datapath: Add tunnel header caching.

author Jesse Gross <jesse@nicira.com>

Fri, 27 Aug 2010 20:55:02 +0000 (13:55 -0700)

committer Jesse Gross <jesse@nicira.com>

Wed, 22 Sep 2010 20:43:02 +0000 (13:43 -0700)
author Jesse Gross <jesse@nicira.com>
Fri, 27 Aug 2010 20:55:02 +0000 (13:55 -0700)
committer Jesse Gross <jesse@nicira.com>
Wed, 22 Sep 2010 20:43:02 +0000 (13:43 -0700)
diff --git a/datapath/tunnel.c b/datapath/tunnel.c

index 6fa369be0c49628d4cfd43c12a4e68b8eddb07f0..77f976fdc531aae6f2b4a40d3235ec6ff2d2ebdd 100644 (file)
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -15,6 +15,7 @@
  #include <linux/jhash.h>
  #include <linux/kernel.h>
  #include <linux/version.h>
+#include <linux/workqueue.h>
  
  #include <net/dsfield.h>
  #include <net/dst.h>
@@ -33,10 +34,45 @@
  #include "tunnel.h"
  #include "vport.h"
  #include "vport-generic.h"
+#include "vport-internal_dev.h"
+
+#ifdef NEED_CACHE_TIMEOUT
+/*
+ * On kernels where we can't quickly detect changes in the rest of the system
+ * we use an expiration time to invalidate the cache.  A shorter expiration
+ * reduces the length of time that we may potentially blackhole packets while
+ * a longer time increases performance by reducing the frequency that the
+ * cache needs to be rebuilt.  A variety of factors may cause the cache to be
+ * invalidated before the expiration time but this is the maximum.  The time
+ * is expressed in jiffies.
+ */
+#define MAX_CACHE_EXP HZ
+#endif
+
+/*
+ * Interval to check for and remove caches that are no longer valid.  Caches
+ * are checked for validity before they are used for packet encapsulation and
+ * old caches are removed at that time.  However, if no packets are sent through
+ * the tunnel then the cache will never be destroyed.  Since it holds
+ * references to a number of system objects, the cache will continue to use
+ * system resources by not allowing those objects to be destroyed.  The cache
+ * cleaner is periodically run to free invalid caches.  It does not
+ * significantly affect system performance.  A lower interval will release
+ * resources faster but will itself consume resources by requiring more frequent
+ * checks.  A longer interval may result in messages being printed to the kernel
+ * message buffer about unreleased resources.  The interval is expressed in
+ * jiffies.
+ */
+#define CACHE_CLEANER_INTERVAL (5 * HZ)
+
+#define CACHE_DATA_ALIGN 16
  
  /* Protected by RCU. */
  static struct tbl *port_table;
  
+static void cache_cleaner(struct work_struct *work);
+DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner);
+
  /*
   * These are just used as an optimization: they don't require any kind of
   * synchronization because we could have just as easily read the value before
@@ -63,22 +99,54 @@ static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node
         return container_of(node, struct tnl_vport, tbl_node);
  }
  
-/* RCU callback. */
-static void free_config(struct rcu_head *rcu)
+static inline void schedule_cache_cleaner(void)
+{
+       schedule_delayed_work(&cache_cleaner_wq, CACHE_CLEANER_INTERVAL);
+}
+
+static void free_cache(struct tnl_cache *cache)
+{
+       if (!cache)
+               return;
+
+       flow_put(cache->flow);
+       ip_rt_put(cache->rt);
+       kfree(cache);
+}
+
+static void free_config_rcu(struct rcu_head *rcu)
  {
         struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
         kfree(c);
  }
  
+static void free_cache_rcu(struct rcu_head *rcu)
+{
+       struct tnl_cache *c = container_of(rcu, struct tnl_cache, rcu);
+       free_cache(c);
+}
+
  static void assign_config_rcu(struct vport *vport,
                               struct tnl_mutable_config *new_config)
  {
         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
         struct tnl_mutable_config *old_config;
  
-       old_config = rcu_dereference(tnl_vport->mutable);
+       old_config = tnl_vport->mutable;
         rcu_assign_pointer(tnl_vport->mutable, new_config);
-       call_rcu(&old_config->rcu, free_config);
+       call_rcu(&old_config->rcu, free_config_rcu);
+}
+
+static void assign_cache_rcu(struct vport *vport, struct tnl_cache *new_cache)
+{
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct tnl_cache *old_cache;
+
+       old_cache = tnl_vport->cache;
+       rcu_assign_pointer(tnl_vport->cache, new_cache);
+
+       if (old_cache)
+               call_rcu(&old_cache->rcu, free_cache_rcu);
  }
  
  static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
@@ -130,10 +198,32 @@ static u32 port_hash(struct port_lookup_key *lookup)
         return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
  }
  
+static u32 mutable_hash(const struct tnl_mutable_config *mutable)
+{
+       struct port_lookup_key lookup;
+
+       lookup.vals[LOOKUP_SADDR] = mutable->port_config.saddr;
+       lookup.vals[LOOKUP_DADDR] = mutable->port_config.daddr;
+       lookup.vals[LOOKUP_KEY] = mutable->port_config.in_key;
+       lookup.vals[LOOKUP_TUNNEL_TYPE] = mutable->tunnel_type;
+
+       return port_hash(&lookup);
+}
+
+static void check_table_empty(void)
+{
+       if (tbl_count(port_table) == 0) {
+               struct tbl *old_table = port_table;
+
+               cancel_delayed_work_sync(&cache_cleaner_wq);
+               rcu_assign_pointer(port_table, NULL);
+               tbl_deferred_destroy(old_table, NULL);
+       }
+}
+
  static int add_port(struct vport *vport)
  {
         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-       struct port_lookup_key lookup;
         int err;
  
         if (!port_table) {
@@ -144,6 +234,7 @@ static int add_port(struct vport *vport)
                         return -ENOMEM;
  
                 rcu_assign_pointer(port_table, new_table);
+               schedule_cache_cleaner();
  
         } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
                 struct tbl *old_table = port_table;
@@ -157,16 +248,44 @@ static int add_port(struct vport *vport)
                 tbl_deferred_destroy(old_table, NULL);
         }
  
-       lookup.vals[LOOKUP_SADDR] = tnl_vport->mutable->port_config.saddr;
-       lookup.vals[LOOKUP_DADDR] = tnl_vport->mutable->port_config.daddr;
-       lookup.vals[LOOKUP_KEY] = tnl_vport->mutable->port_config.in_key;
-       lookup.vals[LOOKUP_TUNNEL_TYPE] = tnl_vport->mutable->tunnel_type;
+       err = tbl_insert(port_table, &tnl_vport->tbl_node, mutable_hash(tnl_vport->mutable));
+       if (err) {
+               check_table_empty();
+               return err;
+       }
+
+       (*find_port_pool(tnl_vport->mutable))++;
+
+       return 0;
+}
+
+static int move_port(struct vport *vport, struct tnl_mutable_config *new_mutable)
+{
+       int err;
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       u32 hash;
+
+       hash = mutable_hash(new_mutable);
+       if (hash == tnl_vport->tbl_node.hash)
+               goto table_updated;
  
-       err = tbl_insert(port_table, &tnl_vport->tbl_node, port_hash(&lookup));
+       /*
+        * Ideally we should make this move atomic to avoid having gaps in
+        * finding tunnels or the possibility of failure.  However, if we do
+        * find a tunnel it will always be consistent.
+        */
+       err = tbl_remove(port_table, &tnl_vport->tbl_node);
         if (err)
                 return err;
  
-       (*find_port_pool(tnl_vport->mutable))++;
+       err = tbl_insert(port_table, &tnl_vport->tbl_node, hash);
+       if (err) {
+               check_table_empty();
+               return err;
+       }
+
+table_updated:
+       assign_config_rcu(vport, new_mutable);
  
         return 0;
  }
@@ -180,6 +299,7 @@ static int del_port(struct vport *vport)
         if (err)
                 return err;
  
+       check_table_empty();
         (*find_port_pool(tnl_vport->mutable))--;
  
         return 0;
@@ -193,7 +313,7 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
         struct tbl *table = rcu_dereference(port_table);
         struct tbl_node *tbl_node;
  
-       if (!table)
+       if (unlikely(!table))
                 return NULL;
  
         lookup.vals[LOOKUP_SADDR] = saddr;
@@ -246,6 +366,60 @@ found:
         return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
  }
  
+static inline void ecn_decapsulate(struct sk_buff *skb)
+{
+       u8 tos = ip_hdr(skb)->tos;
+
+       if (INET_ECN_is_ce(tos)) {
+               __be16 protocol = skb->protocol;
+               unsigned int nw_header = skb_network_offset(skb);
+
+               if (skb->protocol == htons(ETH_P_8021Q)) {
+                       if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
+                               return;
+
+                       protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+                       nw_header += VLAN_HLEN;
+               }
+
+               if (protocol == htons(ETH_P_IP)) {
+                       if (unlikely(!pskb_may_pull(skb, nw_header
+                           + sizeof(struct iphdr))))
+                               return;
+
+                       IP_ECN_set_ce((struct iphdr *)(skb->data + nw_header));
+               }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+               else if (protocol == htons(ETH_P_IPV6)) {
+                       if (unlikely(!pskb_may_pull(skb, nw_header
+                           + sizeof(struct ipv6hdr))))
+                               return;
+
+                       IP6_ECN_set_ce((struct ipv6hdr *)(skb->data + nw_header));
+               }
+#endif
+       }
+}
+
+/* Called with rcu_read_lock. */
+void tnl_rcv(struct vport *vport, struct sk_buff *skb)
+{
+       skb->pkt_type = PACKET_HOST;
+       skb->protocol = eth_type_trans(skb, skb->dev);
+
+       skb_dst_drop(skb);
+       nf_reset(skb);
+       secpath_reset(skb);
+       skb_reset_network_header(skb);
+
+       ecn_decapsulate(skb);
+
+       skb_push(skb, ETH_HLEN);
+       compute_ip_summed(skb, false);
+
+       vport_receive(vport, skb);
+}
+
  static bool check_ipv4_address(__be32 addr)
  {
         if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
@@ -514,179 +688,412 @@ bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutab
         return true;
  }
  
-static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
+static bool check_mtu(struct sk_buff *skb,
+                     struct vport *vport,
+                     const struct tnl_mutable_config *mutable,
+                     const struct rtable *rt, __be16 *frag_offp)
  {
-       if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
-               struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
-               if (unlikely(!nskb)) {
-                       kfree_skb(skb);
-                       return ERR_PTR(-ENOMEM);
+       int mtu;
+       __be16 frag_off;
+
+       frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
+       if (frag_off)
+               mtu = dst_mtu(&rt_dst(rt))
+                       - ETH_HLEN
+                       - mutable->tunnel_hlen
+                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+       else
+               mtu = mutable->mtu;
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               struct iphdr *old_iph = ip_hdr(skb);
+
+               frag_off |= old_iph->frag_off & htons(IP_DF);
+               mtu = max(mtu, IP_MIN_MTU);
+
+               if ((old_iph->frag_off & htons(IP_DF)) &&
+                   mtu < ntohs(old_iph->tot_len)) {
+                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
+                               goto drop;
                 }
+       }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       else if (skb->protocol == htons(ETH_P_IPV6)) {
+               unsigned int packet_length = skb->len - ETH_HLEN
+                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
  
-               set_skb_csum_bits(skb, nskb);
+               mtu = max(mtu, IPV6_MIN_MTU);
  
-               if (skb->sk)
-                       skb_set_owner_w(nskb, skb->sk);
+               /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
+               if (packet_length > IPV6_MIN_MTU)
+                       frag_off = htons(IP_DF);
  
-               dev_kfree_skb(skb);
-               return nskb;
+               if (mtu < packet_length) {
+                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
+                               goto drop;
+               }
         }
+#endif
  
-       return skb;
+       *frag_offp = frag_off;
+       return true;
+
+drop:
+       *frag_offp = 0;
+       return false;
  }
  
-static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb)
+static void create_tunnel_header(const struct vport *vport,
+                                const struct tnl_mutable_config *mutable,
+                                const struct rtable *rt, void *header)
  {
-       u8 inner;
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct iphdr *iph = header;
+
+       iph->version    = 4;
+       iph->ihl        = sizeof(struct iphdr) >> 2;
+       iph->frag_off   = htons(IP_DF);
+       iph->protocol   = tnl_vport->tnl_ops->ipproto;
+       iph->tos        = mutable->port_config.tos;
+       iph->daddr      = rt->rt_dst;
+       iph->saddr      = rt->rt_src;
+       iph->ttl        = mutable->port_config.ttl;
+       if (!iph->ttl)
+               iph->ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
+
+       tnl_vport->tnl_ops->build_header(vport, mutable, iph + 1);
+}
  
-       if (skb->protocol == htons(ETH_P_IP))
-               inner = ((struct iphdr *)skb_network_header(skb))->tos;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6))
-               inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
-#endif
-       else
-               inner = 0;
+static inline void *get_cached_header(const struct tnl_cache *cache)
+{
+       return (void *)cache + ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN);
+}
  
-       return INET_ECN_encapsulate(tos, inner);
+static inline bool check_cache_valid(const struct tnl_cache *cache,
+                                    const struct tnl_mutable_config *mutable)
+{
+       return cache &&
+#ifdef NEED_CACHE_TIMEOUT
+               time_before(jiffies, cache->expiration) &&
+#endif
+#ifdef HAVE_RT_GENID
+               atomic_read(&init_net.ipv4.rt_genid) == cache->rt->rt_genid &&
+#endif
+#ifdef HAVE_HH_SEQ
+               rt_dst(cache->rt).hh->hh_lock.sequence == cache->hh_seq &&
+#endif
+               mutable->seq == cache->mutable_seq &&
+               (!is_internal_dev(rt_dst(cache->rt).dev) ||
+               (cache->flow && !cache->flow->dead));
  }
  
-static inline void ecn_decapsulate(struct sk_buff *skb)
+static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux)
  {
-       u8 tos = ip_hdr(skb)->tos;
+       struct tnl_vport *tnl_vport = tnl_vport_table_cast(tbl_node);
+       const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
+       const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache);
  
-       if (INET_ECN_is_ce(tos)) {
-               __be16 protocol = skb->protocol;
-               unsigned int nw_header = skb_network_header(skb) - skb->data;
+       if (cache && !check_cache_valid(cache, mutable) &&
+           spin_trylock_bh(&tnl_vport->cache_lock)) {
+               assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL);
+               spin_unlock_bh(&tnl_vport->cache_lock);
+       }
  
-               if (skb->protocol == htons(ETH_P_8021Q)) {
-                       if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
-                               return;
+       return 0;
+}
  
-                       protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
-                       nw_header += VLAN_HLEN;
-               }
+static void cache_cleaner(struct work_struct *work)
+{
+       schedule_cache_cleaner();
  
-               if (protocol == htons(ETH_P_IP)) {
-                       if (unlikely(!pskb_may_pull(skb, nw_header
-                           + sizeof(struct iphdr))))
-                               return;
+       rcu_read_lock();
+       tbl_foreach(port_table, cache_cleaner_cb, NULL);
+       rcu_read_unlock();
+}
  
-                       IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
-               }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-               else if (protocol == htons(ETH_P_IPV6)) {
-                       if (unlikely(!pskb_may_pull(skb, nw_header
-                           + sizeof(struct ipv6hdr))))
-                               return;
+static inline void create_eth_hdr(struct tnl_cache *cache,
+                                 const struct rtable *rt)
+{
+       void *cache_data = get_cached_header(cache);
+       int hh_len = rt_dst(rt).hh->hh_len;
+       int hh_off = HH_DATA_ALIGN(rt_dst(rt).hh->hh_len) - hh_len;
  
-                       IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
-                                                         + skb->data));
-               }
+#ifdef HAVE_HH_SEQ
+       unsigned hh_seq;
+
+       do {
+               hh_seq = read_seqbegin(&rt_dst(rt).hh->hh_lock);
+               memcpy(cache_data, (void *)rt_dst(rt).hh->hh_data + hh_off, hh_len);
+       } while (read_seqretry(&rt_dst(rt).hh->hh_lock, hh_seq));
+
+       cache->hh_seq = hh_seq;
+#else
+       read_lock_bh(&rt_dst(rt).hh->hh_lock);
+       memcpy(cache_data, (void *)rt_dst(rt).hh->hh_data + hh_off, hh_len);
+       read_unlock_bh(&rt_dst(rt).hh->hh_lock);
  #endif
-       }
  }
  
-static struct sk_buff *handle_gso(struct sk_buff *skb)
+static struct tnl_cache *build_cache(struct vport *vport,
+                                    const struct tnl_mutable_config *mutable,
+                                    struct rtable *rt)
  {
-       if (skb_is_gso(skb)) {
-               struct sk_buff *nskb = skb_gso_segment(skb, 0);
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct tnl_cache *cache;
+       void *cache_data;
+       int cache_len;
  
-               dev_kfree_skb(skb);
-               return nskb;
+       if (!(mutable->port_config.flags & TNL_F_HDR_CACHE))
+               return NULL;
+
+       /*
+        * If there is no entry in the ARP cache or if this device does not
+        * support hard header caching just fall back to the IP stack.
+        */
+       if (!rt_dst(rt).hh)
+               return NULL;
+
+       /*
+        * If lock is contended fall back to directly building the header.
+        * We're not going to help performance by sitting here spinning.
+        */
+       if (!spin_trylock_bh(&tnl_vport->cache_lock))
+               return NULL;
+
+       cache = tnl_vport->cache;
+       if (check_cache_valid(cache, mutable))
+               goto unlock;
+       else
+               cache = NULL;
+
+       cache_len = rt_dst(rt).hh->hh_len + mutable->tunnel_hlen;
+
+       cache = kzalloc(ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN) +
+                       cache_len, GFP_ATOMIC);
+       if (!cache)
+               goto unlock;
+
+       cache->len = cache_len;
+
+       create_eth_hdr(cache, rt);
+       cache_data = get_cached_header(cache) + rt_dst(rt).hh->hh_len;
+
+       create_tunnel_header(vport, mutable, rt, cache_data);
+
+       cache->mutable_seq = mutable->seq;
+       cache->rt = rt;
+#ifdef NEED_CACHE_TIMEOUT
+       cache->expiration = jiffies + tnl_vport->cache_exp_interval;
+#endif
+
+       if (is_internal_dev(rt_dst(rt).dev)) {
+               int err;
+               struct vport *vport;
+               struct dp_port *dp_port;
+               struct sk_buff *skb;
+               bool is_frag;
+               struct odp_flow_key flow_key;
+               struct tbl_node *flow_node;
+
+               vport = internal_dev_get_vport(rt_dst(rt).dev);
+               if (!vport)
+                       goto done;
+
+               dp_port = vport_get_dp_port(vport);
+               if (!dp_port)
+                       goto done;
+
+               skb = alloc_skb(cache->len, GFP_ATOMIC);
+               if (!skb)
+                       goto done;
+
+               __skb_put(skb, cache->len);
+               memcpy(skb->data, get_cached_header(cache), cache->len);
+
+               err = flow_extract(skb, dp_port->port_no, &flow_key, &is_frag);
+
+               kfree_skb(skb);
+               if (err || is_frag)
+                       goto done;
+
+               flow_node = tbl_lookup(rcu_dereference(dp_port->dp->table),
+                                      &flow_key, flow_hash(&flow_key),
+                                      flow_cmp);
+               if (flow_node) {
+                       struct sw_flow *flow = flow_cast(flow_node);
+
+                       cache->flow = flow;
+                       flow_hold(flow);
+               }
         }
  
-       return skb;
+done:
+       assign_cache_rcu(vport, cache);
+
+unlock:
+       spin_unlock_bh(&tnl_vport->cache_lock);
+
+       return cache;
  }
  
-static int handle_csum_offload(struct sk_buff *skb)
+static struct rtable *find_route(struct vport *vport,
+                                const struct tnl_mutable_config *mutable,
+                                u8 tos, struct tnl_cache **cache)
  {
-       if (skb->ip_summed == CHECKSUM_PARTIAL)
-               return skb_checksum_help(skb);
-       else {
-               skb->ip_summed = CHECKSUM_NONE;
-               return 0;
+       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
+
+       *cache = NULL;
+       tos = RT_TOS(tos);
+
+       if (likely(tos == mutable->port_config.tos &&
+                  check_cache_valid(cur_cache, mutable))) {
+               *cache = cur_cache;
+               return cur_cache->rt;
+       } else {
+               struct rtable *rt;
+               struct flowi fl = { .nl_u = { .ip4_u =
+                                             { .daddr = mutable->port_config.daddr,
+                                               .saddr = mutable->port_config.saddr,
+                                               .tos = tos } },
+                                   .proto = tnl_vport->tnl_ops->ipproto };
+
+               if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
+                       return NULL;
+
+               if (likely(tos == mutable->port_config.tos))
+                       *cache = build_cache(vport, mutable, rt);
+
+               return rt;
         }
  }
  
-/* Called with rcu_read_lock. */
-void tnl_rcv(struct vport *vport, struct sk_buff *skb)
+static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
  {
-       skb->pkt_type = PACKET_HOST;
-       skb->protocol = eth_type_trans(skb, skb->dev);
+       if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
+               struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
+               if (unlikely(!nskb)) {
+                       kfree_skb(skb);
+                       return ERR_PTR(-ENOMEM);
+               }
  
-       skb_dst_drop(skb);
-       nf_reset(skb);
-       secpath_reset(skb);
-       skb_reset_network_header(skb);
+               set_skb_csum_bits(skb, nskb);
  
-       ecn_decapsulate(skb);
+               if (skb->sk)
+                       skb_set_owner_w(nskb, skb->sk);
  
-       skb_push(skb, ETH_HLEN);
-       compute_ip_summed(skb, false);
+               kfree_skb(skb);
+               return nskb;
+       }
  
-       vport_receive(vport, skb);
+       return skb;
  }
  
-static int build_packet(struct vport *vport, const struct tnl_mutable_config *mutable,
-                       struct iphdr *iph, struct rtable *rt, int max_headroom,
-                       int mtu, struct sk_buff *skb)
+static inline bool need_linearize(const struct sk_buff *skb)
  {
-       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
+       int i;
+
+       if (unlikely(skb_shinfo(skb)->frag_list))
+               return true;
+
+       /*
+        * Generally speaking we should linearize if there are paged frags.
+        * However, if all of the refcounts are 1 we know nobody else can
+        * change them from underneath us and we can skip the linearization.
+        */
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+               if (unlikely(page_count(skb_shinfo(skb)->frags[0].page) > 1))
+                       return true;
+
+       return false;
+}
+
+static struct sk_buff *handle_offloads(struct sk_buff *skb,
+                                      const struct tnl_mutable_config *mutable,
+                                      const struct rtable *rt)
+{
+       int min_headroom;
         int err;
-       struct iphdr *new_iph;
-       int orig_len = skb->len;
-       __be16 frag_off = iph->frag_off;
  
-       skb = check_headroom(skb, max_headroom);
-       if (unlikely(IS_ERR(skb)))
-               goto error;
+       forward_ip_summed(skb);
  
-       err = handle_csum_offload(skb);
+       err = vswitch_skb_checksum_setup(skb);
         if (unlikely(err))
                 goto error_free;
  
-       if (skb->protocol == htons(ETH_P_IP)) {
-               struct iphdr *old_iph = ip_hdr(skb);
+       min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+                       + mutable->tunnel_hlen;
  
-               if ((old_iph->frag_off & htons(IP_DF)) &&
-                   mtu < ntohs(old_iph->tot_len)) {
-                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
-                               goto error_free;
+       if (skb_is_gso(skb)) {
+               struct sk_buff *nskb;
+
+               /*
+                * If we are doing GSO on a pskb it is better to make sure that
+                * the headroom is correct now.  We will only have to copy the
+                * portion in the linear data area and GSO will preserve
+                * headroom when it creates the segments.  This is particularly
+                * beneficial on Xen where we get a lot of GSO pskbs.
+                * Conversely, we avoid copying if it is just to get our own
+                * writable clone because GSO will do the copy for us.
+                */
+               if (skb_headroom(skb) < min_headroom) {
+                       skb = check_headroom(skb, min_headroom);
+                       if (unlikely(IS_ERR(skb))) {
+                               err = PTR_ERR(skb);
+                               goto error;
+                       }
                 }
  
-       }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6)) {
-               unsigned int packet_length = skb->len - ETH_HLEN
-                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+               nskb = skb_gso_segment(skb, 0);
+               kfree_skb(skb);
+               if (unlikely(IS_ERR(nskb))) {
+                       err = PTR_ERR(nskb);
+                       goto error;
+               }
  
-               /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
-               if (packet_length > IPV6_MIN_MTU)
-                       frag_off = htons(IP_DF);
+               skb = nskb;
+       } else {
+               skb = check_headroom(skb, min_headroom);
+               if (unlikely(IS_ERR(skb))) {
+                       err = PTR_ERR(skb);
+                       goto error;
+               }
  
-               if (mtu < packet_length) {
-                       if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
+               if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                       /*
+                        * Pages aren't locked and could change at any time.
+                        * If this happens after we compute the checksum, the
+                        * checksum will be wrong.  We linearize now to avoid
+                        * this problem.
+                        */
+                       if (unlikely(need_linearize(skb))) {
+                               err = __skb_linearize(skb);
+                               if (unlikely(err))
+                                       goto error_free;
+                       }
+
+                       err = skb_checksum_help(skb);
+                       if (unlikely(err))
                                 goto error_free;
-               }
+               } else if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
         }
-#endif
  
-       new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
-       skb_reset_network_header(skb);
-       skb_set_transport_header(skb, sizeof(struct iphdr));
-
-       memcpy(new_iph, iph, sizeof(struct iphdr));
-       new_iph->frag_off = frag_off;
-       ip_select_ident(new_iph, &rt_dst(rt), NULL);
+       return skb;
  
-       memset(&IPCB(skb)->opt, 0, sizeof(IPCB(skb)->opt));
-       IPCB(skb)->flags = 0;
+error_free:
+       kfree_skb(skb);
+error:
+       return ERR_PTR(err);
+}
  
-       skb = tnl_vport->tnl_ops->build_header(skb, vport, mutable, &rt_dst(rt));
-       if (unlikely(!skb))
-               goto error;
+static int send_frags(struct sk_buff *skb,
+                     const struct tnl_mutable_config *mutable)
+{
+       int sent_len;
+       int err;
  
+       sent_len = 0;
         while (skb) {
                 struct sk_buff *next = skb->next;
                 int frag_len = skb->len - mutable->tunnel_hlen;
@@ -694,34 +1101,26 @@ static int build_packet(struct vport *vport, const struct tnl_mutable_config *mu
                 skb->next = NULL;
  
                 err = ip_local_out(skb);
-               if (unlikely(net_xmit_eval(err) != 0)) {
-                       orig_len -= frag_len;
+               if (likely(net_xmit_eval(err) == 0))
+                       sent_len += frag_len;
+               else {
                         skb = next;
                         goto free_frags;
                 }
  
                 skb = next;
-       };
+       }
  
-       return orig_len;
+       return sent_len;
  
-error_free:
-       kfree_skb(skb);
-error:
-       return 0;
  free_frags:
         /*
          * There's no point in continuing to send fragments once one has been
          * dropped so just free the rest.  This may help improve the congestion
          * that caused the first packet to be dropped.
          */
-       while (skb) {
-               struct sk_buff *next = skb->next;
-               orig_len -= skb->len - mutable->tunnel_hlen;
-               kfree_skb(skb);
-               skb = next;
-       };
-       return orig_len;
+       tnl_free_linked_skbs(skb);
+       return sent_len;
  }
  
  int tnl_send(struct vport *vport, struct sk_buff *skb)
@@ -729,12 +1128,15 @@ int tnl_send(struct vport *vport, struct sk_buff *skb)
         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
         const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
  
-       struct iphdr *old_iph;
-       int orig_len;
-       struct iphdr iph;
+       enum vport_err_type err = VPORT_E_TX_ERROR;
         struct rtable *rt;
-       int max_headroom;
-       int mtu;
+       struct dst_entry *unattached_dst = NULL;
+       struct tnl_cache *cache;
+       int sent_len = 0;
+       __be16 frag_off;
+       u8 ttl;
+       u8 inner_tos;
+       u8 tos;
  
         /* Validate the protocol headers before we try to use them. */
         if (skb->protocol == htons(ETH_P_8021Q)) {
@@ -746,147 +1148,164 @@ int tnl_send(struct vport *vport, struct sk_buff *skb)
         }
  
         if (skb->protocol == htons(ETH_P_IP)) {
-               if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
-                   + sizeof(struct iphdr) - skb->data)))
+               if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
+                   + sizeof(struct iphdr))))
                         skb->protocol = 0;
         }
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
         else if (skb->protocol == htons(ETH_P_IPV6)) {
-               if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
-                   + sizeof(struct ipv6hdr) - skb->data)))
+               if (unlikely(!pskb_may_pull(skb, skb_network_offset(skb)
+                   + sizeof(struct ipv6hdr))))
                         skb->protocol = 0;
         }
  #endif
-       old_iph = ip_hdr(skb);
  
-       iph.tos = mutable->port_config.tos;
-       if (mutable->port_config.flags & TNL_F_TOS_INHERIT) {
-               if (skb->protocol == htons(ETH_P_IP))
-                       iph.tos = old_iph->tos;
+       /* ToS */
+       if (skb->protocol == htons(ETH_P_IP))
+               inner_tos = ip_hdr(skb)->tos;
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-               else if (skb->protocol == htons(ETH_P_IPV6))
-                       iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               inner_tos = ipv6_get_dsfield(ipv6_hdr(skb));
  #endif
-       }
-       iph.tos = ecn_encapsulate(iph.tos, skb);
+       else
+               inner_tos = 0;
  
-       {
-               struct flowi fl = { .nl_u = { .ip4_u =
-                                             { .daddr = mutable->port_config.daddr,
-                                               .saddr = mutable->port_config.saddr,
-                                               .tos = RT_TOS(iph.tos) } },
-                                   .proto = tnl_vport->tnl_ops->ipproto };
+       if (mutable->port_config.flags & TNL_F_TOS_INHERIT)
+               tos = inner_tos;
+       else
+               tos = mutable->port_config.tos;
  
-               if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
-                       goto error_free;
+       tos = INET_ECN_encapsulate(tos, inner_tos);
+
+       /* Route lookup */
+       rt = find_route(vport, mutable, tos, &cache);
+       if (unlikely(!rt))
+               goto error_free;
+       if (unlikely(!cache))
+               unattached_dst = &rt_dst(rt);
+
+       /* Reset SKB */
+       nf_reset(skb);
+       secpath_reset(skb);
+       skb_dst_drop(skb);
+
+       /* Offloading */
+       skb = handle_offloads(skb, mutable, rt);
+       if (unlikely(IS_ERR(skb)))
+               goto error;
+
+       /* MTU */
+       if (unlikely(!check_mtu(skb, vport, mutable, rt, &frag_off))) {
+               err = VPORT_E_TX_DROPPED;
+               goto error_free;
         }
  
-       iph.ttl = mutable->port_config.ttl;
+       /*
+        * If we are over the MTU, allow the IP stack to handle fragmentation.
+        * Fragmentation is a slow path anyways.
+        */
+       if (unlikely(skb->len + mutable->tunnel_hlen > dst_mtu(&rt_dst(rt)) &&
+                    cache)) {
+               unattached_dst = &rt_dst(rt);
+               dst_hold(unattached_dst);
+               cache = NULL;
+       }
+
+       /* TTL */
+       ttl = mutable->port_config.ttl;
+       if (!ttl)
+               ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
+
         if (mutable->port_config.flags & TNL_F_TTL_INHERIT) {
                 if (skb->protocol == htons(ETH_P_IP))
-                       iph.ttl = old_iph->ttl;
+                       ttl = ip_hdr(skb)->ttl;
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                 else if (skb->protocol == htons(ETH_P_IPV6))
-                       iph.ttl = ipv6_hdr(skb)->hop_limit;
+                       ttl = ipv6_hdr(skb)->hop_limit;
  #endif
         }
-       if (!iph.ttl)
-               iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT);
  
-       iph.frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0;
-       if (iph.frag_off)
-               mtu = dst_mtu(&rt_dst(rt))
-                       - ETH_HLEN
-                       - mutable->tunnel_hlen
-                       - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
-       else
-               mtu = mutable->mtu;
+       while (skb) {
+               struct iphdr *iph;
+               struct sk_buff *next_skb = skb->next;
+               skb->next = NULL;
  
-       if (skb->protocol == htons(ETH_P_IP)) {
-               iph.frag_off |= old_iph->frag_off & htons(IP_DF);
-               mtu = max(mtu, IP_MIN_MTU);
-       }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       else if (skb->protocol == htons(ETH_P_IPV6))
-               mtu = max(mtu, IPV6_MIN_MTU);
-#endif
+               if (likely(cache)) {
+                       skb_push(skb, cache->len);
+                       memcpy(skb->data, get_cached_header(cache), cache->len);
+                       skb_reset_mac_header(skb);
+                       skb_set_network_header(skb, rt_dst(rt).hh->hh_len);
  
-       iph.version = 4;
-       iph.ihl = sizeof(struct iphdr) >> 2;
-       iph.protocol = tnl_vport->tnl_ops->ipproto;
-       iph.daddr = rt->rt_dst;
-       iph.saddr = rt->rt_src;
+               } else {
+                       skb_push(skb, mutable->tunnel_hlen);
+                       create_tunnel_header(vport, mutable, rt, skb->data);
+                       skb_reset_network_header(skb);
  
-       nf_reset(skb);
-       secpath_reset(skb);
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt_dst(rt));
+                       if (next_skb)
+                               skb_dst_set(skb, dst_clone(unattached_dst));
+                       else {
+                               skb_dst_set(skb, unattached_dst);
+                               unattached_dst = NULL;
+                       }
  
-       /*
-        * If we are doing GSO on a pskb it is better to make sure that the
-        * headroom is correct now.  We will only have to copy the portion in
-        * the linear data area and GSO will preserve headroom when it creates
-        * the segments.  This is particularly beneficial on Xen where we get
-        * lots of GSO pskbs.  Conversely, we delay copying if it is just to
-        * get our own writable clone because GSO may do the copy for us.
-        */
-       max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
-                       + mutable->tunnel_hlen;
  
-       if (skb_headroom(skb) < max_headroom) {
-               skb = check_headroom(skb, max_headroom);
-               if (unlikely(IS_ERR(skb))) {
-                       vport_record_error(vport, VPORT_E_TX_DROPPED);
-                       goto error;
+                       memset(&IPCB(skb)->opt, 0, sizeof(IPCB(skb)->opt));
+                       IPCB(skb)->flags = 0;
                 }
-       }
+               skb_set_transport_header(skb, skb_network_offset(skb) + sizeof(struct iphdr));
  
-       forward_ip_summed(skb);
+               iph = ip_hdr(skb);
+               iph->tos = tos;
+               iph->ttl = ttl;
+               iph->frag_off = frag_off;
+               ip_select_ident(iph, &rt_dst(rt), NULL);
  
-       if (unlikely(vswitch_skb_checksum_setup(skb)))
-               goto error_free;
+               skb = tnl_vport->tnl_ops->update_header(vport, mutable, &rt_dst(rt), skb);
+               if (unlikely(!skb))
+                       goto next;
  
-       skb = handle_gso(skb);
-       if (unlikely(IS_ERR(skb))) {
-               vport_record_error(vport, VPORT_E_TX_DROPPED);
-               goto error;
-       }
+               if (likely(cache)) {
+                       int orig_len = skb->len - cache->len;
+                       struct vport *cache_vport = internal_dev_get_vport(rt_dst(rt).dev);
  
-       /*
-        * Process GSO segments.  Try to do any work for the entire packet that
-        * doesn't involve actually writing to it before this point.
-        */
-       orig_len = 0;
-       do {
-               struct sk_buff *next_skb = skb->next;
-               skb->next = NULL;
+                       skb->protocol = htons(ETH_P_IP);
+
+                       iph->tot_len = htons(skb->len - skb_network_offset(skb));
+                       ip_send_check(iph);
  
-               orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
+                       if (likely(cache_vport)) {
+                               OVS_CB(skb)->flow = cache->flow;
+                               compute_ip_summed(skb, true);
+                               vport_receive(cache_vport, skb);
+                               sent_len += orig_len;
+                       } else {
+                               int err;
  
+                               skb->dev = rt_dst(rt).dev;
+                               err = dev_queue_xmit(skb);
+
+                               if (likely(net_xmit_eval(err) == 0))
+                                       sent_len += orig_len;
+                       }
+               } else
+                       sent_len += send_frags(skb, mutable);
+
+next:
                 skb = next_skb;
-       } while (skb);
+       }
  
-       if (unlikely(orig_len == 0))
+       if (unlikely(sent_len == 0))
                 vport_record_error(vport, VPORT_E_TX_DROPPED);
  
-       return orig_len;
+       goto out;
  
  error_free:
-       kfree_skb(skb);
-       vport_record_error(vport, VPORT_E_TX_ERROR);
+       tnl_free_linked_skbs(skb);
  error:
-       return 0;
-}
-
-int tnl_init(void)
-{
-       return 0;
-}
-
-void tnl_exit(void)
-{
-       tbl_destroy(port_table, NULL);
-       port_table = NULL;
+       dst_release(unattached_dst);
+       vport_record_error(vport, err);
+out:
+       return sent_len;
  }
  
  static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
@@ -899,15 +1318,18 @@ static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops,
         if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct tnl_port_config)))
                 return -EFAULT;
  
+       if (mutable->port_config.daddr == 0)
+               return -EINVAL;
+
+       if (mutable->port_config.tos != RT_TOS(mutable->port_config.tos))
+               return -EINVAL;
+
         mutable->tunnel_hlen = tnl_ops->hdr_len(&mutable->port_config);
         if (mutable->tunnel_hlen < 0)
                 return mutable->tunnel_hlen;
  
         mutable->tunnel_hlen += sizeof(struct iphdr);
  
-       if (mutable->port_config.daddr == 0)
-               return -EINVAL;
-
         mutable->tunnel_type = tnl_ops->tunnel_type;
         if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) {
                 mutable->tunnel_type |= TNL_T_KEY_MATCH;
@@ -950,7 +1372,7 @@ struct vport *tnl_create(const char *name, const void __user *config,
         strcpy(tnl_vport->name, name);
         tnl_vport->tnl_ops = tnl_ops;
  
-       tnl_vport->mutable = kmalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
+       tnl_vport->mutable = kzalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL);
         if (!tnl_vport->mutable) {
                 err = -ENOMEM;
                 goto error_free_vport;
@@ -966,6 +1388,13 @@ struct vport *tnl_create(const char *name, const void __user *config,
         if (err)
                 goto error_free_mutable;
  
+       spin_lock_init(&tnl_vport->cache_lock);
+
+#ifdef NEED_CACHE_TIMEOUT
+       tnl_vport->cache_exp_interval = MAX_CACHE_EXP -
+                                       (net_random() % (MAX_CACHE_EXP / 2));
+#endif
+
         err = add_port(vport);
         if (err)
                 goto error_free_mutable;
@@ -985,7 +1414,6 @@ int tnl_modify(struct vport *vport, const void __user *config)
         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
         struct tnl_mutable_config *mutable;
         int err;
-       bool update_hash = false;
  
         mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
         if (!mutable) {
@@ -997,35 +1425,11 @@ int tnl_modify(struct vport *vport, const void __user *config)
         if (err)
                 goto error_free;
  
-       /*
-        * Only remove the port from the hash table if something that would
-        * affect the lookup has changed.
-        */
-       if (tnl_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
-           tnl_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
-           tnl_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
-           (tnl_vport->mutable->port_config.flags & TNL_F_IN_KEY_MATCH) !=
-           (mutable->port_config.flags & TNL_F_IN_KEY_MATCH))
-               update_hash = true;
-
-
-       /*
-        * This update is not atomic but the lookup uses the config, which
-        * serves as an inherent double check.
-        */
-       if (update_hash) {
-               err = del_port(vport);
-               if (err)
-                       goto error_free;
-       }
-
-       assign_config_rcu(vport, mutable);
+       mutable->seq++;
  
-       if (update_hash) {
-               err = add_port(vport);
-               if (err)
-                       goto error_free;
-       }
+       err = move_port(vport, mutable);
+       if (err)
+               goto error_free;
  
         return 0;
  
@@ -1035,10 +1439,14 @@ error:
         return err;
  }
  
-static void free_port(struct rcu_head *rcu)
+static void free_port_rcu(struct rcu_head *rcu)
  {
         struct tnl_vport *tnl_vport = container_of(rcu, struct tnl_vport, rcu);
  
+       spin_lock_bh(&tnl_vport->cache_lock);
+       free_cache(tnl_vport->cache);
+       spin_unlock_bh(&tnl_vport->cache_lock);
+
         kfree(tnl_vport->mutable);
         vport_free(tnl_vport_to_vport(tnl_vport));
  }
@@ -1055,7 +1463,7 @@ int tnl_destroy(struct vport *vport)
             &old_mutable))
                 del_port(vport);
  
-       call_rcu(&tnl_vport->rcu, free_port);
+       call_rcu(&tnl_vport->rcu, free_port_rcu);
  
         return 0;
  }
@@ -1090,7 +1498,6 @@ int tnl_set_addr(struct vport *vport, const unsigned char *addr)
         return 0;
  }
  
-
  const char *tnl_get_name(const struct vport *vport)
  {
         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
@@ -1108,3 +1515,15 @@ int tnl_get_mtu(const struct vport *vport)
         const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
         return rcu_dereference(tnl_vport->mutable)->mtu;
  }
+
+void tnl_free_linked_skbs(struct sk_buff *skb)
+{
+       if (unlikely(!skb))
+               return;
+
+       while (skb) {
+               struct sk_buff *next = skb->next;
+               kfree_skb(skb);
+               skb = next;
+       }
+}
diff --git a/datapath/tunnel.h b/datapath/tunnel.h

index 37874c57cf3d6cc9e6718a806c756573526dd339..8ffb7bf544446f9b108b683f1ba3aa154ff693d9 100644 (file)
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -9,6 +9,9 @@
  #ifndef TUNNEL_H
  #define TUNNEL_H 1
  
+#include <linux/version.h>
+
+#include "flow.h"
  #include "openvswitch/tunnel.h"
  #include "table.h"
  #include "vport.h"
@@ -20,14 +23,15 @@
  #define IP_MIN_MTU 68
  
  /*
- * One of these goes in your struct tnl_ops and in tnl_find_port().
+ * One of these goes in struct tnl_ops and in tnl_find_port().
   * These values are in the same namespace as other TNL_T_* values, so
- * you have only the first 10 bits to define protocol identifiers.
+ * only the least significant 10 bits are available to define protocol
+ * identifiers.
   */
  #define TNL_T_PROTO_GRE                0
  #define TNL_T_PROTO_CAPWAP     1
  
-/* You only need these flags when you are calling tnl_find_port(). */
+/* These flags are only needed when calling tnl_find_port(). */
  #define TNL_T_KEY_EXACT                (1 << 10)
  #define TNL_T_KEY_MATCH                (1 << 11)
  #define TNL_T_KEY_EITHER       (TNL_T_KEY_EXACT | TNL_T_KEY_MATCH)
@@ -35,39 +39,119 @@
  struct tnl_mutable_config {
         struct rcu_head rcu;
  
-       unsigned char eth_addr[ETH_ALEN];
-       unsigned int mtu;
-       struct tnl_port_config port_config;
+       unsigned seq;           /* Sequence number to identify this config. */
  
-       /* Set of TNL_T_* flags that define the category for lookup. */
-       u32 tunnel_type;
+       u32 tunnel_type;        /* Set of TNL_T_* flags that define lookup. */
+       unsigned tunnel_hlen;   /* Tunnel header length. */
+
+       unsigned char eth_addr[ETH_ALEN];
+       unsigned mtu;
  
-       int tunnel_hlen; /* Tunnel header length. */
+       struct tnl_port_config port_config;
  };
  
  struct tnl_ops {
-       /* Put your TNL_T_PROTO_* type in here. */
-       u32 tunnel_type;
-       u8 ipproto;
+       u32 tunnel_type;        /* Put the TNL_T_PROTO_* type in here. */
+       u8 ipproto;             /* The IP protocol for the tunnel. */
  
         /*
-        * Returns the length of the tunnel header you will add in
+        * Returns the length of the tunnel header that will be added in
          * build_header() (i.e. excludes the IP header).  Returns a negative
          * error code if the configuration is invalid.
          */
         int (*hdr_len)(const struct tnl_port_config *);
  
         /*
-        * Returns a linked list of SKBs with tunnel headers (multiple
-        * packets may be generated in the event of fragmentation).  Space
-        * will have already been allocated at the start of the packet equal
-        * to sizeof(struct iphdr) + value returned by hdr_len().  The IP
-        * header will have already been constructed.
+        * Builds the static portion of the tunnel header, which is stored in
+        * the header cache.  In general the performance of this function is
+        * not too important as we try to only call it when building the cache
+        * so it is preferable to shift as much work as possible here.  However,
+        * in some circumstances caching is disabled and this function will be
+        * called for every packet, so try not to make it too slow.
+        */
+       void (*build_header)(const struct vport *,
+                            const struct tnl_mutable_config *, void *header);
+
+       /*
+        * Updates the cached header of a packet to match the actual packet
+        * data.  Typical things that might need to be updated are length,
+        * checksum, etc.  The IP header will have already been updated and this
+        * is the final step before transmission.  Returns a linked list of
+        * completed SKBs (multiple packets may be generated in the event
+        * of fragmentation).
+        */
+       struct sk_buff *(*update_header)(const struct vport *,
+                                        const struct tnl_mutable_config *,
+                                        struct dst_entry *, struct sk_buff *);
+};
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+/*
+ * On these kernels we have a fast mechanism to tell if the ARP cache for a
+ * particular destination has changed.
+ */
+#define HAVE_HH_SEQ
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
+/*
+ * On these kernels we have a fast mechanism to tell if the routing table
+ * has changed.
+ */
+#define HAVE_RT_GENID
+#endif
+#if !defined(HAVE_HH_SEQ) || !defined(HAVE_RT_GENID)
+/* If we can't detect all system changes directly we need to use a timeout. */
+#define NEED_CACHE_TIMEOUT
+#endif
+struct tnl_cache {
+       struct rcu_head rcu;
+
+       int len;                /* Length of data to be memcpy'd from cache. */
+
+       /* Sequence number of mutable->seq from which this cache was generated. */
+       unsigned mutable_seq;
+
+#ifdef HAVE_HH_SEQ
+       /*
+        * The sequence number from the seqlock protecting the hardware header
+        * cache (in the ARP cache).  Since every write increments the counter
+        * this gives us an easy way to tell if it has changed.
+        */
+       unsigned hh_seq;
+#endif
+
+#ifdef NEED_CACHE_TIMEOUT
+       /*
+        * If we don't have direct mechanisms to detect all important changes in
+        * the system fall back to an expiration time.  This expiration time
+        * can be relatively short since at high rates there will be millions of
+        * packets per second, so we'll still get plenty of benefit from the
+        * cache.  Note that if something changes we may blackhole packets
+        * until the expiration time (depending on what changed and the kernel
+        * version we may be able to detect the change sooner).  Expiration is
+        * expressed as a time in jiffies.
          */
-       struct sk_buff *(*build_header)(struct sk_buff *,
-                                       const struct vport *,
-                                       const struct tnl_mutable_config *,
-                                       struct dst_entry *);
+       unsigned long expiration;
+#endif
+
+       /*
+        * The routing table entry that is the result of looking up the tunnel
+        * endpoints.  It also contains a sequence number (called a generation
+        * ID) that can be compared to a global sequence to tell if the routing
+        * table has changed (and therefore there is a potential that this
+        * cached route has been invalidated).
+        */
+       struct rtable *rt;
+
+       /*
+        * If the output device for tunnel traffic is an OVS internal device,
+        * the flow of that datapath.  Since all tunnel traffic will have the
+        * same headers this allows us to cache the flow lookup.  NULL if the
+        * output device is not OVS or if there is no flow installed.
+        */
+       struct sw_flow *flow;
+
+       /* The cached header follows after padding for alignment. */
  };
  
  struct tnl_vport {
@@ -77,14 +161,29 @@ struct tnl_vport {
         char name[IFNAMSIZ];
         const struct tnl_ops *tnl_ops;
  
-       /* Protected by RCU. */
-       struct tnl_mutable_config *mutable;
+       struct tnl_mutable_config *mutable;     /* Protected by RCU. */
  
+       /*
+        * ID of last fragment sent (for tunnel protocols with direct support
+        * fragmentation).  If the protocol relies on IP fragmentation then
+        * this is not needed.
+        */
         atomic_t frag_id;
+
+       spinlock_t cache_lock;
+       struct tnl_cache *cache;                /* Protected by RCU/cache_lock. */
+
+#ifdef NEED_CACHE_TIMEOUT
+       /*
+        * If we must rely on expiration time to invalidate the cache, this is
+        * the interval.  It is randomized within a range (defined by
+        * MAX_CACHE_EXP in tunnel.c) to avoid synchronized expirations caused
+        * by creation of a large number of tunnels at a one time.
+        */
+       unsigned long cache_exp_interval;
+#endif
  };
  
-int tnl_init(void);
-void tnl_exit(void);
  struct vport *tnl_create(const char *name, const void __user *config,
                          const struct vport_ops *,
                          const struct tnl_ops *);
@@ -104,10 +203,12 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key,
  bool tnl_frag_needed(struct vport *vport,
                      const struct tnl_mutable_config *mutable,
                      struct sk_buff *skb, unsigned int mtu, __be32 flow_key);
+void tnl_free_linked_skbs(struct sk_buff *skb);
  
  static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport)
  {
         return vport_priv(vport);
  }
  
+
  #endif /* tunnel.h */
diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c

index 7ae3790d76fae949307ecffaae7c56aca6a6f000..bf1465fc0b5d0dbc1a4c9893645afddb3886afa9 100644 (file)
--- a/datapath/vport-capwap.c
+++ b/datapath/vport-capwap.c
@@ -128,24 +128,32 @@ static int capwap_hdr_len(const struct tnl_port_config *port_config)
         return CAPWAP_HLEN;
  }
  
-static struct sk_buff *capwap_build_header(struct sk_buff *skb,
-                                          const struct vport *vport,
-                                          const struct tnl_mutable_config *mutable,
-                                          struct dst_entry *dst)
+static void capwap_build_header(const struct vport *vport,
+                               const struct tnl_mutable_config *mutable,
+                               void *header)
  {
-       struct udphdr *udph = udp_hdr(skb);
-       struct capwaphdr *cwh = capwap_hdr(skb);
+       struct udphdr *udph = header;
+       struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
  
         udph->source = htons(CAPWAP_SRC_PORT);
         udph->dest = htons(CAPWAP_DST_PORT);
-       udph->len = htons(skb->len - sizeof(struct iphdr));
         udph->check = 0;
  
         cwh->begin = NO_FRAG_HDR;
         cwh->frag_id = 0;
         cwh->frag_off = 0;
+}
+
+static struct sk_buff *capwap_update_header(const struct vport *vport,
+                                           const struct tnl_mutable_config *mutable,
+                                           struct dst_entry *dst,
+                                           struct sk_buff *skb)
+{
+       struct udphdr *udph = udp_hdr(skb);
  
-       if (unlikely(skb->len > dst_mtu(dst)))
+       udph->len = htons(skb->len - skb_transport_offset(skb));
+
+       if (unlikely(skb->len - skb_network_offset(skb) > dst_mtu(dst)))
                 skb = fragment(skb, vport, dst);
  
         return skb;
@@ -209,6 +217,7 @@ struct tnl_ops capwap_tnl_ops = {
         .ipproto        = IPPROTO_UDP,
         .hdr_len        = capwap_hdr_len,
         .build_header   = capwap_build_header,
+       .update_header  = capwap_update_header,
  };
  
  static struct vport *capwap_create(const char *name, const void __user *config)
@@ -241,7 +250,7 @@ static int capwap_init(void)
  
         defrag_init();
  
-       return tnl_init();
+       return 0;
  
  error_sock:
         sock_release(capwap_rcv_socket);
@@ -252,7 +261,6 @@ error:
  
  static void capwap_exit(void)
  {
-       tnl_exit();
         defrag_exit();
         sock_release(capwap_rcv_socket);
  }
@@ -282,17 +290,19 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
                                 struct dst_entry *dst)
  {
         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-       unsigned int hlen = sizeof(struct iphdr) + CAPWAP_HLEN;
-       unsigned int headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
+       unsigned int hlen = skb_transport_offset(skb) + CAPWAP_HLEN;
+       unsigned int headroom;
+       unsigned int max_frame_len = dst_mtu(dst) + skb_network_offset(skb);
         struct sk_buff *result = NULL, *list_cur = NULL;
         unsigned int remaining;
         unsigned int offset;
         __be16 frag_id;
  
-       if (hlen + ~FRAG_OFF_MASK + 1 > dst_mtu(dst)) {
+       if (hlen + ~FRAG_OFF_MASK + 1 > max_frame_len) {
                 if (net_ratelimit())
                         pr_warn("capwap link mtu (%d) is less than minimum packet (%d)\n",
-                               dst_mtu(dst), hlen + ~FRAG_OFF_MASK + 1);
+                               dst_mtu(dst),
+                               hlen - skb_network_offset(skb) + ~FRAG_OFF_MASK + 1);
                 goto error;
         }
  
@@ -300,14 +310,17 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
         offset = 0;
         frag_id = htons(atomic_inc_return(&tnl_vport->frag_id));
  
+       headroom = dst->header_len + 16;
+       if (!skb_network_offset(skb))
+               headroom += LL_RESERVED_SPACE(dst->dev);
+
         while (remaining) {
                 struct sk_buff *skb2;
                 int frag_size;
-               struct iphdr *iph;
                 struct udphdr *udph;
                 struct capwaphdr *cwh;
  
-               frag_size = min(remaining, dst_mtu(dst) - hlen);
+               frag_size = min(remaining, max_frame_len - hlen);
                 if (remaining > frag_size)
                         frag_size &= FRAG_OFF_MASK;
  
@@ -317,23 +330,22 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
  
                 skb_reserve(skb2, headroom);
                 __skb_put(skb2, hlen + frag_size);
-               skb_reset_network_header(skb2);
-               skb_set_transport_header(skb2, sizeof(struct iphdr));
  
-               /* Copy IP/UDP/CAPWAP header. */
+               if (skb_network_offset(skb))
+                       skb_reset_mac_header(skb2);
+               skb_set_network_header(skb2, skb_network_offset(skb));
+               skb_set_transport_header(skb2, skb_transport_offset(skb));
+
+               /* Copy (Ethernet)/IP/UDP/CAPWAP header. */
                 copy_skb_metadata(skb, skb2);
-               skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
+               skb_copy_from_linear_data(skb, skb2->data, hlen);
  
                 /* Copy this data chunk. */
                 if (skb_copy_bits(skb, hlen + offset, skb2->data + hlen, frag_size))
                         BUG();
  
-               iph = ip_hdr(skb2);
-               iph->tot_len = hlen + frag_size;
-               ip_send_check(iph);
-
                 udph = udp_hdr(skb2);
-               udph->len = htons(skb2->len - sizeof(struct iphdr));
+               udph->len = htons(skb2->len - skb_transport_offset(skb2));
  
                 cwh = capwap_hdr(skb2);
                 if (remaining > frag_size)
@@ -356,11 +368,7 @@ static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
         goto out;
  
  error:
-       while (result) {
-               list_cur = result->next;
-               kfree_skb(result);
-               result = list_cur;
-       }
+       tnl_free_linked_skbs(result);
  out:
         kfree_skb(skb);
         return result;
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c

index 0a7092f96646c885a4920acaeb2a570b8c60d857..be8fb5343d2f1542b000bddd6a2070c10af35071 100644 (file)
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -50,41 +50,49 @@ static int gre_hdr_len(const struct tnl_port_config *port_config)
         return len;
  }
  
-static struct sk_buff *gre_build_header(struct sk_buff *skb,
-                                       const struct vport *vport,
-                                       const struct tnl_mutable_config *mutable,
-                                       struct dst_entry *dst)
+static void gre_build_header(const struct vport *vport,
+                            const struct tnl_mutable_config *mutable,
+                            void *header)
  {
-       struct gre_base_hdr *greh = (struct gre_base_hdr *)skb_transport_header(skb);
-       __be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen
-                                              - GRE_HEADER_SECTION);
+       struct gre_base_hdr *greh = header;
+       __be32 *options = (__be32 *)(greh + 1);
  
         greh->protocol = htons(ETH_P_TEB);
         greh->flags = 0;
  
-       /* Work backwards over the options so the checksum is last. */
+       if (mutable->port_config.flags & TNL_F_CSUM) {
+               greh->flags |= GRE_CSUM;
+               *options = 0;
+               options++;
+       }
+
         if (mutable->port_config.out_key ||
-           mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) {
+           mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
                 greh->flags |= GRE_KEY;
  
-               if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)
-                       *options = OVS_CB(skb)->tun_id;
-               else
-                       *options = mutable->port_config.out_key;
+       if (mutable->port_config.out_key)
+               *options = mutable->port_config.out_key;
+}
+
+static struct sk_buff *gre_update_header(const struct vport *vport,
+                                        const struct tnl_mutable_config *mutable,
+                                        struct dst_entry *dst,
+                                        struct sk_buff *skb)
+{
+       __be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen
+                                              - GRE_HEADER_SECTION);
  
+       /* Work backwards over the options so the checksum is last. */
+       if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) {
+               *options = OVS_CB(skb)->tun_id;
                 options--;
         }
  
-       if (mutable->port_config.flags & TNL_F_CSUM) {
-               greh->flags |= GRE_CSUM;
-
-               *options = 0;
+       if (mutable->port_config.flags & TNL_F_CSUM)
                 *(__sum16 *)options = csum_fold(skb_checksum(skb,
-                                               sizeof(struct iphdr),
-                                               skb->len - sizeof(struct iphdr),
+                                               skb_transport_offset(skb),
+                                               skb->len - skb_transport_offset(skb),
                                                 0));
-       }
-
         /*
          * Allow our local IP stack to fragment the outer packet even if the
          * DF bit is set as a last resort.
@@ -329,6 +337,7 @@ struct tnl_ops gre_tnl_ops = {
         .ipproto        = IPPROTO_GRE,
         .hdr_len        = gre_hdr_len,
         .build_header   = gre_build_header,
+       .update_header  = gre_update_header,
  };
  
  static struct vport *gre_create(const char *name, const void __user *config)
@@ -346,20 +355,14 @@ static int gre_init(void)
         int err;
  
         err = inet_add_protocol(&gre_protocol_handlers, IPPROTO_GRE);
-       if (err) {
+       if (err)
                 pr_warn("cannot register gre protocol handler\n");
-               goto out;
-       }
-
-       err = tnl_init();
  
-out:
         return err;
  }
  
  static void gre_exit(void)
  {
-       tnl_exit();
         inet_del_protocol(&gre_protocol_handlers, IPPROTO_GRE);
  }
  
diff --git a/include/openvswitch/tunnel.h b/include/openvswitch/tunnel.h

index 373797513333f9bb0e077cf42d48664511b4a0fa..dd700d0dc006960a7e49d44600c55c68523a8314 100644 (file)
--- a/include/openvswitch/tunnel.h
+++ b/include/openvswitch/tunnel.h
@@ -48,6 +48,7 @@
  #define TNL_F_TOS_INHERIT      (1 << 4) /* Inherit the ToS from the inner packet. */
  #define TNL_F_TTL_INHERIT      (1 << 5) /* Inherit the TTL from the inner packet. */
  #define TNL_F_PMTUD            (1 << 6) /* Enable path MTU discovery. */
+#define TNL_F_HDR_CACHE                (1 << 7) /* Enable tunnel header caching. */
  
  struct tnl_port_config {
         __u32   flags;
author	Jesse Gross <jesse@nicira.com>
	Fri, 27 Aug 2010 20:55:02 +0000 (13:55 -0700)
committer	Jesse Gross <jesse@nicira.com>
	Wed, 22 Sep 2010 20:43:02 +0000 (13:43 -0700)
datapath/tunnel.c		patch \| blob \| blame \| history
datapath/tunnel.h		patch \| blob \| blame \| history
datapath/vport-capwap.c		patch \| blob \| blame \| history
datapath/vport-gre.c		patch \| blob \| blame \| history
include/openvswitch/tunnel.h		patch \| blob \| blame \| history