openvswitch: Per NUMA node flow stats.

author Jarno Rajahalme <jrajahalme@nicira.com>

Thu, 27 Mar 2014 19:42:54 +0000 (12:42 -0700)

committer Jesse Gross <jesse@nicira.com>

Fri, 16 May 2014 20:40:29 +0000 (13:40 -0700)
author Jarno Rajahalme <jrajahalme@nicira.com>
Thu, 27 Mar 2014 19:42:54 +0000 (12:42 -0700)
committer Jesse Gross <jesse@nicira.com>
Fri, 16 May 2014 20:40:29 +0000 (13:40 -0700)
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c

index aad7a8da70b13b2ebd52cd430d4d1a0ab6f32416..432f04d5c896dd38fd97d3193ca06dda3dcdc6a9 100644 (file)
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -65,8 +65,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
  {
         struct flow_stats *stats;
         __be16 tcp_flags = 0;
+       int node = numa_node_id();
  
-       stats = this_cpu_ptr(flow->stats);
+       stats = rcu_dereference(flow->stats[node]);
  
         if ((flow->key.eth.type == htons(ETH_P_IP) ||
              flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -76,68 +77,102 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
                 tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
         }
  
-       spin_lock(&stats->lock);
+       /* Check if already have node-specific stats. */
+       if (likely(stats)) {
+               spin_lock(&stats->lock);
+               /* Mark if we write on the pre-allocated stats. */
+               if (node == 0 && unlikely(flow->stats_last_writer != node))
+                       flow->stats_last_writer = node;
+       } else {
+               stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
+               spin_lock(&stats->lock);
+
+               /* If the current NUMA-node is the only writer on the
+                * pre-allocated stats keep using them.
+                */
+               if (unlikely(flow->stats_last_writer != node)) {
+                       /* A previous locker may have already allocated the
+                        * stats, so we need to check again.  If node-specific
+                        * stats were already allocated, we update the pre-
+                        * allocated stats as we have already locked them.
+                        */
+                       if (likely(flow->stats_last_writer != NUMA_NO_NODE)
+                           && likely(!rcu_dereference(flow->stats[node]))) {
+                               /* Try to allocate node-specific stats. */
+                               struct flow_stats *new_stats;
+
+                               new_stats =
+                                       kmem_cache_alloc_node(flow_stats_cache,
+                                                             GFP_THISNODE |
+                                                             __GFP_NOMEMALLOC,
+                                                             node);
+                               if (likely(new_stats)) {
+                                       new_stats->used = jiffies;
+                                       new_stats->packet_count = 1;
+                                       new_stats->byte_count = skb->len;
+                                       new_stats->tcp_flags = tcp_flags;
+                                       spin_lock_init(&new_stats->lock);
+
+                                       rcu_assign_pointer(flow->stats[node],
+                                                          new_stats);
+                                       goto unlock;
+                               }
+                       }
+                       flow->stats_last_writer = node;
+               }
+       }
+
         stats->used = jiffies;
         stats->packet_count++;
         stats->byte_count += skb->len;
         stats->tcp_flags |= tcp_flags;
-       spin_unlock(&stats->lock);
-}
-
-static void stats_read(struct flow_stats *stats,
-                      struct ovs_flow_stats *ovs_stats,
-                      unsigned long *used, __be16 *tcp_flags)
-{
-       spin_lock(&stats->lock);
-       if (!*used || time_after(stats->used, *used))
-               *used = stats->used;
-       *tcp_flags |= stats->tcp_flags;
-       ovs_stats->n_packets += stats->packet_count;
-       ovs_stats->n_bytes += stats->byte_count;
+unlock:
         spin_unlock(&stats->lock);
  }
  
  void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
                         unsigned long *used, __be16 *tcp_flags)
  {
-       int cpu;
+       int node;
  
         *used = 0;
         *tcp_flags = 0;
         memset(ovs_stats, 0, sizeof(*ovs_stats));
  
-       local_bh_disable();
-
-       for_each_possible_cpu(cpu) {
-               struct flow_stats *stats;
+       for_each_node(node) {
+               struct flow_stats *stats = rcu_dereference(flow->stats[node]);
  
-               stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-               stats_read(stats, ovs_stats, used, tcp_flags);
+               if (stats) {
+                       /* Local CPU may write on non-local stats, so we must
+                        * block bottom-halves here.
+                        */
+                       spin_lock_bh(&stats->lock);
+                       if (!*used || time_after(stats->used, *used))
+                               *used = stats->used;
+                       *tcp_flags |= stats->tcp_flags;
+                       ovs_stats->n_packets += stats->packet_count;
+                       ovs_stats->n_bytes += stats->byte_count;
+                       spin_unlock_bh(&stats->lock);
+               }
         }
-
-       local_bh_enable();
-}
-
-static void stats_reset(struct flow_stats *stats)
-{
-       spin_lock(&stats->lock);
-       stats->used = 0;
-       stats->packet_count = 0;
-       stats->byte_count = 0;
-       stats->tcp_flags = 0;
-       spin_unlock(&stats->lock);
  }
  
  void ovs_flow_stats_clear(struct sw_flow *flow)
  {
-       int cpu;
-
-       local_bh_disable();
-
-       for_each_possible_cpu(cpu)
-               stats_reset(per_cpu_ptr(flow->stats, cpu));
-
-       local_bh_enable();
+       int node;
+
+       for_each_node(node) {
+               struct flow_stats *stats = rcu_dereference(flow->stats[node]);
+
+               if (stats) {
+                       spin_lock_bh(&stats->lock);
+                       stats->used = 0;
+                       stats->packet_count = 0;
+                       stats->byte_count = 0;
+                       stats->tcp_flags = 0;
+                       spin_unlock_bh(&stats->lock);
+               }
+       }
  }
  
  static int check_header(struct sk_buff *skb, int len)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h

index 9c0dd8aa3117ddfcb7c14f8f17af6e4eee77e059..ddcebc53224f0d1c91ad0c7a8158db8236aafd41 100644 (file)
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -159,12 +159,18 @@ struct sw_flow {
         struct rcu_head rcu;
         struct hlist_node hash_node[2];
         u32 hash;
-
+       int stats_last_writer;          /* NUMA-node id of the last writer on
+                                        * 'stats[0]'.
+                                        */
         struct sw_flow_key key;
         struct sw_flow_key unmasked_key;
         struct sw_flow_mask *mask;
         struct sw_flow_actions __rcu *sf_acts;
-       struct flow_stats __percpu *stats;
+       struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
+                                          * is allocated at flow creation time,
+                                          * the rest are allocated on demand
+                                          * while holding the 'stats[0].lock'.
+                                          */
  };
  
  struct arp_eth_header {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c

index aa92da23053d4b6434326943bdf6c756fe0bae64..d8ef37b937bda884f00ffe7c53b8d95446c5f0f0 100644 (file)
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -48,6 +48,7 @@
  #define REHASH_INTERVAL                (10 * 60 * HZ)
  
  static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
  
  static u16 range_n_bytes(const struct sw_flow_key_range *range)
  {
@@ -75,7 +76,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
  struct sw_flow *ovs_flow_alloc(void)
  {
         struct sw_flow *flow;
-       int cpu;
+       struct flow_stats *stats;
+       int node;
  
         flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
         if (!flow)
@@ -83,17 +85,22 @@ struct sw_flow *ovs_flow_alloc(void)
  
         flow->sf_acts = NULL;
         flow->mask = NULL;
+       flow->stats_last_writer = NUMA_NO_NODE;
  
-       flow->stats = alloc_percpu(struct flow_stats);
-       if (!flow->stats)
+       /* Initialize the default stat node. */
+       stats = kmem_cache_alloc_node(flow_stats_cache,
+                                     GFP_KERNEL | __GFP_ZERO, 0);
+       if (!stats)
                 goto err;
  
-       for_each_possible_cpu(cpu) {
-               struct flow_stats *cpu_stats;
+       spin_lock_init(&stats->lock);
+
+       RCU_INIT_POINTER(flow->stats[0], stats);
+
+       for_each_node(node)
+               if (node != 0)
+                       RCU_INIT_POINTER(flow->stats[node], NULL);
  
-               cpu_stats = per_cpu_ptr(flow->stats, cpu);
-               spin_lock_init(&cpu_stats->lock);
-       }
         return flow;
  err:
         kmem_cache_free(flow_cache, flow);
@@ -130,8 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
  
  static void flow_free(struct sw_flow *flow)
  {
+       int node;
+
         kfree((struct sf_flow_acts __force *)flow->sf_acts);
-       free_percpu(flow->stats);
+       for_each_node(node)
+               if (flow->stats[node])
+                       kmem_cache_free(flow_stats_cache,
+                                       (struct flow_stats __force *)flow->stats[node]);
         kmem_cache_free(flow_cache, flow);
  }
  
@@ -586,16 +598,28 @@ int ovs_flow_init(void)
         BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
         BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
  
-       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
-                                       0, NULL);
+       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+                                      + (num_possible_nodes()
+                                         * sizeof(struct flow_stats *)),
+                                      0, 0, NULL);
         if (flow_cache == NULL)
                 return -ENOMEM;
  
+       flow_stats_cache
+               = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+                                   0, SLAB_HWCACHE_ALIGN, NULL);
+       if (flow_stats_cache == NULL) {
+               kmem_cache_destroy(flow_cache);
+               flow_cache = NULL;
+               return -ENOMEM;
+       }
+
         return 0;
  }
  
  /* Uninitializes the flow module. */
  void ovs_flow_exit(void)
  {
+       kmem_cache_destroy(flow_stats_cache);
         kmem_cache_destroy(flow_cache);
  }
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h

index c26c59a7ab57727fd9ca5b436cf1cc564417a2de..ca8a5820f6153f67fb9ad987c4c156be882c92ea 100644 (file)
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -52,6 +52,8 @@ struct flow_table {
         unsigned int count;
  };
  
+extern struct kmem_cache *flow_stats_cache;
+
  int ovs_flow_init(void);
  void ovs_flow_exit(void);
author	Jarno Rajahalme <jrajahalme@nicira.com>
	Thu, 27 Mar 2014 19:42:54 +0000 (12:42 -0700)
committer	Jesse Gross <jesse@nicira.com>
	Fri, 16 May 2014 20:40:29 +0000 (13:40 -0700)
net/openvswitch/flow.c		patch \| blob \| blame \| history
net/openvswitch/flow.h		patch \| blob \| blame \| history
net/openvswitch/flow_table.c		patch \| blob \| blame \| history
net/openvswitch/flow_table.h		patch \| blob \| blame \| history