smp: Avoid using two cache lines for struct call_single_data

author Ying Huang <ying.huang@intel.com>

Tue, 8 Aug 2017 04:30:00 +0000 (12:30 +0800)

committer Ingo Molnar <mingo@kernel.org>

Tue, 29 Aug 2017 13:14:38 +0000 (15:14 +0200)
author Ying Huang <ying.huang@intel.com>
Tue, 8 Aug 2017 04:30:00 +0000 (12:30 +0800)
committer Ingo Molnar <mingo@kernel.org>
Tue, 29 Aug 2017 13:14:38 +0000 (15:14 +0200)
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c

index 6bace7695788fbc3b7663aeb353a08a3c45503af..c7cbddfcdc3b88d07a7538611e4d377588384a92 100644 (file)
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
  
  static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
  
  void tick_broadcast(const struct cpumask *mask)
  {
         atomic_t *count;
-       struct call_single_data *csd;
+       call_single_data_t *csd;
         int cpu;
  
         for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
  
  static int __init tick_broadcast_init(void)
  {
-       struct call_single_data *csd;
+       call_single_data_t *csd;
         int cpu;
  
         for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c

index 87b7df4851bffd06cc94c802267546651d15a11b..07125e7941f4a0dfe8138437180413caeac2a415 100644 (file)
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
  static int raise_blk_irq(int cpu, struct request *rq)
  {
         if (cpu_online(cpu)) {
-               struct call_single_data *data = &rq->csd;
+               call_single_data_t *data = &rq->csd;
  
                 data->func = trigger_softirq;
                 data->info = rq;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c

index 85c24cace9732fa62b67afd81c827437a04adfb9..81142ce781da78f564df58fb93f481e1be9fcea9 100644 (file)
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -13,7 +13,7 @@
  struct nullb_cmd {
         struct list_head list;
         struct llist_node ll_list;
-       struct call_single_data csd;
+       call_single_data_t csd;
         struct request *rq;
         struct bio *bio;
         unsigned int tag;
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c

index 71e586d7df71722364f051ce0148479631c76fdb..147f38ea0fcd261a6eaccd11bf5e32939dee90c8 100644 (file)
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,13 +119,13 @@ struct cpuidle_coupled {
  
  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
  
-static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
  
  /*
   * The cpuidle_coupled_poke_pending mask is used to avoid calling
- * __smp_call_function_single with the per cpu call_single_data struct already
+ * __smp_call_function_single with the per cpu call_single_data_t struct already
   * in use.  This prevents a deadlock where two cpus are waiting for each others
- * call_single_data struct to be available
+ * call_single_data_t struct to be available
   */
  static cpumask_t cpuidle_coupled_poke_pending;
  
@@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
   */
  static void cpuidle_coupled_poke(int cpu)
  {
-       struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+       call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
  
         if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
                 smp_call_function_single_async(cpu, csd);
@@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
  {
         int cpu;
         struct cpuidle_device *other_dev;
-       struct call_single_data *csd;
+       call_single_data_t *csd;
         struct cpuidle_coupled *coupled;
  
         if (cpumask_empty(&dev->coupled_cpus))
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c

index 51583ae4b1ebe6e6d441cf3a3d073702ac35e808..120b6e537b28f2665991f0bb203b47f64b398992 100644 (file)
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
         if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
                 napi_schedule_irqoff(&droq->napi);
         } else {
-               struct call_single_data *csd = &droq->csd;
+               call_single_data_t *csd = &droq->csd;
  
                 csd->func = napi_schedule_wrapper;
                 csd->info = &droq->napi;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h

index 6efd139b894d12dbbbabda11f8fdea3689f9a82c..f91bc84d171984cda866e76cd7bd9ce69b9c1a8f 100644 (file)
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
@@ -328,7 +328,7 @@ struct octeon_droq {
  
         u32 cpu_id;
  
-       struct call_single_data csd;
+       call_single_data_t csd;
  };
  
  #define OCT_DROQ_SIZE   (sizeof(struct octeon_droq))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 25f6a0cb27d3e9644c1ef42e7f1ff2b5d817293d..006fa09a641eb5b6c59367c70dd76c5b1b83ca41 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
  struct request {
         struct list_head queuelist;
         union {
-               struct call_single_data csd;
+               call_single_data_t csd;
                 u64 fifo_time;
         };
  
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

index 779b235955968353a926709436483bc3ea63dde9..6557f320b66e5a04f342b666728c057258f39d6b 100644 (file)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2774,7 +2774,7 @@ struct softnet_data {
         unsigned int            input_queue_head ____cacheline_aligned_in_smp;
  
         /* Elements below can be accessed between CPUs for RPS/RFS */
-       struct call_single_data csd ____cacheline_aligned_in_smp;
+       call_single_data_t      csd ____cacheline_aligned_in_smp;
         struct softnet_data     *rps_ipi_next;
         unsigned int            cpu;
         unsigned int            input_queue_tail;
diff --git a/include/linux/smp.h b/include/linux/smp.h

index 68123c1fe54918c051292eb5ba3427df09f31c2f..98b1fe027fc9c653e2d9c55fc890f19be3973cb3 100644 (file)
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -14,13 +14,17 @@
  #include <linux/llist.h>
  
  typedef void (*smp_call_func_t)(void *info);
-struct call_single_data {
+struct __call_single_data {
         struct llist_node llist;
         smp_call_func_t func;
         void *info;
         unsigned int flags;
  };
  
+/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
+typedef struct __call_single_data call_single_data_t
+       __aligned(sizeof(struct __call_single_data));
+
  /* total number of cpus in this system (may exceed NR_CPUS) */
  extern unsigned int total_cpus;
  
@@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                 smp_call_func_t func, void *info, bool wait,
                 gfp_t gfp_flags);
  
-int smp_call_function_single_async(int cpu, struct call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
  
  #ifdef CONFIG_SMP
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index eeef1a3086d1e74af034782cd780bbf7e44333b2..f29a7d2b57e1921dadd9682f38f340919b490343 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
  #ifdef CONFIG_SCHED_HRTICK
  #ifdef CONFIG_SMP
         int hrtick_csd_pending;
-       struct call_single_data hrtick_csd;
+       call_single_data_t hrtick_csd;
  #endif
         struct hrtimer hrtick_timer;
  #endif
diff --git a/kernel/smp.c b/kernel/smp.c

index 3061483cb3ad3eac437046eda0634f8a152e07f2..81cfca9b4cc3b37557b9e2c09602f3baf1f297a8 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
  };
  
  struct call_function_data {
-       struct call_single_data __percpu *csd;
+       call_single_data_t      __percpu *csd;
         cpumask_var_t           cpumask;
         cpumask_var_t           cpumask_ipi;
  };
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
                 free_cpumask_var(cfd->cpumask);
                 return -ENOMEM;
         }
-       cfd->csd = alloc_percpu(struct call_single_data);
+       cfd->csd = alloc_percpu(call_single_data_t);
         if (!cfd->csd) {
                 free_cpumask_var(cfd->cpumask);
                 free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
   * previous function call. For multi-cpu calls its even more interesting
   * as we'll have to ensure no other cpu is observing our csd.
   */
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
  {
         smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
  }
  
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
  {
         csd_lock_wait(csd);
         csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
         /*
          * prevent CPU from reordering the above assignment
          * to ->flags with any subsequent assignments to other
-        * fields of the specified call_single_data structure:
+        * fields of the specified call_single_data_t structure:
          */
         smp_wmb();
  }
  
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
  {
         WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
  
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
         smp_store_release(&csd->flags, 0);
  }
  
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
  
  /*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
   * for execution on the given CPU. data must already have
   * ->func, ->info, and ->flags set.
   */
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
                                smp_call_func_t func, void *info)
  {
         if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
  {
         struct llist_head *head;
         struct llist_node *entry;
-       struct call_single_data *csd, *csd_next;
+       call_single_data_t *csd, *csd_next;
         static bool warned;
  
         WARN_ON(!irqs_disabled());
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
  int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                              int wait)
  {
-       struct call_single_data *csd;
-       struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+       call_single_data_t *csd;
+       call_single_data_t csd_stack = {
+               .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+       };
         int this_cpu;
         int err;
  
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
   * NOTE: Be careful, there is unfortunately no current debugging facility to
   * validate the correctness of this serialization.
   */
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
  {
         int err = 0;
  
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
  
         cpumask_clear(cfd->cpumask_ipi);
         for_each_cpu(cpu, cfd->cpumask) {
-               struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+               call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
  
                 csd_lock(csd);
                 if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
  
         if (wait) {
                 for_each_cpu(cpu, cfd->cpumask) {
-                       struct call_single_data *csd;
+                       call_single_data_t *csd;
  
                         csd = per_cpu_ptr(cfd->csd, cpu);
                         csd_lock_wait(csd);
diff --git a/kernel/up.c b/kernel/up.c

index ee81ac9af4caff801a531a9219efd19637b76688..42c46bf3e0a5ca5b6ece7dc967fc34c72dd581a2 100644 (file)
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
  }
  EXPORT_SYMBOL(smp_call_function_single);
  
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
  {
         unsigned long flags;
author	Ying Huang <ying.huang@intel.com>
	Tue, 8 Aug 2017 04:30:00 +0000 (12:30 +0800)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 29 Aug 2017 13:14:38 +0000 (15:14 +0200)
arch/mips/kernel/smp.c		patch \| blob \| blame \| history
block/blk-softirq.c		patch \| blob \| blame \| history
drivers/block/null_blk.c		patch \| blob \| blame \| history
drivers/cpuidle/coupled.c		patch \| blob \| blame \| history
drivers/net/ethernet/cavium/liquidio/lio_main.c		patch \| blob \| blame \| history
drivers/net/ethernet/cavium/liquidio/octeon_droq.h		patch \| blob \| blame \| history
include/linux/blkdev.h		patch \| blob \| blame \| history
include/linux/netdevice.h		patch \| blob \| blame \| history
include/linux/smp.h		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
kernel/smp.c		patch \| blob \| blame \| history
kernel/up.c		patch \| blob \| blame \| history