mm: change the call sites of numa statistics items

author Kemi Wang <kemi.wang@intel.com>

Fri, 8 Sep 2017 23:12:48 +0000 (16:12 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 9 Sep 2017 01:26:47 +0000 (18:26 -0700)
author Kemi Wang <kemi.wang@intel.com>
Fri, 8 Sep 2017 23:12:48 +0000 (16:12 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 9 Sep 2017 01:26:47 +0000 (18:26 -0700)
diff --git a/drivers/base/node.c b/drivers/base/node.c

index d8dc83017d8dc09f27c5c7eff657e05cf6dd028e..3855902f2c5b369dc538759950f5b4c951c484d6 100644 (file)
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
                        "interleave_hit %lu\n"
                        "local_node %lu\n"
                        "other_node %lu\n",
-                      sum_zone_node_page_state(dev->id, NUMA_HIT),
-                      sum_zone_node_page_state(dev->id, NUMA_MISS),
-                      sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
-                      sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-                      sum_zone_node_page_state(dev->id, NUMA_LOCAL),
-                      sum_zone_node_page_state(dev->id, NUMA_OTHER));
+                      sum_zone_numa_state(dev->id, NUMA_HIT),
+                      sum_zone_numa_state(dev->id, NUMA_MISS),
+                      sum_zone_numa_state(dev->id, NUMA_FOREIGN),
+                      sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+                      sum_zone_numa_state(dev->id, NUMA_LOCAL),
+                      sum_zone_numa_state(dev->id, NUMA_OTHER));
  }
  static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
  
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
                 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
                              sum_zone_node_page_state(nid, i));
  
-       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
                 n += sprintf(buf+n, "%s %lu\n",
                              vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                            sum_zone_numa_state(nid, i));
+#endif
+
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               n += sprintf(buf+n, "%s %lu\n",
+                            vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+                            NR_VM_NUMA_STAT_ITEMS],
                              node_page_state(pgdat, i));
  
         return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index e7e92c8f4883740b9fea3f636d31d583cb03ee71..e65d91c02e3077019c0b0b98da6222c3141f6397 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
  #define ZONE_PADDING(name)
  #endif
  
+#ifdef CONFIG_NUMA
+enum numa_stat_item {
+       NUMA_HIT,               /* allocated in intended node */
+       NUMA_MISS,              /* allocated in non intended node */
+       NUMA_FOREIGN,           /* was intended here, hit elsewhere */
+       NUMA_INTERLEAVE_HIT,    /* interleaver preferred this zone */
+       NUMA_LOCAL,             /* allocation from local node */
+       NUMA_OTHER,             /* allocation from other node */
+       NR_VM_NUMA_STAT_ITEMS
+};
+#else
+#define NR_VM_NUMA_STAT_ITEMS 0
+#endif
+
  enum zone_stat_item {
         /* First 128 byte cacheline (assuming 64 bit words) */
         NR_FREE_PAGES,
@@ -131,14 +145,6 @@ enum zone_stat_item {
         NR_BOUNCE,
  #if IS_ENABLED(CONFIG_ZSMALLOC)
         NR_ZSPAGES,             /* allocated in zsmalloc */
-#endif
-#ifdef CONFIG_NUMA
-       NUMA_HIT,               /* allocated in intended node */
-       NUMA_MISS,              /* allocated in non intended node */
-       NUMA_FOREIGN,           /* was intended here, hit elsewhere */
-       NUMA_INTERLEAVE_HIT,    /* interleaver preferred this zone */
-       NUMA_LOCAL,             /* allocation from local node */
-       NUMA_OTHER,             /* allocation from other node */
  #endif
         NR_FREE_CMA_PAGES,
         NR_VM_ZONE_STAT_ITEMS };
@@ -276,6 +282,8 @@ struct per_cpu_pageset {
         struct per_cpu_pages pcp;
  #ifdef CONFIG_NUMA
         s8 expire;
+       s8 numa_stat_threshold;
+       s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
  #endif
  #ifdef CONFIG_SMP
         s8 stat_threshold;
@@ -496,6 +504,7 @@ struct zone {
         ZONE_PADDING(_pad3_)
         /* Zone statistics */
         atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
+       atomic_long_t           vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
  } ____cacheline_internodealigned_in_smp;
  
  enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index 97e11ab573f0812d7862b03d5650cd482419a399..9ac82e29948f5ecad6c44d5edb0af1a1f5ab3576 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -107,8 +107,33 @@ static inline void vm_events_fold_cpu(int cpu)
   * Zone and node-based page accounting with per cpu differentials.
   */
  extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
  extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
  
+#ifdef CONFIG_NUMA
+static inline void zone_numa_state_add(long x, struct zone *zone,
+                                enum numa_stat_item item)
+{
+       atomic_long_add(x, &zone->vm_numa_stat[item]);
+       atomic_long_add(x, &vm_numa_stat[item]);
+}
+
+static inline unsigned long global_numa_state(enum numa_stat_item item)
+{
+       long x = atomic_long_read(&vm_numa_stat[item]);
+
+       return x;
+}
+
+static inline unsigned long zone_numa_state(struct zone *zone,
+                                       enum numa_stat_item item)
+{
+       long x = atomic_long_read(&zone->vm_numa_stat[item]);
+
+       return x;
+}
+#endif /* CONFIG_NUMA */
+
  static inline void zone_page_state_add(long x, struct zone *zone,
                                  enum zone_stat_item item)
  {
@@ -194,8 +219,10 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
  
  
  #ifdef CONFIG_NUMA
+extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
  extern unsigned long sum_zone_node_page_state(int node,
-                                               enum zone_stat_item item);
+                                             enum zone_stat_item item);
+extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
  extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                 enum node_stat_item item);
  #else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index a9add06fe7687eba0812a112f5aa213693401d9d..45583cd8dd568ad3c30675a5881a417dc64e69f2 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order)
  static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
  {
  #ifdef CONFIG_NUMA
-       enum zone_stat_item local_stat = NUMA_LOCAL;
+       enum numa_stat_item local_stat = NUMA_LOCAL;
  
         if (z->node != numa_node_id())
                 local_stat = NUMA_OTHER;
  
         if (z->node == preferred_zone->node)
-               __inc_zone_state(z, NUMA_HIT);
+               __inc_numa_state(z, NUMA_HIT);
         else {
-               __inc_zone_state(z, NUMA_MISS);
-               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+               __inc_numa_state(z, NUMA_MISS);
+               __inc_numa_state(preferred_zone, NUMA_FOREIGN);
         }
-       __inc_zone_state(z, local_stat);
+       __inc_numa_state(z, local_stat);
  #endif
  }
  
diff --git a/mm/vmstat.c b/mm/vmstat.c

index c7e4b84580235624f1c877ba8b921c577ad64f47..daea02833e2eed3b38e36f81fa59ef62a3731cf5 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu)
   * vm_stat contains the global counters
   */
  atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
  atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
  EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_numa_stat);
  EXPORT_SYMBOL(vm_node_stat);
  
  #ifdef CONFIG_SMP
@@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void)
  
                         per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                         = threshold;
-
+#ifdef CONFIG_NUMA
+                       per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
+                                                       = threshold;
+#endif
                         /* Base nodestat threshold on the largest populated zone. */
                         pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                         continue;
  
                 threshold = (*calculate_pressure)(zone);
-               for_each_online_cpu(cpu)
+               for_each_online_cpu(cpu) {
                         per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                         = threshold;
+#ifdef CONFIG_NUMA
+                       per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
+                                                       = threshold;
+#endif
+               }
         }
  }
  
@@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state);
   * Fold a differential into the global counters.
   * Returns the number of counters updated.
   */
+#ifdef CONFIG_NUMA
+static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+{
+       int i;
+       int changes = 0;
+
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+               if (zone_diff[i]) {
+                       atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+                       changes++;
+       }
+
+       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+               if (numa_diff[i]) {
+                       atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
+                       changes++;
+       }
+
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               if (node_diff[i]) {
+                       atomic_long_add(node_diff[i], &vm_node_stat[i]);
+                       changes++;
+       }
+       return changes;
+}
+#else
  static int fold_diff(int *zone_diff, int *node_diff)
  {
         int i;
@@ -622,6 +658,7 @@ static int fold_diff(int *zone_diff, int *node_diff)
         }
         return changes;
  }
+#endif /* CONFIG_NUMA */
  
  /*
   * Update the zone counters for the current cpu.
@@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
         struct zone *zone;
         int i;
         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+       int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
         int changes = 0;
  
@@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                         }
                 }
  #ifdef CONFIG_NUMA
+               for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+                       int v;
+
+                       v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
+                       if (v) {
+
+                               atomic_long_add(v, &zone->vm_numa_stat[i]);
+                               global_numa_diff[i] += v;
+                               __this_cpu_write(p->expire, 3);
+                       }
+               }
+
                 if (do_pagesets) {
                         cond_resched();
                         /*
@@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                 }
         }
  
+#ifdef CONFIG_NUMA
+       changes += fold_diff(global_zone_diff, global_numa_diff,
+                            global_node_diff);
+#else
         changes += fold_diff(global_zone_diff, global_node_diff);
+#endif
         return changes;
  }
  
@@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu)
         struct zone *zone;
         int i;
         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+       int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
  
         for_each_populated_zone(zone) {
@@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu)
                                 atomic_long_add(v, &zone->vm_stat[i]);
                                 global_zone_diff[i] += v;
                         }
+
+#ifdef CONFIG_NUMA
+               for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+                       if (p->vm_numa_stat_diff[i]) {
+                               int v;
+
+                               v = p->vm_numa_stat_diff[i];
+                               p->vm_numa_stat_diff[i] = 0;
+                               atomic_long_add(v, &zone->vm_numa_stat[i]);
+                               global_numa_diff[i] += v;
+                       }
+#endif
         }
  
         for_each_online_pgdat(pgdat) {
@@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu)
                         }
         }
  
+#ifdef CONFIG_NUMA
+       fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
+#else
         fold_diff(global_zone_diff, global_node_diff);
+#endif
  }
  
  /*
@@ -779,10 +855,38 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
                         atomic_long_add(v, &zone->vm_stat[i]);
                         atomic_long_add(v, &vm_zone_stat[i]);
                 }
+
+#ifdef CONFIG_NUMA
+       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+               if (pset->vm_numa_stat_diff[i]) {
+                       int v = pset->vm_numa_stat_diff[i];
+
+                       pset->vm_numa_stat_diff[i] = 0;
+                       atomic_long_add(v, &zone->vm_numa_stat[i]);
+                       atomic_long_add(v, &vm_numa_stat[i]);
+               }
+#endif
  }
  #endif
  
  #ifdef CONFIG_NUMA
+void __inc_numa_state(struct zone *zone,
+                                enum numa_stat_item item)
+{
+       struct per_cpu_pageset __percpu *pcp = zone->pageset;
+       s8 __percpu *p = pcp->vm_numa_stat_diff + item;
+       s8 v, t;
+
+       v = __this_cpu_inc_return(*p);
+       t = __this_cpu_read(pcp->numa_stat_threshold);
+       if (unlikely(v > t)) {
+               s8 overstep = t >> 1;
+
+               zone_numa_state_add(v + overstep, zone, item);
+               __this_cpu_write(*p, -overstep);
+       }
+}
+
  /*
   * Determine the per node value of a stat item. This function
   * is called frequently in a NUMA machine, so try to be as
@@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node,
         return count;
  }
  
+unsigned long sum_zone_numa_state(int node,
+                                enum numa_stat_item item)
+{
+       struct zone *zones = NODE_DATA(node)->node_zones;
+       int i;
+       unsigned long count = 0;
+
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               count += zone_numa_state(zones + i, item);
+
+       return count;
+}
+
  /*
   * Determine the per node value of a stat item.
   */
@@ -937,6 +1054,9 @@ const char * const vmstat_text[] = {
  #if IS_ENABLED(CONFIG_ZSMALLOC)
         "nr_zspages",
  #endif
+       "nr_free_cma",
+
+       /* enum numa_stat_item counters */
  #ifdef CONFIG_NUMA
         "numa_hit",
         "numa_miss",
@@ -945,7 +1065,6 @@ const char * const vmstat_text[] = {
         "numa_local",
         "numa_other",
  #endif
-       "nr_free_cma",
  
         /* Node-based counters */
         "nr_inactive_anon",
@@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = {
  };
  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
  
-
  #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
       defined(CONFIG_PROC_FS)
  static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                 seq_printf(m, "\n  per-node stats");
                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
                         seq_printf(m, "\n      %-12s %lu",
-                               vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                               vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+                               NR_VM_NUMA_STAT_ITEMS],
                                 node_page_state(pgdat, i));
                 }
         }
@@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                 seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
                                 zone_page_state(zone, i));
  
+#ifdef CONFIG_NUMA
+       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+               seq_printf(m, "\n      %-12s %lu",
+                               vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                               zone_numa_state(zone, i));
+#endif
+
         seq_printf(m, "\n  pagesets");
         for_each_online_cpu(i) {
                 struct per_cpu_pageset *pageset;
@@ -1497,6 +1623,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
         if (*pos >= ARRAY_SIZE(vmstat_text))
                 return NULL;
         stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+                         NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
                           NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
                           NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
  
@@ -1512,6 +1639,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
                 v[i] = global_zone_page_state(i);
         v += NR_VM_ZONE_STAT_ITEMS;
  
+#ifdef CONFIG_NUMA
+       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+               v[i] = global_numa_state(i);
+       v += NR_VM_NUMA_STAT_ITEMS;
+#endif
+
         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
                 v[i] = global_node_page_state(i);
         v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
                         err = -EINVAL;
                 }
         }
+#ifdef CONFIG_NUMA
+       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+               val = atomic_long_read(&vm_numa_stat[i]);
+               if (val < 0) {
+                       pr_warn("%s: %s %ld\n",
+                               __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
+                       err = -EINVAL;
+               }
+       }
+#endif
         if (err)
                 return err;
         if (write)
@@ -1654,13 +1797,19 @@ static bool need_update(int cpu)
                 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
  
                 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+#ifdef CONFIG_NUMA
+               BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
+#endif
                 /*
                  * The fast way of checking if there are any vmstat diffs.
                  * This works because the diffs are byte sized items.
                  */
                 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
                         return true;
-
+#ifdef CONFIG_NUMA
+               if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+                       return true;
+#endif
         }
         return false;
  }
author	Kemi Wang <kemi.wang@intel.com>
	Fri, 8 Sep 2017 23:12:48 +0000 (16:12 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 9 Sep 2017 01:26:47 +0000 (18:26 -0700)
drivers/base/node.c		patch \| blob \| blame \| history
include/linux/mmzone.h		patch \| blob \| blame \| history
include/linux/vmstat.h		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history
mm/vmstat.c		patch \| blob \| blame \| history