mm: numa: Add fault driven placement and migration

[mirror_ubuntu-zesty-kernel.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 6b800a14b9903c81d7bc9b2176b03e750c951108..6831abb5dbefd74a5d75fa125a3fd89a92fbf5c6 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
  #include <linux/slab.h>
  #include <linux/profile.h>
  #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
  
  #include <trace/events/sched.h>
  
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * Scheduling class queueing methods:
   */
  
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+       if (p->numa_scan_seq == seq)
+               return;
+       p->numa_scan_seq = seq;
+
+       /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+       struct task_struct *p = current;
+
+       /* FIXME: Allocate task-specific structure for placement policy here */
+
+       task_numa_placement(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+       unsigned long migrate, next_scan, now = jiffies;
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+
+       WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+       work->next = work; /* protect against double add */
+       /*
+        * Who cares about NUMA placement when they're dying.
+        *
+        * NOTE: make sure not to dereference p->mm before this check,
+        * exit_task_work() happens _after_ exit_mm() so we could be called
+        * without p->mm even though we still had it when we enqueued this
+        * work.
+        */
+       if (p->flags & PF_EXITING)
+               return;
+
+       /*
+        * Enforce maximal scan/migration frequency..
+        */
+       migrate = mm->numa_next_scan;
+       if (time_before(now, migrate))
+               return;
+
+       if (p->numa_scan_period == 0)
+               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+       next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+               return;
+
+       ACCESS_ONCE(mm->numa_scan_seq)++;
+       {
+               struct vm_area_struct *vma;
+
+               down_read(&mm->mmap_sem);
+               for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                       if (!vma_migratable(vma))
+                               continue;
+                       change_prot_numa(vma, vma->vm_start, vma->vm_end);
+               }
+               up_read(&mm->mmap_sem);
+       }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+       struct callback_head *work = &curr->numa_work;
+       u64 period, now;
+
+       /*
+        * We don't care about NUMA placement if we don't have memory.
+        */
+       if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+               return;
+
+       /*
+        * Using runtime rather than walltime has the dual advantage that
+        * we (mostly) drive the selection from busy threads and that the
+        * task needs to have done some actual work before we bother with
+        * NUMA placement.
+        */
+       now = curr->se.sum_exec_runtime;
+       period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+       if (now - curr->node_stamp > period) {
+               curr->node_stamp = now;
+
+               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                       task_work_add(curr, work, true);
+               }
+       }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
  static void
  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                 cfs_rq = cfs_rq_of(se);
                 entity_tick(cfs_rq, se, queued);
         }
+
+       if (sched_feat_numa(NUMA))
+               task_tick_numa(rq, curr);
  }
  
  /*