sched/uclamp: Extend CPU's cgroup controller

author Patrick Bellasi <patrick.bellasi@arm.com>

Thu, 22 Aug 2019 13:28:06 +0000 (14:28 +0100)

committer Ingo Molnar <mingo@kernel.org>

Tue, 3 Sep 2019 07:17:37 +0000 (09:17 +0200)
author Patrick Bellasi <patrick.bellasi@arm.com>
Thu, 22 Aug 2019 13:28:06 +0000 (14:28 +0100)
committer Ingo Molnar <mingo@kernel.org>
Tue, 3 Sep 2019 07:17:37 +0000 (09:17 +0200)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 3b29005aa9810d98a6604bed10e1d4c98dde6360..5f1c266131b08b7d349b68afe28086b0bed7b005 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -951,6 +951,13 @@ controller implements weight and absolute bandwidth limit models for
  normal scheduling policy and absolute bandwidth allocation model for
  realtime scheduling policy.
  
+In all the above models, cycles distribution is defined only on a temporal
+base and it does not account for the frequency at which tasks are executed.
+The (optional) utilization clamping support allows to hint the schedutil
+cpufreq governor about the minimum desired frequency which should always be
+provided by a CPU, as well as the maximum desired frequency, which should not
+be exceeded by a CPU.
+
  WARNING: cgroup2 doesn't yet support control of realtime processes and
  the cpu controller can only be enabled when all RT processes are in
  the root cgroup.  Be aware that system management software may already
@@ -1016,6 +1023,33 @@ All time durations are in microseconds.
         Shows pressure stall information for CPU. See
         Documentation/accounting/psi.rst for details.
  
+  cpu.uclamp.min
+        A read-write single value file which exists on non-root cgroups.
+        The default is "0", i.e. no utilization boosting.
+
+        The requested minimum utilization (protection) as a percentage
+        rational number, e.g. 12.34 for 12.34%.
+
+        This interface allows reading and setting minimum utilization clamp
+        values similar to the sched_setattr(2). This minimum utilization
+        value is used to clamp the task specific minimum utilization clamp.
+
+        The requested minimum utilization (protection) is always capped by
+        the current value for the maximum utilization (limit), i.e.
+        `cpu.uclamp.max`.
+
+  cpu.uclamp.max
+        A read-write single value file which exists on non-root cgroups.
+        The default is "max". i.e. no utilization capping
+
+        The requested maximum utilization (limit) as a percentage rational
+        number, e.g. 98.76 for 98.76%.
+
+        This interface allows reading and setting maximum utilization clamp
+        values similar to the sched_setattr(2). This maximum utilization
+        value is used to clamp the task specific maximum utilization clamp.
+
+
  
  Memory
  ------
diff --git a/init/Kconfig b/init/Kconfig

index bd7d650d4a996c7c10d2349a340f52c1f99c1e49..ac285cfa78b6ca22254bcb6868e9abb3eacd7bdb 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -928,6 +928,28 @@ config RT_GROUP_SCHED
  
  endif #CGROUP_SCHED
  
+config UCLAMP_TASK_GROUP
+       bool "Utilization clamping per group of tasks"
+       depends on CGROUP_SCHED
+       depends on UCLAMP_TASK
+       default n
+       help
+         This feature enables the scheduler to track the clamped utilization
+         of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
+
+         When this option is enabled, the user can specify a min and max
+         CPU bandwidth which is allowed for each single task in a group.
+         The max bandwidth allows to clamp the maximum frequency a task
+         can use, while the min bandwidth allows to define a minimum
+         frequency a task will always use.
+
+         When task group based utilization clamping is enabled, an eventually
+         specified task-specific clamp value is constrained by the cgroup
+         specified clamp value. Both minimum and maximum task clamping cannot
+         be bigger than the corresponding clamping defined at task group level.
+
+         If in doubt, say N.
+
  config CGROUP_PIDS
         bool "PIDs controller"
         help
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index a6661852907bdac981c4ed709a8a938c92a5819f..c186abed5c6dc11a871ddf292f4800273a8e3752 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
  }
  
  #ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@ -1010,10 +1022,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 loff_t *ppos)
  {
         int old_min, old_max;
-       static DEFINE_MUTEX(mutex);
         int result;
  
-       mutex_lock(&mutex);
+       mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
  
@@ -1048,7 +1059,7 @@ undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
  done:
-       mutex_unlock(&mutex);
+       mutex_unlock(&uclamp_mutex);
  
         return result;
  }
@@ -1137,6 +1148,8 @@ static void __init init_uclamp(void)
         unsigned int clamp_id;
         int cpu;
  
+       mutex_init(&uclamp_mutex);
+
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1162,12 @@ static void __init init_uclamp(void)
  
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-       for_each_clamp_id(clamp_id)
+       for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+               root_task_group.uclamp_req[clamp_id] = uc_max;
+#endif
+       }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@ -6798,6 +6815,19 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+                                           struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       int clamp_id;
+
+       for_each_clamp_id(clamp_id) {
+               uclamp_se_set(&tg->uclamp_req[clamp_id],
+                             uclamp_none(clamp_id), false);
+       }
+#endif
+}
+
  static void sched_free_group(struct task_group *tg)
  {
         free_fair_sched_group(tg);
@@ -6821,6 +6851,8 @@ struct task_group *sched_create_group(struct task_group *parent)
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
  
+       alloc_uclamp_sched_group(tg, parent);
+
         return tg;
  
  err:
@@ -7037,6 +7069,131 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
                 sched_move_task(task);
  }
  
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT   2
+#define UCLAMP_PERCENT_SCALE   (100 * POW10(UCLAMP_PERCENT_SHIFT))
+       s64 percent;
+       u64 util;
+       int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+       struct uclamp_request req = {
+               .percent = UCLAMP_PERCENT_SCALE,
+               .util = SCHED_CAPACITY_SCALE,
+               .ret = 0,
+       };
+
+       buf = strim(buf);
+       if (strcmp(buf, "max")) {
+               req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+                                            &req.percent);
+               if (req.ret)
+                       return req;
+               if (req.percent > UCLAMP_PERCENT_SCALE) {
+                       req.ret = -ERANGE;
+                       return req;
+               }
+
+               req.util = req.percent << SCHED_CAPACITY_SHIFT;
+               req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+       }
+
+       return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off,
+                               enum uclamp_id clamp_id)
+{
+       struct uclamp_request req;
+       struct task_group *tg;
+
+       req = capacity_from_percent(buf);
+       if (req.ret)
+               return req.ret;
+
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
+
+       tg = css_tg(of_css(of));
+       if (tg->uclamp_req[clamp_id].value != req.util)
+               uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+       /*
+        * Because of not recoverable conversion rounding we keep track of the
+        * exact requested value
+        */
+       tg->uclamp_pct[clamp_id] = req.percent;
+
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
+
+       return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes,
+                                   loff_t off)
+{
+       return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+                                   enum uclamp_id clamp_id)
+{
+       struct task_group *tg;
+       u64 util_clamp;
+       u64 percent;
+       u32 rem;
+
+       rcu_read_lock();
+       tg = css_tg(seq_css(sf));
+       util_clamp = tg->uclamp_req[clamp_id].value;
+       rcu_read_unlock();
+
+       if (util_clamp == SCHED_CAPACITY_SCALE) {
+               seq_puts(sf, "max\n");
+               return;
+       }
+
+       percent = tg->uclamp_pct[clamp_id];
+       percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+       seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MIN);
+       return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+       cpu_uclamp_print(sf, UCLAMP_MAX);
+       return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@ -7381,6 +7538,20 @@ static struct cftype cpu_legacy_files[] = {
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
         { }     /* Terminate */
  };
@@ -7548,6 +7719,20 @@ static struct cftype cpu_files[] = {
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       {
+               .name = "uclamp.min",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_min_show,
+               .write = cpu_uclamp_min_write,
+       },
+       {
+               .name = "uclamp.max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_uclamp_max_show,
+               .write = cpu_uclamp_max_write,
+       },
  #endif
         { }     /* terminate */
  };
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 7111e3a1eeb4d929f2ea46a50c2fa62df65ab452..ae1be61fb279257763f12c33bdbdbe8e263879d0 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -391,6 +391,14 @@ struct task_group {
  #endif
  
         struct cfs_bandwidth    cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+       /* The two decimal precision [%] value requested from user-space */
+       unsigned int            uclamp_pct[UCLAMP_CNT];
+       /* Clamp values requested for a task group */
+       struct uclamp_se        uclamp_req[UCLAMP_CNT];
+#endif
+
  };
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
author	Patrick Bellasi <patrick.bellasi@arm.com>
	Thu, 22 Aug 2019 13:28:06 +0000 (14:28 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 3 Sep 2019 07:17:37 +0000 (09:17 +0200)
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history