import 14.2.4 nautilus point release

[ceph.git] / ceph / src / common / config.cc
diff --git a/ceph/src/common/config.cc b/ceph/src/common/config.cc

index 1cd96ae99a2a3cccd509c5d1f8e5392de03dca7f..4154ac66194f9ed2f6df46ee7f77351c48871f91 100644 (file)
--- a/ceph/src/common/config.cc
+++ b/ceph/src/common/config.cc
@@ -475,19 +475,93 @@ void md_config_t::parse_env(unsigned entity_type,
        _set_val(values, tracker, dir, *o, CONF_ENV, &err);
      }
    }
-  if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) {
+
+  // Apply pod memory limits:
+  //
+  // There are two types of resource requests: `limits` and `requests`.
+  //
+  // - Requests: Used by the K8s scheduler to determine on which nodes to
+  //   schedule the pods. This helps spread the pods to different nodes. This
+  //   value should be conservative in order to make sure all the pods are
+  //   schedulable. This corresponds to POD_MEMORY_REQUEST (set by the Rook
+  //   CRD) and is the target memory utilization we try to maintain for daemons
+  //   that respect it.
+  //
+  //   If POD_MEMORY_REQUEST is present, we use it as the target.
+  //
+  // - Limits: At runtime, the container runtime (and Linux) will use the
+  //   limits to see if the pod is using too many resources. In that case, the
+  //   pod will be killed/restarted automatically if the pod goes over the limit.
+  //   This should be higher than what is specified for requests (potentially
+  //   much higher). This corresponds to the cgroup memory limit that will
+  //   trigger the Linux OOM killer.
+  //
+  //   If POD_MEMORY_LIMIT is present, we use it as the /default/ value for
+  //   the target, which means it will only apply if the *_memory_target option
+  //   isn't set via some other path (e.g., POD_MEMORY_REQUEST, or the cluster
+  //   config, or whatever.)
+  //
+  // Here are the documented best practices:
+  //   https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits
+  //
+  // When the operator creates the CephCluster CR, it will need to generate the
+  // desired requests and limits. As long as we are conservative in our choice
+  // for requests and generous with the limits we should be in a good place to
+  // get started.
+  //
+  // The support in Rook is already there for applying the limits as seen in
+  // these links.
+  //
+  // Rook docs on the resource requests and limits:
+  //   https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings
+  // Example CR settings:
+  //   https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90
+  //
+  uint64_t pod_limit = 0, pod_request = 0;
+  if (auto pod_lim = getenv("POD_MEMORY_LIMIT"); pod_lim) {
      string err;
-    uint64_t v = atoll(pod_req);
+    uint64_t v = atoll(pod_lim);
      if (v) {
        switch (entity_type) {
        case CEPH_ENTITY_TYPE_OSD:
-       _set_val(values, tracker, stringify(v),
-                *find_option("osd_memory_target"),
-                CONF_ENV, &err);
-       break;
+        {
+         double cgroup_ratio = get_val<double>(
+           values, "osd_memory_target_cgroup_limit_ratio");
+         if (cgroup_ratio > 0.0) {
+           pod_limit = v * cgroup_ratio;
+           // set osd_memory_target *default* based on cgroup limit, so that
+           // it can be overridden by any explicit settings elsewhere.
+           set_val_default(values, tracker,
+                           "osd_memory_target", stringify(pod_limit));
+         }
+       }
        }
      }
    }
+  if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) {
+    if (uint64_t v = atoll(pod_req); v) {
+      pod_request = v;
+    }
+  }
+  if (pod_request && pod_limit) {
+    // If both LIMIT and REQUEST are set, ensure that we use the
+    // min of request and limit*ratio.  This is important
+    // because k8s set set LIMIT == REQUEST if only LIMIT is
+    // specified, and we want to apply the ratio in that case,
+    // even though REQUEST is present.
+    pod_request = std::min<uint64_t>(pod_request, pod_limit);
+  }
+  if (pod_request) {
+    string err;
+    switch (entity_type) {
+    case CEPH_ENTITY_TYPE_OSD:
+      _set_val(values, tracker, stringify(pod_request),
+              *find_option("osd_memory_target"),
+              CONF_ENV, &err);
+      break;
+    }
+  }
+
    if (getenv(args_var)) {
      vector<const char *> env_args;
      env_to_vec(env_args, args_var);
@@ -1458,14 +1532,16 @@ void md_config_t::diff(
    string name) const
  {
    values.for_each([this, f, &values] (auto& name, auto& configs) {
-    if (configs.size() == 1 &&
-       configs.begin()->first == CONF_DEFAULT) {
-      // we only have a default value; exclude from diff
+    if (configs.empty()) {
        return;
      }
      f->open_object_section(name.c_str());
      const Option *o = find_option(name);
-    dump(f, CONF_DEFAULT, _get_val_default(*o));
+    if (configs.size() &&
+       configs.begin()->first != CONF_DEFAULT) {
+      // show compiled-in default only if an override default wasn't provided
+      dump(f, CONF_DEFAULT, _get_val_default(*o));
+    }
      for (auto& j : configs) {
        dump(f, j.first, j.second);
      }