update sources to v12.2.3

[ceph.git] / ceph / src / mgr / DaemonServer.cc
diff --git a/ceph/src/mgr/DaemonServer.cc b/ceph/src/mgr/DaemonServer.cc

index 6454c8da306a52b96bd631527f2b61c2dc09e5b9..29f380ca5a6b5d398ca6d613c27062a939f6a65e 100644 (file)
--- a/ceph/src/mgr/DaemonServer.cc
+++ b/ceph/src/mgr/DaemonServer.cc
@@ -12,11 +12,17 @@
   */
  
  #include "DaemonServer.h"
+#include "mgr/Mgr.h"
  
+#include "include/stringify.h"
  #include "include/str_list.h"
  #include "auth/RotatingKeyRing.h"
  #include "json_spirit/json_spirit_writer.h"
  
+#include "mgr/mgr_commands.h"
+#include "mgr/OSDHealthMetricCollector.h"
+#include "mon/MonCommand.h"
+
  #include "messages/MMgrOpen.h"
  #include "messages/MMgrConfigure.h"
  #include "messages/MMonMgrReport.h"
@@ -24,6 +30,7 @@
  #include "messages/MCommandReply.h"
  #include "messages/MPGStats.h"
  #include "messages/MOSDScrub.h"
+#include "messages/MOSDForceRecovery.h"
  #include "common/errno.h"
  
  #define dout_context g_ceph_context
@@ -31,30 +38,32 @@
  #undef dout_prefix
  #define dout_prefix *_dout << "mgr.server " << __func__ << " "
  
+
+
  DaemonServer::DaemonServer(MonClient *monc_,
                             Finisher &finisher_,
                            DaemonStateIndex &daemon_state_,
                            ClusterState &cluster_state_,
-                          PyModules &py_modules_,
+                          PyModuleRegistry &py_modules_,
                            LogChannelRef clog_,
                            LogChannelRef audit_clog_)
      : Dispatcher(g_ceph_context),
        client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
-                                        g_conf->mgr_client_bytes)),
+                                        g_conf->get_val<uint64_t>("mgr_client_bytes"))),
        client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
-                                       g_conf->mgr_client_messages)),
+                                       g_conf->get_val<uint64_t>("mgr_client_messages"))),
        osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
-                                     g_conf->mgr_osd_bytes)),
+                                     g_conf->get_val<uint64_t>("mgr_osd_bytes"))),
        osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
-                                    g_conf->mgr_osd_messages)),
+                                    g_conf->get_val<uint64_t>("mgr_osd_messages"))),
        mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
-                                     g_conf->mgr_mds_bytes)),
+                                     g_conf->get_val<uint64_t>("mgr_mds_bytes"))),
        mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
-                                    g_conf->mgr_mds_messages)),
+                                    g_conf->get_val<uint64_t>("mgr_mds_messages"))),
        mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
-                                     g_conf->mgr_mon_bytes)),
+                                     g_conf->get_val<uint64_t>("mgr_mon_bytes"))),
        mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
-                                    g_conf->mgr_mon_messages)),
+                                    g_conf->get_val<uint64_t>("mgr_mon_messages"))),
        msgr(nullptr),
        monc(monc_),
        finisher(finisher_),
@@ -63,15 +72,23 @@ DaemonServer::DaemonServer(MonClient *monc_,
        py_modules(py_modules_),
        clog(clog_),
        audit_clog(audit_clog_),
-      auth_registry(g_ceph_context,
+      auth_cluster_registry(g_ceph_context,
                      g_conf->auth_supported.empty() ?
                        g_conf->auth_cluster_required :
                        g_conf->auth_supported),
-      lock("DaemonServer")
-{}
+      auth_service_registry(g_ceph_context,
+                   g_conf->auth_supported.empty() ?
+                      g_conf->auth_service_required :
+                      g_conf->auth_supported),
+      lock("DaemonServer"),
+      pgmap_ready(false)
+{
+  g_conf->add_observer(this);
+}
  
  DaemonServer::~DaemonServer() {
    delete msgr;
+  g_conf->remove_observer(this);
  }
  
  int DaemonServer::init(uint64_t gid, entity_addr_t client_addr)
@@ -132,7 +149,15 @@ bool DaemonServer::ms_verify_authorizer(Connection *con,
      bool& is_valid,
      CryptoKey& session_key)
  {
-  auto handler = auth_registry.get_handler(protocol);
+  AuthAuthorizeHandler *handler = nullptr;
+  if (peer_type == CEPH_ENTITY_TYPE_OSD ||
+      peer_type == CEPH_ENTITY_TYPE_MON ||
+      peer_type == CEPH_ENTITY_TYPE_MDS ||
+      peer_type == CEPH_ENTITY_TYPE_MGR) {
+    handler = auth_cluster_registry.get_handler(protocol);
+  } else {
+    handler = auth_service_registry.get_handler(protocol);
+  }
    if (!handler) {
      dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
      is_valid = false;
@@ -143,12 +168,18 @@ bool DaemonServer::ms_verify_authorizer(Connection *con,
    s->inst.addr = con->get_peer_addr();
    AuthCapsInfo caps_info;
  
-  is_valid = handler->verify_authorizer(
-    cct, monc->rotating_secrets.get(),
-    authorizer_data,
-    authorizer_reply, s->entity_name,
-    s->global_id, caps_info,
-    session_key);
+  RotatingKeyRing *keys = monc->rotating_secrets.get();
+  if (keys) {
+    is_valid = handler->verify_authorizer(
+      cct, keys,
+      authorizer_data,
+      authorizer_reply, s->entity_name,
+      s->global_id, caps_info,
+      session_key);
+  } else {
+    dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
+    is_valid = false;
+  }
  
    if (is_valid) {
      if (caps_info.allow_all) {
@@ -220,6 +251,11 @@ bool DaemonServer::ms_handle_reset(Connection *con)
      dout(10) << "unregistering osd." << session->osd_id
              << "  session " << session << " con " << con << dendl;
      osd_cons[session->osd_id].erase(con);
+
+    auto iter = daemon_connections.find(con);
+    if (iter != daemon_connections.end()) {
+      daemon_connections.erase(iter);
+    }
    }
    return false;
  }
@@ -232,8 +268,9 @@ bool DaemonServer::ms_handle_refused(Connection *con)
  
  bool DaemonServer::ms_dispatch(Message *m)
  {
-  Mutex::Locker l(lock);
-
+  // Note that we do *not* take ::lock here, in order to avoid
+  // serializing all message handling.  It's up to each handler
+  // to take whatever locks it needs.
    switch (m->get_type()) {
      case MSG_PGSTATS:
        cluster_state.ingest_pgstats(static_cast<MPGStats*>(m));
@@ -254,29 +291,36 @@ bool DaemonServer::ms_dispatch(Message *m)
  
  void DaemonServer::maybe_ready(int32_t osd_id)
  {
-  if (!pgmap_ready && reported_osds.find(osd_id) == reported_osds.end()) {
-    dout(4) << "initial report from osd " << osd_id << dendl;
-    reported_osds.insert(osd_id);
-    std::set<int32_t> up_osds;
+  if (pgmap_ready.load()) {
+    // Fast path: we don't need to take lock because pgmap_ready
+    // is already set
+  } else {
+    Mutex::Locker l(lock);
  
-    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
-        osdmap.get_up_osds(up_osds);
-    });
+    if (reported_osds.find(osd_id) == reported_osds.end()) {
+      dout(4) << "initial report from osd " << osd_id << dendl;
+      reported_osds.insert(osd_id);
+      std::set<int32_t> up_osds;
  
-    std::set<int32_t> unreported_osds;
-    std::set_difference(up_osds.begin(), up_osds.end(),
-                        reported_osds.begin(), reported_osds.end(),
-                        std::inserter(unreported_osds, unreported_osds.begin()));
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+          osdmap.get_up_osds(up_osds);
+      });
  
-    if (unreported_osds.size() == 0) {
-      dout(4) << "all osds have reported, sending PG state to mon" << dendl;
-      pgmap_ready = true;
-      reported_osds.clear();
-      // Avoid waiting for next tick
-      send_report();
-    } else {
-      dout(4) << "still waiting for " << unreported_osds.size() << " osds"
-                 " to report in before PGMap is ready" << dendl;
+      std::set<int32_t> unreported_osds;
+      std::set_difference(up_osds.begin(), up_osds.end(),
+                          reported_osds.begin(), reported_osds.end(),
+                          std::inserter(unreported_osds, unreported_osds.begin()));
+
+      if (unreported_osds.size() == 0) {
+        dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+        pgmap_ready = true;
+        reported_osds.clear();
+        // Avoid waiting for next tick
+        send_report();
+      } else {
+        dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+                   " to report in before PGMap is ready" << dendl;
+      }
      }
    }
  }
@@ -293,6 +337,8 @@ void DaemonServer::shutdown()
  
  bool DaemonServer::handle_open(MMgrOpen *m)
  {
+  Mutex::Locker l(lock);
+
    DaemonKey key;
    if (!m->service_name.empty()) {
      key.first = m->service_name;
@@ -303,20 +349,20 @@ bool DaemonServer::handle_open(MMgrOpen *m)
  
    dout(4) << "from " << m->get_connection() << "  " << key << dendl;
  
-  auto configure = new MMgrConfigure();
-  configure->stats_period = g_conf->mgr_stats_period;
-  m->get_connection()->send_message(configure);
+  _send_configure(m->get_connection());
  
+  DaemonStatePtr daemon;
    if (daemon_state.exists(key)) {
+    daemon = daemon_state.get(key);
+  }
+  if (daemon) {
      dout(20) << "updating existing DaemonState for " << m->daemon_name << dendl;
-    daemon_state.get(key)->perf_counters.clear();
+    Mutex::Locker l(daemon->lock);
+    daemon->perf_counters.clear();
    }
  
    if (m->service_daemon) {
-    DaemonStatePtr daemon;
-    if (daemon_state.exists(key)) {
-      daemon = daemon_state.get(key);
-    } else {
+    if (!daemon) {
        dout(4) << "constructing new DaemonState for " << key << dendl;
        daemon = std::make_shared<DaemonState>(daemon_state.types);
        daemon->key = key;
@@ -325,6 +371,7 @@ bool DaemonServer::handle_open(MMgrOpen *m)
        }
        daemon_state.insert(daemon);
      }
+    Mutex::Locker l(daemon->lock);
      daemon->service_daemon = true;
      daemon->metadata = m->daemon_metadata;
      daemon->service_status = m->daemon_status;
@@ -343,6 +390,15 @@ bool DaemonServer::handle_open(MMgrOpen *m)
      }
    }
  
+  if (m->get_connection()->get_peer_type() != entity_name_t::TYPE_CLIENT &&
+      m->service_name.empty())
+  {
+    // Store in set of the daemon/service connections, i.e. those
+    // connections that require an update in the event of stats
+    // configuration changes.
+    daemon_connections.insert(m->get_connection());
+  }
+
    m->put();
    return true;
  }
@@ -365,61 +421,103 @@ bool DaemonServer::handle_report(MMgrReport *m)
      // themselves to be a daemon for some service.
      dout(4) << "rejecting report from non-daemon client " << m->daemon_name
             << dendl;
+    m->get_connection()->mark_down();
      m->put();
      return true;
    }
  
+  // Look up the DaemonState
    DaemonStatePtr daemon;
    if (daemon_state.exists(key)) {
      dout(20) << "updating existing DaemonState for " << key << dendl;
      daemon = daemon_state.get(key);
    } else {
-    dout(4) << "constructing new DaemonState for " << key << dendl;
-    daemon = std::make_shared<DaemonState>(daemon_state.types);
-    // FIXME: crap, we don't know the hostname at this stage.
-    daemon->key = key;
-    daemon_state.insert(daemon);
-    // FIXME: we should avoid this case by rejecting MMgrReport from
-    // daemons without sessions, and ensuring that session open
-    // always contains metadata.
+    // we don't know the hostname at this stage, reject MMgrReport here.
+    dout(5) << "rejecting report from " << key << ", since we do not have its metadata now."
+           << dendl;
+
+    // issue metadata request in background
+    if (!daemon_state.is_updating(key) && 
+       (key.first == "osd" || key.first == "mds")) {
+
+      std::ostringstream oss;
+      auto c = new MetadataUpdate(daemon_state, key);
+      if (key.first == "osd") {
+        oss << "{\"prefix\": \"osd metadata\", \"id\": "
+            << key.second<< "}";
+
+      } else if (key.first == "mds") {
+        c->set_default("addr", stringify(m->get_source_addr()));
+        oss << "{\"prefix\": \"mds metadata\", \"who\": \""
+            << key.second << "\"}";
+ 
+      } else {
+       ceph_abort();
+      }
+
+      monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
+    }
+    
+    {
+      Mutex::Locker l(lock);
+      // kill session
+      MgrSessionRef session(static_cast<MgrSession*>(m->get_connection()->get_priv()));
+      if (!session) {
+       return false;
+      }
+      m->get_connection()->mark_down();
+      session->put();
+
+      dout(10) << "unregistering osd." << session->osd_id
+              << "  session " << session << " con " << m->get_connection() << dendl;
+      
+      if (osd_cons.find(session->osd_id) != osd_cons.end()) {
+          osd_cons[session->osd_id].erase(m->get_connection());
+      } 
+
+      auto iter = daemon_connections.find(m->get_connection());
+      if (iter != daemon_connections.end()) {
+       daemon_connections.erase(iter);
+      }
+    }
+
+    return false;
    }
-  assert(daemon != nullptr);
-  auto &daemon_counters = daemon->perf_counters;
-  daemon_counters.update(m);
  
-  if (daemon->service_daemon) {
-    utime_t now = ceph_clock_now();
-    if (m->daemon_status) {
-      daemon->service_status = *m->daemon_status;
-      daemon->service_status_stamp = now;
+  // Update the DaemonState
+  assert(daemon != nullptr);
+  {
+    Mutex::Locker l(daemon->lock);
+    auto &daemon_counters = daemon->perf_counters;
+    daemon_counters.update(m);
+
+    if (daemon->service_daemon) {
+      utime_t now = ceph_clock_now();
+      if (m->daemon_status) {
+        daemon->service_status = *m->daemon_status;
+        daemon->service_status_stamp = now;
+      }
+      daemon->last_service_beacon = now;
+    } else if (m->daemon_status) {
+      derr << "got status from non-daemon " << key << dendl;
      }
-    daemon->last_service_beacon = now;
-  } else if (m->daemon_status) {
-    derr << "got status from non-daemon " << key << dendl;
+    if (m->get_connection()->peer_is_osd()) {
+      // only OSD sends health_checks to me now
+      daemon->osd_health_metrics = std::move(m->osd_health_metrics);
+    }
+  }
+
+  // if there are any schema updates, notify the python modules
+  if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
+    ostringstream oss;
+    oss << key.first << '.' << key.second;
+    py_modules.notify_all("perf_schema_update", oss.str());
    }
  
    m->put();
    return true;
  }
  
-struct MgrCommand {
-  string cmdstring;
-  string helpstring;
-  string module;
-  string perm;
-  string availability;
-
-  bool requires_perm(char p) const {
-    return (perm.find(p) != string::npos);
-  }
-
-} mgr_commands[] = {
-
-#define COMMAND(parsesig, helptext, module, perm, availability) \
-  {parsesig, helptext, module, perm, availability},
-#include "MgrCommands.h"
-#undef COMMAND
-};
  
  void DaemonServer::_generate_command_map(
    map<string,cmd_vartype>& cmdmap,
@@ -444,16 +542,14 @@ void DaemonServer::_generate_command_map(
    }
  }
  
-const MgrCommand *DaemonServer::_get_mgrcommand(
+const MonCommand *DaemonServer::_get_mgrcommand(
    const string &cmd_prefix,
-  MgrCommand *cmds,
-  int cmds_size)
+  const std::vector<MonCommand> &cmds)
  {
-  MgrCommand *this_cmd = NULL;
-  for (MgrCommand *cp = cmds;
-       cp < &cmds[cmds_size]; cp++) {
-    if (cp->cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
-      this_cmd = cp;
+  const MonCommand *this_cmd = nullptr;
+  for (const auto &cmd : cmds) {
+    if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
+      this_cmd = &cmd;
        break;
      }
    }
@@ -466,7 +562,7 @@ bool DaemonServer::_allowed_command(
    const string &prefix,
    const map<string,cmd_vartype>& cmdmap,
    const map<string,string>& param_str_map,
-  const MgrCommand *this_cmd) {
+  const MonCommand *this_cmd) {
  
    if (s->entity_name.is_mon()) {
      // mon is all-powerful.  even when it is forwarding commands on behalf of
@@ -492,6 +588,7 @@ bool DaemonServer::_allowed_command(
  
  bool DaemonServer::handle_command(MCommand *m)
  {
+  Mutex::Locker l(lock);
    int r = 0;
    std::stringstream ss;
    std::string prefix;
@@ -593,30 +690,29 @@ bool DaemonServer::handle_command(MCommand *m)
    dout(4) << "prefix=" << prefix << dendl;
  
    if (prefix == "get_command_descriptions") {
-    int cmdnum = 0;
-
      dout(10) << "reading commands from python modules" << dendl;
-    auto py_commands = py_modules.get_commands();
+    const auto py_commands = py_modules.get_commands();
  
+    int cmdnum = 0;
      JSONFormatter f;
      f.open_object_section("command_descriptions");
-    for (const auto &pyc : py_commands) {
+
+    auto dump_cmd = [&cmdnum, &f](const MonCommand &mc){
        ostringstream secname;
        secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
-      dout(20) << "Dumping " << pyc.cmdstring << " (" << pyc.helpstring
-               << ")" << dendl;
-      dump_cmddesc_to_json(&f, secname.str(), pyc.cmdstring, pyc.helpstring,
-                          "mgr", pyc.perm, "cli", 0);
+      dump_cmddesc_to_json(&f, secname.str(), mc.cmdstring, mc.helpstring,
+                           mc.module, mc.req_perms, mc.availability, 0);
        cmdnum++;
+    };
+
+    for (const auto &pyc : py_commands) {
+      dump_cmd(pyc);
      }
  
-    for (const auto &cp : mgr_commands) {
-      ostringstream secname;
-      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
-      dump_cmddesc_to_json(&f, secname.str(), cp.cmdstring, cp.helpstring,
-                          cp.module, cp.perm, cp.availability, 0);
-      cmdnum++;
+    for (const auto &mgr_cmd : mgr_commands) {
+      dump_cmd(mgr_cmd);
      }
+
      f.close_section(); // command_descriptions
      f.flush(cmdctx->odata);
      cmdctx->reply(0, ss);
@@ -624,11 +720,10 @@ bool DaemonServer::handle_command(MCommand *m)
    }
  
    // lookup command
-  const MgrCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands,
-                                              ARRAY_SIZE(mgr_commands));
+  const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands);
    _generate_command_map(cmdctx->cmdmap, param_str_map);
    if (!mgr_cmd) {
-    MgrCommand py_command = {"", "", "py", "rw", "cli"};
+    MonCommand py_command = {"", "", "py", "rw", "cli"};
      if (!_allowed_command(session.get(), py_command.module, prefix, cmdctx->cmdmap,
                            param_str_map, &py_command)) {
        dout(1) << " access denied" << dendl;
@@ -685,6 +780,7 @@ bool DaemonServer::handle_command(MCommand *m)
         DaemonKey key(p.first, q.first);
         assert(daemon_state.exists(key));
         auto daemon = daemon_state.get(key);
+       Mutex::Locker l(daemon->lock);
         f->dump_stream("status_stamp") << daemon->service_status_stamp;
         f->dump_stream("last_beacon") << daemon->last_service_beacon;
         f->open_object_section("status");
@@ -702,6 +798,19 @@ bool DaemonServer::handle_command(MCommand *m)
      return true;
    }
  
+  if (prefix == "config set") {
+    std::string key;
+    std::string val;
+    cmd_getval(cct, cmdctx->cmdmap, "key", key);
+    cmd_getval(cct, cmdctx->cmdmap, "value", val);
+    r = cct->_conf->set_val(key, val, true, &ss);
+    if (r == 0) {
+      cct->_conf->apply_changes(nullptr);
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  }
+
    // -----------
    // PG commands
  
@@ -921,6 +1030,305 @@ bool DaemonServer::handle_command(MCommand *m)
        });
      cmdctx->reply(r, "");
      return true;
+  } else if (prefix == "osd safe-to-destroy") {
+    vector<string> ids;
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+       r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    set<int> active_osds, missing_stats, stored_pgs;
+    int affected_pgs = 0;
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+       if (pg_map.num_pg_unknown > 0) {
+         ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw"
+            << " any conclusions";
+         r = -EAGAIN;
+         return;
+       }
+       int num_active_clean = 0;
+       for (auto& p : pg_map.num_pg_by_state) {
+         unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN;
+         if ((p.first & want) == want) {
+           num_active_clean += p.second;
+         }
+       }
+       cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+           for (auto osd : osds) {
+             if (!osdmap.exists(osd)) {
+               continue;  // clearly safe to destroy
+             }
+             auto q = pg_map.num_pg_by_osd.find(osd);
+             if (q != pg_map.num_pg_by_osd.end()) {
+               if (q->second.acting > 0 || q->second.up > 0) {
+                 active_osds.insert(osd);
+                 affected_pgs += q->second.acting + q->second.up;
+                 continue;
+               }
+             }
+             if (num_active_clean < pg_map.num_pg) {
+               // all pgs aren't active+clean; we need to be careful.
+               auto p = pg_map.osd_stat.find(osd);
+               if (p == pg_map.osd_stat.end()) {
+                 missing_stats.insert(osd);
+               }
+               if (p->second.num_pgs > 0) {
+                 stored_pgs.insert(osd);
+               }
+             }
+           }
+         });
+      });
+    if (!r && !active_osds.empty()) {
+      ss << "OSD(s) " << active_osds << " have " << affected_pgs
+        << " pgs currently mapped to them";
+      r = -EBUSY;
+    } else if (!missing_stats.empty()) {
+      ss << "OSD(s) " << missing_stats << " have no reported stats, and not all"
+        << " PGs are active+clean; we cannot draw any conclusions";
+      r = -EAGAIN;
+    } else if (!stored_pgs.empty()) {
+      ss << "OSD(s) " << stored_pgs << " last reported they still store some PG"
+        << " data, and not all PGs are active+clean; we cannot be sure they"
+        << " aren't still needed.";
+      r = -EBUSY;
+    }
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
+       << " durability.";
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "osd ok-to-stop") {
+    vector<string> ids;
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+       r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    map<pg_t,int> pg_delta;  // pgid -> net acting set size change
+    int dangerous_pgs = 0;
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+       return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+           if (pg_map.num_pg_unknown > 0) {
+             ss << pg_map.num_pg_unknown << " pgs have unknown state; "
+                << "cannot draw any conclusions";
+             r = -EAGAIN;
+             return;
+           }
+           for (auto osd : osds) {
+             auto p = pg_map.pg_by_osd.find(osd);
+             if (p != pg_map.pg_by_osd.end()) {
+               for (auto& pgid : p->second) {
+                 --pg_delta[pgid];
+               }
+             }
+           }
+           for (auto& p : pg_delta) {
+             auto q = pg_map.pg_stat.find(p.first);
+             if (q == pg_map.pg_stat.end()) {
+               ss << "missing information about " << p.first << "; cannot draw"
+                  << " any conclusions";
+               r = -EAGAIN;
+               return;
+             }
+             if (!(q->second.state & PG_STATE_ACTIVE) ||
+                 (q->second.state & PG_STATE_DEGRADED)) {
+               // we don't currently have a good way to tell *how* degraded
+               // a degraded PG is, so we have to assume we cannot remove
+               // any more replicas/shards.
+               ++dangerous_pgs;
+               continue;
+             }
+             const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool());
+             if (!pi) {
+               ++dangerous_pgs; // pool is creating or deleting
+             } else {
+               if (q->second.acting.size() + p.second < pi->min_size) {
+                 ++dangerous_pgs;
+               }
+             }
+           }
+         });
+      });
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    if (dangerous_pgs) {
+      ss << dangerous_pgs << " PGs are already degraded or might become "
+        << "unavailable";
+      cmdctx->reply(-EBUSY, ss);
+      return true;
+    }
+    ss << "OSD(s) " << osds << " are ok to stop without reducing"
+       << " availability, provided there are no other concurrent failures"
+       << " or interventions. " << pg_delta.size() << " PGs are likely to be"
+       << " degraded (but remain available) as a result.";
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "pg force-recovery" ||
+              prefix == "pg force-backfill" ||
+              prefix == "pg cancel-force-recovery" ||
+              prefix == "pg cancel-force-backfill") {
+    string forceop = prefix.substr(3, string::npos);
+    list<pg_t> parsed_pgs;
+    map<int, list<pg_t> > osdpgs;
+
+    // figure out actual op just once
+    int actual_op = 0;
+    if (forceop == "force-recovery") {
+      actual_op = OFR_RECOVERY;
+    } else if (forceop == "force-backfill") {
+      actual_op = OFR_BACKFILL;
+    } else if (forceop == "cancel-force-backfill") {
+      actual_op = OFR_BACKFILL | OFR_CANCEL;
+    } else if (forceop == "cancel-force-recovery") {
+      actual_op = OFR_RECOVERY | OFR_CANCEL;
+    }
+
+    // covnert pg names to pgs, discard any invalid ones while at it
+    {
+      // we don't want to keep pgidstr and pgidstr_nodup forever
+      vector<string> pgidstr;
+      // get pgids to process and prune duplicates
+      cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr);
+      set<string> pgidstr_nodup(pgidstr.begin(), pgidstr.end());
+      if (pgidstr.size() != pgidstr_nodup.size()) {
+       // move elements only when there were duplicates, as this
+       // reorders them
+       pgidstr.resize(pgidstr_nodup.size());
+       auto it = pgidstr_nodup.begin();
+       for (size_t i = 0 ; i < pgidstr_nodup.size(); i++) {
+         pgidstr[i] = std::move(*it++);
+       }
+      }
+
+      cluster_state.with_pgmap([&](const PGMap& pg_map) {
+       for (auto& pstr : pgidstr) {
+         pg_t parsed_pg;
+         if (!parsed_pg.parse(pstr.c_str())) {
+           ss << "invalid pgid '" << pstr << "'; ";
+           r = -EINVAL;
+         } else {
+           auto workit = pg_map.pg_stat.find(parsed_pg);
+           if (workit == pg_map.pg_stat.end()) {
+             ss << "pg " << pstr << " does not exist; ";
+             r = -ENOENT;
+           } else {
+             pg_stat_t workpg = workit->second;
+
+             // discard pgs for which user requests are pointless
+             switch (actual_op)
+             {
+               case OFR_RECOVERY:
+                 if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING)) == 0) {
+                   // don't return error, user script may be racing with cluster. not fatal.
+                   ss << "pg " << pstr << " doesn't require recovery; ";
+                   continue;
+                 } else  if (workpg.state & PG_STATE_FORCED_RECOVERY) {
+                   ss << "pg " << pstr << " recovery already forced; ";
+                   // return error, as it may be a bug in user script
+                   r = -EINVAL;
+                   continue;
+                 }
+                 break;
+               case OFR_BACKFILL:
+                 if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) {
+                   ss << "pg " << pstr << " doesn't require backfilling; ";
+                   continue;
+                 } else  if (workpg.state & PG_STATE_FORCED_BACKFILL) {
+                   ss << "pg " << pstr << " backfill already forced; ";
+                   r = -EINVAL;
+                   continue;
+                 }
+                 break;
+               case OFR_BACKFILL | OFR_CANCEL:
+                 if ((workpg.state & PG_STATE_FORCED_BACKFILL) == 0) {
+                   ss << "pg " << pstr << " backfill not forced; ";
+                   continue;
+                 }
+                 break;
+               case OFR_RECOVERY | OFR_CANCEL:
+                 if ((workpg.state & PG_STATE_FORCED_RECOVERY) == 0) {
+                   ss << "pg " << pstr << " recovery not forced; ";
+                   continue;
+                 }
+                 break;
+               default:
+                 assert(0 == "actual_op value is not supported");
+             }
+
+             parsed_pgs.push_back(std::move(parsed_pg));
+           }
+         }
+       }
+
+       // group pgs to process by osd
+       for (auto& pgid : parsed_pgs) {
+         auto workit = pg_map.pg_stat.find(pgid);
+         if (workit != pg_map.pg_stat.end()) {
+           pg_stat_t workpg = workit->second;
+           set<int32_t> osds(workpg.up.begin(), workpg.up.end());
+           osds.insert(workpg.acting.begin(), workpg.acting.end());
+           for (auto i : osds) {
+             osdpgs[i].push_back(pgid);
+           }
+         }
+       }
+
+      });
+    }
+
+    // respond with error only when no pgs are correct
+    // yes, in case of mixed errors, only the last one will be emitted,
+    // but the message presented will be fine
+    if (parsed_pgs.size() != 0) {
+      // clear error to not confuse users/scripts
+      r = 0;
+    }
+
+    // optimize the command -> messages conversion, use only one message per distinct OSD
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+      for (auto& i : osdpgs) {
+       if (osdmap.is_up(i.first)) {
+         vector<pg_t> pgvec(make_move_iterator(i.second.begin()), make_move_iterator(i.second.end()));
+         auto p = osd_cons.find(i.first);
+         if (p == osd_cons.end()) {
+           ss << "osd." << i.first << " is not currently connected";
+           r = -EAGAIN;
+           continue;
+         }
+         for (auto& con : p->second) {
+           con->send_message(new MOSDForceRecovery(monc->get_fsid(), pgvec, actual_op));
+         }
+         ss << "instructing pg(s) " << i.second << " on osd." << i.first << " to " << forceop << "; ";
+       }
+      }
+    });
+    ss << std::endl;
+    cmdctx->reply(r, ss);
+    return true;
    } else {
      r = cluster_state.with_pgmap([&](const PGMap& pg_map) {
         return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
@@ -936,8 +1344,8 @@ bool DaemonServer::handle_command(MCommand *m)
    }
  
    // None of the special native commands, 
-  MgrPyModule *handler = nullptr;
-  auto py_commands = py_modules.get_commands();
+  ActivePyModule *handler = nullptr;
+  auto py_commands = py_modules.get_py_commands();
    for (const auto &pyc : py_commands) {
      auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
      dout(1) << "pyc_prefix: '" << pyc_prefix << "'" << dendl;
@@ -971,7 +1379,7 @@ bool DaemonServer::handle_command(MCommand *m)
  void DaemonServer::_prune_pending_service_map()
  {
    utime_t cutoff = ceph_clock_now();
-  cutoff -= g_conf->mgr_service_beacon_grace;
+  cutoff -= g_conf->get_val<double>("mgr_service_beacon_grace");
    auto p = pending_service_map.services.begin();
    while (p != pending_service_map.services.end()) {
      auto q = p->second.daemons.begin();
@@ -983,6 +1391,7 @@ void DaemonServer::_prune_pending_service_map()
         continue;
        }
        auto daemon = daemon_state.get(key);
+      Mutex::Locker l(daemon->lock);
        if (daemon->last_service_beacon == utime_t()) {
         // we must have just restarted; assume they are alive now.
         daemon->last_service_beacon = ceph_clock_now();
@@ -1010,7 +1419,7 @@ void DaemonServer::_prune_pending_service_map()
  void DaemonServer::send_report()
  {
    if (!pgmap_ready) {
-    if (ceph_clock_now() - started_at > g_conf->mgr_stats_period * 4.0) {
+    if (ceph_clock_now() - started_at > g_conf->get_val<int64_t>("mgr_stats_period") * 4.0) {
        pgmap_ready = true;
        reported_osds.clear();
        dout(1) << "Giving up on OSDs that haven't reported yet, sending "
@@ -1023,6 +1432,8 @@ void DaemonServer::send_report()
    }
  
    auto m = new MMonMgrReport();
+  py_modules.get_health_checks(&m->health_checks);
+
    cluster_state.with_pgmap([&](const PGMap& pg_map) {
        cluster_state.update_delta_stats();
  
@@ -1045,6 +1456,7 @@ void DaemonServer::send_report()
  
           pg_map.get_health_checks(g_ceph_context, osdmap,
                                    &m->health_checks);
+
           dout(10) << m->health_checks.checks.size() << " health checks"
                    << dendl;
           dout(20) << "health checks:\n";
@@ -1054,6 +1466,30 @@ void DaemonServer::send_report()
           *_dout << dendl;
         });
      });
+
+  auto osds = daemon_state.get_by_service("osd");
+  map<osd_metric, unique_ptr<OSDHealthMetricCollector>> accumulated;
+  for (const auto& osd : osds) {
+    Mutex::Locker l(osd.second->lock);
+    for (const auto& metric : osd.second->osd_health_metrics) {
+      auto acc = accumulated.find(metric.get_type());
+      if (acc == accumulated.end()) {
+       auto collector = OSDHealthMetricCollector::create(metric.get_type());
+       if (!collector) {
+         derr << __func__ << " " << osd.first << "." << osd.second
+              << " sent me an unknown health metric: "
+              << static_cast<uint8_t>(metric.get_type()) << dendl;
+         continue;
+       }
+       tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
+                                                   std::move(collector));
+      }
+      acc->second->update(osd.first, metric);
+    }
+  }
+  for (const auto& acc : accumulated) {
+    acc.second->summarize(m->health_checks);
+  }
    // TODO? We currently do not notify the PyModules
    // TODO: respect needs_send, so we send the report only if we are asked to do
    //       so, or the state is updated.
@@ -1098,3 +1534,48 @@ void DaemonServer::got_service_map()
      daemon_state.cull(p.first, names);
    }
  }
+
+
+const char** DaemonServer::get_tracked_conf_keys() const
+{
+  static const char *KEYS[] = {
+    "mgr_stats_threshold",
+    "mgr_stats_period",
+    nullptr
+  };
+
+  return KEYS;
+}
+
+void DaemonServer::handle_conf_change(const struct md_config_t *conf,
+                                              const std::set <std::string> &changed)
+{
+  dout(4) << "ohai" << dendl;
+  // We may be called within lock (via MCommand `config set`) or outwith the
+  // lock (via admin socket `config set`), so handle either case.
+  const bool initially_locked = lock.is_locked_by_me();
+  if (!initially_locked) {
+    lock.Lock();
+  }
+
+  if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
+    dout(4) << "Updating stats threshold/period on "
+            << daemon_connections.size() << " clients" << dendl;
+    // Send a fresh MMgrConfigure to all clients, so that they can follow
+    // the new policy for transmitting stats
+    for (auto &c : daemon_connections) {
+      _send_configure(c);
+    }
+  }
+}
+
+void DaemonServer::_send_configure(ConnectionRef c)
+{
+  assert(lock.is_locked_by_me());
+
+  auto configure = new MMgrConfigure();
+  configure->stats_period = g_conf->get_val<int64_t>("mgr_stats_period");
+  configure->stats_threshold = g_conf->get_val<int64_t>("mgr_stats_threshold");
+  c->send_message(configure);
+}
+