]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/mds/MDSDaemon.cc
import ceph 15.2.14
[ceph.git] / ceph / src / mds / MDSDaemon.cc
index f42d031f7326f3dbc8b43ede5d1bc1245e228f26..f875715009170c37b793e3355d0b6bd6b33e3f34 100644 (file)
 #define dout_subsys ceph_subsys_mds
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << name << ' '
-
+using TOPNSPC::common::cmd_getval;
 // cons/des
 MDSDaemon::MDSDaemon(std::string_view n, Messenger *m, MonClient *mc) :
   Dispatcher(m->cct),
-  mds_lock("MDSDaemon::mds_lock"),
-  stopping(false),
   timer(m->cct, mds_lock),
   gss_ktfile_client(m->cct->_conf.get_val<std::string>("gss_ktab_client_file")),
   beacon(m->cct, mc, n),
   name(n),
   messenger(m),
   monc(mc),
-  mgrc(m->cct, m),
+  mgrc(m->cct, m, &mc->monmap),
   log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
-  mds_rank(NULL),
-  asok_hook(NULL),
   starttime(mono_clock::now())
 {
   orig_argc = 0;
@@ -97,8 +93,6 @@ MDSDaemon::MDSDaemon(std::string_view n, Messenger *m, MonClient *mc) :
     ceph_assert(set_result == 0);
   }
 
-  monc->set_messenger(messenger);
-
   mdsmap.reset(new MDSMap);
 }
 
@@ -113,43 +107,99 @@ class MDSSocketHook : public AdminSocketHook {
   MDSDaemon *mds;
 public:
   explicit MDSSocketHook(MDSDaemon *m) : mds(m) {}
-  bool call(std::string_view command, const cmdmap_t& cmdmap,
-           std::string_view format, bufferlist& out) override {
-    stringstream ss;
-    bool r = mds->asok_command(command, cmdmap, format, ss);
-    out.append(ss);
-    return r;
+  int call(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    std::ostream& errss,
+    ceph::buffer::list& out) override {
+    ceph_abort("shoudl go to call_async");
+  }
+  void call_async(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    const bufferlist& inbl,
+    std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
+    mds->asok_command(command, cmdmap, f, inbl, on_finish);
   }
 };
 
-bool MDSDaemon::asok_command(std::string_view command, const cmdmap_t& cmdmap,
-                            std::string_view format, std::ostream& ss)
+void MDSDaemon::asok_command(
+  std::string_view command,
+  const cmdmap_t& cmdmap,
+  Formatter *f,
+  const bufferlist& inbl,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
 {
-  dout(1) << "asok_command: " << command << " (starting...)" << dendl;
+  dout(1) << "asok_command: " << command << " " << cmdmap
+         << " (starting...)" << dendl;
 
-  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
-  bool handled = false;
+  int r = -ENOSYS;
+  bufferlist outbl;
+  stringstream ss;
   if (command == "status") {
     dump_status(f);
-    handled = true;
+    r = 0;
+  } else if (command == "exit") {
+    outbl.append("Exiting...\n");
+    r = 0;
+    std::thread t([this](){
+                   // Wait a little to improve chances of caller getting
+                   // our response before seeing us disappear from mdsmap
+                   sleep(1);
+                   std::lock_guard l(mds_lock);
+                   suicide();
+                 });
+    t.detach();
+  } else if (command == "respawn") {
+    outbl.append("Respawning...\n");
+    r = 0;
+    std::thread t([this](){
+                   // Wait a little to improve chances of caller getting
+                   // our response before seeing us disappear from mdsmap
+                   sleep(1);
+                   std::lock_guard l(mds_lock);
+                   respawn();
+                 });
+    t.detach();
+  } else if (command == "heap") {
+    if (!ceph_using_tcmalloc()) {
+      ss << "not using tcmalloc";
+      r = -EOPNOTSUPP;
+    } else {
+      string heapcmd;
+      cmd_getval(cmdmap, "heapcmd", heapcmd);
+      vector<string> heapcmd_vec;
+      get_str_vec(heapcmd, heapcmd_vec);
+      string value;
+      if (cmd_getval(cmdmap, "value", value)) {
+       heapcmd_vec.push_back(value);
+      }
+      ceph_heap_profiler_handle_command(heapcmd_vec, ss);
+    }
+  } else if (command == "cpu_profiler") {
+    string arg;
+    cmd_getval(cmdmap, "arg", arg);
+    vector<string> argvec;
+    get_str_vec(arg, argvec);
+    cpu_profiler_handle_command(argvec, ss);
+    r = 0;
   } else {
     if (mds_rank == NULL) {
       dout(1) << "Can't run that command on an inactive MDS!" << dendl;
       f->dump_string("error", "mds_not_active");
     } else {
       try {
-       handled = mds_rank->handle_asok_command(command, cmdmap, f, ss);
-      } catch (const bad_cmd_get& e) {
+       mds_rank->handle_asok_command(command, cmdmap, f, inbl, on_finish);
+       return;
+      } catch (const TOPNSPC::common::bad_cmd_get& e) {
        ss << e.what();
+       r = -EINVAL;
       }
     }
   }
-  f->flush(ss);
-  delete f;
-
-  dout(1) << "asok_command: " << command << " (complete)" << dendl;
-
-  return handled;
+  on_finish(r, ss.str(), outbl);
 }
 
 void MDSDaemon::dump_status(Formatter *f)
@@ -191,150 +241,214 @@ void MDSDaemon::set_up_admin_socket()
   AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
   ceph_assert(asok_hook == nullptr);
   asok_hook = new MDSSocketHook(this);
-  r = admin_socket->register_command("status", "status", asok_hook,
+  r = admin_socket->register_command("status", asok_hook,
                                     "high-level status of MDS");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump_ops_in_flight",
-                                    "dump_ops_in_flight", asok_hook,
+  r = admin_socket->register_command("dump_ops_in_flight", asok_hook,
                                     "show the ops currently in flight");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("ops",
-                                    "ops", asok_hook,
+  r = admin_socket->register_command("ops", asok_hook,
                                     "show the ops currently in flight");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump_blocked_ops", "dump_blocked_ops",
+  r = admin_socket->register_command("dump_blocked_ops",
       asok_hook,
       "show the blocked ops currently in flight");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
+  r = admin_socket->register_command("dump_historic_ops",
                                     asok_hook,
                                     "show recent ops");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
+  r = admin_socket->register_command("dump_historic_ops_by_duration",
                                     asok_hook,
                                     "show recent ops, sorted by op duration");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("scrub_path",
-                                    "scrub_path name=path,type=CephString "
+  r = admin_socket->register_command("scrub_path name=path,type=CephString "
                                     "name=scrubops,type=CephChoices,"
-                                    "strings=force|recursive|repair,n=N,req=false",
+                                    "strings=force|recursive|repair,n=N,req=false "
+                                    "name=tag,type=CephString,req=false",
                                      asok_hook,
                                      "scrub an inode and output results");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("tag path",
-                                     "tag path name=path,type=CephString"
+  r = admin_socket->register_command("scrub start "
+                                    "name=path,type=CephString "
+                                    "name=scrubops,type=CephChoices,strings=force|recursive|repair,n=N,req=false "
+                                    "name=tag,type=CephString,req=false",
+                                    asok_hook,
+                                    "scrub and inode and output results");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub abort",
+                                     asok_hook,
+                                     "Abort in progress scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub pause",
+                                     asok_hook,
+                                     "Pause in progress scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub resume",
+                                     asok_hook,
+                                     "Resume paused scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub status",
+                                     asok_hook,
+                                     "Status of scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("tag path name=path,type=CephString"
                                      " name=tag,type=CephString",
                                      asok_hook,
                                      "Apply scrub tag recursively");
    ceph_assert(r == 0);
-  r = admin_socket->register_command("flush_path",
-                                     "flush_path name=path,type=CephString",
+  r = admin_socket->register_command("flush_path name=path,type=CephString",
                                      asok_hook,
                                      "flush an inode (and its dirfrags)");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("export dir",
-                                     "export dir "
+  r = admin_socket->register_command("export dir "
                                      "name=path,type=CephString "
                                      "name=rank,type=CephInt",
                                      asok_hook,
                                      "migrate a subtree to named MDS");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump cache",
-                                     "dump cache name=path,type=CephString,req=false",
+  r = admin_socket->register_command("dump cache name=path,type=CephString,req=false",
                                      asok_hook,
                                      "dump metadata cache (optionally to a file)");
   ceph_assert(r == 0);
+  r = admin_socket->register_command("cache drop "
+                                    "name=timeout,type=CephInt,range=0,req=false",
+                                    asok_hook,
+                                    "trim cache and optionally request client to release all caps and flush the journal");
+  ceph_assert(r == 0);
   r = admin_socket->register_command("cache status",
-                                     "cache status",
                                      asok_hook,
                                      "show cache status");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump tree",
-                                    "dump tree "
+  r = admin_socket->register_command("dump tree "
                                     "name=root,type=CephString,req=true "
                                     "name=depth,type=CephInt,req=false ",
                                     asok_hook,
                                     "dump metadata cache for subtree");
   ceph_assert(r == 0);
   r = admin_socket->register_command("dump loads",
-                                     "dump loads",
                                      asok_hook,
                                      "dump metadata loads");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump snaps",
-                                     "dump snaps name=server,type=CephChoices,strings=--server,req=false",
+  r = admin_socket->register_command("dump snaps name=server,type=CephChoices,strings=--server,req=false",
                                      asok_hook,
                                      "dump snapshots");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("session evict",
-                                    "session evict name=client_id,type=CephString",
+  r = admin_socket->register_command("session ls "
+                                    "name=cap_dump,type=CephBool,req=false "
+                                    "name=filters,type=CephString,n=N,req=false ",
+                                    asok_hook,
+                                    "List client sessions based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("client ls "
+                                    "name=cap_dump,type=CephBool,req=false "
+                                    "name=filters,type=CephString,n=N,req=false ",
+                                    asok_hook,
+                                    "List client sessions based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session evict name=filters,type=CephString,n=N,req=false",
                                     asok_hook,
-                                    "Evict a CephFS client");
+                                    "Evict client session(s) based on a filter");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("session ls",
-                                    "session ls",
+  r = admin_socket->register_command("client evict name=filters,type=CephString,n=N,req=false",
+                                    asok_hook,
+                                    "Evict client session(s) based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session kill name=client_id,type=CephString",
+                                    asok_hook,
+                                    "Evict a client session by id");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session ls name=cap_dump,type=CephBool,req=false",
                                     asok_hook,
                                     "Enumerate connected CephFS clients");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("session config",
-                                    "session config name=client_id,type=CephInt,req=true "
+  r = admin_socket->register_command("session config "
+                                    "name=client_id,type=CephInt,req=true "
+                                    "name=option,type=CephString,req=true "
+                                    "name=value,type=CephString,req=false ",
+                                    asok_hook,
+                                    "Config a CephFS client session");
+  assert(r == 0);
+  r = admin_socket->register_command("client config "
+                                    "name=client_id,type=CephInt,req=true "
                                     "name=option,type=CephString,req=true "
                                     "name=value,type=CephString,req=false ",
                                     asok_hook,
                                     "Config a CephFS client session");
   assert(r == 0);
-  r = admin_socket->register_command("osdmap barrier",
-                                    "osdmap barrier name=target_epoch,type=CephInt",
+  r = admin_socket->register_command("damage ls",
+                                    asok_hook,
+                                    "List detected metadata damage");
+  assert(r == 0);
+  r = admin_socket->register_command("damage rm "
+                                    "name=damage_id,type=CephInt",
+                                    asok_hook,
+                                    "Remove a damage table entry");
+  assert(r == 0);
+  r = admin_socket->register_command("osdmap barrier name=target_epoch,type=CephInt",
                                     asok_hook,
                                     "Wait until the MDS has this OSD map epoch");
   ceph_assert(r == 0);
   r = admin_socket->register_command("flush journal",
-                                    "flush journal",
                                     asok_hook,
                                     "Flush the journal to the backing store");
   ceph_assert(r == 0);
   r = admin_socket->register_command("force_readonly",
-                                    "force_readonly",
                                     asok_hook,
                                     "Force MDS to read-only mode");
   ceph_assert(r == 0);
   r = admin_socket->register_command("get subtrees",
-                                    "get subtrees",
                                     asok_hook,
                                     "Return the subtree map");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dirfrag split",
-                                    "dirfrag split "
+  r = admin_socket->register_command("dirfrag split "
                                      "name=path,type=CephString,req=true "
                                      "name=frag,type=CephString,req=true "
                                      "name=bits,type=CephInt,req=true ",
                                     asok_hook,
                                     "Fragment directory by path");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dirfrag merge",
-                                    "dirfrag merge "
+  r = admin_socket->register_command("dirfrag merge "
                                      "name=path,type=CephString,req=true "
                                      "name=frag,type=CephString,req=true",
                                     asok_hook,
                                     "De-fragment directory by path");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dirfrag ls",
-                                    "dirfrag ls "
+  r = admin_socket->register_command("dirfrag ls "
                                      "name=path,type=CephString,req=true",
                                     asok_hook,
                                     "List fragments in directory");
   ceph_assert(r == 0);
   r = admin_socket->register_command("openfiles ls",
-                                     "openfiles ls",
                                      asok_hook,
                                      "List the opening files and their caps");
   ceph_assert(r == 0);
-  r = admin_socket->register_command("dump inode",
-                                    "dump inode " 
+  r = admin_socket->register_command("dump inode "
                                      "name=number,type=CephInt,req=true",
                                     asok_hook,
                                     "dump inode by inode number");
   ceph_assert(r == 0);
+  r = admin_socket->register_command("exit",
+                                    asok_hook,
+                                    "Terminate this MDS");
+  r = admin_socket->register_command("respawn",
+                                    asok_hook,
+                                    "Respawn this MDS");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "heap " \
+    "name=heapcmd,type=CephChoices,strings="                           \
+    "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
+    "name=value,type=CephString,req=false",
+    asok_hook,
+    "show heap usage info (available only if compiled with tcmalloc)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "cpu_profiler " \
+    "name=arg,type=CephChoices,strings=status|flush",
+    asok_hook,
+    "run cpu profiling on daemon");
+  ceph_assert(r == 0);
 }
 
 void MDSDaemon::clean_up_admin_socket()
@@ -378,9 +492,9 @@ int MDSDaemon::init()
   r = monc->init();
   if (r < 0) {
     derr << "ERROR: failed to init monc: " << cpp_strerror(-r) << dendl;
-    mds_lock.Lock();
+    mds_lock.lock();
     suicide();
-    mds_lock.Unlock();
+    mds_lock.unlock();
     return r;
   }
 
@@ -394,9 +508,9 @@ int MDSDaemon::init()
   r = monc->authenticate();
   if (r < 0) {
     derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl;
-    mds_lock.Lock();
+    mds_lock.lock();
     suicide();
-    mds_lock.Unlock();
+    mds_lock.unlock();
     return r;
   }
 
@@ -410,36 +524,30 @@ int MDSDaemon::init()
     }
     derr << "ERROR: failed to refresh rotating keys, "
          << "maximum retry time reached." << dendl;
-    mds_lock.Lock();
+    std::lock_guard locker{mds_lock};
     suicide();
-    mds_lock.Unlock();
     return -ETIMEDOUT;
   }
 
-  mgrc.init();
-  messenger->add_dispatcher_head(&mgrc);
-
-  mds_lock.Lock();
+  mds_lock.lock();
   if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
     dout(4) << __func__ << ": terminated already, dropping out" << dendl;
-    mds_lock.Unlock();
+    mds_lock.unlock();
     return 0;
   }
 
   monc->sub_want("mdsmap", 0, 0);
-  monc->sub_want("mgrmap", 0, 0);
   monc->renew_subs();
 
-  mds_lock.Unlock();
+  mds_lock.unlock();
 
   // Set up admin socket before taking mds_lock, so that ordering
   // is consistent (later we take mds_lock within asok callbacks)
   set_up_admin_socket();
-  mds_lock.Lock();
+  std::lock_guard locker{mds_lock};
   if (beacon.get_want_state() == MDSMap::STATE_DNE) {
     suicide();  // we could do something more graceful here
     dout(4) << __func__ << ": terminated already, dropping out" << dendl;
-    mds_lock.Unlock();
     return 0; 
   }
 
@@ -450,8 +558,6 @@ int MDSDaemon::init()
 
   // schedule tick
   reset_tick();
-  mds_lock.Unlock();
-
   return 0;
 }
 
@@ -463,8 +569,8 @@ void MDSDaemon::reset_tick()
   // schedule
   tick_event = timer.add_event_after(
     g_conf()->mds_tick_interval,
-    new FunctionContext([this](int) {
-       ceph_assert(mds_lock.is_locked_by_me());
+    new LambdaContext([this](int) {
+       ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
        tick();
       }));
 }
@@ -480,13 +586,18 @@ void MDSDaemon::tick()
   }
 }
 
-void MDSDaemon::send_command_reply(const MCommand::const_ref &m, MDSRank *mds_rank,
-                                  int r, bufferlist outbl,
-                                  std::string_view outs)
+void MDSDaemon::handle_command(const cref_t<MCommand> &m)
 {
   auto priv = m->get_connection()->get_priv();
   auto session = static_cast<Session *>(priv.get());
   ceph_assert(session != NULL);
+
+  int r = 0;
+  cmdmap_t cmdmap;
+  std::stringstream ss;
+  std::string outs;
+  bufferlist outbl;
+
   // If someone is using a closed session for sending commands (e.g.
   // the ceph CLI) then we should feel free to clean up this connection
   // as soon as we've sent them a response.
@@ -503,279 +614,32 @@ void MDSDaemon::send_command_reply(const MCommand::const_ref &m, MDSRank *mds_ra
   }
   priv.reset();
 
-  auto reply = MCommandReply::create(r, outs);
-  reply->set_tid(m->get_tid());
-  reply->set_data(outbl);
-  m->get_connection()->send_message2(reply);
-}
-
-void MDSDaemon::handle_command(const MCommand::const_ref &m)
-{
-  auto priv = m->get_connection()->get_priv();
-  auto session = static_cast<Session *>(priv.get());
-  ceph_assert(session != NULL);
-
-  int r = 0;
-  cmdmap_t cmdmap;
-  std::stringstream ss;
-  std::string outs;
-  bufferlist outbl;
-  Context *run_after = NULL;
-  bool need_reply = true;
-
   if (!session->auth_caps.allow_all()) {
     dout(1) << __func__
       << ": received command from client without `tell` capability: "
       << *m->get_connection()->peer_addrs << dendl;
 
     ss << "permission denied";
-    r = -EPERM;
+    r = -EACCES;
   } else if (m->cmd.empty()) {
     r = -EINVAL;
     ss << "no command given";
     outs = ss.str();
-  } else if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+  } else if (!TOPNSPC::common::cmdmap_from_json(m->cmd, &cmdmap, ss)) {
     r = -EINVAL;
     outs = ss.str();
   } else {
-    try {
-      r = _handle_command(cmdmap, m, &outbl, &outs, &run_after, &need_reply);
-    } catch (const bad_cmd_get& e) {
-      outs = e.what();
-      r = -EINVAL;
-    }
-  }
-  priv.reset();
-
-  if (need_reply) {
-    send_command_reply(m, mds_rank, r, outbl, outs);
-  }
-
-  if (run_after) {
-    run_after->complete(0);
-  }
-}
-
-const std::vector<MDSDaemon::MDSCommand>& MDSDaemon::get_commands()
-{
-  static const std::vector<MDSCommand> commands = {
-    MDSCommand("injectargs name=injected_args,type=CephString,n=N", "inject configuration arguments into running MDS"),
-    MDSCommand("config set name=key,type=CephString name=value,type=CephString", "Set a configuration option at runtime (not persistent)"),
-    MDSCommand("config unset name=key,type=CephString", "Unset a configuration option at runtime (not persistent)"),
-    MDSCommand("exit", "Terminate this MDS"),
-    MDSCommand("respawn", "Restart this MDS"),
-    MDSCommand("session kill name=session_id,type=CephInt", "End a client session"),
-    MDSCommand("cpu_profiler name=arg,type=CephChoices,strings=status|flush", "run cpu profiling on daemon"),
-    MDSCommand("session ls name=filters,type=CephString,n=N,req=false", "List client sessions"),
-    MDSCommand("client ls name=filters,type=CephString,n=N,req=false", "List client sessions"),
-    MDSCommand("session evict name=filters,type=CephString,n=N,req=false", "Evict client session(s)"),
-    MDSCommand("client evict name=filters,type=CephString,n=N,req=false", "Evict client session(s)"),
-    MDSCommand("session config name=client_id,type=CephInt name=option,type=CephString name=value,type=CephString,req=false",
-       "Config a client session"),
-    MDSCommand("client config name=client_id,type=CephInt name=option,type=CephString name=value,type=CephString,req=false",
-       "Config a client session"),
-    MDSCommand("damage ls", "List detected metadata damage"),
-    MDSCommand("damage rm name=damage_id,type=CephInt", "Remove a damage table entry"),
-    MDSCommand("version", "report version of MDS"),
-    MDSCommand("heap "
-        "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats",
-        "show heap usage info (available only if compiled with tcmalloc)"),
-    MDSCommand("cache drop name=timeout,type=CephInt,range=0,req=false", "trim cache and optionally request client to release all caps and flush the journal"),
-    MDSCommand("scrub start name=path,type=CephString name=scrubops,type=CephChoices,strings=force|recursive|repair,n=N,req=false name=tag,type=CephString,req=false",
-               "scrub an inode and output results"),
-    MDSCommand("scrub abort", "Abort in progress scrub operation(s)"),
-    MDSCommand("scrub pause", "Pause in progress scrub operation(s)"),
-    MDSCommand("scrub resume", "Resume paused scrub operation(s)"),
-    MDSCommand("scrub status", "Status of scrub operation"),
-  };
-  return commands;
-};
-
-int MDSDaemon::_handle_command(
-    const cmdmap_t &cmdmap,
-    const MCommand::const_ref &m,
-    bufferlist *outbl,
-    std::string *outs,
-    Context **run_later,
-    bool *need_reply)
-{
-  ceph_assert(outbl != NULL);
-  ceph_assert(outs != NULL);
-
-  class SuicideLater : public Context
-  {
-    MDSDaemon *mds;
-
-    public:
-    explicit SuicideLater(MDSDaemon *mds_) : mds(mds_) {}
-    void finish(int r) override {
-      // Wait a little to improve chances of caller getting
-      // our response before seeing us disappear from mdsmap
-      sleep(1);
-
-      mds->suicide();
-    }
-  };
-
-
-  class RespawnLater : public Context
-  {
-    MDSDaemon *mds;
-
-    public:
-
-    explicit RespawnLater(MDSDaemon *mds_) : mds(mds_) {}
-    void finish(int r) override {
-      // Wait a little to improve chances of caller getting
-      // our response before seeing us disappear from mdsmap
-      sleep(1);
-
-      mds->respawn();
-    }
-  };
-
-  std::stringstream ds;
-  std::stringstream ss;
-  std::string prefix;
-  std::string format;
-  std::unique_ptr<Formatter> f(Formatter::create(format));
-  cmd_getval(cct, cmdmap, "prefix", prefix);
-
-  int r = 0;
-
-  if (prefix == "get_command_descriptions") {
-    int cmdnum = 0;
-    std::unique_ptr<JSONFormatter> f(std::make_unique<JSONFormatter>());
-    f->open_object_section("command_descriptions");
-    for (auto& c : get_commands()) {
-      ostringstream secname;
-      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
-      dump_cmddesc_to_json(f.get(), m->get_connection()->get_features(),
-                           secname.str(), c.cmdstring, c.helpstring,
-                          c.module, "*", 0);
-      cmdnum++;
-    }
-    f->close_section();        // command_descriptions
-
-    f->flush(ds);
-    goto out; 
-  }
-
-  cmd_getval(cct, cmdmap, "format", format);
-  if (prefix == "version") {
-    if (f) {
-      f->open_object_section("version");
-      f->dump_string("version", pretty_version_to_str());
-      f->close_section();
-      f->flush(ds);
-    } else {
-      ds << pretty_version_to_str();
-    }
-  } else if (prefix == "injectargs") {
-    vector<string> argsvec;
-    cmd_getval(cct, cmdmap, "injected_args", argsvec);
-
-    if (argsvec.empty()) {
-      r = -EINVAL;
-      ss << "ignoring empty injectargs";
-      goto out;
-    }
-    string args = argsvec.front();
-    for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
-      args += " " + *a;
-    r = cct->_conf.injectargs(args, &ss);
-  } else if (prefix == "config set") {
-    std::string key;
-    cmd_getval(cct, cmdmap, "key", key);
-    std::string val;
-    cmd_getval(cct, cmdmap, "value", val);
-    r = cct->_conf.set_val(key, val, &ss);
-    if (r == 0) {
-      cct->_conf.apply_changes(nullptr);
-    }
-  } else if (prefix == "config unset") {
-    std::string key;
-    cmd_getval(cct, cmdmap, "key", key);
-    r = cct->_conf.rm_val(key);
-    if (r == 0) {
-      cct->_conf.apply_changes(nullptr);
-    }
-    if (r == -ENOENT) {
-      r = 0; // idempotent
-    }
-  } else if (prefix == "exit") {
-    // We will send response before executing
-    ss << "Exiting...";
-    *run_later = new SuicideLater(this);
-  } else if (prefix == "respawn") {
-    // We will send response before executing
-    ss << "Respawning...";
-    *run_later = new RespawnLater(this);
-  } else if (prefix == "session kill") {
-    if (mds_rank == NULL) {
-      r = -EINVAL;
-      ss << "MDS not active";
-      goto out;
-    }
-    // FIXME harmonize `session kill` with admin socket session evict
-    int64_t session_id = 0;
-    bool got = cmd_getval(cct, cmdmap, "session_id", session_id);
-    ceph_assert(got);
-    bool killed = mds_rank->evict_client(session_id, false,
-                                         g_conf()->mds_session_blacklist_on_evict,
-                                         ss);
-    if (!killed)
-      r = -ENOENT;
-  } else if (prefix == "heap") {
-    if (!ceph_using_tcmalloc()) {
-      r = -EOPNOTSUPP;
-      ss << "could not issue heap profiler command -- not using tcmalloc!";
-    } else {
-      string heapcmd;
-      cmd_getval(cct, cmdmap, "heapcmd", heapcmd);
-      vector<string> heapcmd_vec;
-      get_str_vec(heapcmd, heapcmd_vec);
-      string value;
-      if (cmd_getval(cct, cmdmap, "value", value))
-        heapcmd_vec.push_back(value);
-      ceph_heap_profiler_handle_command(heapcmd_vec, ds);
-    }
-  } else if (prefix == "cpu_profiler") {
-    string arg;
-    cmd_getval(cct, cmdmap, "arg", arg);
-    vector<string> argvec;
-    get_str_vec(arg, argvec);
-    cpu_profiler_handle_command(argvec, ds);
-  } else {
-    // Give MDSRank a shot at the command
-    if (!mds_rank) {
-      ss << "MDS not active";
-      r = -EINVAL;
-    }
-    else {
-      bool handled;
-      try {
-        handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss,
-                                           run_later, need_reply);
-       if (!handled) {
-         // MDSDaemon doesn't know this command
-         ss << "unrecognized command! " << prefix;
-         r = -EINVAL;
-       }
-      } catch (const bad_cmd_get& e) {
-       ss << e.what();
-       r = -EINVAL;
-      }
-    }
+    cct->get_admin_socket()->queue_tell_command(m);
+    return;
   }
 
-out:
-  *outs = ss.str();
-  outbl->append(ds);
-  return r;
+  auto reply = make_message<MCommandReply>(r, outs);
+  reply->set_tid(m->get_tid());
+  reply->set_data(outbl);
+  m->get_connection()->send_message2(reply);
 }
 
-void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
+void MDSDaemon::handle_mds_map(const cref_t<MMDSMap> &m)
 {
   version_t epoch = m->get_epoch();
 
@@ -788,8 +652,6 @@ void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
 
   dout(1) << "Updating MDS map to version " << epoch << " from " << m->get_source() << dendl;
 
-  entity_addrvec_t addrs;
-
   // keep old map, for a moment
   std::unique_ptr<MDSMap> oldmap;
   oldmap.swap(mdsmap);
@@ -797,14 +659,9 @@ void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
   // decode and process
   mdsmap.reset(new MDSMap);
   mdsmap->decode(m->get_encoded());
-  const MDSMap::DaemonState new_state = mdsmap->get_state_gid(mds_gid_t(monc->get_global_id()));
-  const int incarnation = mdsmap->get_inc_gid(mds_gid_t(monc->get_global_id()));
 
   monc->sub_got("mdsmap", mdsmap->get_epoch());
 
-  // Calculate my effective rank (either my owned rank or the rank I'm following if STATE_STANDBY_REPLAY
-  mds_rank_t whoami = mdsmap->get_rank_gid(mds_gid_t(monc->get_global_id()));
-
   // verify compatset
   CompatSet mdsmap_compat(MDSMap::get_compat_set_all());
   dout(10) << "     my compat " << mdsmap_compat << dendl;
@@ -814,56 +671,71 @@ void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
            << " not writeable with daemon features " << mdsmap_compat
            << ", killing myself" << dendl;
     suicide();
-    goto out;
-  }
-
-  // mark down any failed peers
-  for (const auto &p : oldmap->get_mds_info()) {
-    if (mdsmap->get_mds_info().count(p.first) == 0) {
-      dout(10) << " peer mds gid " << p.first << " removed from map" << dendl;
-      messenger->mark_down_addrs(p.second.addrs);
-    }
+    return;
   }
 
-  // see who i am
-  dout(10) << "my gid is " << monc->get_global_id() << dendl;
+  // Calculate my effective rank (either my owned rank or the rank I'm following if STATE_STANDBY_REPLAY
+  const auto addrs = messenger->get_myaddrs();
+  const auto myid = monc->get_global_id();
+  const auto mygid = mds_gid_t(myid);
+  const auto whoami = mdsmap->get_rank_gid(mygid);
+  const auto old_state = oldmap->get_state_gid(mygid);
+  const auto new_state = mdsmap->get_state_gid(mygid);
+  const auto incarnation = mdsmap->get_inc_gid(mygid);
+  dout(10) << "my gid is " << myid << dendl;
   dout(10) << "map says I am mds." << whoami << "." << incarnation
           << " state " << ceph_mds_state_name(new_state) << dendl;
+  dout(10) << "msgr says I am " << addrs << dendl;
+
+  // If we're removed from the MDSMap, stop all processing.
+  using DS = MDSMap::DaemonState;
+  if (old_state != DS::STATE_NULL && new_state == DS::STATE_NULL) {
+    const auto& oldinfo = oldmap->get_info_gid(mygid);
+    dout(1) << "Map removed me " << oldinfo
+            << " from cluster; respawning! See cluster/monitor logs for details." << dendl;
+    respawn();
+  }
+
+  if (old_state == DS::STATE_NULL && new_state != DS::STATE_NULL) {
+    /* The MDS has been added to the FSMap, now we can init the MgrClient */
+    mgrc.init();
+    messenger->add_dispatcher_tail(&mgrc);
+    monc->sub_want("mgrmap", 0, 0);
+    monc->renew_subs(); /* MgrMap receipt drives connection to ceph-mgr */
+  }
 
-  addrs = messenger->get_myaddrs();
-  dout(10) << "msgr says i am " << addrs << dendl;
+  // mark down any failed peers
+  for (const auto& [gid, info] : oldmap->get_mds_info()) {
+    if (mdsmap->get_mds_info().count(gid) == 0) {
+      dout(10) << " peer mds gid " << gid << " removed from map" << dendl;
+      messenger->mark_down_addrs(info.addrs);
+    }
+  }
 
   if (whoami == MDS_RANK_NONE) {
-    if (mds_rank != NULL) {
-      const auto myid = monc->get_global_id();
-      // We have entered a rank-holding state, we shouldn't be back
-      // here!
-      if (g_conf()->mds_enforce_unique_name) {
-        if (mds_gid_t existing = mdsmap->find_mds_gid_by_name(name)) {
-          const MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing);
-          if (i.global_id > myid) {
-            dout(1) << "Map replaced me with another mds." << whoami
-                    << " with gid (" << i.global_id << ") larger than myself ("
-                    << myid << "); quitting!" << dendl;
-            // Call suicide() rather than respawn() because if someone else
-            // has taken our ID, we don't want to keep restarting and
-            // fighting them for the ID.
-            suicide();
-            return;
-          }
-        }
-      }
+    // We do not hold a rank:
+    dout(10) <<  __func__ << ": handling map in rankless mode" << dendl;
 
-      dout(1) << "Map removed me (mds." << whoami << " gid:"
-              << myid << ") from cluster due to lost contact; respawning" << dendl;
-      respawn();
+    if (new_state == DS::STATE_STANDBY) {
+      /* Note: STATE_BOOT is never an actual state in the FSMap. The Monitors
+       * generally mark a new MDS as STANDBY (although it's possible to
+       * immediately be assigned a rank).
+       */
+      if (old_state == DS::STATE_NULL) {
+        dout(1) << "Monitors have assigned me to become a standby." << dendl;
+        beacon.set_want_state(*mdsmap, new_state);
+      } else if (old_state == DS::STATE_STANDBY) {
+        dout(5) << "I am still standby" << dendl;
+      }
+    } else if (new_state == DS::STATE_NULL) {
+      /* We are not in the MDSMap yet! Keep waiting: */
+      ceph_assert(beacon.get_want_state() == DS::STATE_BOOT);
+      dout(10) << "not in map yet" << dendl;
+    } else {
+      /* We moved to standby somehow from another state */
+      ceph_abort("invalid transition to standby");
     }
-    // MDSRank not active: process the map here to see if we have
-    // been assigned a rank.
-    dout(10) <<  __func__ << ": handling map in rankless mode" << dendl;
-    _handle_mds_map(*mdsmap);
   } else {
-
     // Did we already hold a different rank?  MDSMonitor shouldn't try
     // to change that out from under me!
     if (mds_rank && whoami != mds_rank->get_nodeid()) {
@@ -875,9 +747,9 @@ void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
     // Did I previously not hold a rank?  Initialize!
     if (mds_rank == NULL) {
       mds_rank = new MDSRankDispatcher(whoami, mds_lock, clog,
-          timer, beacon, mdsmap, messenger, monc,
-          new FunctionContext([this](int r){respawn();}),
-          new FunctionContext([this](int r){suicide();}));
+          timer, beacon, mdsmap, messenger, monc, &mgrc,
+          new LambdaContext([this](int r){respawn();}),
+          new LambdaContext([this](int r){suicide();}));
       dout(10) <<  __func__ << ": initializing MDS rank "
                << mds_rank->get_nodeid() << dendl;
       mds_rank->init();
@@ -889,36 +761,9 @@ void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
     mds_rank->handle_mds_map(m, *oldmap);
   }
 
-out:
   beacon.notify_mdsmap(*mdsmap);
 }
 
-void MDSDaemon::_handle_mds_map(const MDSMap &mdsmap)
-{
-  MDSMap::DaemonState new_state = mdsmap.get_state_gid(mds_gid_t(monc->get_global_id()));
-
-  // Normal rankless case, we're marked as standby
-  if (new_state == MDSMap::STATE_STANDBY) {
-    beacon.set_want_state(mdsmap, new_state);
-    dout(1) << "Map has assigned me to become a standby" << dendl;
-
-    return;
-  }
-
-  // Case where we thought we were standby, but MDSMap disagrees
-  if (beacon.get_want_state() == MDSMap::STATE_STANDBY) {
-    dout(10) << "dropped out of mdsmap, try to re-add myself" << dendl;
-    new_state = MDSMap::STATE_BOOT;
-    beacon.set_want_state(mdsmap, new_state);
-    return;
-  }
-
-  // Case where we have sent a boot beacon that isn't reflected yet
-  if (beacon.get_want_state() == MDSMap::STATE_BOOT) {
-    dout(10) << "not in map yet" << dendl;
-  }
-}
-
 void MDSDaemon::handle_signal(int signum)
 {
   ceph_assert(signum == SIGINT || signum == SIGTERM);
@@ -934,7 +779,7 @@ void MDSDaemon::handle_signal(int signum)
 
 void MDSDaemon::suicide()
 {
-  ceph_assert(mds_lock.is_locked());
+  ceph_assert(ceph_mutex_is_locked(mds_lock));
   
   // make sure we don't suicide twice
   ceph_assert(stopping == false);
@@ -950,17 +795,17 @@ void MDSDaemon::suicide()
 
   clean_up_admin_socket();
 
-  // Inform MDS we are going away, then shut down beacon
+  // Notify the Monitors (MDSMonitor) that we're dying, so that it doesn't have
+  // to wait for us to go laggy. Only do this if we're actually in the MDSMap,
+  // because otherwise the MDSMonitor will drop our message.
   beacon.set_want_state(*mdsmap, MDSMap::STATE_DNE);
   if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) {
-    // Notify the MDSMonitor that we're dying, so that it doesn't have to
-    // wait for us to go laggy.  Only do this if we're actually in the
-    // MDSMap, because otherwise the MDSMonitor will drop our message.
     beacon.send_and_wait(1);
   }
   beacon.shutdown();
 
-  mgrc.shutdown();
+  if (mgrc.is_initialized())
+    mgrc.shutdown();
 
   if (mds_rank) {
     mds_rank->shutdown();
@@ -1034,7 +879,7 @@ void MDSDaemon::respawn()
 
 
 
-bool MDSDaemon::ms_dispatch2(const Message::ref &m)
+bool MDSDaemon::ms_dispatch2(const ref_t<Message> &m)
 {
   std::lock_guard l(mds_lock);
   if (stopping) {
@@ -1061,24 +906,21 @@ bool MDSDaemon::ms_dispatch2(const Message::ref &m)
   }
 }
 
-bool MDSDaemon::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
-{
-  dout(10) << "MDSDaemon::ms_get_authorizer type="
-           << ceph_entity_type_name(dest_type) << dendl;
-
-  /* monitor authorization is being handled on different layer */
-  if (dest_type == CEPH_ENTITY_TYPE_MON)
-    return true;
-
-  *authorizer = monc->build_authorizer(dest_type);
-  return *authorizer != NULL;
-}
-
-
 /*
  * high priority messages we always process
  */
-bool MDSDaemon::handle_core_message(const Message::const_ref &m)
+
+#define ALLOW_MESSAGES_FROM(peers)                                      \
+  do {                                                                  \
+    if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
+      dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" \
+              << m->get_connection()->get_peer_type() << " allowing="   \
+              << #peers << " message=" << *m << dendl;                  \
+      return true;                                                      \
+    }                                                                   \
+  } while (0)
+
+bool MDSDaemon::handle_core_message(const cref_t<Message> &m)
 {
   switch (m->get_type()) {
   case CEPH_MSG_MON_MAP:
@@ -1088,12 +930,17 @@ bool MDSDaemon::handle_core_message(const Message::const_ref &m)
     // MDS
   case CEPH_MSG_MDS_MAP:
     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS);
-    handle_mds_map(MMDSMap::msgref_cast(m));
+    handle_mds_map(ref_cast<MMDSMap>(m));
+    break;
+
+  case MSG_REMOVE_SNAPS:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+    mds_rank->snapserver->handle_remove_snaps(ref_cast<MRemoveSnaps>(m));
     break;
 
     // OSD
   case MSG_COMMAND:
-    handle_command(MCommand::msgref_cast(m));
+    handle_command(ref_cast<MCommand>(m));
     break;
   case CEPH_MSG_OSD_MAP:
     ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
@@ -1175,11 +1022,6 @@ bool MDSDaemon::ms_handle_refused(Connection *con)
   return false;
 }
 
-KeyStore *MDSDaemon::ms_get_auth1_authorizer_keystore()
-{
-  return monc->rotating_secrets.get();
-}
-
 bool MDSDaemon::parse_caps(const AuthCapsInfo& info, MDSAuthCaps& caps)
 {
   caps.clear();