update sources to v12.2.3

[ceph.git] / ceph / src / mon / MDSMonitor.cc
diff --git a/ceph/src/mon/MDSMonitor.cc b/ceph/src/mon/MDSMonitor.cc

index d5a350efb37b7dd5e2f407704171193fc23ceb6e..5c51d2f84d554a46d8389ead0a0659e5e96058ef 100644 (file)
--- a/ceph/src/mon/MDSMonitor.cc
+++ b/ceph/src/mon/MDSMonitor.cc
@@ -49,6 +49,10 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
                 << ").mds e" << fsmap.get_epoch() << " ";
  }
  
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
  /*
   * Specialized implementation of cmd_getval to allow us to parse
   * out strongly-typedef'd types
@@ -71,9 +75,6 @@ template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
    return cmd_getval(cct, cmdmap, k, (int64_t&)val);
  }
  
-static const string MDS_METADATA_PREFIX("mds_metadata");
-
-
  // my methods
  
  void MDSMonitor::print_map(FSMap &m, int dbl)
@@ -89,6 +90,12 @@ void MDSMonitor::create_initial()
    dout(10) << "create_initial" << dendl;
  }
  
+void MDSMonitor::get_store_prefixes(std::set<string>& s)
+{
+  s.insert(service_name);
+  s.insert(MDS_METADATA_PREFIX);
+  s.insert(MDS_HEALTH_PREFIX);
+}
  
  void MDSMonitor::update_from_paxos(bool *need_bootstrap)
  {
@@ -133,6 +140,11 @@ void MDSMonitor::create_pending()
    pending_fsmap = fsmap;
    pending_fsmap.epoch++;
  
+  if (mon->osdmon()->is_readable()) {
+    auto &osdmap = mon->osdmon()->osdmap;
+    pending_fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+  }
+
    dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
  }
  
@@ -202,7 +214,7 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
        health.decode(bl_i);
      }
      for (const auto &metric : health.metrics) {
-      int const rank = info.rank;
+      const int rank = info.rank;
        health_check_t *check = &new_checks.get_or_add(
         mds_metric_name(metric.type),
         metric.sev,
@@ -234,6 +246,10 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
        p.second.summary,
        boost::regex("%isorare%"),
        p.second.detail.size() > 1 ? "are" : "is");
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%hasorhave%"),
+      p.second.detail.size() > 1 ? "have" : "has");
    }
    encode_health(new_checks, t);
  }
@@ -565,7 +581,9 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
            mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
            return false;
          }
-        mon->clog->info() << "MDS daemon '" << m->get_name() << "' restarted";
+        const MDSMap::mds_info_t &existing_info =
+          pending_fsmap.get_info_gid(existing);
+        mon->clog->info() << existing_info.human_name() << " restarted";
         fail_mds_gid(existing);
          failed_mds = true;
        }
@@ -599,9 +617,8 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
        if (leaderinfo && (leaderinfo->rank >= 0)) {
          auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
          auto fs = pending_fsmap.get_filesystem(fscid);
-        bool followable = fs->mds_map.is_followable(leaderinfo->rank);
  
-        pending_fsmap.modify_daemon(gid, [fscid, leaderinfo, followable](
+        pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
                MDSMap::mds_info_t *info) {
              info->standby_for_rank = leaderinfo->rank;
              info->standby_for_fscid = fscid;
@@ -659,6 +676,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
              << "  standby_for_rank=" << m->get_standby_for_rank()
              << dendl;
      if (state == MDSMap::STATE_STOPPED) {
+      const auto fscid = pending_fsmap.mds_roles.at(gid);
+      auto fs = pending_fsmap.get_filesystem(fscid);
+
+      mon->clog->info() << info.human_name() << " finished "
+                        << "deactivating rank " << info.rank << " in filesystem "
+                        << fs->mds_map.fs_name << " (now has "
+                        << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
+
        auto erased = pending_fsmap.stop(gid);
        erased.push_back(gid);
  
@@ -669,6 +694,8 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
            pending_daemon_health_rm.insert(erased_gid);
          }
        }
+
+
      } else if (state == MDSMap::STATE_DAMAGED) {
        if (!mon->osdmon()->is_writeable()) {
          dout(4) << __func__ << ": DAMAGED from rank " << info.rank
@@ -683,7 +710,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
                << info.rank << " damaged" << dendl;
  
        utime_t until = ceph_clock_now();
-      until += g_conf->mds_blacklist_interval;
+      until += g_conf->get_val<double>("mon_mds_blacklist_interval");
        const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
        request_proposal(mon->osdmon());
        pending_fsmap.damaged(gid, blacklist_epoch);
@@ -728,6 +755,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
             << ceph_mds_state_name(state) << dendl;
        return true;
      } else {
+      if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
+        auto fscid = pending_fsmap.mds_roles.at(gid);
+        auto fs = pending_fsmap.get_filesystem(fscid);
+        mon->clog->info() << info.human_name() << " is now active in "
+                          << "filesystem " << fs->mds_map.fs_name << " as rank "
+                          << info.rank;
+      }
+
        // Made it through special cases and validations, record the
        // daemon's reported state to the FSMap.
        pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
@@ -778,7 +813,7 @@ void MDSMonitor::_updated(MonOpRequestRef op)
    op->mark_mdsmon_event(__func__);
    MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
    dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
-  mon->clog->info() << m->get_orig_source_inst() << " "
+  mon->clog->debug() << m->get_orig_source_inst() << " "
           << ceph_mds_state_name(m->get_state());
  
    if (m->get_state() == MDSMap::STATE_STOPPED) {
@@ -832,7 +867,7 @@ void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
      health.decode(bl_i);
  
      for (const auto &metric : health.metrics) {
-      int const rank = info.rank;
+      const int rank = info.rank;
        std::ostringstream message;
        message << "mds" << rank << ": " << metric.message;
        summary.push_back(std::make_pair(metric.sev, message.str()));
@@ -946,11 +981,11 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
        } else {
         mdsmap->print(ds);
         r = 0;
-      } 
-      if (r == 0) {
-       rdata.append(ds);
-       ss << "dumped fsmap epoch " << p->get_epoch();
        }
+
+      rdata.append(ds);
+      ss << "dumped fsmap epoch " << p->get_epoch();
+
        if (p != &fsmap) {
         delete p;
        }
@@ -985,11 +1020,11 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
        } else {
         p->print(ds);
         r = 0;
-      } 
-      if (r == 0) {
-       rdata.append(ds);
-       ss << "dumped fsmap epoch " << p->get_epoch();
        }
+
+      rdata.append(ds);
+      ss << "dumped fsmap epoch " << p->get_epoch();
+
        if (p != &fsmap)
         delete p;
      }
@@ -1021,6 +1056,7 @@ bool MDSMonitor::preprocess_command(MonOpRequestRef op)
            derr << "Unexpected error reading metadata: " << cpp_strerror(r)
                 << dendl;
            ss << get_err.str();
+          f->close_section();
            break;
          }
          f->close_section();
@@ -1181,7 +1217,7 @@ bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
    epoch_t blacklist_epoch = 0;
    if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
      utime_t until = ceph_clock_now();
-    until += g_conf->mds_blacklist_interval;
+    until += g_conf->get_val<double>("mon_mds_blacklist_interval");
      blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
    }
  
@@ -1243,8 +1279,11 @@ mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
    return MDS_GID_NONE;
  }
  
-int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
+int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg,
+    MDSMap::mds_info_t *failed_info)
  {
+  assert(failed_info != nullptr);
+
    mds_gid_t gid = gid_from_arg(arg, ss);
    if (gid == MDS_GID_NONE) {
      return 0;
@@ -1252,6 +1291,11 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
    if (!mon->osdmon()->is_writeable()) {
      return -EAGAIN;
    }
+
+  // Take a copy of the info before removing the MDS from the map,
+  // so that the caller knows which mds (if any) they ended up removing.
+  *failed_info = pending_fsmap.get_info_gid(gid);
+
    fail_mds_gid(gid);
    ss << "failed mds gid " << gid;
    assert(mon->osdmon()->is_writeable());
@@ -1284,9 +1328,18 @@ bool MDSMonitor::prepare_command(MonOpRequestRef op)
      return true;
    }
  
+  bool batched_propose = false;
    for (auto h : handlers) {
      if (h->can_handle(prefix)) {
+      batched_propose = h->batched_propose();
+      if (batched_propose) {
+        paxos->plug();
+      }
        r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
+      if (batched_propose) {
+        paxos->unplug();
+      }
+
        if (r == -EAGAIN) {
          // message has been enqueued for retry; return.
          dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
@@ -1340,6 +1393,9 @@ out:
      // success.. delay reply
      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
                                               get_last_committed() + 1));
+    if (batched_propose) {
+      force_immediate_propose();
+    }
      return true;
    } else {
      // reply immediately
@@ -1443,10 +1499,18 @@ int MDSMonitor::filesystem_command(
    } else if (prefix == "mds fail") {
      string who;
      cmd_getval(g_ceph_context, cmdmap, "who", who);
-    r = fail_mds(ss, who);
+
+    MDSMap::mds_info_t failed_info;
+    r = fail_mds(ss, who, &failed_info);
      if (r < 0 && r == -EAGAIN) {
        mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
        return -EAGAIN; // don't propose yet; wait for message to be retried
+    } else if (r == 0) {
+      // Only log if we really did something (not when was already gone)
+      if (failed_info.global_id != MDS_GID_NONE) {
+        mon->clog->info() << failed_info.human_name() << " marked failed by "
+                          << op->get_session()->entity_name;
+      }
      }
    } else if (prefix == "mds rm") {
      mds_gid_t gid;
@@ -1852,19 +1916,24 @@ int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
    return 0;
  }
  
-void MDSMonitor::count_metadata(const string& field, Formatter *f)
+void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
  {
-  map<string,int> by_val;
    map<mds_gid_t,Metadata> meta;
    load_metadata(meta);
    for (auto& p : meta) {
      auto q = p.second.find(field);
      if (q == p.second.end()) {
-      by_val["unknown"]++;
+      (*out)["unknown"]++;
      } else {
-      by_val[q->second]++;
+      (*out)[q->second]++;
      }
    }
+}
+
+void MDSMonitor::count_metadata(const string& field, Formatter *f)
+{
+  map<string,int> by_val;
+  count_metadata(field, &by_val);
    f->open_object_section(field.c_str());
    for (auto& p : by_val) {
      f->dump_int(p.first.c_str(), p.second);
@@ -1954,8 +2023,14 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
        break;
      }
  
-    dout(1) << "adding standby " << pending_fsmap.get_info_gid(newgid).addr
+    const auto &new_info = pending_fsmap.get_info_gid(newgid);
+    dout(1) << "assigned standby " << new_info.addr
              << " as mds." << mds << dendl;
+
+    mon->clog->info() << new_info.human_name() << " assigned to "
+                         "filesystem " << fs->mds_map.fs_name << " as rank "
+                      << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
+                      << " ranks)";
      pending_fsmap.promote(newgid, fs, mds);
      do_propose = true;
    }
@@ -1969,21 +2044,14 @@ bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
   * is available, fail this daemon (remove from map) and pass its
   * role to another daemon.
   */
-void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
-    const beacon_info_t &beacon,
+void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
      bool *mds_propose, bool *osd_propose)
  {
    assert(mds_propose != nullptr);
    assert(osd_propose != nullptr);
  
-  const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
    const auto fscid = pending_fsmap.mds_roles.at(gid);
  
-  dout(10) << "no beacon from " << gid << " " << info.addr << " mds."
-    << info.rank << "." << info.inc
-    << " " << ceph_mds_state_name(info.state)
-    << " since " << beacon.stamp << dendl;
-
    // We will only take decisive action (replacing/removing a daemon)
    // if we have some indicating that some other daemon(s) are successfully
    // getting beacons through recently.
@@ -2013,10 +2081,10 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
        << " " << ceph_mds_state_name(info.state)
        << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
  
-    mon->clog->warn() << "MDS daemon '" << info.name << "'"
+    mon->clog->warn() << info.human_name() 
                        << " is not responding, replacing it "
                        << "as rank " << info.rank
-                      << " with standby '" << si.name << "'";
+                      << " with standby " << si.human_name();
  
      // Remember what NS the old one was in
      const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
@@ -2034,9 +2102,8 @@ void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
      dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank 
        << "." << info.inc << " " << ceph_mds_state_name(info.state)
        << dendl;
-    mon->clog->info() << "MDS standby '"  << info.name
-                      << "' is not responding, removing it from the set of "
-                      << "standbys";
+    mon->clog->info() << "Standby " << info.human_name() << " is not "
+                         "responding, dropping it";
      fail_mds_gid(gid);
      *mds_propose = true;
    } else if (!info.laggy()) {
@@ -2069,6 +2136,10 @@ bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
          const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
          dout(0) << " taking over failed mds." << f << " with " << sgid
                  << "/" << si.name << " " << si.addr << dendl;
+        mon->clog->info() << "Standby " << si.human_name()
+                          << " assigned to filesystem " << fs->mds_map.fs_name
+                          << " as rank " << f;
+
          pending_fsmap.promote(sgid, fs, f);
         do_propose = true;
        }
@@ -2207,32 +2278,36 @@ void MDSMonitor::tick()
      }
    }
  
-  // If the OSDMap is writeable, we can blacklist things, so we can
-  // try failing any laggy MDS daemons.  Consider each one for failure.
-  if (mon->osdmon()->is_writeable()) {
-    bool propose_osdmap = false;
-
-    map<mds_gid_t, beacon_info_t>::iterator p = last_beacon.begin();
-    while (p != last_beacon.end()) {
-      mds_gid_t gid = p->first;
-      auto beacon_info = p->second;
-      ++p;
-
-      if (!pending_fsmap.gid_exists(gid)) {
-       // clean it out
-       last_beacon.erase(gid);
-       continue;
-      }
+  bool propose_osdmap = false;
+  bool osdmap_writeable = mon->osdmon()->is_writeable();
+  auto p = last_beacon.begin();
+  while (p != last_beacon.end()) {
+    mds_gid_t gid = p->first;
+    auto beacon_info = p->second;
+    ++p;
  
-      if (beacon_info.stamp < cutoff) {
-        maybe_replace_gid(gid, beacon_info, &do_propose, &propose_osdmap);
-      }
+    if (!pending_fsmap.gid_exists(gid)) {
+      // clean it out
+      last_beacon.erase(gid);
+      continue;
      }
  
-    if (propose_osdmap) {
-      request_proposal(mon->osdmon());
+    if (beacon_info.stamp < cutoff) {
+      auto &info = pending_fsmap.get_info_gid(gid);
+      dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+              << " (gid: " << gid << " addr: " << info.addr
+              << " state: " << ceph_mds_state_name(info.state) << ")"
+              << " since " << beacon_info.stamp << dendl;
+      // If the OSDMap is writeable, we can blacklist things, so we can
+      // try failing any laggy MDS daemons.  Consider each one for failure.
+      if (osdmap_writeable) {
+        maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
+      }
      }
    }
+  if (propose_osdmap) {
+    request_proposal(mon->osdmon());
+  }
  
    for (auto i : pending_fsmap.filesystems) {
      auto fs = i.second;
@@ -2271,7 +2346,7 @@ bool MDSMonitor::try_standby_replay(
  MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
    : PaxosService(mn, p, service_name)
  {
-  handlers = FileSystemCommandHandler::load();
+  handlers = FileSystemCommandHandler::load(p);
  }
  
  void MDSMonitor::on_restart()