]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/mon/PGMonitor.cc
update sources to v12.1.0
[ceph.git] / ceph / src / mon / PGMonitor.cc
index 151468a58970ce7f03198469e211f1505b0811cc..7fdb5bd6357a786afbc30ed4acb8625d5c78078d 100644 (file)
 #include "Monitor.h"
 #include "OSDMonitor.h"
 #include "MonitorDBStore.h"
+#include "PGStatService.h"
 
 #include "messages/MPGStats.h"
 #include "messages/MPGStatsAck.h"
-#include "messages/MGetPoolStats.h"
-#include "messages/MGetPoolStatsReply.h"
 
-#include "messages/MStatfs.h"
-#include "messages/MStatfsReply.h"
 #include "messages/MOSDPGCreate.h"
 #include "messages/MMonCommand.h"
 #include "messages/MOSDScrub.h"
@@ -72,12 +69,17 @@ void PGMonitor::on_active()
 
   update_logger();
 
-  if (mon->is_leader())
+  if (mon->is_leader() &&
+      mon->osdmon()->osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
     mon->clog->info() << "pgmap " << pg_map;
+  }
 }
 
 void PGMonitor::update_logger()
 {
+  if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    return;
+  }
   dout(10) << "update_logger" << dendl;
 
   mon->cluster_logger->set(l_cluster_osd_bytes, pg_map.osd_sum.kb * 1024ull);
@@ -115,6 +117,9 @@ void PGMonitor::update_logger()
 void PGMonitor::tick()
 {
   if (!is_active()) return;
+  if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    return;
+  }
 
   handle_osd_timeouts();
 
@@ -162,6 +167,19 @@ void PGMonitor::create_initial()
 
 void PGMonitor::update_from_paxos(bool *need_bootstrap)
 {
+  if (did_delete)
+    return;
+
+  if (get_value("deleted")) {
+    did_delete = true;
+    dout(10) << __func__ << " deleted, clearing in-memory PGMap" << dendl;
+    pg_map = PGMap();
+    pending_inc = PGMap::Incremental();
+    pgservice.reset();
+    last_osd_report.clear();
+    return;
+  }
+
   version_t version = get_last_committed();
   if (version == pg_map.version)
     return;
@@ -216,21 +234,13 @@ void PGMonitor::on_upgrade()
 void PGMonitor::upgrade_format()
 {
   unsigned current = 1;
-  assert(format_version <= current);
-  if (format_version == current)
-    return;
-
-  dout(1) << __func__ << " to " << current << dendl;
-
-  // upgrade by dirtying it all
-  pg_map.dirty_all(pending_inc);
-
-  format_version = current;
-  propose_pending();
+  assert(format_version == current);
 }
 
 void PGMonitor::post_paxos_update()
 {
+  if (did_delete)
+    return;
   dout(10) << __func__ << dendl;
   OSDMap& osdmap = mon->osdmon()->osdmap;
   if (mon->monmap->get_required_features().contains_all(
@@ -250,6 +260,8 @@ void PGMonitor::handle_osd_timeouts()
 {
   if (!mon->is_leader())
     return;
+  if (did_delete)
+    return;
 
   utime_t now(ceph_clock_now());
   utime_t timeo(g_conf->mon_osd_report_timeout, 0);
@@ -264,6 +276,9 @@ void PGMonitor::handle_osd_timeouts()
 
 void PGMonitor::create_pending()
 {
+  if (did_delete)
+    return;
+  do_delete = false;
   pending_inc = PGMap::Incremental();
   pending_inc.version = pg_map.version + 1;
   if (pg_map.version == 0) {
@@ -362,7 +377,7 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
   }
 
   pool_stat_t pg_sum_old = pg_map.pg_sum;
-  ceph::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
+  mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
 
   // pgs
   set<int64_t> deleted_pools;
@@ -429,6 +444,32 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
 
 void PGMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 {
+  if (did_delete)
+    return;
+
+  string prefix = pgmap_meta_prefix;
+  if (do_delete) {
+    dout(1) << __func__ << " clearing pgmap data at v" << pending_inc.version
+           << dendl;
+    do_delete = false;
+    for (auto key : { "version", "stamp", "last_osdmap_epoch",
+         "last_pg_scan", "full_ratio", "nearfull_ratio" }) {
+      t->erase(prefix, key);
+    }
+    for (auto& p : pg_map.pg_stat) {
+      t->erase(prefix, stringify(p.first));
+    }
+    for (auto& p : pg_map.osd_stat) {
+      t->erase(prefix, stringify(p.first));
+    }
+    put_last_committed(t, pending_inc.version);
+    put_value(t, "deleted", 1);
+    return;
+  }
+
+  assert(mon->osdmon()->osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS ||
+        pending_inc.version == 1  /* rebuild-mondb.yaml case */);
+
   version_t version = pending_inc.version;
   dout(10) << __func__ << " v " << version << dendl;
   assert(get_last_committed() + 1 == version);
@@ -436,8 +477,6 @@ void PGMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 
   uint64_t features = mon->get_quorum_con_features();
 
-  string prefix = pgmap_meta_prefix;
-
   t->put(prefix, "version", pending_inc.version);
   {
     bufferlist bl;
@@ -519,17 +558,14 @@ version_t PGMonitor::get_trim_to()
 
 bool PGMonitor::preprocess_query(MonOpRequestRef op)
 {
+  if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    return false;
+  }
+
   op->mark_pgmon_event(__func__);
   PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
   switch (m->get_type()) {
-  case CEPH_MSG_STATFS:
-    handle_statfs(op);
-    return true;
-
-  case MSG_GETPOOLSTATS:
-    return preprocess_getpoolstats(op);
-
   case MSG_PGSTATS:
     return preprocess_pg_stats(op);
 
@@ -545,6 +581,10 @@ bool PGMonitor::preprocess_query(MonOpRequestRef op)
 
 bool PGMonitor::prepare_update(MonOpRequestRef op)
 {
+  if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    return false;
+  }
+
   op->mark_pgmon_event(__func__);
   PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
   dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
@@ -561,85 +601,6 @@ bool PGMonitor::prepare_update(MonOpRequestRef op)
   }
 }
 
-void PGMonitor::handle_statfs(MonOpRequestRef op)
-{
-  op->mark_pgmon_event(__func__);
-  MStatfs *statfs = static_cast<MStatfs*>(op->get_req());
-  // check caps
-  MonSession *session = statfs->get_session();
-  if (!session)
-    return;
-
-  if (!session->is_capable("pg", MON_CAP_R)) {
-    dout(0) << "MStatfs received from entity with insufficient privileges "
-            << session->caps << dendl;
-    return;
-  }
-
-  if (statfs->fsid != mon->monmap->fsid) {
-    dout(0) << "handle_statfs on fsid " << statfs->fsid
-            << " != " << mon->monmap->fsid << dendl;
-    return;
-  }
-
-
-  dout(10) << "handle_statfs " << *statfs
-           << " from " << statfs->get_orig_source() << dendl;
-
-  // fill out stfs
-  MStatfsReply *reply = new MStatfsReply(mon->monmap->fsid, statfs->get_tid(),
-    get_last_committed());
-
-  // these are in KB.
-  reply->h.st.kb = pg_map.osd_sum.kb;
-  reply->h.st.kb_used = pg_map.osd_sum.kb_used;
-  reply->h.st.kb_avail = pg_map.osd_sum.kb_avail;
-  reply->h.st.num_objects = pg_map.pg_sum.stats.sum.num_objects;
-
-  // reply
-  mon->send_reply(op, reply);
-}
-
-bool PGMonitor::preprocess_getpoolstats(MonOpRequestRef op)
-{
-  op->mark_pgmon_event(__func__);
-  MGetPoolStats *m = static_cast<MGetPoolStats*>(op->get_req());
-  MGetPoolStatsReply *reply;
-
-  MonSession *session = m->get_session();
-  if (!session)
-    goto out;
-  if (!session->is_capable("pg", MON_CAP_R)) {
-    dout(0) << "MGetPoolStats received from entity with insufficient caps "
-            << session->caps << dendl;
-    goto out;
-  }
-
-  if (m->fsid != mon->monmap->fsid) {
-    dout(0) << "preprocess_getpoolstats on fsid " << m->fsid << " != " << mon->monmap->fsid << dendl;
-    goto out;
-  }
-
-  reply = new MGetPoolStatsReply(m->fsid, m->get_tid(), get_last_committed());
-
-  for (list<string>::iterator p = m->pools.begin();
-       p != m->pools.end();
-       ++p) {
-    int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(p->c_str());
-    if (poolid < 0)
-      continue;
-    if (pg_map.pg_pool_sum.count(poolid) == 0)
-      continue;
-    reply->pool_stats[*p] = pg_map.pg_pool_sum[poolid];
-  }
-
-  mon->send_reply(op, reply);
-
-out:
-  return true;
-}
-
-
 bool PGMonitor::preprocess_pg_stats(MonOpRequestRef op)
 {
   op->mark_pgmon_event(__func__);
@@ -758,7 +719,7 @@ bool PGMonitor::prepare_pg_stats(MonOpRequestRef op)
 
   // osd stat
   if (mon->osdmon()->osdmap.is_in(from)) {
-    pending_inc.update_stat(from, stats->epoch, stats->osd_stat);
+    pending_inc.update_stat(from, stats->epoch, std::move(stats->osd_stat));
   } else {
     pending_inc.update_stat(from, stats->epoch, osd_stat_t());
   }
@@ -844,6 +805,9 @@ void PGMonitor::check_osd_map(epoch_t epoch)
   if (mon->is_peon())
     return;  // whatever.
 
+  if (did_delete)
+    return;
+
   if (pg_map.last_osdmap_epoch >= epoch) {
     dout(10) << __func__ << " already seen " << pg_map.last_osdmap_epoch
              << " >= " << epoch << dendl;
@@ -862,11 +826,19 @@ void PGMonitor::check_osd_map(epoch_t epoch)
     return;
   }
 
+  const OSDMap& osdmap = mon->osdmon()->osdmap;
+  if (!did_delete && osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    // delete all my data
+    dout(1) << __func__ << " will clear pg_map data" << dendl;
+    do_delete = true;
+    propose_pending();
+    return;
+  }
+
   // osds that went up or down
   set<int> need_check_down_pg_osds;
 
   // apply latest map(s)
-  const OSDMap& osdmap = mon->osdmon()->osdmap;
   epoch = std::max(epoch, osdmap.get_epoch());
   for (epoch_t e = pg_map.last_osdmap_epoch+1;
        e <= epoch;
@@ -938,16 +910,6 @@ epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
   return last + 1;
 }
 
-void PGMonitor::dump_info(Formatter *f) const
-{
-  f->open_object_section("pgmap");
-  pg_map.dump(f);
-  f->close_section();
-
-  f->dump_unsigned("pgmap_first_committed", get_first_committed());
-  f->dump_unsigned("pgmap_last_committed", get_last_committed());
-}
-
 bool PGMonitor::preprocess_command(MonOpRequestRef op)
 {
   op->mark_pgmon_event(__func__);
@@ -1000,12 +962,12 @@ bool PGMonitor::preprocess_command(MonOpRequestRef op)
       r = -ENOENT;
       goto reply;
     }
-    if (pg_map.pg_stat[pgid].acting_primary == -1) {
+    int osd = pg_map.pg_stat[pgid].acting_primary;
+    if (osd == -1) {
       ss << "pg " << pgid << " has no primary osd";
       r = -EAGAIN;
       goto reply;
     }
-    int osd = pg_map.pg_stat[pgid].acting_primary;
     if (!mon->osdmon()->osdmap.is_up(osd)) {
       ss << "pg " << pgid << " primary osd." << osd << " not up";
       r = -EAGAIN;
@@ -1098,7 +1060,7 @@ bool PGMonitor::prepare_command(MonOpRequestRef op)
     goto update;
   } else if (prefix == "pg set_full_ratio" ||
              prefix == "pg set_nearfull_ratio") {
-    if (mon->osdmon()->osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+    if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
       ss << "please use the new luminous interfaces"
         << " ('osd set-full-ratio' and 'osd set-nearfull-ratio')";
       r = -EPERM;
@@ -1136,498 +1098,23 @@ update:
   return true;
 }
 
-// Only called with a single bit set in "what"
-static void note_stuck_detail(int what,
-                              ceph::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
-                              list<pair<health_status_t,string> > *detail)
-{
-  for (ceph::unordered_map<pg_t,pg_stat_t>::iterator p = stuck_pgs.begin();
-       p != stuck_pgs.end();
-       ++p) {
-    ostringstream ss;
-    utime_t since;
-    const char *whatname = 0;
-    switch (what) {
-    case PGMap::STUCK_INACTIVE:
-      since = p->second.last_active;
-      whatname = "inactive";
-      break;
-    case PGMap::STUCK_UNCLEAN:
-      since = p->second.last_clean;
-      whatname = "unclean";
-      break;
-    case PGMap::STUCK_DEGRADED:
-      since = p->second.last_undegraded;
-      whatname = "degraded";
-      break;
-    case PGMap::STUCK_UNDERSIZED:
-      since = p->second.last_fullsized;
-      whatname = "undersized";
-      break;
-    case PGMap::STUCK_STALE:
-      since = p->second.last_unstale;
-      whatname = "stale";
-      break;
-    default:
-      ceph_abort();
-    }
-    ss << "pg " << p->first << " is stuck " << whatname;
-    if (since == utime_t()) {
-      ss << " since forever";
-    } else {
-      utime_t dur = ceph_clock_now() - since;
-      ss << " for " << dur;
-    }
-    ss << ", current state " << pg_state_string(p->second.state)
-       << ", last acting " << p->second.acting;
-    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-  }
-}
-
-int PGMonitor::_warn_slow_request_histogram(const pow2_hist_t& h, string suffix,
-                                            list<pair<health_status_t,string> >& summary,
-                                            list<pair<health_status_t,string> > *detail) const
-{
-  if (h.h.empty())
-    return 0;
-
-  unsigned sum = 0;
-  for (unsigned i = h.h.size() - 1; i > 0; --i) {
-    float ub = (float)(1 << i) / 1000.0;
-    if (ub < g_conf->mon_osd_max_op_age)
-      break;
-    ostringstream ss;
-    if (h.h[i]) {
-      ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
-      if (detail)
-       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-      sum += h.h[i];
-    }
-  }
-  return sum;
-}
-
-namespace {
-  enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
-
-  void print_unscrubbed_detailed(const std::pair<const pg_t,pg_stat_t> &pg_entry,
-                                list<pair<health_status_t,string> > *detail,
-                                scrubbed_or_deepscrubbed_t how_scrubbed) {
-
-    std::stringstream ss;
-    const auto& pg_stat(pg_entry.second);
-
-    ss << "pg " << pg_entry.first << " is not ";
-    if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
-      ss << "scrubbed, last_scrub_stamp "
-        << pg_stat.last_scrub_stamp;
-    } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
-      ss << "deep-scrubbed, last_deep_scrub_stamp "
-        << pg_stat.last_deep_scrub_stamp;
-    }
-
-    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-  }
-
-
-  using pg_stat_map_t = const ceph::unordered_map<pg_t,pg_stat_t>;
-
-  void print_unscrubbed_pgs(pg_stat_map_t& pg_stats,
-                           list<pair<health_status_t,string> > &summary,
-                           list<pair<health_status_t,string> > *detail,
-                           const CephContext* cct) {
-    if (cct->_conf->mon_warn_not_scrubbed == 0 &&
-      cct->_conf->mon_warn_not_deep_scrubbed == 0)
-      return;
-
-    int pgs_count = 0;
-    const utime_t now = ceph_clock_now();
-    for (const auto& pg_entry : pg_stats) {
-      const auto& pg_stat(pg_entry.second);
-      const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
-      const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
-
-      const int mon_warn_not_scrubbed =
-       cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
-
-      const int mon_warn_not_deep_scrubbed =
-       cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
-
-      bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
-                          cct->_conf->mon_warn_not_scrubbed != 0);
-
-      bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
-                               cct->_conf->mon_warn_not_deep_scrubbed != 0);
-
-      if (detail != nullptr) {
-       if (not_scrubbed) {
-         print_unscrubbed_detailed(pg_entry,
-                                   detail,
-                                   scrubbed_or_deepscrubbed_t::SCRUBBED);
-       }
-        if (not_deep_scrubbed) {
-         print_unscrubbed_detailed(pg_entry,
-                                   detail,
-                                   scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
-       }
-      }
-      if (not_scrubbed || not_deep_scrubbed) {
-       ++pgs_count;
-      }
-    }
-
-    if (pgs_count > 0) {
-      std::stringstream ss;
-      ss << pgs_count << " unscrubbed pgs";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-
-  }
-}
-
 void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
                           list<pair<health_status_t,string> > *detail,
                           CephContext *cct) const
 {
-  map<string,int> note;
-  ceph::unordered_map<int,int>::const_iterator p = pg_map.num_pg_by_state.begin();
-  ceph::unordered_map<int,int>::const_iterator p_end = pg_map.num_pg_by_state.end();
-  for (; p != p_end; ++p) {
-    if (p->first & PG_STATE_STALE)
-      note["stale"] += p->second;
-    if (p->first & PG_STATE_DOWN)
-      note["down"] += p->second;
-    if (p->first & PG_STATE_UNDERSIZED)
-      note["undersized"] += p->second;
-    if (p->first & PG_STATE_DEGRADED)
-      note["degraded"] += p->second;
-    if (p->first & PG_STATE_INCONSISTENT)
-      note["inconsistent"] += p->second;
-    if (p->first & PG_STATE_PEERING)
-      note["peering"] += p->second;
-    if (p->first & PG_STATE_REPAIR)
-      note["repair"] += p->second;
-    if (p->first & PG_STATE_RECOVERING)
-      note["recovering"] += p->second;
-    if (p->first & PG_STATE_RECOVERY_WAIT)
-      note["recovery_wait"] += p->second;
-    if (p->first & PG_STATE_INCOMPLETE)
-      note["incomplete"] += p->second;
-    if (p->first & PG_STATE_BACKFILL_WAIT)
-      note["backfill_wait"] += p->second;
-    if (p->first & PG_STATE_BACKFILL)
-      note["backfilling"] += p->second;
-    if (p->first & PG_STATE_BACKFILL_TOOFULL)
-      note["backfill_toofull"] += p->second;
-    if (p->first & PG_STATE_RECOVERY_TOOFULL)
-      note["recovery_toofull"] += p->second;
-  }
-
-  ceph::unordered_map<pg_t, pg_stat_t> stuck_pgs;
-  utime_t now(ceph_clock_now());
-  utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0);
-  uint64_t num_inactive_pgs = 0;
-  
-  if (detail) {
-    
-    // we need to collect details of stuck pgs, first do a quick check
-    // whether this will yield any results
-    if (pg_map.get_stuck_counts(cutoff, note)) {
-      
-      // there are stuck pgs. gather details for specified statuses
-      // only if we know that there are pgs stuck in that status
-      
-      if (note.find("stuck inactive") != note.end()) {
-        pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
-        note["stuck inactive"] = stuck_pgs.size();
-        num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
-        stuck_pgs.clear();
-      }
-
-      if (note.find("stuck unclean") != note.end()) {
-        pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
-        note["stuck unclean"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs, detail);
-        stuck_pgs.clear();
-      }
-
-      if (note.find("stuck undersized") != note.end()) {
-        pg_map.get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
-        note["stuck undersized"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs, detail);
-        stuck_pgs.clear();
-      }
-
-      if (note.find("stuck degraded") != note.end()) {
-        pg_map.get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
-        note["stuck degraded"] = stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs, detail);
-        stuck_pgs.clear();
-      }
-
-      if (note.find("stuck stale") != note.end()) {
-        pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
-        note["stuck stale"] = stuck_pgs.size();
-        num_inactive_pgs += stuck_pgs.size();
-        note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
-      }
-    }
-  } else {
-    pg_map.get_stuck_counts(cutoff, note);
-    map<string,int>::const_iterator p = note.find("stuck inactive");
-    if (p != note.end()) 
-      num_inactive_pgs += p->second;
-    p = note.find("stuck stale");
-    if (p != note.end()) 
-      num_inactive_pgs += p->second;
-  }
-
-  if (g_conf->mon_pg_min_inactive > 0 && num_inactive_pgs >= g_conf->mon_pg_min_inactive) {
-    ostringstream ss;
-    ss << num_inactive_pgs << " pgs are stuck inactive for more than " << g_conf->mon_pg_stuck_threshold << " seconds";
-    summary.push_back(make_pair(HEALTH_ERR, ss.str()));
-  }
-
-  if (!note.empty()) {
-    for (map<string,int>::iterator p = note.begin(); p != note.end(); ++p) {
-      ostringstream ss;
-      ss << p->second << " pgs " << p->first;
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-    if (detail) {
-      for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p = pg_map.pg_stat.begin();
-           p != pg_map.pg_stat.end();
-           ++p) {
-       if ((p->second.state & (PG_STATE_STALE |
-                               PG_STATE_DOWN |
-                               PG_STATE_UNDERSIZED |
-                               PG_STATE_DEGRADED |
-                               PG_STATE_INCONSISTENT |
-                               PG_STATE_PEERING |
-                               PG_STATE_REPAIR |
-                               PG_STATE_RECOVERING |
-                               PG_STATE_RECOVERY_WAIT |
-                               PG_STATE_RECOVERY_TOOFULL |
-                               PG_STATE_INCOMPLETE |
-                               PG_STATE_BACKFILL_WAIT |
-                               PG_STATE_BACKFILL |
-                               PG_STATE_BACKFILL_TOOFULL)) &&
-           stuck_pgs.count(p->first) == 0) {
-         ostringstream ss;
-         ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
-         ss << ", acting " << p->second.acting;
-         if (p->second.stats.sum.num_objects_unfound)
-           ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
-         if (p->second.state & PG_STATE_INCOMPLETE) {
-           const pg_pool_t *pi = mon->osdmon()->osdmap.get_pg_pool(p->first.pool());
-           if (pi && pi->min_size > 1) {
-             ss << " (reducing pool " << mon->osdmon()->osdmap.get_pool_name(p->first.pool())
-                << " min_size from " << (int)pi->min_size << " may help; search ceph.com/docs for 'incomplete')";
-           }
-         }
-         detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-       }
-      }
-    }
-  }
-
-  // slow requests
-  if (g_conf->mon_osd_max_op_age > 0 &&
-      pg_map.osd_sum.op_queue_age_hist.upper_bound() > g_conf->mon_osd_max_op_age) {
-    unsigned sum = _warn_slow_request_histogram(pg_map.osd_sum.op_queue_age_hist, "", summary, NULL);
-    if (sum > 0) {
-      ostringstream ss;
-      ss << sum << " requests are blocked > " << g_conf->mon_osd_max_op_age << " sec";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-
-      if (detail) {
-       unsigned num_slow_osds = 0;
-       // do per-osd warnings
-       for (ceph::unordered_map<int32_t,osd_stat_t>::const_iterator p = pg_map.osd_stat.begin();
-            p != pg_map.osd_stat.end();
-            ++p) {
-         if (_warn_slow_request_histogram(p->second.op_queue_age_hist,
-                                          string(" on osd.") + stringify(p->first),
-                                          summary, detail))
-           ++num_slow_osds;
-       }
-       ostringstream ss2;
-       ss2 << num_slow_osds << " osds have slow requests";
-       summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
-       detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
-      }
-    }
-  }
-
-  if (g_conf->mon_warn_osd_usage_min_max_delta) {
-    float max_osd_usage = 0.0, min_osd_usage = 1.0;
-    for (auto p = pg_map.osd_stat.begin(); p != pg_map.osd_stat.end(); ++p) {
-      // kb should never be 0, but avoid divide by zero in case of corruption
-      if (p->second.kb <= 0)
-        continue;
-      float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
-      if (usage > max_osd_usage)
-        max_osd_usage = usage;
-      if (usage < min_osd_usage)
-        min_osd_usage = usage;
-    }
-    float diff = max_osd_usage - min_osd_usage;
-    if (diff > g_conf->mon_warn_osd_usage_min_max_delta) {
-      ostringstream ss;
-      ss << "difference between min (" << roundf(min_osd_usage*1000.0)/100.0
-        << "%) and max (" << roundf(max_osd_usage*1000.0)/100.0
-        << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
-        << roundf(g_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
-        << " (mon_warn_osd_usage_min_max_delta)";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail)
-        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-  }
-
-  // recovery
-  list<string> sl;
-  pg_map.overall_recovery_summary(NULL, &sl);
-  for (list<string>::iterator p = sl.begin(); p != sl.end(); ++p) {
-    summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
-    if (detail)
-      detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
-  }
-
-  // full/nearfull
-  if (!mon->osdmon()->osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+  // legacy pre-luminous full/nearfull
+  if (mon->osdmon()->osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
     check_full_osd_health(summary, detail, pg_map.full_osds, "full",
                          HEALTH_ERR);
     check_full_osd_health(summary, detail, pg_map.nearfull_osds, "near full",
                          HEALTH_WARN);
+    pg_map.get_health(cct, mon->osdmon()->osdmap, summary, detail);
   }
-
-  // near-target max pools
-  auto& pools = mon->osdmon()->osdmap.get_pools();
-  for (auto p = pools.begin();
-       p != pools.end(); ++p) {
-    if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
-        !pg_map.pg_pool_sum.count(p->first))
-      continue;
-    bool nearfull = false;
-    const string& name = mon->osdmon()->osdmap.get_pool_name(p->first);
-    const pool_stat_t& st = pg_map.get_pg_pool_sum_stat(p->first);
-    uint64_t ratio = p->second.cache_target_full_ratio_micro +
-                     ((1000000 - p->second.cache_target_full_ratio_micro) *
-                      g_conf->mon_cache_target_full_warn_ratio);
-    if (p->second.target_max_objects && (uint64_t)(st.stats.sum.num_objects - st.stats.sum.num_objects_hit_set_archive) >
-        p->second.target_max_objects * (ratio / 1000000.0)) {
-      nearfull = true;
-      if (detail) {
-       ostringstream ss;
-       ss << "cache pool '" << name << "' with "
-          << si_t(st.stats.sum.num_objects)
-          << " objects at/near target max "
-          << si_t(p->second.target_max_objects) << " objects";
-       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-      }
-    }
-    if (p->second.target_max_bytes && (uint64_t)(st.stats.sum.num_bytes - st.stats.sum.num_bytes_hit_set_archive) >
-        p->second.target_max_bytes * (ratio / 1000000.0)) {
-      nearfull = true;
-      if (detail) {
-       ostringstream ss;
-       ss << "cache pool '" << name
-          << "' with " << si_t(st.stats.sum.num_bytes)
-          << "B at/near target max "
-          << si_t(p->second.target_max_bytes) << "B";
-       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-      }
-    }
-    if (nearfull) {
-      ostringstream ss;
-      ss << "'" << name << "' at/near target max";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-  }
-
-  // scrub
-  if (pg_map.pg_sum.stats.sum.num_scrub_errors) {
-    ostringstream ss;
-    ss << pg_map.pg_sum.stats.sum.num_scrub_errors << " scrub errors";
-    summary.push_back(make_pair(HEALTH_ERR, ss.str()));
-    if (detail) {
-      detail->push_back(make_pair(HEALTH_ERR, ss.str()));
-    }
-  }
-
-  // pg skew
-  int num_in = mon->osdmon()->osdmap.get_num_in_osds();
-  int sum_pg_up = MAX(pg_map.pg_sum.up, static_cast<int32_t>(pg_map.pg_stat.size()));
-  if (num_in && g_conf->mon_pg_warn_min_per_osd > 0) {
-    int per = sum_pg_up / num_in;
-    if (per < g_conf->mon_pg_warn_min_per_osd && per) {
-      ostringstream ss;
-      ss << "too few PGs per OSD (" << per << " < min " << g_conf->mon_pg_warn_min_per_osd << ")";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail)
-       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-  }
-  if (num_in && g_conf->mon_pg_warn_max_per_osd > 0) {
-    int per = sum_pg_up / num_in;
-    if (per > g_conf->mon_pg_warn_max_per_osd) {
-      ostringstream ss;
-      ss << "too many PGs per OSD (" << per << " > max " << g_conf->mon_pg_warn_max_per_osd << ")";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail)
-       detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-  }
-  if (!pg_map.pg_stat.empty()) {
-    for (ceph::unordered_map<int,pool_stat_t>::const_iterator p = pg_map.pg_pool_sum.begin();
-         p != pg_map.pg_pool_sum.end();
-         ++p) {
-      const pg_pool_t *pi = mon->osdmon()->osdmap.get_pg_pool(p->first);
-      if (!pi)
-       continue;   // in case osdmap changes haven't propagated to PGMap yet
-      const string& name = mon->osdmon()->osdmap.get_pool_name(p->first);
-      if (pi->get_pg_num() > pi->get_pgp_num() &&
-         !(name.find(".DELETED") != string::npos &&
-           g_conf->mon_fake_pool_delete)) {
-       ostringstream ss;
-       ss << "pool " << name << " pg_num "
-          << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
-       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-       if (detail)
-         detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-      }
-      int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();
-      if (average_objects_per_pg > 0 &&
-          pg_map.pg_sum.stats.sum.num_objects >= g_conf->mon_pg_warn_min_objects &&
-          p->second.stats.sum.num_objects >= g_conf->mon_pg_warn_min_pool_objects) {
-       int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
-       float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
-       if (g_conf->mon_pg_warn_max_object_skew > 0 &&
-           ratio > g_conf->mon_pg_warn_max_object_skew) {
-         ostringstream ss;
-         ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
-         summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-         if (detail) {
-           ostringstream ss;
-           ss << "pool " << name << " objects per pg ("
-              << objects_per_pg << ") is more than " << ratio << " times cluster average ("
-              << average_objects_per_pg << ")";
-           detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-         }
-       }
-      }
-    }
-  }
-
-  print_unscrubbed_pgs(pg_map.pg_stat, summary, detail, cct);
-
 }
 
 void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summary,
                                       list<pair<health_status_t,string> > *detail,
-                                      const set<int>& s, const char *desc,
+                                      const mempool::pgmap::set<int>& s, const char *desc,
                                       health_status_t sev) const
 {
   if (!s.empty()) {
@@ -1648,6 +1135,10 @@ void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summa
 
 void PGMonitor::check_subs()
 {
+  if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    return;
+  }
+
   dout(10) << __func__ << dendl;
   const string type = "osd_pg_creates";
 
@@ -1680,3 +1171,95 @@ bool PGMonitor::check_sub(Subscription *sub)
   }
   return true;
 }
+
+class PGMonStatService : public MonPGStatService, public PGMapStatService {
+  PGMonitor *pgmon;
+public:
+  PGMonStatService(const PGMap& o, PGMonitor *pgm)
+    : MonPGStatService(), PGMapStatService(o), pgmon(pgm) {}
+       
+
+  bool is_readable() const override { return pgmon->is_readable(); }
+
+  unsigned maybe_add_creating_pgs(epoch_t scan_epoch,
+     const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+     creating_pgs_t *pending_creates) const override
+  {
+    if (pgmap.last_pg_scan < scan_epoch) {
+      return 0;
+    }
+    unsigned added = 0;
+    for (auto& pgid : pgmap.creating_pgs) {
+      if (!pools.count(pgid.pool())) {
+       continue;
+      }
+      auto st = pgmap.pg_stat.find(pgid);
+      assert(st != pgmap.pg_stat.end());
+      auto created = make_pair(st->second.created,
+                              st->second.last_scrub_stamp);
+      // no need to add the pg, if it already exists in creating_pgs
+      if (pending_creates->pgs.emplace(pgid, created).second) {
+       added++;
+      }
+    }
+    return added;
+  }
+  void maybe_trim_creating_pgs(creating_pgs_t *creates) const override {
+    auto p = creates->pgs.begin();
+    while (p != creates->pgs.end()) {
+      auto q = pgmap.pg_stat.find(p->first);
+      if (q != pgmap.pg_stat.end() &&
+         !(q->second.state & PG_STATE_CREATING)) {
+       p = creates->pgs.erase(p);
+       creates->created_pools.insert(q->first.pool());
+      } else {
+       ++p;
+      }
+    }
+  }
+  void dump_info(Formatter *f) const override {
+    f->dump_object("pgmap", pgmap);
+    f->dump_unsigned("pgmap_first_committed", pgmon->get_first_committed());
+    f->dump_unsigned("pgmap_last_committed", pgmon->get_last_committed());
+  }
+  int process_pg_command(const string& prefix,
+                        const map<string,cmd_vartype>& cmdmap,
+                        const OSDMap& osdmap,
+                        Formatter *f,
+                        stringstream *ss,
+                        bufferlist *odata) const override {
+    return process_pg_map_command(prefix, cmdmap, pgmap, osdmap, f, ss, odata);
+  }
+
+  int reweight_by_utilization(const OSDMap &osd_map,
+                             int oload,
+                             double max_changef,
+                             int max_osds,
+                             bool by_pg, const set<int64_t> *pools,
+                             bool no_increasing,
+                             mempool::osdmap::map<int32_t, uint32_t>* new_weights,
+                             std::stringstream *ss,
+                             std::string *out_str,
+                             Formatter *f) const override {
+    return reweight::by_utilization(osd_map, pgmap, oload, max_changef,
+                                   max_osds, by_pg, pools, no_increasing,
+                                   new_weights, ss, out_str, f);
+  }
+};
+
+MonPGStatService *PGMonitor::get_pg_stat_service()
+{
+  if (!pgservice) {
+    pgservice.reset(new PGMonStatService(pg_map, this));
+  }
+  return pgservice.get();
+}
+
+PGMonitor::PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
+  : PaxosService(mn, p, service_name),
+    pgmap_meta_prefix("pgmap_meta"),
+    pgmap_pg_prefix("pgmap_pg"),
+    pgmap_osd_prefix("pgmap_osd")
+{}
+
+PGMonitor::~PGMonitor() = default;