]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/mon/PGMap.cc
update sources to v12.2.0
[ceph.git] / ceph / src / mon / PGMap.cc
index c0277b7a518a235ce18162afcb856e59b934a0cd..b1aa8a5e3b22863f0cce5352af98668bae322b89 100644 (file)
@@ -1,6 +1,8 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <boost/algorithm/string.hpp>
+
 #include "PGMap.h"
 
 #define dout_subsys ceph_subsys_mon
@@ -394,7 +396,7 @@ void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
     } else {
       ostringstream ss;
       ss << delta_sum.stats.sum.num_objects_unfound
-         << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
+         << "/" << delta_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
       psl->push_back(ss.str());
     }
   }
@@ -612,6 +614,68 @@ void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
   cache_io_rate_summary(f, out, p->second.first, ts->second);
 }
 
+static float pool_raw_used_rate(const OSDMap &osd_map, int64_t poolid)
+{
+  const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
+
+  switch (pool->get_type()) {
+  case pg_pool_t::TYPE_REPLICATED:
+    return pool->get_size();
+    break;
+  case pg_pool_t::TYPE_ERASURE:
+  {
+    auto& ecp =
+      osd_map.get_erasure_code_profile(pool->erasure_code_profile);
+    auto pm = ecp.find("m");
+    auto pk = ecp.find("k");
+    if (pm != ecp.end() && pk != ecp.end()) {
+      int k = atoi(pk->second.c_str());
+      int m = atoi(pm->second.c_str());
+      int mk = m + k;
+      assert(mk != 0);
+      assert(k != 0);
+      return (float)mk / k;
+    } else {
+      return 0.0;
+    }
+  }
+  break;
+  default:
+    assert(0 == "unrecognized pool type");
+  }
+}
+
+ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
+                                   boost::optional<int64_t> data_pool) const
+{
+  ceph_statfs statfs;
+  bool filter = false;
+  object_stat_sum_t sum;
+
+  if (data_pool) {
+    auto i = pg_pool_sum.find(*data_pool);
+    if (i != pg_pool_sum.end()) {
+      sum = i->second.stats.sum;
+      filter = true;
+    }
+  }
+
+  if (filter) {
+    statfs.kb_used = (sum.num_bytes >> 10);
+    statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
+    statfs.num_objects = sum.num_objects;
+    statfs.kb = statfs.kb_used + statfs.kb_avail;
+  } else {
+    // these are in KB.
+    statfs.kb = osd_sum.kb;
+    statfs.kb_used = osd_sum.kb_used;
+    statfs.kb_avail = osd_sum.kb_avail;
+    statfs.num_objects = pg_sum.stats.sum.num_objects;
+  }
+
+  return statfs;
+}
+
 void PGMapDigest::dump_pool_stats_full(
   const OSDMap &osd_map,
   stringstream *ss,
@@ -666,32 +730,8 @@ void PGMapDigest::dump_pool_stats_full(
     } else {
       avail = avail_by_rule[ruleno];
     }
-    switch (pool->get_type()) {
-    case pg_pool_t::TYPE_REPLICATED:
-      avail /= pool->get_size();
-      raw_used_rate = pool->get_size();
-      break;
-    case pg_pool_t::TYPE_ERASURE:
-    {
-      auto& ecp =
-        osd_map.get_erasure_code_profile(pool->erasure_code_profile);
-      auto pm = ecp.find("m");
-      auto pk = ecp.find("k");
-      if (pm != ecp.end() && pk != ecp.end()) {
-       int k = atoi(pk->second.c_str());
-       int m = atoi(pm->second.c_str());
-       int mk = m + k;
-       assert(mk != 0);
-       avail = avail * k / mk;
-       raw_used_rate = (float)mk / k;
-      } else {
-       raw_used_rate = 0.0;
-      }
-    }
-    break;
-    default:
-      assert(0 == "unrecognized pool type");
-    }
+
+    raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);
 
     if (f) {
       f->open_object_section("pool");
@@ -821,6 +861,21 @@ void PGMapDigest::dump_object_stat_sum(
   }
 }
 
+int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
+                                        int64_t poolid) const
+{
+  const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
+  int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
+                                       pool->get_type(),
+                                       pool->get_size());
+  int64_t avail;
+  avail = get_rule_avail(ruleno);
+  if (avail < 0)
+    avail = 0;
+
+  return avail / ::pool_raw_used_rate(osd_map, poolid);
+}
+
 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
 {
   map<int,float> wm;
@@ -2547,7 +2602,695 @@ namespace {
       ss << pgs_count << " unscrubbed pgs";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
     }
+  }
+}
+
+void PGMap::get_health_checks(
+  CephContext *cct,
+  const OSDMap& osdmap,
+  health_check_map_t *checks) const
+{
+  utime_t now = ceph_clock_now();
+  const unsigned max = cct->_conf->mon_health_max_detail;
+  const auto& pools = osdmap.get_pools();
+
+  typedef enum pg_consequence_t {
+    UNAVAILABLE = 1,   // Client IO to the pool may block
+    DEGRADED = 2,      // Fewer than the requested number of replicas are present
+    DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
+                       //  and insufficiet resources are present to fix this
+    DAMAGED = 4        // The data may be missing or inconsistent on disk and
+                       //  requires repair
+  } pg_consequence_t;
+
+  // For a given PG state, how should it be reported at the pool level?
+  class PgStateResponse {
+    public:
+    pg_consequence_t consequence;
+    typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
+    stuck_cb stuck_since;
+    bool invert;
+
+    PgStateResponse(const pg_consequence_t &c, stuck_cb s)
+      : consequence(c), stuck_since(s), invert(false)
+    {
+    }
+
+    PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
+      : consequence(c), stuck_since(s), invert(i)
+    {
+    }
+  };
+
+  // Record the PG state counts that contributed to a reported pool state
+  class PgCauses {
+    public:
+    // Map of PG_STATE_* to number of pgs in that state.
+    std::map<unsigned, unsigned> states;
+
+    // List of all PG IDs that had a state contributing
+    // to this health condition.
+    std::set<pg_t> pgs;
+
+    std::map<pg_t, std::string> pg_messages;
+  };
+
+  // Map of PG state to how to respond to it
+  std::map<unsigned, PgStateResponse> state_to_response = {
+    // Immediate reports
+    { PG_STATE_INCONSISTENT,     {DAMAGED,     {}} },
+    { PG_STATE_INCOMPLETE,       {UNAVAILABLE, {}} },
+    { PG_STATE_REPAIR,           {DAMAGED,     {}} },
+    { PG_STATE_SNAPTRIM_ERROR,   {DAMAGED,     {}} },
+    { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
+    { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
+    { PG_STATE_DEGRADED,         {DEGRADED,    {}} },
+    { PG_STATE_DOWN,             {UNAVAILABLE, {}} },
+    // Delayed (wait until stuck) reports
+    { PG_STATE_PEERING,          {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;}    } },
+    { PG_STATE_UNDERSIZED,       {DEGRADED,    [](const pg_stat_t &p){return p.last_fullsized;} } },
+    { PG_STATE_STALE,            {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;}   } },
+    // Delayed and inverted reports
+    { PG_STATE_ACTIVE,           {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} },
+    { PG_STATE_CLEAN,            {DEGRADED,    [](const pg_stat_t &p){return p.last_clean;}, true} }
+  };
+
+  // Specialized state printer that takes account of inversion of
+  // ACTIVE, CLEAN checks.
+  auto state_name = [](const uint32_t &state) {
+    // Special cases for the states that are inverted checks
+    if (state == PG_STATE_CLEAN) {
+      return std::string("unclean");
+    } else if (state == PG_STATE_ACTIVE) {
+      return std::string("inactive");
+    } else {
+      return pg_state_string(state);
+    }
+  };
+
+  // Map of what is wrong to information about why, implicitly also stores
+  // the list of what is wrong.
+  std::map<pg_consequence_t, PgCauses> detected;
+
+  // Optimisation: trim down the number of checks to apply based on
+  // the summary counters
+  std::map<unsigned, PgStateResponse> possible_responses;
+  for (const auto &i : num_pg_by_state) {
+    for (const auto &j : state_to_response) {
+      if (!j.second.invert) {
+        // Check for normal tests by seeing if any pgs have the flag
+        if (i.first & j.first) {
+          possible_responses.insert(j);
+        }
+      }
+    }
+  }
+
+  for (const auto &j : state_to_response) {
+    if (j.second.invert) {
+      // Check for inverted tests by seeing if not-all pgs have the flag
+      const auto &found = num_pg_by_state.find(j.first);
+      if (found == num_pg_by_state.end() || found->second != num_pg) {
+        possible_responses.insert(j);
+      }
+    }
+  }
+
+  utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
+  // Loop over all PGs, if there are any possibly-unhealthy states in there
+  if (!possible_responses.empty()) {
+    for (const auto& i : pg_stat) {
+      const auto &pg_id = i.first;
+      const auto &pg_info = i.second;
+
+      for (const auto &j : state_to_response) {
+        const auto &pg_response_state = j.first;
+        const auto &pg_response = j.second;
+
+        // Apply the state test
+        if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
+          continue;
+        }
+
+        // Apply stuckness test if needed
+        if (pg_response.stuck_since) {
+          // Delayed response, check for stuckness
+          utime_t last_whatever = pg_response.stuck_since(pg_info);
+          if (last_whatever >= cutoff) {
+            // Not stuck enough, ignore.
+            continue;
+          } else {
+
+          }
+        }
+
+        auto &causes = detected[pg_response.consequence];
+        causes.states[pg_response_state]++;
+        causes.pgs.insert(pg_id);
+
+        // Don't bother composing detail string if we have already recorded
+        // too many
+        if (causes.pg_messages.size() > max) {
+          continue;
+        }
+
+        std::ostringstream ss;
+        if (pg_response.stuck_since) {
+          utime_t since = pg_response.stuck_since(pg_info);
+          ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
+          if (since == utime_t()) {
+            ss << " since forever";
+          } else {
+            utime_t dur = now - since;
+            ss << " for " << dur;
+          }
+          ss << ", current state " << pg_state_string(pg_info.state)
+             << ", last acting " << pg_info.acting;
+        } else {
+          ss << "pg " << pg_id << " is "
+             << pg_state_string(pg_info.state);
+          ss << ", acting " << pg_info.acting;
+          if (pg_info.stats.sum.num_objects_unfound) {
+            ss << ", " << pg_info.stats.sum.num_objects_unfound
+               << " unfound";
+          }
+        }
+
+        if (pg_info.state & PG_STATE_INCOMPLETE) {
+          const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
+          if (pi && pi->min_size > 1) {
+            ss << " (reducing pool "
+               << osdmap.get_pool_name(pg_id.pool())
+               << " min_size from " << (int)pi->min_size
+               << " may help; search ceph.com/docs for 'incomplete')";
+          }
+        }
+
+        causes.pg_messages[pg_id] = ss.str();
+      }
+    }
+  } else {
+    dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
+  }
+
+  for (const auto &i : detected) {
+    std::string health_code;
+    health_status_t sev;
+    std::string summary;
+    switch(i.first) {
+      case UNAVAILABLE:
+        health_code = "PG_AVAILABILITY";
+        sev = HEALTH_WARN;
+        summary = "Reduced data availability: ";
+        break;
+      case DEGRADED:
+        health_code = "PG_DEGRADED";
+        summary = "Degraded data redundancy: ";
+        sev = HEALTH_WARN;
+        break;
+      case DEGRADED_FULL:
+        health_code = "PG_DEGRADED_FULL";
+        summary = "Degraded data redundancy (low space): ";
+        sev = HEALTH_ERR;
+        break;
+      case DAMAGED:
+        health_code = "PG_DAMAGED";
+        summary = "Possible data damage: ";
+        sev = HEALTH_ERR;
+        break;
+      default:
+        assert(false);
+    }
+
+    if (i.first == DEGRADED) {
+      if (pg_sum.stats.sum.num_objects_degraded &&
+          pg_sum.stats.sum.num_object_copies > 0) {
+        double pc = (double)pg_sum.stats.sum.num_objects_degraded /
+          (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+        char b[20];
+        snprintf(b, sizeof(b), "%.3lf", pc);
+        ostringstream ss;
+        ss << pg_sum.stats.sum.num_objects_degraded
+           << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
+           << b << "%)";
+
+        // Throw in a comma for the benefit of the following PG counts
+        summary += ss.str() + ", ";
+      }
+    }
+
+    // Compose summary message saying how many PGs in what states led
+    // to this health check failing
+    std::vector<std::string> pg_msgs;
+    for (const auto &j : i.second.states) {
+      std::ostringstream msg;
+      msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
+      pg_msgs.push_back(msg.str());
+    }
+    summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
+
+
+
+    health_check_t *check = &checks->add(
+        health_code,
+        sev,
+        summary);
+
+    // Compose list of PGs contributing to this health check failing
+    for (const auto &j : i.second.pg_messages) {
+      check->detail.push_back(j.second);
+    }
+  }
+
+  // OSD_SCRUB_ERRORS
+  if (pg_sum.stats.sum.num_scrub_errors) {
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
+    checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
+  }
+
+  // CACHE_POOL_NEAR_FULL
+  {
+    list<string> detail;
+    unsigned num_pools = 0;
+    for (auto& p : pools) {
+      if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
+         !pg_pool_sum.count(p.first)) {
+       continue;
+      }
+      bool nearfull = false;
+      const string& name = osdmap.get_pool_name(p.first);
+      const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
+      uint64_t ratio = p.second.cache_target_full_ratio_micro +
+       ((1000000 - p.second.cache_target_full_ratio_micro) *
+        cct->_conf->mon_cache_target_full_warn_ratio);
+      if (p.second.target_max_objects &&
+         (uint64_t)(st.stats.sum.num_objects -
+                    st.stats.sum.num_objects_hit_set_archive) >
+         p.second.target_max_objects * (ratio / 1000000.0)) {
+       ostringstream ss;
+       ss << "cache pool '" << name << "' with "
+          << si_t(st.stats.sum.num_objects)
+          << " objects at/near target max "
+          << si_t(p.second.target_max_objects) << " objects";
+       detail.push_back(ss.str());
+       nearfull = true;
+      }
+      if (p.second.target_max_bytes &&
+         (uint64_t)(st.stats.sum.num_bytes -
+                    st.stats.sum.num_bytes_hit_set_archive) >
+         p.second.target_max_bytes * (ratio / 1000000.0)) {
+       ostringstream ss;
+       ss << "cache pool '" << name
+          << "' with " << si_t(st.stats.sum.num_bytes)
+          << "B at/near target max "
+          << si_t(p.second.target_max_bytes) << "B";
+       detail.push_back(ss.str());
+       nearfull = true;
+      }
+      if (nearfull) {
+       ++num_pools;
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << num_pools << " cache pools at or near target size";
+      auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
+    }
+  }
+
+  // TOO_FEW_PGS
+  int num_in = osdmap.get_num_in_osds();
+  int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
+  if (num_in &&
+      cct->_conf->mon_pg_warn_min_per_osd > 0 &&
+      osdmap.get_pools().size() > 0) {
+    int per = sum_pg_up / num_in;
+    if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+      ostringstream ss;
+      ss << "too few PGs per OSD (" << per
+        << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+      checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
+    }
+  }
+
+  // TOO_MANY_PGS
+  if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
+    int per = sum_pg_up / num_in;
+    if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+      ostringstream ss;
+      ss << "too many PGs per OSD (" << per
+        << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+      checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
+    }
+  }
+
+  // SMALLER_PGP_NUM
+  // MANY_OBJECTS_PER_PG
+  if (!pg_stat.empty()) {
+    list<string> pgp_detail, many_detail;
+    for (auto p = pg_pool_sum.begin();
+         p != pg_pool_sum.end();
+         ++p) {
+      const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
+      if (!pi)
+       continue;   // in case osdmap changes haven't propagated to PGMap yet
+      const string& name = osdmap.get_pool_name(p->first);
+      if (pi->get_pg_num() > pi->get_pgp_num() &&
+         !(name.find(".DELETED") != string::npos &&
+           cct->_conf->mon_fake_pool_delete)) {
+       ostringstream ss;
+       ss << "pool " << name << " pg_num "
+          << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
+       pgp_detail.push_back(ss.str());
+      }
+      int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
+      if (average_objects_per_pg > 0 &&
+          pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
+          p->second.stats.sum.num_objects >=
+         cct->_conf->mon_pg_warn_min_pool_objects) {
+       int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
+       float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+       if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
+           ratio > cct->_conf->mon_pg_warn_max_object_skew) {
+         ostringstream ss;
+         ss << "pool " << name << " objects per pg ("
+            << objects_per_pg << ") is more than " << ratio
+            << " times cluster average ("
+            << average_objects_per_pg << ")";
+         many_detail.push_back(ss.str());
+       }
+      }
+    }
+    if (!pgp_detail.empty()) {
+      ostringstream ss;
+      ss << pgp_detail.size() << " pools have pg_num > pgp_num";
+      auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
+      d.detail.swap(pgp_detail);
+    }
+    if (!many_detail.empty()) {
+      ostringstream ss;
+      ss << many_detail.size() << " pools have many more objects per pg than"
+        << " average";
+      auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
+      d.detail.swap(many_detail);
+    }
+  }
+
+  // POOL_FULL
+  // POOL_NEAR_FULL
+  {
+    float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
+    float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
+    list<string> full_detail, nearfull_detail;
+    unsigned full_pools = 0, nearfull_pools = 0;
+    for (auto it : pools) {
+      auto it2 = pg_pool_sum.find(it.first);
+      if (it2 == pg_pool_sum.end()) {
+       continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      const string& pool_name = osdmap.get_pool_name(it.first);
+      const pg_pool_t &pool = it.second;
+      bool full = false, nearfull = false;
+      if (pool.quota_max_objects > 0) {
+       stringstream ss;
+       if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
+       } else if (crit_threshold > 0 &&
+                  sum.num_objects >= pool.quota_max_objects*crit_threshold) {
+         ss << "pool '" << pool_name
+            << "' has " << sum.num_objects << " objects"
+            << " (max " << pool.quota_max_objects << ")";
+         full_detail.push_back(ss.str());
+         full = true;
+       } else if (warn_threshold > 0 &&
+                  sum.num_objects >= pool.quota_max_objects*warn_threshold) {
+         ss << "pool '" << pool_name
+            << "' has " << sum.num_objects << " objects"
+            << " (max " << pool.quota_max_objects << ")";
+         nearfull_detail.push_back(ss.str());
+         nearfull = true;
+       }
+      }
+      if (pool.quota_max_bytes > 0) {
+       stringstream ss;
+       if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+       } else if (crit_threshold > 0 &&
+                  sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
+         ss << "pool '" << pool_name
+            << "' has " << si_t(sum.num_bytes) << " bytes"
+            << " (max " << si_t(pool.quota_max_bytes) << ")";
+         full_detail.push_back(ss.str());
+         full = true;
+       } else if (warn_threshold > 0 &&
+                  sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
+         ss << "pool '" << pool_name
+            << "' has " << si_t(sum.num_bytes) << " bytes"
+            << " (max " << si_t(pool.quota_max_bytes) << ")";
+         nearfull_detail.push_back(ss.str());
+         nearfull = true;
+       }
+      }
+      if (full) {
+       ++full_pools;
+      }
+      if (nearfull) {
+       ++nearfull_pools;
+      }
+    }
+    if (full_pools) {
+      ostringstream ss;
+      ss << full_pools << " pools full";
+      auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
+      d.detail.swap(full_detail);
+    }
+    if (nearfull_pools) {
+      ostringstream ss;
+      ss << nearfull_pools << " pools full";
+      auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
+      d.detail.swap(nearfull_detail);
+    }
+  }
 
+  // OBJECT_MISPLACED
+  if (pg_sum.stats.sum.num_objects_misplaced &&
+      pg_sum.stats.sum.num_object_copies > 0) {
+    double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
+      (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_objects_misplaced
+       << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
+       << b << "%)";
+    checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
+  }
+
+  // OBJECT_UNFOUND
+  if (pg_sum.stats.sum.num_objects_unfound &&
+      pg_sum.stats.sum.num_objects) {
+    double pc = (double)pg_sum.stats.sum.num_objects_unfound /
+      (double)pg_sum.stats.sum.num_objects * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_objects_unfound
+       << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
+    auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
+
+    for (auto& p : pg_stat) {
+      if (p.second.stats.sum.num_objects_unfound) {
+       ostringstream ss;
+       ss << "pg " << p.first
+          << " has " << p.second.stats.sum.num_objects_unfound
+          << " unfound objects";
+       d.detail.push_back(ss.str());
+       if (d.detail.size() > max) {
+         d.detail.push_back("(additional pgs left out for brevity)");
+         break;
+       }
+      }
+    }
+  }
+
+  // REQUEST_SLOW
+  // REQUEST_STUCK
+  if (cct->_conf->mon_osd_warn_op_age > 0 &&
+      !osd_sum.op_queue_age_hist.h.empty() &&
+      osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
+      cct->_conf->mon_osd_warn_op_age) {
+    list<string> warn_detail, error_detail;
+    unsigned warn = 0, error = 0;
+    float err_age =
+      cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
+    const pow2_hist_t& h = osd_sum.op_queue_age_hist;
+    for (unsigned i = h.h.size() - 1; i > 0; --i) {
+      float ub = (float)(1 << i) / 1000.0;
+      if (ub < cct->_conf->mon_osd_warn_op_age)
+       break;
+      if (h.h[i]) {
+       ostringstream ss;
+       ss << h.h[i] << " ops are blocked > " << ub << " sec";
+       if (ub > err_age) {
+         error += h.h[i];
+         error_detail.push_back(ss.str());
+       } else {
+         warn += h.h[i];
+         warn_detail.push_back(ss.str());
+       }
+      }
+    }
+
+    map<float,set<int>> warn_osd_by_max; // max -> osds
+    map<float,set<int>> error_osd_by_max; // max -> osds
+    if (!warn_detail.empty() || !error_detail.empty()) {
+      for (auto& p : osd_stat) {
+       const pow2_hist_t& h = p.second.op_queue_age_hist;
+       for (unsigned i = h.h.size() - 1; i > 0; --i) {
+         float ub = (float)(1 << i) / 1000.0;
+         if (ub < cct->_conf->mon_osd_warn_op_age)
+           break;
+         if (h.h[i]) {
+           if (ub > err_age) {
+             error_osd_by_max[ub].insert(p.first);
+           } else {
+             warn_osd_by_max[ub].insert(p.first);
+           }
+           break;
+         }
+       }
+      }
+    }
+
+    if (!warn_detail.empty()) {
+      ostringstream ss;
+      ss << warn << " slow requests are blocked > "
+        << cct->_conf->mon_osd_warn_op_age << " sec";
+      auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
+      d.detail.swap(warn_detail);
+      int left = max;
+      for (auto& p : warn_osd_by_max) {
+       ostringstream ss;
+       if (p.second.size() > 1) {
+         ss << "osds " << p.second
+             << " have blocked requests > " << p.first << " sec";
+       } else {
+         ss << "osd." << *p.second.begin()
+             << " has blocked requests > " << p.first << " sec";
+       }
+       d.detail.push_back(ss.str());
+       if (--left == 0) {
+         break;
+       }
+      }
+    }
+    if (!error_detail.empty()) {
+      ostringstream ss;
+      ss << error << " stuck requests are blocked > "
+        << err_age << " sec";
+      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
+      d.detail.swap(error_detail);
+      int left = max;
+      for (auto& p : error_osd_by_max) {
+       ostringstream ss;
+       if (p.second.size() > 1) {
+         ss << "osds " << p.second
+             << " have stuck requests > " << p.first << " sec";
+       } else {
+         ss << "osd." << *p.second.begin()
+             << " has stuck requests > " << p.first << " sec";
+       }
+       d.detail.push_back(ss.str());
+       if (--left == 0) {
+         break;
+       }
+      }
+    }
+  }
+
+  // PG_NOT_SCRUBBED
+  // PG_NOT_DEEP_SCRUBBED
+  {
+    if (cct->_conf->mon_warn_not_scrubbed ||
+        cct->_conf->mon_warn_not_deep_scrubbed) {
+      list<string> detail, deep_detail;
+      const double age = cct->_conf->mon_warn_not_scrubbed +
+        cct->_conf->mon_scrub_interval;
+      utime_t cutoff = now;
+      cutoff -= age;
+      const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
+        cct->_conf->osd_deep_scrub_interval;
+      utime_t deep_cutoff = now;
+      deep_cutoff -= deep_age;
+      for (auto& p : pg_stat) {
+        if (cct->_conf->mon_warn_not_scrubbed &&
+            p.second.last_scrub_stamp < cutoff) {
+         ostringstream ss;
+         ss << "pg " << p.first << " not scrubbed since "
+            << p.second.last_scrub_stamp;
+          detail.push_back(ss.str());
+        }
+        if (cct->_conf->mon_warn_not_deep_scrubbed &&
+            p.second.last_deep_scrub_stamp < deep_cutoff) {
+         ostringstream ss;
+         ss << "pg " << p.first << " not deep-scrubbed since "
+            << p.second.last_deep_scrub_stamp;
+          deep_detail.push_back(ss.str());
+        }
+      }
+      if (!detail.empty()) {
+        ostringstream ss;
+        ss << detail.size() << " pgs not scrubbed for " << age;
+        auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+        d.detail.swap(detail);
+      }
+      if (!deep_detail.empty()) {
+        ostringstream ss;
+        ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
+        auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+        d.detail.swap(deep_detail);
+      }
+    }
+  }
+
+  // POOL_APP
+  if (g_conf->get_val<bool>("mon_warn_on_pool_no_app")) {
+    list<string> detail;
+    for (auto &it : pools) {
+      const pg_pool_t &pool = it.second;
+      const string& pool_name = osdmap.get_pool_name(it.first);
+      auto it2 = pg_pool_sum.find(it.first);
+      if (it2 == pg_pool_sum.end()) {
+        continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      if (pstat == nullptr) {
+        continue;
+      }
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      // application metadata is not encoded until luminous is minimum
+      // required release
+      if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+          sum.num_objects > 0 && pool.application_metadata.empty() &&
+          !pool.is_tier() && !g_conf->mon_debug_no_require_luminous) {
+        stringstream ss;
+        ss << "application not enabled on pool '" << pool_name << "'";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << "application not enabled on " << detail.size() << " pool(s)";
+      auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
+      stringstream tip;
+      tip << "use 'ceph osd pool application enable <pool-name> "
+          << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
+          << "or freeform for custom applications.";
+      detail.push_back(tip.str());
+      d.detail.swap(detail);
+    }
   }
 }
 
@@ -2589,6 +3332,8 @@ void PGMap::get_health(
       note["backfill_toofull"] += p->second;
     if (p->first & PG_STATE_RECOVERY_TOOFULL)
       note["recovery_toofull"] += p->second;
+    if (p->first & PG_STATE_SNAPTRIM_ERROR)
+      note["snaptrim_error"] += p->second;
   }
 
   mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
@@ -2722,7 +3467,8 @@ void PGMap::get_health(
 
   // slow requests
   if (cct->_conf->mon_osd_warn_op_age > 0 &&
-      osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) {
+      osd_sum.op_queue_age_hist.upper_bound() / 1000.0  >
+      cct->_conf->mon_osd_warn_op_age) {
     auto sum = _warn_slow_request_histogram(
       cct, osd_sum.op_queue_age_hist, "", summary, NULL);
     if (sum.first > 0 || sum.second > 0) {
@@ -2735,7 +3481,7 @@ void PGMap::get_health(
       }
       if (sum.second > 0) {
        ostringstream ss;
-       ss << sum.first << " requests are blocked > "
+       ss << sum.second << " requests are blocked > "
           << (cct->_conf->mon_osd_warn_op_age *
               cct->_conf->mon_osd_err_op_age_ratio)
           << " sec";
@@ -2774,32 +3520,6 @@ void PGMap::get_health(
     }
   }
 
-  if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
-    float max_osd_usage = 0.0, min_osd_usage = 1.0;
-    for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
-      // kb should never be 0, but avoid divide by zero in case of corruption
-      if (p->second.kb <= 0)
-        continue;
-      float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
-      if (usage > max_osd_usage)
-        max_osd_usage = usage;
-      if (usage < min_osd_usage)
-        min_osd_usage = usage;
-    }
-    float diff = max_osd_usage - min_osd_usage;
-    if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
-      ostringstream ss;
-      ss << "difference between min (" << roundf(min_osd_usage*1000.0)/100.0
-        << "%) and max (" << roundf(max_osd_usage*1000.0)/100.0
-        << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
-        << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
-        << " (mon_warn_osd_usage_min_max_delta)";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail)
-        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-  }
-
   // recovery
   list<string> sl;
   overall_recovery_summary(NULL, &sl);
@@ -2870,6 +3590,10 @@ void PGMap::get_health(
   // pg skew
   int num_in = osdmap.get_num_in_osds();
   int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
+  int sum_objects = pg_sum.stats.sum.num_objects;
+  if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
+    return;
+  }
   if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
     int per = sum_pg_up / num_in;
     if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
@@ -2931,6 +3655,70 @@ void PGMap::get_health(
     }
   }
 
+  for (auto it : pools) {
+    auto it2 = pg_pool_sum.find(it.first);
+    if (it2 == pg_pool_sum.end()) {
+      continue;
+    }
+    const pool_stat_t *pstat = &it2->second;
+    const object_stat_sum_t& sum = pstat->stats.sum;
+    const string& pool_name = osdmap.get_pool_name(it.first);
+    const pg_pool_t &pool = it.second;
+
+    float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
+    float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
+
+    if (pool.quota_max_objects > 0) {
+      stringstream ss;
+      health_status_t status = HEALTH_OK;
+      if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
+      } else if (crit_threshold > 0 &&
+                sum.num_objects >= pool.quota_max_objects*crit_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << sum.num_objects << " objects"
+           << " (max " << pool.quota_max_objects << ")";
+        status = HEALTH_ERR;
+      } else if (warn_threshold > 0 &&
+                sum.num_objects >= pool.quota_max_objects*warn_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << sum.num_objects << " objects"
+           << " (max " << pool.quota_max_objects << ")";
+        status = HEALTH_WARN;
+      }
+      if (status != HEALTH_OK) {
+        pair<health_status_t,string> s(status, ss.str());
+        summary.push_back(s);
+        if (detail)
+          detail->push_back(s);
+      }
+    }
+
+    if (pool.quota_max_bytes > 0) {
+      health_status_t status = HEALTH_OK;
+      stringstream ss;
+      if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+      } else if (crit_threshold > 0 &&
+                sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << si_t(sum.num_bytes) << " bytes"
+           << " (max " << si_t(pool.quota_max_bytes) << ")";
+        status = HEALTH_ERR;
+      } else if (warn_threshold > 0 &&
+                sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << si_t(sum.num_bytes) << " bytes"
+           << " (max " << si_t(pool.quota_max_bytes) << ")";
+        status = HEALTH_WARN;
+      }
+      if (status != HEALTH_OK) {
+        pair<health_status_t,string> s(status, ss.str());
+        summary.push_back(s);
+        if (detail)
+          detail->push_back(s);
+      }
+    }
+  }
+
   print_unscrubbed_pgs(pg_stat, summary, detail, cct);
 }
 
@@ -3095,7 +3883,11 @@ int process_pg_map_command(
         break;
       } else {
         int filter = pg_string_state(state_str);
-        assert(filter != -1);
+        if (filter < 0) {
+          *ss << "'" << state_str << "' is not a valid pg state,"
+              << " available choices: " << pg_state_string(0xFFFFFFFF);
+          return -EINVAL;
+        }
         state |= filter;
       }
 
@@ -3410,9 +4202,12 @@ void PGMapUpdater::check_osd_map(
       my_pg_num = q->second;
     unsigned pg_num = pi.get_pg_num();
     if (my_pg_num != pg_num) {
+      ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
+                   << " != my pg_num " << my_pg_num << dendl;
       for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
        pg_t pgid(ps, poolid);
        if (pending_inc->pg_stat_updates.count(pgid) == 0) {
+         ldout(cct,20) << __func__ << " adding " << pgid << dendl;
          pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
          stats.last_fresh = osdmap.get_modified();
          stats.last_active = osdmap.get_modified();
@@ -3772,6 +4567,9 @@ int reweight::by_utilization(
       if (pools && pools->count(pg.first.pool()) == 0)
        continue;
       for (const auto acting : pg.second.acting) {
+        if (!osdmap.exists(acting)) {
+          continue;
+        }
        if (acting >= (int)pgs_by_osd.size())
          pgs_by_osd.resize(acting);
        if (pgs_by_osd[acting] == 0) {