}
void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
- const pool_stat_t& delta_sum) const
+ const pool_stat_t& pool_sum) const
{
- if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
- double pc = (double)delta_sum.stats.sum.num_objects_degraded /
- (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
+ if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
+ double pc = (double)pool_sum.stats.sum.num_objects_degraded /
+ (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
char b[20];
snprintf(b, sizeof(b), "%.3lf", pc);
if (f) {
- f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
- f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
+ f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
+ f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
f->dump_float("degraded_ratio", pc / 100.0);
} else {
ostringstream ss;
- ss << delta_sum.stats.sum.num_objects_degraded
- << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
+ ss << pool_sum.stats.sum.num_objects_degraded
+ << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
psl->push_back(ss.str());
}
}
- if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
- double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
- (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
+ if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
+ double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
+ (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
char b[20];
snprintf(b, sizeof(b), "%.3lf", pc);
if (f) {
- f->dump_unsigned("misplaced_objects", delta_sum.stats.sum.num_objects_misplaced);
- f->dump_unsigned("misplaced_total", delta_sum.stats.sum.num_object_copies);
+ f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
+ f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
f->dump_float("misplaced_ratio", pc / 100.0);
} else {
ostringstream ss;
- ss << delta_sum.stats.sum.num_objects_misplaced
- << "/" << delta_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
+ ss << pool_sum.stats.sum.num_objects_misplaced
+ << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
psl->push_back(ss.str());
}
}
- if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
- double pc = (double)delta_sum.stats.sum.num_objects_unfound /
- (double)delta_sum.stats.sum.num_objects * (double)100.0;
+ if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
+ double pc = (double)pool_sum.stats.sum.num_objects_unfound /
+ (double)pool_sum.stats.sum.num_objects * (double)100.0;
char b[20];
snprintf(b, sizeof(b), "%.3lf", pc);
if (f) {
- f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
- f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
+ f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
+ f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
f->dump_float("unfound_ratio", pc / 100.0);
} else {
ostringstream ss;
- ss << delta_sum.stats.sum.num_objects_unfound
- << "/" << delta_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
+ ss << pool_sum.stats.sum.num_objects_unfound
+ << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
psl->push_back(ss.str());
}
}
void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
uint64_t poolid) const
{
- auto p = per_pool_sum_delta.find(poolid);
- if (p == per_pool_sum_delta.end())
+ auto p = pg_pool_sum.find(poolid);
+ if (p == pg_pool_sum.end())
return;
- recovery_summary(f, psl, p->second.first);
+ recovery_summary(f, psl, p->second);
}
void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
assert(inc.version == version+1);
version++;
- utime_t delta_t;
- delta_t = inc.stamp;
- delta_t -= stamp;
- stamp = inc.stamp;
-
pool_stat_t pg_sum_old = pg_sum;
mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
full_osds.erase(*p);
}
- // calculate a delta, and average over the last 2 deltas.
- pool_stat_t d = pg_sum;
- d.stats.sub(pg_sum_old.stats);
- pg_sum_deltas.push_back(make_pair(d, delta_t));
- stamp_delta += delta_t;
-
- pg_sum_delta.stats.add(d.stats);
- if (pg_sum_deltas.size() > (unsigned)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
- pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
- stamp_delta -= pg_sum_deltas.front().second;
- pg_sum_deltas.pop_front();
+ // skip calculating delta while sum was not synchronized
+ if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
+ utime_t delta_t;
+ delta_t = inc.stamp;
+ delta_t -= stamp;
+ // calculate a delta, and average over the last 2 deltas.
+ pool_stat_t d = pg_sum;
+ d.stats.sub(pg_sum_old.stats);
+ pg_sum_deltas.push_back(make_pair(d, delta_t));
+ stamp_delta += delta_t;
+ pg_sum_delta.stats.add(d.stats);
+ auto smooth_intervals =
+ cct ? cct->_conf->get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
+ if (pg_sum_deltas.size() > smooth_intervals) {
+ pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
+ stamp_delta -= pg_sum_deltas.front().second;
+ pg_sum_deltas.pop_front();
+ }
}
+ stamp = inc.stamp;
update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+ tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
}
for (auto i = pg_stats.begin();
<< st.last_scrub_stamp
<< st.last_deep_scrub
<< st.last_deep_scrub_stamp
+ << st.snaptrimq_len
<< TextTable::endrow;
}
}
*/
pool_stat_t d = current_pool_sum;
d.stats.sub(old_pool_sum.stats);
- delta_avg_list->push_back(make_pair(d,delta_t));
- *result_ts_delta += delta_t;
/* Aggregate current delta, and take out the last seen delta (if any) to
* average it out.
+ * Skip calculating delta while sum was not synchronized.
*/
- result_pool_delta->stats.add(d.stats);
- size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
+ if(!old_pool_sum.stats.sum.is_zero()) {
+ delta_avg_list->push_back(make_pair(d,delta_t));
+ *result_ts_delta += delta_t;
+ result_pool_delta->stats.add(d.stats);
+ }
+ size_t s = cct ? cct->_conf->get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
if (delta_avg_list->size() > s) {
result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
*result_ts_delta -= delta_avg_list->front().second;
health_check_map_t *checks) const
{
utime_t now = ceph_clock_now();
- const unsigned max = cct->_conf->mon_health_max_detail;
+ const auto max = cct->_conf->get_val<uint64_t>("mon_health_max_detail");
const auto& pools = osdmap.get_pools();
typedef enum pg_consequence_t {
{ PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
{ PG_STATE_REPAIR, {DAMAGED, {}} },
{ PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
+ { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
+ { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
{ PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
{ PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
{ PG_STATE_DEGRADED, {DEGRADED, {}} },
{ PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
{ PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
// Delayed and inverted reports
- { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} },
- { PG_STATE_CLEAN, {DEGRADED, [](const pg_stat_t &p){return p.last_clean;}, true} }
+ { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
};
// Specialized state printer that takes account of inversion of
}
}
- utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
+ utime_t cutoff = now - utime_t(cct->_conf->get_val<int64_t>("mon_pg_stuck_threshold"), 0);
// Loop over all PGs, if there are any possibly-unhealthy states in there
if (!possible_responses.empty()) {
for (const auto& i : pg_stat) {
// MANY_OBJECTS_PER_PG
if (!pg_stat.empty()) {
list<string> pgp_detail, many_detail;
+ const auto mon_pg_warn_min_objects =
+ cct->_conf->get_val<int64_t>("mon_pg_warn_min_objects");
+ const auto mon_pg_warn_min_pool_objects =
+ cct->_conf->get_val<int64_t>("mon_pg_warn_min_pool_objects");
+ const auto mon_pg_warn_max_object_skew =
+ cct->_conf->get_val<double>("mon_pg_warn_max_object_skew");
for (auto p = pg_pool_sum.begin();
p != pg_pool_sum.end();
++p) {
}
int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
if (average_objects_per_pg > 0 &&
- pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
- p->second.stats.sum.num_objects >=
- cct->_conf->mon_pg_warn_min_pool_objects) {
+ pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
+ p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
- if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
- ratio > cct->_conf->mon_pg_warn_max_object_skew) {
+ if (mon_pg_warn_max_object_skew > 0 &&
+ ratio > mon_pg_warn_max_object_skew) {
ostringstream ss;
ss << "pool " << name << " objects per pg ("
<< objects_per_pg << ") is more than " << ratio
// POOL_FULL
// POOL_NEAR_FULL
{
- float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
- float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
+ float warn_threshold = (float)g_conf->get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
+ float crit_threshold = (float)g_conf->get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
list<string> full_detail, nearfull_detail;
unsigned full_pools = 0, nearfull_pools = 0;
for (auto it : pools) {
d.detail.swap(detail);
}
}
+
+ // PG_SLOW_SNAP_TRIMMING
+ if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
+ uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
+ uint64_t snaptrimq_exceeded = 0;
+ uint32_t longest_queue = 0;
+ const pg_t* longest_q_pg = nullptr;
+ list<string> detail;
+
+ for (auto& i: pg_stat) {
+ uint32_t current_len = i.second.snaptrimq_len;
+ if (current_len >= snapthreshold) {
+ snaptrimq_exceeded++;
+ if (longest_queue <= current_len) {
+ longest_q_pg = &i.first;
+ longest_queue = current_len;
+ }
+ if (detail.size() < max - 1) {
+ stringstream ss;
+ ss << "snap trim queue for pg " << i.first << " at " << current_len;
+ detail.push_back(ss.str());
+ continue;
+ }
+ if (detail.size() < max) {
+ detail.push_back("...more pgs affected");
+ continue;
+ }
+ }
+ }
+
+ if (snaptrimq_exceeded) {
+ {
+ ostringstream ss;
+ ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
+ detail.push_back(ss.str());
+ }
+
+ stringstream ss;
+ ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
+ auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str());
+ detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
+ d.detail.swap(detail);
+ }
+ }
}
void PGMap::get_health(
mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
utime_t now(ceph_clock_now());
- utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
+ utime_t cutoff = now - utime_t(g_conf->get_val<int64_t>("mon_pg_stuck_threshold"), 0);
uint64_t num_inactive_pgs = 0;
if (detail) {
note["stuck inactive"] = stuck_pgs.size();
num_inactive_pgs += stuck_pgs.size();
note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
- cct->_conf->mon_health_max_detail, detail);
+ cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
stuck_pgs.clear();
}
get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
note["stuck unclean"] = stuck_pgs.size();
note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
- cct->_conf->mon_health_max_detail, detail);
+ cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
stuck_pgs.clear();
}
get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
note["stuck undersized"] = stuck_pgs.size();
note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
- cct->_conf->mon_health_max_detail, detail);
+ cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
stuck_pgs.clear();
}
get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
note["stuck degraded"] = stuck_pgs.size();
note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
- cct->_conf->mon_health_max_detail, detail);
+ cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
stuck_pgs.clear();
}
note["stuck stale"] = stuck_pgs.size();
num_inactive_pgs += stuck_pgs.size();
note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
- cct->_conf->mon_health_max_detail, detail);
+ cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
}
}
} else {
if (cct->_conf->mon_pg_min_inactive > 0 &&
num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
ostringstream ss;
- ss << num_inactive_pgs << " pgs are stuck inactive for more than " << cct->_conf->mon_pg_stuck_threshold << " seconds";
+ ss << num_inactive_pgs << " pgs are stuck inactive for more than " << g_conf->get_val<int64_t>("mon_pg_stuck_threshold") << " seconds";
summary.push_back(make_pair(HEALTH_ERR, ss.str()));
}
}
if (detail) {
int n = 0, more = 0;
- int max = cct->_conf->mon_health_max_detail;
+ int max = cct->_conf->get_val<uint64_t>("mon_health_max_detail");
for (auto p = pg_stat.begin();
p != pg_stat.end();
++p) {
stuckop_vec.push_back("unclean");
int64_t threshold;
cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
- int64_t(g_conf->mon_pg_stuck_threshold));
+ g_conf->get_val<int64_t>("mon_pg_stuck_threshold"));
r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
odata->append(ds);
// if a large number of osds changed state, just iterate over the whole
// pg map.
if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
- g_conf->mon_pg_check_down_all_threshold) {
+ g_conf->get_val<double>("mon_pg_check_down_all_threshold")) {
check_all = true;
}