1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include <boost/algorithm/string.hpp>
6 #include "include/rados.h"
9 #define dout_subsys ceph_subsys_mon
10 #include "common/debug.h"
11 #include "common/Clock.h"
12 #include "common/Formatter.h"
13 #include "global/global_context.h"
14 #include "include/ceph_features.h"
15 #include "include/stringify.h"
17 #include "osd/osd_types.h"
18 #include "osd/OSDMap.h"
19 #include <boost/range/adaptor/reversed.hpp>
21 #define dout_context g_ceph_context
28 using std::ostringstream
;
31 using std::stringstream
;
34 using ceph::bufferlist
;
35 using ceph::fixed_u_to_string
;
36 using ceph::common::cmd_getval
;
37 using ceph::common::cmd_getval_or
;
38 using ceph::common::cmd_putval
;
40 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest
, pgmap_digest
, pgmap
);
41 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap
, pgmap
, pgmap
);
42 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental
, pgmap_inc
, pgmap
);
45 // ---------------------
48 void PGMapDigest::encode(bufferlist
& bl
, uint64_t features
) const
50 // NOTE: see PGMap::encode_digest
52 assert(HAVE_FEATURE(features
, SERVER_NAUTILUS
));
53 ENCODE_START(v
, 1, bl
);
55 encode(num_pg_active
, bl
);
56 encode(num_pg_unknown
, bl
);
58 encode(pg_pool_sum
, bl
, features
);
59 encode(pg_sum
, bl
, features
);
60 encode(osd_sum
, bl
, features
);
61 encode(num_pg_by_state
, bl
);
62 encode(num_pg_by_osd
, bl
);
63 encode(num_pg_by_pool
, bl
);
64 encode(osd_last_seq
, bl
);
65 encode(per_pool_sum_delta
, bl
, features
);
66 encode(per_pool_sum_deltas_stamps
, bl
);
67 encode(pg_sum_delta
, bl
, features
);
68 encode(stamp_delta
, bl
);
69 encode(avail_space_by_rule
, bl
);
70 encode(purged_snaps
, bl
);
71 encode(osd_sum_by_class
, bl
, features
);
75 void PGMapDigest::decode(bufferlist::const_iterator
& p
)
78 assert(struct_v
>= 4);
80 decode(num_pg_active
, p
);
81 decode(num_pg_unknown
, p
);
83 decode(pg_pool_sum
, p
);
86 decode(num_pg_by_state
, p
);
87 decode(num_pg_by_osd
, p
);
88 decode(num_pg_by_pool
, p
);
89 decode(osd_last_seq
, p
);
90 decode(per_pool_sum_delta
, p
);
91 decode(per_pool_sum_deltas_stamps
, p
);
92 decode(pg_sum_delta
, p
);
93 decode(stamp_delta
, p
);
94 decode(avail_space_by_rule
, p
);
95 decode(purged_snaps
, p
);
96 decode(osd_sum_by_class
, p
);
100 void PGMapDigest::dump(ceph::Formatter
*f
) const
102 f
->dump_unsigned("num_pg", num_pg
);
103 f
->dump_unsigned("num_pg_active", num_pg_active
);
104 f
->dump_unsigned("num_pg_unknown", num_pg_unknown
);
105 f
->dump_unsigned("num_osd", num_osd
);
106 f
->dump_object("pool_sum", pg_sum
);
107 f
->dump_object("osd_sum", osd_sum
);
109 f
->open_object_section("osd_sum_by_class");
110 for (auto& i
: osd_sum_by_class
) {
111 f
->dump_object(i
.first
.c_str(), i
.second
);
115 f
->open_array_section("pool_stats");
116 for (auto& p
: pg_pool_sum
) {
117 f
->open_object_section("pool_stat");
118 f
->dump_int("poolid", p
.first
);
119 auto q
= num_pg_by_pool
.find(p
.first
);
120 if (q
!= num_pg_by_pool
.end())
121 f
->dump_unsigned("num_pg", q
->second
);
126 f
->open_array_section("osd_stats");
128 // TODO: this isn't really correct since we can dump non-existent OSDs
129 // I dunno what osd_last_seq is set to in that case...
130 for (auto& p
: osd_last_seq
) {
131 f
->open_object_section("osd_stat");
132 f
->dump_int("osd", i
);
133 f
->dump_unsigned("seq", p
);
138 f
->open_array_section("num_pg_by_state");
139 for (auto& p
: num_pg_by_state
) {
140 f
->open_object_section("count");
141 f
->dump_string("state", pg_state_string(p
.first
));
142 f
->dump_unsigned("num", p
.second
);
146 f
->open_array_section("num_pg_by_osd");
147 for (auto& p
: num_pg_by_osd
) {
148 f
->open_object_section("count");
149 f
->dump_unsigned("osd", p
.first
);
150 f
->dump_unsigned("num_primary_pg", p
.second
.primary
);
151 f
->dump_unsigned("num_acting_pg", p
.second
.acting
);
152 f
->dump_unsigned("num_up_not_acting_pg", p
.second
.up_not_acting
);
156 f
->open_array_section("purged_snaps");
157 for (auto& j
: purged_snaps
) {
158 f
->open_object_section("pool");
159 f
->dump_int("pool", j
.first
);
160 f
->open_object_section("purged_snaps");
161 for (auto i
= j
.second
.begin(); i
!= j
.second
.end(); ++i
) {
162 f
->open_object_section("interval");
163 f
->dump_stream("start") << i
.get_start();
164 f
->dump_stream("length") << i
.get_len();
173 void PGMapDigest::generate_test_instances(list
<PGMapDigest
*>& ls
)
175 ls
.push_back(new PGMapDigest
);
178 inline std::string
percentify(const float& a
) {
179 std::stringstream ss
;
183 ss
<< std::fixed
<< std::setprecision(2) << a
;
187 void PGMapDigest::print_summary(ceph::Formatter
*f
, ostream
*out
) const
190 f
->open_array_section("pgs_by_state");
192 // list is descending numeric order (by count)
193 std::multimap
<int,uint64_t> state_by_count
; // count -> state
194 for (auto p
= num_pg_by_state
.begin();
195 p
!= num_pg_by_state
.end();
197 state_by_count
.insert(make_pair(p
->second
, p
->first
));
200 for (auto p
= state_by_count
.rbegin();
201 p
!= state_by_count
.rend();
204 f
->open_object_section("pgs_by_state_element");
205 f
->dump_string("state_name", pg_state_string(p
->second
));
206 f
->dump_unsigned("count", p
->first
);
214 f
->dump_unsigned("num_pgs", num_pg
);
215 f
->dump_unsigned("num_pools", pg_pool_sum
.size());
216 f
->dump_unsigned("num_objects", pg_sum
.stats
.sum
.num_objects
);
217 f
->dump_unsigned("data_bytes", pg_sum
.stats
.sum
.num_bytes
);
218 f
->dump_unsigned("bytes_used", osd_sum
.statfs
.get_used_raw());
219 f
->dump_unsigned("bytes_avail", osd_sum
.statfs
.available
);
220 f
->dump_unsigned("bytes_total", osd_sum
.statfs
.total
);
222 *out
<< " pools: " << pg_pool_sum
.size() << " pools, "
223 << num_pg
<< " pgs\n";
224 *out
<< " objects: " << si_u_t(pg_sum
.stats
.sum
.num_objects
) << " objects, "
225 << byte_u_t(pg_sum
.stats
.sum
.num_bytes
) << "\n";
227 << byte_u_t(osd_sum
.statfs
.get_used_raw()) << " used, "
228 << byte_u_t(osd_sum
.statfs
.available
) << " / "
229 << byte_u_t(osd_sum
.statfs
.total
) << " avail\n";
235 if (num_pg_unknown
> 0) {
236 float p
= (float)num_pg_unknown
/ (float)num_pg
;
238 f
->dump_float("unknown_pgs_ratio", p
);
241 snprintf(b
, sizeof(b
), "%.3lf", p
* 100.0);
242 *out
<< b
<< "% pgs unknown\n";
247 int num_pg_inactive
= num_pg
- num_pg_active
- num_pg_unknown
;
248 if (num_pg_inactive
> 0) {
249 float p
= (float)num_pg_inactive
/ (float)num_pg
;
251 f
->dump_float("inactive_pgs_ratio", p
);
257 snprintf(b
, sizeof(b
), "%.3f", p
* 100.0);
258 *out
<< b
<< "% pgs not active\n";
264 overall_recovery_summary(f
, &sl
);
265 if (!f
&& !sl
.empty()) {
266 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
) {
277 unsigned max_width
= 1;
278 for (auto p
= state_by_count
.rbegin(); p
!= state_by_count
.rend(); ++p
)
280 std::stringstream ss
;
282 max_width
= std::max
<size_t>(ss
.str().size(), max_width
);
285 for (auto p
= state_by_count
.rbegin(); p
!= state_by_count
.rend(); ++p
)
291 out
->setf(std::ios::left
);
292 *out
<< std::setw(max_width
) << p
->first
293 << " " << pg_state_string(p
->second
) << "\n";
294 out
->unsetf(std::ios::left
);
298 ostringstream ss_rec_io
;
299 overall_recovery_rate_summary(f
, &ss_rec_io
);
300 ostringstream ss_client_io
;
301 overall_client_io_rate_summary(f
, &ss_client_io
);
302 ostringstream ss_cache_io
;
303 overall_cache_io_rate_summary(f
, &ss_cache_io
);
305 if (!f
&& (ss_client_io
.str().length() || ss_rec_io
.str().length()
306 || ss_cache_io
.str().length())) {
311 if (!f
&& ss_client_io
.str().length())
312 *out
<< " client: " << ss_client_io
.str() << "\n";
313 if (!f
&& ss_rec_io
.str().length())
314 *out
<< " recovery: " << ss_rec_io
.str() << "\n";
315 if (!f
&& ss_cache_io
.str().length())
316 *out
<< " cache: " << ss_cache_io
.str() << "\n";
319 void PGMapDigest::print_oneline_summary(ceph::Formatter
*f
, ostream
*out
) const
321 std::stringstream ss
;
324 f
->open_array_section("num_pg_by_state");
325 for (auto p
= num_pg_by_state
.begin();
326 p
!= num_pg_by_state
.end();
329 f
->open_object_section("state");
330 f
->dump_string("name", pg_state_string(p
->first
));
331 f
->dump_unsigned("num", p
->second
);
334 if (p
!= num_pg_by_state
.begin())
336 ss
<< p
->second
<< " " << pg_state_string(p
->first
);
341 string states
= ss
.str();
343 *out
<< num_pg
<< " pgs: "
345 << byte_u_t(pg_sum
.stats
.sum
.num_bytes
) << " data, "
346 << byte_u_t(osd_sum
.statfs
.get_used()) << " used, "
347 << byte_u_t(osd_sum
.statfs
.available
) << " / "
348 << byte_u_t(osd_sum
.statfs
.total
) << " avail";
350 f
->dump_unsigned("num_pgs", num_pg
);
351 f
->dump_unsigned("num_bytes", pg_sum
.stats
.sum
.num_bytes
);
352 f
->dump_int("total_bytes", osd_sum
.statfs
.total
);
353 f
->dump_int("total_avail_bytes", osd_sum
.statfs
.available
);
354 f
->dump_int("total_used_bytes", osd_sum
.statfs
.get_used());
355 f
->dump_int("total_used_raw_bytes", osd_sum
.statfs
.get_used_raw());
358 // make non-negative; we can get negative values if osds send
359 // uncommitted stats and then "go backward" or if they are just
361 pool_stat_t pos_delta
= pg_sum_delta
;
363 if (pos_delta
.stats
.sum
.num_rd
||
364 pos_delta
.stats
.sum
.num_wr
) {
367 if (pos_delta
.stats
.sum
.num_rd
) {
368 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)stamp_delta
;
370 *out
<< byte_u_t(rd
) << "/s rd, ";
372 f
->dump_unsigned("read_bytes_sec", rd
);
374 if (pos_delta
.stats
.sum
.num_wr
) {
375 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)stamp_delta
;
377 *out
<< byte_u_t(wr
) << "/s wr, ";
379 f
->dump_unsigned("write_bytes_sec", wr
);
381 int64_t iops
= (pos_delta
.stats
.sum
.num_rd
+ pos_delta
.stats
.sum
.num_wr
) / (double)stamp_delta
;
383 *out
<< si_u_t(iops
) << " op/s";
385 f
->dump_unsigned("io_sec", iops
);
389 overall_recovery_summary(f
, &sl
);
391 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
)
393 std::stringstream ssr
;
394 overall_recovery_rate_summary(f
, &ssr
);
395 if (out
&& ssr
.str().length())
396 *out
<< "; " << ssr
.str() << " recovering";
399 void PGMapDigest::get_recovery_stats(
400 double *misplaced_ratio
,
401 double *degraded_ratio
,
402 double *inactive_pgs_ratio
,
403 double *unknown_pgs_ratio
) const
405 if (pg_sum
.stats
.sum
.num_objects_degraded
&&
406 pg_sum
.stats
.sum
.num_object_copies
> 0) {
407 *degraded_ratio
= (double)pg_sum
.stats
.sum
.num_objects_degraded
/
408 (double)pg_sum
.stats
.sum
.num_object_copies
;
412 if (pg_sum
.stats
.sum
.num_objects_misplaced
&&
413 pg_sum
.stats
.sum
.num_object_copies
> 0) {
414 *misplaced_ratio
= (double)pg_sum
.stats
.sum
.num_objects_misplaced
/
415 (double)pg_sum
.stats
.sum
.num_object_copies
;
417 *misplaced_ratio
= 0;
420 int num_pg_inactive
= num_pg
- num_pg_active
- num_pg_unknown
;
421 *inactive_pgs_ratio
= (double)num_pg_inactive
/ (double)num_pg
;
422 *unknown_pgs_ratio
= (double)num_pg_unknown
/ (double)num_pg
;
424 *inactive_pgs_ratio
= 0;
425 *unknown_pgs_ratio
= 0;
429 void PGMapDigest::recovery_summary(ceph::Formatter
*f
, list
<string
> *psl
,
430 const pool_stat_t
& pool_sum
) const
432 if (pool_sum
.stats
.sum
.num_objects_degraded
&& pool_sum
.stats
.sum
.num_object_copies
> 0) {
433 double pc
= (double)pool_sum
.stats
.sum
.num_objects_degraded
/
434 (double)pool_sum
.stats
.sum
.num_object_copies
* (double)100.0;
436 snprintf(b
, sizeof(b
), "%.3lf", pc
);
438 f
->dump_unsigned("degraded_objects", pool_sum
.stats
.sum
.num_objects_degraded
);
439 f
->dump_unsigned("degraded_total", pool_sum
.stats
.sum
.num_object_copies
);
440 f
->dump_float("degraded_ratio", pc
/ 100.0);
443 ss
<< pool_sum
.stats
.sum
.num_objects_degraded
444 << "/" << pool_sum
.stats
.sum
.num_object_copies
<< " objects degraded (" << b
<< "%)";
445 psl
->push_back(ss
.str());
448 if (pool_sum
.stats
.sum
.num_objects_misplaced
&& pool_sum
.stats
.sum
.num_object_copies
> 0) {
449 double pc
= (double)pool_sum
.stats
.sum
.num_objects_misplaced
/
450 (double)pool_sum
.stats
.sum
.num_object_copies
* (double)100.0;
452 snprintf(b
, sizeof(b
), "%.3lf", pc
);
454 f
->dump_unsigned("misplaced_objects", pool_sum
.stats
.sum
.num_objects_misplaced
);
455 f
->dump_unsigned("misplaced_total", pool_sum
.stats
.sum
.num_object_copies
);
456 f
->dump_float("misplaced_ratio", pc
/ 100.0);
459 ss
<< pool_sum
.stats
.sum
.num_objects_misplaced
460 << "/" << pool_sum
.stats
.sum
.num_object_copies
<< " objects misplaced (" << b
<< "%)";
461 psl
->push_back(ss
.str());
464 if (pool_sum
.stats
.sum
.num_objects_unfound
&& pool_sum
.stats
.sum
.num_objects
) {
465 double pc
= (double)pool_sum
.stats
.sum
.num_objects_unfound
/
466 (double)pool_sum
.stats
.sum
.num_objects
* (double)100.0;
468 snprintf(b
, sizeof(b
), "%.3lf", pc
);
470 f
->dump_unsigned("unfound_objects", pool_sum
.stats
.sum
.num_objects_unfound
);
471 f
->dump_unsigned("unfound_total", pool_sum
.stats
.sum
.num_objects
);
472 f
->dump_float("unfound_ratio", pc
/ 100.0);
475 ss
<< pool_sum
.stats
.sum
.num_objects_unfound
476 << "/" << pool_sum
.stats
.sum
.num_objects
<< " objects unfound (" << b
<< "%)";
477 psl
->push_back(ss
.str());
482 void PGMapDigest::recovery_rate_summary(ceph::Formatter
*f
, ostream
*out
,
483 const pool_stat_t
& delta_sum
,
484 utime_t delta_stamp
) const
486 // make non-negative; we can get negative values if osds send
487 // uncommitted stats and then "go backward" or if they are just
489 pool_stat_t pos_delta
= delta_sum
;
491 if (pos_delta
.stats
.sum
.num_objects_recovered
||
492 pos_delta
.stats
.sum
.num_bytes_recovered
||
493 pos_delta
.stats
.sum
.num_keys_recovered
) {
494 int64_t objps
= pos_delta
.stats
.sum
.num_objects_recovered
/ (double)delta_stamp
;
495 int64_t bps
= pos_delta
.stats
.sum
.num_bytes_recovered
/ (double)delta_stamp
;
496 int64_t kps
= pos_delta
.stats
.sum
.num_keys_recovered
/ (double)delta_stamp
;
498 f
->dump_int("recovering_objects_per_sec", objps
);
499 f
->dump_int("recovering_bytes_per_sec", bps
);
500 f
->dump_int("recovering_keys_per_sec", kps
);
501 f
->dump_int("num_objects_recovered", pos_delta
.stats
.sum
.num_objects_recovered
);
502 f
->dump_int("num_bytes_recovered", pos_delta
.stats
.sum
.num_bytes_recovered
);
503 f
->dump_int("num_keys_recovered", pos_delta
.stats
.sum
.num_keys_recovered
);
505 *out
<< byte_u_t(bps
) << "/s";
506 if (pos_delta
.stats
.sum
.num_keys_recovered
)
507 *out
<< ", " << si_u_t(kps
) << " keys/s";
508 *out
<< ", " << si_u_t(objps
) << " objects/s";
513 void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter
*f
, ostream
*out
) const
515 recovery_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
518 void PGMapDigest::overall_recovery_summary(ceph::Formatter
*f
, list
<string
> *psl
) const
520 recovery_summary(f
, psl
, pg_sum
);
523 void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter
*f
, ostream
*out
,
524 uint64_t poolid
) const
526 auto p
= per_pool_sum_delta
.find(poolid
);
527 if (p
== per_pool_sum_delta
.end())
530 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
531 ceph_assert(ts
!= per_pool_sum_deltas_stamps
.end());
532 recovery_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
535 void PGMapDigest::pool_recovery_summary(ceph::Formatter
*f
, list
<string
> *psl
,
536 uint64_t poolid
) const
538 auto p
= pg_pool_sum
.find(poolid
);
539 if (p
== pg_pool_sum
.end())
542 recovery_summary(f
, psl
, p
->second
);
545 void PGMapDigest::client_io_rate_summary(ceph::Formatter
*f
, ostream
*out
,
546 const pool_stat_t
& delta_sum
,
547 utime_t delta_stamp
) const
549 pool_stat_t pos_delta
= delta_sum
;
551 if (pos_delta
.stats
.sum
.num_rd
||
552 pos_delta
.stats
.sum
.num_wr
) {
553 if (pos_delta
.stats
.sum
.num_rd
) {
554 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)delta_stamp
;
556 f
->dump_int("read_bytes_sec", rd
);
558 *out
<< byte_u_t(rd
) << "/s rd, ";
561 if (pos_delta
.stats
.sum
.num_wr
) {
562 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)delta_stamp
;
564 f
->dump_int("write_bytes_sec", wr
);
566 *out
<< byte_u_t(wr
) << "/s wr, ";
569 int64_t iops_rd
= pos_delta
.stats
.sum
.num_rd
/ (double)delta_stamp
;
570 int64_t iops_wr
= pos_delta
.stats
.sum
.num_wr
/ (double)delta_stamp
;
572 f
->dump_int("read_op_per_sec", iops_rd
);
573 f
->dump_int("write_op_per_sec", iops_wr
);
575 *out
<< si_u_t(iops_rd
) << " op/s rd, " << si_u_t(iops_wr
) << " op/s wr";
580 void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter
*f
, ostream
*out
) const
582 client_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
585 void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter
*f
, ostream
*out
,
586 uint64_t poolid
) const
588 auto p
= per_pool_sum_delta
.find(poolid
);
589 if (p
== per_pool_sum_delta
.end())
592 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
593 ceph_assert(ts
!= per_pool_sum_deltas_stamps
.end());
594 client_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
597 void PGMapDigest::cache_io_rate_summary(ceph::Formatter
*f
, ostream
*out
,
598 const pool_stat_t
& delta_sum
,
599 utime_t delta_stamp
) const
601 pool_stat_t pos_delta
= delta_sum
;
603 bool have_output
= false;
605 if (pos_delta
.stats
.sum
.num_flush
) {
606 int64_t flush
= (pos_delta
.stats
.sum
.num_flush_kb
<< 10) / (double)delta_stamp
;
608 f
->dump_int("flush_bytes_sec", flush
);
610 *out
<< byte_u_t(flush
) << "/s flush";
614 if (pos_delta
.stats
.sum
.num_evict
) {
615 int64_t evict
= (pos_delta
.stats
.sum
.num_evict_kb
<< 10) / (double)delta_stamp
;
617 f
->dump_int("evict_bytes_sec", evict
);
621 *out
<< byte_u_t(evict
) << "/s evict";
625 if (pos_delta
.stats
.sum
.num_promote
) {
626 int64_t promote
= pos_delta
.stats
.sum
.num_promote
/ (double)delta_stamp
;
628 f
->dump_int("promote_op_per_sec", promote
);
632 *out
<< si_u_t(promote
) << " op/s promote";
636 if (pos_delta
.stats
.sum
.num_flush_mode_low
) {
638 f
->dump_int("num_flush_mode_low", pos_delta
.stats
.sum
.num_flush_mode_low
);
642 *out
<< si_u_t(pos_delta
.stats
.sum
.num_flush_mode_low
) << " PGs flushing";
646 if (pos_delta
.stats
.sum
.num_flush_mode_high
) {
648 f
->dump_int("num_flush_mode_high", pos_delta
.stats
.sum
.num_flush_mode_high
);
652 *out
<< si_u_t(pos_delta
.stats
.sum
.num_flush_mode_high
) << " PGs flushing (high)";
656 if (pos_delta
.stats
.sum
.num_evict_mode_some
) {
658 f
->dump_int("num_evict_mode_some", pos_delta
.stats
.sum
.num_evict_mode_some
);
662 *out
<< si_u_t(pos_delta
.stats
.sum
.num_evict_mode_some
) << " PGs evicting";
666 if (pos_delta
.stats
.sum
.num_evict_mode_full
) {
668 f
->dump_int("num_evict_mode_full", pos_delta
.stats
.sum
.num_evict_mode_full
);
672 *out
<< si_u_t(pos_delta
.stats
.sum
.num_evict_mode_full
) << " PGs evicting (full)";
677 void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter
*f
, ostream
*out
) const
679 cache_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
682 void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter
*f
, ostream
*out
,
683 uint64_t poolid
) const
685 auto p
= per_pool_sum_delta
.find(poolid
);
686 if (p
== per_pool_sum_delta
.end())
689 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
690 ceph_assert(ts
!= per_pool_sum_deltas_stamps
.end());
691 cache_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
694 ceph_statfs
PGMapDigest::get_statfs(OSDMap
&osdmap
,
695 std::optional
<int64_t> data_pool
) const
699 object_stat_sum_t sum
;
702 auto i
= pg_pool_sum
.find(*data_pool
);
703 if (i
!= pg_pool_sum
.end()) {
704 sum
= i
->second
.stats
.sum
;
710 statfs
.kb_used
= (sum
.num_bytes
>> 10);
711 statfs
.kb_avail
= get_pool_free_space(osdmap
, *data_pool
) >> 10;
712 statfs
.num_objects
= sum
.num_objects
;
713 statfs
.kb
= statfs
.kb_used
+ statfs
.kb_avail
;
716 statfs
.kb
= osd_sum
.statfs
.kb();
717 statfs
.kb_used
= osd_sum
.statfs
.kb_used_raw();
718 statfs
.kb_avail
= osd_sum
.statfs
.kb_avail();
719 statfs
.num_objects
= pg_sum
.stats
.sum
.num_objects
;
725 void PGMapDigest::dump_pool_stats_full(
726 const OSDMap
&osd_map
,
734 f
->open_array_section("pools");
736 tbl
.define_column("POOL", TextTable::LEFT
, TextTable::LEFT
);
737 tbl
.define_column("ID", TextTable::RIGHT
, TextTable::RIGHT
);
738 tbl
.define_column("PGS", TextTable::RIGHT
, TextTable::RIGHT
);
739 tbl
.define_column("STORED", TextTable::RIGHT
, TextTable::RIGHT
);
741 tbl
.define_column("(DATA)", TextTable::RIGHT
, TextTable::RIGHT
);
742 tbl
.define_column("(OMAP)", TextTable::RIGHT
, TextTable::RIGHT
);
744 tbl
.define_column("OBJECTS", TextTable::RIGHT
, TextTable::RIGHT
);
745 tbl
.define_column("USED", TextTable::RIGHT
, TextTable::RIGHT
);
747 tbl
.define_column("(DATA)", TextTable::RIGHT
, TextTable::RIGHT
);
748 tbl
.define_column("(OMAP)", TextTable::RIGHT
, TextTable::RIGHT
);
750 tbl
.define_column("%USED", TextTable::RIGHT
, TextTable::RIGHT
);
751 tbl
.define_column("MAX AVAIL", TextTable::RIGHT
, TextTable::RIGHT
);
754 tbl
.define_column("QUOTA OBJECTS", TextTable::RIGHT
, TextTable::RIGHT
);
755 tbl
.define_column("QUOTA BYTES", TextTable::RIGHT
, TextTable::RIGHT
);
756 tbl
.define_column("DIRTY", TextTable::RIGHT
, TextTable::RIGHT
);
757 tbl
.define_column("USED COMPR", TextTable::RIGHT
, TextTable::RIGHT
);
758 tbl
.define_column("UNDER COMPR", TextTable::RIGHT
, TextTable::RIGHT
);
762 map
<int,uint64_t> avail_by_rule
;
763 for (auto p
= osd_map
.get_pools().begin();
764 p
!= osd_map
.get_pools().end(); ++p
) {
765 int64_t pool_id
= p
->first
;
766 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
769 const string
& pool_name
= osd_map
.get_pool_name(pool_id
);
770 auto pool_pg_num
= osd_map
.get_pg_num(pool_id
);
771 const pool_stat_t
&stat
= pg_pool_sum
.at(pool_id
);
773 const pg_pool_t
*pool
= osd_map
.get_pg_pool(pool_id
);
774 int ruleno
= pool
->get_crush_rule();
776 if (avail_by_rule
.count(ruleno
) == 0) {
777 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
778 avail
= get_rule_avail(ruleno
);
781 avail_by_rule
[ruleno
] = avail
;
783 avail
= avail_by_rule
[ruleno
];
786 f
->open_object_section("pool");
787 f
->dump_string("name", pool_name
);
788 f
->dump_int("id", pool_id
);
789 f
->open_object_section("stats");
795 float raw_used_rate
= osd_map
.pool_raw_used_rate(pool_id
);
796 bool per_pool
= use_per_pool_stats();
797 bool per_pool_omap
= use_per_pool_omap_stats();
798 dump_object_stat_sum(tbl
, f
, stat
, avail
, raw_used_rate
, verbose
, per_pool
,
799 per_pool_omap
, pool
);
801 f
->close_section(); // stats
802 f
->close_section(); // pool
804 tbl
<< TextTable::endrow
;
810 ceph_assert(ss
!= nullptr);
811 *ss
<< "--- POOLS ---\n";
816 void PGMapDigest::dump_cluster_stats(stringstream
*ss
,
821 f
->open_object_section("stats");
822 f
->dump_int("total_bytes", osd_sum
.statfs
.total
);
823 f
->dump_int("total_avail_bytes", osd_sum
.statfs
.available
);
824 f
->dump_int("total_used_bytes", osd_sum
.statfs
.get_used());
825 f
->dump_int("total_used_raw_bytes", osd_sum
.statfs
.get_used_raw());
826 f
->dump_float("total_used_raw_ratio", osd_sum
.statfs
.get_used_raw_ratio());
827 f
->dump_unsigned("num_osds", osd_sum
.num_osds
);
828 f
->dump_unsigned("num_per_pool_osds", osd_sum
.num_per_pool_osds
);
829 f
->dump_unsigned("num_per_pool_omap_osds", osd_sum
.num_per_pool_omap_osds
);
831 f
->open_object_section("stats_by_class");
832 for (auto& i
: osd_sum_by_class
) {
833 f
->open_object_section(i
.first
.c_str());
834 f
->dump_int("total_bytes", i
.second
.statfs
.total
);
835 f
->dump_int("total_avail_bytes", i
.second
.statfs
.available
);
836 f
->dump_int("total_used_bytes", i
.second
.statfs
.get_used());
837 f
->dump_int("total_used_raw_bytes", i
.second
.statfs
.get_used_raw());
838 f
->dump_float("total_used_raw_ratio",
839 i
.second
.statfs
.get_used_raw_ratio());
844 ceph_assert(ss
!= nullptr);
846 tbl
.define_column("CLASS", TextTable::LEFT
, TextTable::LEFT
);
847 tbl
.define_column("SIZE", TextTable::RIGHT
, TextTable::RIGHT
);
848 tbl
.define_column("AVAIL", TextTable::RIGHT
, TextTable::RIGHT
);
849 tbl
.define_column("USED", TextTable::RIGHT
, TextTable::RIGHT
);
850 tbl
.define_column("RAW USED", TextTable::RIGHT
, TextTable::RIGHT
);
851 tbl
.define_column("%RAW USED", TextTable::RIGHT
, TextTable::RIGHT
);
854 for (auto& i
: osd_sum_by_class
) {
856 tbl
<< stringify(byte_u_t(i
.second
.statfs
.total
))
857 << stringify(byte_u_t(i
.second
.statfs
.available
))
858 << stringify(byte_u_t(i
.second
.statfs
.get_used()))
859 << stringify(byte_u_t(i
.second
.statfs
.get_used_raw()))
860 << percentify(i
.second
.statfs
.get_used_raw_ratio()*100.0)
861 << TextTable::endrow
;
864 tbl
<< stringify(byte_u_t(osd_sum
.statfs
.total
))
865 << stringify(byte_u_t(osd_sum
.statfs
.available
))
866 << stringify(byte_u_t(osd_sum
.statfs
.get_used()))
867 << stringify(byte_u_t(osd_sum
.statfs
.get_used_raw()))
868 << percentify(osd_sum
.statfs
.get_used_raw_ratio()*100.0)
869 << TextTable::endrow
;
871 *ss
<< "--- RAW STORAGE ---\n";
876 void PGMapDigest::dump_object_stat_sum(
877 TextTable
&tbl
, ceph::Formatter
*f
,
878 const pool_stat_t
&pool_stat
, uint64_t avail
,
879 float raw_used_rate
, bool verbose
, bool per_pool
, bool per_pool_omap
,
880 const pg_pool_t
*pool
)
882 const object_stat_sum_t
&sum
= pool_stat
.stats
.sum
;
883 const store_statfs_t statfs
= pool_stat
.store_stats
;
885 if (sum
.num_object_copies
> 0) {
886 raw_used_rate
*= (float)(sum
.num_object_copies
- sum
.num_objects_degraded
) / sum
.num_object_copies
;
889 uint64_t used_data_bytes
= pool_stat
.get_allocated_data_bytes(per_pool
);
890 uint64_t used_omap_bytes
= pool_stat
.get_allocated_omap_bytes(per_pool_omap
);
891 uint64_t used_bytes
= used_data_bytes
+ used_omap_bytes
;
894 // note avail passed in is raw_avail, calc raw_used here.
897 used
/= used
+ avail
;
898 } else if (used_bytes
) {
901 auto avail_res
= raw_used_rate
? avail
/ raw_used_rate
: 0;
902 // an approximation for actually stored user data
903 auto stored_data_normalized
= pool_stat
.get_user_data_bytes(
904 raw_used_rate
, per_pool
);
905 auto stored_omap_normalized
= pool_stat
.get_user_omap_bytes(
906 raw_used_rate
, per_pool_omap
);
907 auto stored_normalized
= stored_data_normalized
+ stored_omap_normalized
;
908 // same, amplied by replication or EC
909 auto stored_raw
= stored_normalized
* raw_used_rate
;
911 f
->dump_int("stored", stored_normalized
);
913 f
->dump_int("stored_data", stored_data_normalized
);
914 f
->dump_int("stored_omap", stored_omap_normalized
);
916 f
->dump_int("objects", sum
.num_objects
);
917 f
->dump_int("kb_used", shift_round_up(used_bytes
, 10));
918 f
->dump_int("bytes_used", used_bytes
);
920 f
->dump_int("data_bytes_used", used_data_bytes
);
921 f
->dump_int("omap_bytes_used", used_omap_bytes
);
923 f
->dump_float("percent_used", used
);
924 f
->dump_unsigned("max_avail", avail_res
);
926 f
->dump_int("quota_objects", pool
->quota_max_objects
);
927 f
->dump_int("quota_bytes", pool
->quota_max_bytes
);
928 if (pool
->is_tier()) {
929 f
->dump_int("dirty", sum
.num_objects_dirty
);
931 f
->dump_int("dirty", 0);
933 f
->dump_int("rd", sum
.num_rd
);
934 f
->dump_int("rd_bytes", sum
.num_rd_kb
* 1024ull);
935 f
->dump_int("wr", sum
.num_wr
);
936 f
->dump_int("wr_bytes", sum
.num_wr_kb
* 1024ull);
937 f
->dump_int("compress_bytes_used", statfs
.data_compressed_allocated
);
938 f
->dump_int("compress_under_bytes", statfs
.data_compressed_original
);
939 // Stored by user amplified by replication
940 f
->dump_int("stored_raw", stored_raw
);
941 f
->dump_unsigned("avail_raw", avail
);
944 tbl
<< stringify(byte_u_t(stored_normalized
));
946 tbl
<< stringify(byte_u_t(stored_data_normalized
));
947 tbl
<< stringify(byte_u_t(stored_omap_normalized
));
949 tbl
<< stringify(si_u_t(sum
.num_objects
));
950 tbl
<< stringify(byte_u_t(used_bytes
));
952 tbl
<< stringify(byte_u_t(used_data_bytes
));
953 tbl
<< stringify(byte_u_t(used_omap_bytes
));
955 tbl
<< percentify(used
*100);
956 tbl
<< stringify(byte_u_t(avail_res
));
958 if (pool
->quota_max_objects
== 0)
961 tbl
<< stringify(si_u_t(pool
->quota_max_objects
));
962 if (pool
->quota_max_bytes
== 0)
965 tbl
<< stringify(byte_u_t(pool
->quota_max_bytes
));
966 if (pool
->is_tier()) {
967 tbl
<< stringify(si_u_t(sum
.num_objects_dirty
));
971 tbl
<< stringify(byte_u_t(statfs
.data_compressed_allocated
));
972 tbl
<< stringify(byte_u_t(statfs
.data_compressed_original
));
977 int64_t PGMapDigest::get_pool_free_space(const OSDMap
&osd_map
,
978 int64_t poolid
) const
980 const pg_pool_t
*pool
= osd_map
.get_pg_pool(poolid
);
981 int ruleno
= pool
->get_crush_rule();
983 avail
= get_rule_avail(ruleno
);
987 return avail
/ osd_map
.pool_raw_used_rate(poolid
);
990 int64_t PGMap::get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const
993 int r
= osdmap
.crush
->get_rule_weight_osd_map(ruleno
, &wm
);
1001 float fratio
= osdmap
.get_full_ratio();
1004 for (auto p
= wm
.begin(); p
!= wm
.end(); ++p
) {
1005 auto osd_info
= osd_stat
.find(p
->first
);
1006 if (osd_info
!= osd_stat
.end()) {
1007 if (osd_info
->second
.statfs
.total
== 0 || p
->second
== 0) {
1008 // osd must be out, hence its stats have been zeroed
1009 // (unless we somehow managed to have a disk with size 0...)
1011 // (p->second == 0), if osd weight is 0, no need to
1012 // calculate proj below.
1015 double unusable
= (double)osd_info
->second
.statfs
.kb() *
1017 double avail
= std::max(0.0, (double)osd_info
->second
.statfs
.kb_avail() - unusable
);
1019 int64_t proj
= (int64_t)(avail
/ (double)p
->second
);
1020 if (min
< 0 || proj
< min
) {
1024 if (osdmap
.is_up(p
->first
)) {
1025 // This is a level 4 rather than an error, because we might have
1026 // only just started, and not received the first stats message yet.
1027 dout(4) << "OSD " << p
->first
<< " is up, but has no stats" << dendl
;
1034 void PGMap::get_rules_avail(const OSDMap
& osdmap
,
1035 std::map
<int,int64_t> *avail_map
) const
1038 for (auto p
: osdmap
.get_pools()) {
1039 int64_t pool_id
= p
.first
;
1040 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
1042 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
1043 int ruleno
= pool
->get_crush_rule();
1044 if (avail_map
->count(ruleno
) == 0)
1045 (*avail_map
)[ruleno
] = get_rule_avail(osdmap
, ruleno
);
1049 // ---------------------
1052 void PGMap::Incremental::dump(ceph::Formatter
*f
) const
1054 f
->dump_unsigned("version", version
);
1055 f
->dump_stream("stamp") << stamp
;
1056 f
->dump_unsigned("osdmap_epoch", osdmap_epoch
);
1057 f
->dump_unsigned("pg_scan_epoch", pg_scan
);
1059 f
->open_array_section("pg_stat_updates");
1060 for (auto p
= pg_stat_updates
.begin(); p
!= pg_stat_updates
.end(); ++p
) {
1061 f
->open_object_section("pg_stat");
1062 f
->dump_stream("pgid") << p
->first
;
1068 f
->open_array_section("osd_stat_updates");
1069 for (auto p
= osd_stat_updates
.begin(); p
!= osd_stat_updates
.end(); ++p
) {
1070 f
->open_object_section("osd_stat");
1071 f
->dump_int("osd", p
->first
);
1076 f
->open_array_section("pool_statfs_updates");
1077 for (auto p
= pool_statfs_updates
.begin(); p
!= pool_statfs_updates
.end(); ++p
) {
1078 f
->open_object_section("pool_statfs");
1079 f
->dump_stream("poolid/osd") << p
->first
;
1085 f
->open_array_section("osd_stat_removals");
1086 for (auto p
= osd_stat_rm
.begin(); p
!= osd_stat_rm
.end(); ++p
)
1087 f
->dump_int("osd", *p
);
1090 f
->open_array_section("pg_removals");
1091 for (auto p
= pg_remove
.begin(); p
!= pg_remove
.end(); ++p
)
1092 f
->dump_stream("pgid") << *p
;
1096 void PGMap::Incremental::generate_test_instances(list
<PGMap::Incremental
*>& o
)
1098 o
.push_back(new Incremental
);
1099 o
.push_back(new Incremental
);
1100 o
.back()->version
= 1;
1101 o
.back()->stamp
= utime_t(123,345);
1102 o
.push_back(new Incremental
);
1103 o
.back()->version
= 2;
1104 o
.back()->pg_stat_updates
[pg_t(1,2)] = pg_stat_t();
1105 o
.back()->osd_stat_updates
[5] = osd_stat_t();
1106 o
.push_back(new Incremental
);
1107 o
.back()->version
= 3;
1108 o
.back()->osdmap_epoch
= 1;
1109 o
.back()->pg_scan
= 2;
1110 o
.back()->pg_stat_updates
[pg_t(4,5)] = pg_stat_t();
1111 o
.back()->osd_stat_updates
[6] = osd_stat_t();
1112 o
.back()->pg_remove
.insert(pg_t(1,2));
1113 o
.back()->osd_stat_rm
.insert(5);
1114 o
.back()->pool_statfs_updates
[std::make_pair(1234,4)] = store_statfs_t();
1119 void PGMap::apply_incremental(CephContext
*cct
, const Incremental
& inc
)
1121 ceph_assert(inc
.version
== version
+1);
1124 pool_stat_t pg_sum_old
= pg_sum
;
1125 mempool::pgmap::unordered_map
<int32_t, pool_stat_t
> pg_pool_sum_old
;
1126 pg_pool_sum_old
= pg_pool_sum
;
1128 for (auto p
= inc
.pg_stat_updates
.begin();
1129 p
!= inc
.pg_stat_updates
.end();
1131 const pg_t
&update_pg(p
->first
);
1132 auto update_pool
= update_pg
.pool();
1133 const pg_stat_t
&update_stat(p
->second
);
1135 auto pg_stat_iter
= pg_stat
.find(update_pg
);
1136 pool_stat_t
&pool_sum_ref
= pg_pool_sum
[update_pool
];
1137 if (pg_stat_iter
== pg_stat
.end()) {
1138 pg_stat
.insert(make_pair(update_pg
, update_stat
));
1140 stat_pg_sub(update_pg
, pg_stat_iter
->second
);
1141 pool_sum_ref
.sub(pg_stat_iter
->second
);
1142 pg_stat_iter
->second
= update_stat
;
1144 stat_pg_add(update_pg
, update_stat
);
1145 pool_sum_ref
.add(update_stat
);
1148 for (auto p
= inc
.pool_statfs_updates
.begin();
1149 p
!= inc
.pool_statfs_updates
.end();
1151 auto update_pool
= p
->first
.first
;
1152 auto update_osd
= p
->first
.second
;
1153 auto& statfs_inc
= p
->second
;
1155 auto pool_statfs_iter
=
1156 pool_statfs
.find(std::make_pair(update_pool
, update_osd
));
1157 if (pg_pool_sum
.count(update_pool
)) {
1158 pool_stat_t
&pool_sum_ref
= pg_pool_sum
[update_pool
];
1159 if (pool_statfs_iter
== pool_statfs
.end()) {
1160 pool_statfs
.emplace(std::make_pair(update_pool
, update_osd
), statfs_inc
);
1162 pool_sum_ref
.sub(pool_statfs_iter
->second
);
1163 pool_statfs_iter
->second
= statfs_inc
;
1165 pool_sum_ref
.add(statfs_inc
);
1169 for (auto p
= inc
.get_osd_stat_updates().begin();
1170 p
!= inc
.get_osd_stat_updates().end();
1173 const osd_stat_t
&new_stats(p
->second
);
1175 auto t
= osd_stat
.find(osd
);
1176 if (t
== osd_stat
.end()) {
1177 osd_stat
.insert(make_pair(osd
, new_stats
));
1179 stat_osd_sub(t
->first
, t
->second
);
1180 t
->second
= new_stats
;
1182 stat_osd_add(osd
, new_stats
);
1184 set
<int64_t> deleted_pools
;
1185 for (auto p
= inc
.pg_remove
.begin();
1186 p
!= inc
.pg_remove
.end();
1188 const pg_t
&removed_pg(*p
);
1189 auto s
= pg_stat
.find(removed_pg
);
1190 bool pool_erased
= false;
1191 if (s
!= pg_stat
.end()) {
1192 pool_erased
= stat_pg_sub(removed_pg
, s
->second
);
1194 // decrease pool stats if pg was removed
1195 auto pool_stats_it
= pg_pool_sum
.find(removed_pg
.pool());
1196 if (pool_stats_it
!= pg_pool_sum
.end()) {
1197 pool_stats_it
->second
.sub(s
->second
);
1202 deleted_pools
.insert(removed_pg
.pool());
1207 for (auto p
= inc
.get_osd_stat_rm().begin();
1208 p
!= inc
.get_osd_stat_rm().end();
1210 auto t
= osd_stat
.find(*p
);
1211 if (t
!= osd_stat
.end()) {
1212 stat_osd_sub(t
->first
, t
->second
);
1215 for (auto i
= pool_statfs
.begin(); i
!= pool_statfs
.end();) {
1216 if (i
->first
.second
== *p
) {
1217 pg_pool_sum
[i
->first
.first
].sub(i
->second
);
1218 i
= pool_statfs
.erase(i
);
1225 // skip calculating delta while sum was not synchronized
1226 if (!stamp
.is_zero() && !pg_sum_old
.stats
.sum
.is_zero()) {
1228 delta_t
= inc
.stamp
;
1230 // calculate a delta, and average over the last 2 deltas.
1231 pool_stat_t d
= pg_sum
;
1232 d
.stats
.sub(pg_sum_old
.stats
);
1233 pg_sum_deltas
.push_back(make_pair(d
, delta_t
));
1234 stamp_delta
+= delta_t
;
1235 pg_sum_delta
.stats
.add(d
.stats
);
1236 auto smooth_intervals
=
1237 cct
? cct
->_conf
.get_val
<uint64_t>("mon_stat_smooth_intervals") : 1;
1238 while (pg_sum_deltas
.size() > smooth_intervals
) {
1239 pg_sum_delta
.stats
.sub(pg_sum_deltas
.front().first
.stats
);
1240 stamp_delta
-= pg_sum_deltas
.front().second
;
1241 pg_sum_deltas
.pop_front();
1246 update_pool_deltas(cct
, inc
.stamp
, pg_pool_sum_old
);
1248 for (auto p
: deleted_pools
) {
1250 dout(20) << " deleted pool " << p
<< dendl
;
1254 if (inc
.osdmap_epoch
)
1255 last_osdmap_epoch
= inc
.osdmap_epoch
;
1257 last_pg_scan
= inc
.pg_scan
;
1260 void PGMap::calc_stats()
1266 pg_pool_sum
.clear();
1267 num_pg_by_pool
.clear();
1269 pg_sum
= pool_stat_t();
1270 osd_sum
= osd_stat_t();
1271 osd_sum_by_class
.clear();
1272 num_pg_by_state
.clear();
1273 num_pg_by_pool_state
.clear();
1274 num_pg_by_osd
.clear();
1276 for (auto p
= pg_stat
.begin();
1280 stat_pg_add(pg
, p
->second
);
1281 pg_pool_sum
[pg
.pool()].add(p
->second
);
1283 for (auto p
= pool_statfs
.begin();
1284 p
!= pool_statfs
.end();
1286 auto pool
= p
->first
.first
;
1287 pg_pool_sum
[pool
].add(p
->second
);
1289 for (auto p
= osd_stat
.begin();
1290 p
!= osd_stat
.end();
1292 stat_osd_add(p
->first
, p
->second
);
1295 void PGMap::stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
1298 auto pool
= pgid
.pool();
1302 num_pg_by_state
[s
.state
]++;
1303 num_pg_by_pool_state
[pgid
.pool()][s
.state
]++;
1304 num_pg_by_pool
[pool
]++;
1306 if ((s
.state
& PG_STATE_CREATING
) &&
1307 s
.parent_split_bits
== 0) {
1308 creating_pgs
.insert(pgid
);
1309 if (s
.acting_primary
>= 0) {
1310 creating_pgs_by_osd_epoch
[s
.acting_primary
][s
.mapping_epoch
].insert(pgid
);
1314 if (s
.state
& PG_STATE_ACTIVE
) {
1324 for (auto p
= s
.blocked_by
.begin();
1325 p
!= s
.blocked_by
.end();
1327 ++blocked_by_sum
[*p
];
1330 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1331 pg_by_osd
[*p
].insert(pgid
);
1332 num_pg_by_osd
[*p
].acting
++;
1334 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1335 auto& t
= pg_by_osd
[*p
];
1336 if (t
.find(pgid
) == t
.end()) {
1338 num_pg_by_osd
[*p
].up_not_acting
++;
1342 if (s
.up_primary
>= 0) {
1343 num_pg_by_osd
[s
.up_primary
].primary
++;
1347 bool PGMap::stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
1350 bool pool_erased
= false;
1354 int end
= --num_pg_by_state
[s
.state
];
1355 ceph_assert(end
>= 0);
1357 num_pg_by_state
.erase(s
.state
);
1358 if (--num_pg_by_pool_state
[pgid
.pool()][s
.state
] == 0) {
1359 num_pg_by_pool_state
[pgid
.pool()].erase(s
.state
);
1361 end
= --num_pg_by_pool
[pgid
.pool()];
1366 if ((s
.state
& PG_STATE_CREATING
) &&
1367 s
.parent_split_bits
== 0) {
1368 creating_pgs
.erase(pgid
);
1369 if (s
.acting_primary
>= 0) {
1370 map
<epoch_t
,set
<pg_t
> >& r
= creating_pgs_by_osd_epoch
[s
.acting_primary
];
1371 r
[s
.mapping_epoch
].erase(pgid
);
1372 if (r
[s
.mapping_epoch
].empty())
1373 r
.erase(s
.mapping_epoch
);
1375 creating_pgs_by_osd_epoch
.erase(s
.acting_primary
);
1379 if (s
.state
& PG_STATE_ACTIVE
) {
1389 for (auto p
= s
.blocked_by
.begin();
1390 p
!= s
.blocked_by
.end();
1392 auto q
= blocked_by_sum
.find(*p
);
1393 ceph_assert(q
!= blocked_by_sum
.end());
1396 blocked_by_sum
.erase(q
);
1399 set
<int32_t> actingset
;
1400 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1401 actingset
.insert(*p
);
1402 auto& oset
= pg_by_osd
[*p
];
1405 pg_by_osd
.erase(*p
);
1406 auto it
= num_pg_by_osd
.find(*p
);
1407 if (it
!= num_pg_by_osd
.end() && it
->second
.acting
> 0)
1408 it
->second
.acting
--;
1410 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1411 auto& oset
= pg_by_osd
[*p
];
1414 pg_by_osd
.erase(*p
);
1415 if (actingset
.count(*p
))
1417 auto it
= num_pg_by_osd
.find(*p
);
1418 if (it
!= num_pg_by_osd
.end() && it
->second
.up_not_acting
> 0)
1419 it
->second
.up_not_acting
--;
1422 if (s
.up_primary
>= 0) {
1423 auto it
= num_pg_by_osd
.find(s
.up_primary
);
1424 if (it
!= num_pg_by_osd
.end() && it
->second
.primary
> 0)
1425 it
->second
.primary
--;
1430 void PGMap::calc_purged_snaps()
1432 purged_snaps
.clear();
1433 set
<int64_t> unknown
;
1434 for (auto& i
: pg_stat
) {
1435 if (i
.second
.state
== 0) {
1436 unknown
.insert(i
.first
.pool());
1437 purged_snaps
.erase(i
.first
.pool());
1439 } else if (unknown
.count(i
.first
.pool())) {
1442 auto j
= purged_snaps
.find(i
.first
.pool());
1443 if (j
== purged_snaps
.end()) {
1445 purged_snaps
[i
.first
.pool()] = i
.second
.purged_snaps
;
1447 j
->second
.intersection_of(i
.second
.purged_snaps
);
1452 void PGMap::calc_osd_sum_by_class(const OSDMap
& osdmap
)
1454 osd_sum_by_class
.clear();
1455 for (auto& i
: osd_stat
) {
1456 const char *class_name
= osdmap
.crush
->get_item_class(i
.first
);
1458 osd_sum_by_class
[class_name
].add(i
.second
);
1463 void PGMap::stat_osd_add(int osd
, const osd_stat_t
&s
)
1467 if (osd
>= (int)osd_last_seq
.size()) {
1468 osd_last_seq
.resize(osd
+ 1);
1470 osd_last_seq
[osd
] = s
.seq
;
1473 void PGMap::stat_osd_sub(int osd
, const osd_stat_t
&s
)
1477 ceph_assert(osd
< (int)osd_last_seq
.size());
1478 osd_last_seq
[osd
] = 0;
1481 void PGMap::encode_digest(const OSDMap
& osdmap
,
1482 bufferlist
& bl
, uint64_t features
)
1484 get_rules_avail(osdmap
, &avail_space_by_rule
);
1485 calc_osd_sum_by_class(osdmap
);
1486 calc_purged_snaps();
1487 PGMapDigest::encode(bl
, features
);
1490 void PGMap::encode(bufferlist
&bl
, uint64_t features
) const
1492 ENCODE_START(8, 8, bl
);
1493 encode(version
, bl
);
1494 encode(pg_stat
, bl
);
1495 encode(osd_stat
, bl
, features
);
1496 encode(last_osdmap_epoch
, bl
);
1497 encode(last_pg_scan
, bl
);
1499 encode(pool_statfs
, bl
, features
);
1503 void PGMap::decode(bufferlist::const_iterator
&bl
)
1505 DECODE_START(8, bl
);
1506 decode(version
, bl
);
1507 decode(pg_stat
, bl
);
1508 decode(osd_stat
, bl
);
1509 decode(last_osdmap_epoch
, bl
);
1510 decode(last_pg_scan
, bl
);
1512 decode(pool_statfs
, bl
);
1518 void PGMap::dump(ceph::Formatter
*f
, bool with_net
) const
1521 dump_pg_stats(f
, false);
1523 dump_osd_stats(f
, with_net
);
1526 void PGMap::dump_basic(ceph::Formatter
*f
) const
1528 f
->dump_unsigned("version", version
);
1529 f
->dump_stream("stamp") << stamp
;
1530 f
->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch
);
1531 f
->dump_unsigned("last_pg_scan", last_pg_scan
);
1533 f
->open_object_section("pg_stats_sum");
1537 f
->open_object_section("osd_stats_sum");
1544 void PGMap::dump_delta(ceph::Formatter
*f
) const
1546 f
->open_object_section("pg_stats_delta");
1547 pg_sum_delta
.dump(f
);
1548 f
->dump_stream("stamp_delta") << stamp_delta
;
1552 void PGMap::dump_pg_stats(ceph::Formatter
*f
, bool brief
) const
1554 f
->open_array_section("pg_stats");
1555 for (auto i
= pg_stat
.begin();
1558 f
->open_object_section("pg_stat");
1559 f
->dump_stream("pgid") << i
->first
;
1561 i
->second
.dump_brief(f
);
1569 void PGMap::dump_pg_progress(ceph::Formatter
*f
) const
1571 f
->open_object_section("pgs");
1572 for (auto& i
: pg_stat
) {
1573 std::string n
= stringify(i
.first
);
1574 f
->open_object_section(n
.c_str());
1575 f
->dump_int("num_bytes_recovered", i
.second
.stats
.sum
.num_bytes_recovered
);
1576 f
->dump_int("num_bytes", i
.second
.stats
.sum
.num_bytes
);
1577 f
->dump_unsigned("reported_epoch", i
.second
.reported_epoch
);
1578 f
->dump_string("state", pg_state_string(i
.second
.state
));
1584 void PGMap::dump_pool_stats(ceph::Formatter
*f
) const
1586 f
->open_array_section("pool_stats");
1587 for (auto p
= pg_pool_sum
.begin();
1588 p
!= pg_pool_sum
.end();
1590 f
->open_object_section("pool_stat");
1591 f
->dump_int("poolid", p
->first
);
1592 auto q
= num_pg_by_pool
.find(p
->first
);
1593 if (q
!= num_pg_by_pool
.end())
1594 f
->dump_unsigned("num_pg", q
->second
);
1601 void PGMap::dump_osd_stats(ceph::Formatter
*f
, bool with_net
) const
1603 f
->open_array_section("osd_stats");
1604 for (auto q
= osd_stat
.begin();
1605 q
!= osd_stat
.end();
1607 f
->open_object_section("osd_stat");
1608 f
->dump_int("osd", q
->first
);
1609 q
->second
.dump(f
, with_net
);
1614 f
->open_array_section("pool_statfs");
1615 for (auto& p
: pool_statfs
) {
1616 f
->open_object_section("item");
1617 f
->dump_int("poolid", p
.first
.first
);
1618 f
->dump_int("osd", p
.first
.second
);
1625 void PGMap::dump_osd_ping_times(ceph::Formatter
*f
) const
1627 f
->open_array_section("osd_ping_times");
1628 for (const auto& [osd
, stat
] : osd_stat
) {
1629 f
->open_object_section("osd_ping_time");
1630 f
->dump_int("osd", osd
);
1631 stat
.dump_ping_time(f
);
1637 // note: dump_pg_stats_plain() is static
1638 void PGMap::dump_pg_stats_plain(
1640 const mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
1646 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1647 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1648 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1649 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1650 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1651 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1654 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1655 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1656 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1657 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1658 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1659 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1660 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1661 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
1662 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
1663 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1664 tab
.define_column("LOG_DUPS", TextTable::LEFT
, TextTable::RIGHT
);
1665 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1666 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1667 tab
.define_column("STATE_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1668 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
1669 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
1670 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1671 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1672 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1673 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1674 tab
.define_column("LAST_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1675 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1676 tab
.define_column("LAST_DEEP_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1677 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1678 tab
.define_column("SNAPTRIMQ_LEN", TextTable::LEFT
, TextTable::RIGHT
);
1679 tab
.define_column("LAST_SCRUB_DURATION", TextTable::LEFT
, TextTable::RIGHT
);
1680 tab
.define_column("SCRUB_SCHEDULING", TextTable::LEFT
, TextTable::LEFT
);
1681 tab
.define_column("OBJECTS_SCRUBBED", TextTable::LEFT
, TextTable::RIGHT
);
1682 tab
.define_column("OBJECTS_TRIMMED", TextTable::LEFT
, TextTable::RIGHT
);
1685 for (const auto& [pg
, st
] : pg_stats
) {
1688 << pg_state_string(st
.state
)
1692 << st
.acting_primary
1693 << TextTable::endrow
;
1695 ostringstream reported
;
1696 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
1699 << st
.stats
.sum
.num_objects
1700 << st
.stats
.sum
.num_objects_missing_on_primary
1701 << st
.stats
.sum
.num_objects_degraded
1702 << st
.stats
.sum
.num_objects_misplaced
1703 << st
.stats
.sum
.num_objects_unfound
1704 << st
.stats
.sum
.num_bytes
1705 << st
.stats
.sum
.num_omap_bytes
1706 << st
.stats
.sum
.num_omap_keys
1709 << st
.ondisk_log_size
1710 << pg_state_string(st
.state
)
1714 << pg_vector_string(st
.up
)
1716 << pg_vector_string(st
.acting
)
1717 << st
.acting_primary
1719 << st
.last_scrub_stamp
1720 << st
.last_deep_scrub
1721 << st
.last_deep_scrub_stamp
1723 << st
.last_scrub_duration
1724 << st
.dump_scrub_schedule()
1725 << st
.objects_scrubbed
1726 << st
.objects_trimmed
1727 << TextTable::endrow
;
1734 void PGMap::dump(ostream
& ss
) const
1737 dump_pg_stats(ss
, false);
1738 dump_pool_stats(ss
, false);
1739 dump_pg_sum_stats(ss
, false);
1743 void PGMap::dump_basic(ostream
& ss
) const
1745 ss
<< "version " << version
<< std::endl
;
1746 ss
<< "stamp " << stamp
<< std::endl
;
1747 ss
<< "last_osdmap_epoch " << last_osdmap_epoch
<< std::endl
;
1748 ss
<< "last_pg_scan " << last_pg_scan
<< std::endl
;
1751 void PGMap::dump_pg_stats(ostream
& ss
, bool brief
) const
1753 dump_pg_stats_plain(ss
, pg_stat
, brief
);
1756 void PGMap::dump_pool_stats(ostream
& ss
, bool header
) const
1761 tab
.define_column("POOLID", TextTable::LEFT
, TextTable::LEFT
);
1762 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1763 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1764 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1765 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1766 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1767 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1768 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
1769 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
1770 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1771 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1773 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1774 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1775 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1776 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1777 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1778 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1779 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1780 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1781 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1782 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1783 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1786 for (auto p
= pg_pool_sum
.begin();
1787 p
!= pg_pool_sum
.end();
1790 << p
->second
.stats
.sum
.num_objects
1791 << p
->second
.stats
.sum
.num_objects_missing_on_primary
1792 << p
->second
.stats
.sum
.num_objects_degraded
1793 << p
->second
.stats
.sum
.num_objects_misplaced
1794 << p
->second
.stats
.sum
.num_objects_unfound
1795 << p
->second
.stats
.sum
.num_bytes
1796 << p
->second
.stats
.sum
.num_omap_bytes
1797 << p
->second
.stats
.sum
.num_omap_keys
1798 << p
->second
.log_size
1799 << p
->second
.ondisk_log_size
1800 << TextTable::endrow
;
1806 void PGMap::dump_pg_sum_stats(ostream
& ss
, bool header
) const
1811 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1812 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1813 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1814 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1815 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1816 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1817 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1818 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
1819 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
1820 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1821 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1823 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1824 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1825 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1826 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1827 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1828 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1829 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1830 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1831 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1832 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1833 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1837 << pg_sum
.stats
.sum
.num_objects
1838 << pg_sum
.stats
.sum
.num_objects_missing_on_primary
1839 << pg_sum
.stats
.sum
.num_objects_degraded
1840 << pg_sum
.stats
.sum
.num_objects_misplaced
1841 << pg_sum
.stats
.sum
.num_objects_unfound
1842 << pg_sum
.stats
.sum
.num_bytes
1843 << pg_sum
.stats
.sum
.num_omap_bytes
1844 << pg_sum
.stats
.sum
.num_omap_keys
1846 << pg_sum
.ondisk_log_size
1847 << TextTable::endrow
;
1852 void PGMap::dump_osd_stats(ostream
& ss
) const
1856 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1857 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1858 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1859 tab
.define_column("USED_RAW", TextTable::LEFT
, TextTable::RIGHT
);
1860 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1861 tab
.define_column("HB_PEERS", TextTable::LEFT
, TextTable::RIGHT
);
1862 tab
.define_column("PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1863 tab
.define_column("PRIMARY_PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1865 for (auto p
= osd_stat
.begin();
1866 p
!= osd_stat
.end();
1869 << byte_u_t(p
->second
.statfs
.get_used())
1870 << byte_u_t(p
->second
.statfs
.available
)
1871 << byte_u_t(p
->second
.statfs
.get_used_raw())
1872 << byte_u_t(p
->second
.statfs
.total
)
1873 << p
->second
.hb_peers
1874 << get_num_pg_by_osd(p
->first
)
1875 << get_num_primary_pg_by_osd(p
->first
)
1876 << TextTable::endrow
;
1880 << byte_u_t(osd_sum
.statfs
.get_used())
1881 << byte_u_t(osd_sum
.statfs
.available
)
1882 << byte_u_t(osd_sum
.statfs
.get_used_raw())
1883 << byte_u_t(osd_sum
.statfs
.total
)
1884 << TextTable::endrow
;
1889 void PGMap::dump_osd_sum_stats(ostream
& ss
) const
1893 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1894 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1895 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1896 tab
.define_column("USED_RAW", TextTable::LEFT
, TextTable::RIGHT
);
1897 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1900 << byte_u_t(osd_sum
.statfs
.get_used())
1901 << byte_u_t(osd_sum
.statfs
.available
)
1902 << byte_u_t(osd_sum
.statfs
.get_used_raw())
1903 << byte_u_t(osd_sum
.statfs
.total
)
1904 << TextTable::endrow
;
1909 void PGMap::get_stuck_stats(
1910 int types
, const utime_t cutoff
,
1911 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const
1913 ceph_assert(types
!= 0);
1914 for (auto i
= pg_stat
.begin();
1917 utime_t val
= cutoff
; // don't care about >= cutoff so that is infinity
1919 if ((types
& STUCK_INACTIVE
) && !(i
->second
.state
& PG_STATE_ACTIVE
)) {
1920 if (i
->second
.last_active
< val
)
1921 val
= i
->second
.last_active
;
1924 if ((types
& STUCK_UNCLEAN
) && !(i
->second
.state
& PG_STATE_CLEAN
)) {
1925 if (i
->second
.last_clean
< val
)
1926 val
= i
->second
.last_clean
;
1929 if ((types
& STUCK_DEGRADED
) && (i
->second
.state
& PG_STATE_DEGRADED
)) {
1930 if (i
->second
.last_undegraded
< val
)
1931 val
= i
->second
.last_undegraded
;
1934 if ((types
& STUCK_UNDERSIZED
) && (i
->second
.state
& PG_STATE_UNDERSIZED
)) {
1935 if (i
->second
.last_fullsized
< val
)
1936 val
= i
->second
.last_fullsized
;
1939 if ((types
& STUCK_STALE
) && (i
->second
.state
& PG_STATE_STALE
)) {
1940 if (i
->second
.last_unstale
< val
)
1941 val
= i
->second
.last_unstale
;
1944 // val is now the earliest any of the requested stuck states began
1946 stuck_pgs
[i
->first
] = i
->second
;
1951 void PGMap::dump_stuck(ceph::Formatter
*f
, int types
, utime_t cutoff
) const
1953 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
1954 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
1955 f
->open_array_section("stuck_pg_stats");
1956 for (auto i
= stuck_pg_stats
.begin();
1957 i
!= stuck_pg_stats
.end();
1959 f
->open_object_section("pg_stat");
1960 f
->dump_stream("pgid") << i
->first
;
1967 void PGMap::dump_stuck_plain(ostream
& ss
, int types
, utime_t cutoff
) const
1969 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
1970 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
1971 if (!stuck_pg_stats
.empty())
1972 dump_pg_stats_plain(ss
, stuck_pg_stats
, true);
1975 int PGMap::dump_stuck_pg_stats(
1979 vector
<string
>& args
) const
1981 int stuck_types
= 0;
1983 for (auto i
= args
.begin(); i
!= args
.end(); ++i
) {
1984 if (*i
== "inactive")
1985 stuck_types
|= PGMap::STUCK_INACTIVE
;
1986 else if (*i
== "unclean")
1987 stuck_types
|= PGMap::STUCK_UNCLEAN
;
1988 else if (*i
== "undersized")
1989 stuck_types
|= PGMap::STUCK_UNDERSIZED
;
1990 else if (*i
== "degraded")
1991 stuck_types
|= PGMap::STUCK_DEGRADED
;
1992 else if (*i
== "stale")
1993 stuck_types
|= PGMap::STUCK_STALE
;
1995 ds
<< "Unknown type: " << *i
<< std::endl
;
2000 utime_t
now(ceph_clock_now());
2001 utime_t cutoff
= now
- utime_t(threshold
, 0);
2004 dump_stuck_plain(ds
, stuck_types
, cutoff
);
2006 dump_stuck(f
, stuck_types
, cutoff
);
2013 void PGMap::dump_osd_perf_stats(ceph::Formatter
*f
) const
2015 f
->open_array_section("osd_perf_infos");
2016 for (auto i
= osd_stat
.begin();
2017 i
!= osd_stat
.end();
2019 f
->open_object_section("osd");
2020 f
->dump_int("id", i
->first
);
2022 f
->open_object_section("perf_stats");
2023 i
->second
.os_perf_stat
.dump(f
);
2030 void PGMap::print_osd_perf_stats(std::ostream
*ss
) const
2033 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2034 tab
.define_column("commit_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2035 tab
.define_column("apply_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2036 for (auto i
= osd_stat
.begin();
2037 i
!= osd_stat
.end();
2040 tab
<< i
->second
.os_perf_stat
.os_commit_latency_ns
/ 1000000ull;
2041 tab
<< i
->second
.os_perf_stat
.os_apply_latency_ns
/ 1000000ull;
2042 tab
<< TextTable::endrow
;
2047 void PGMap::dump_osd_blocked_by_stats(ceph::Formatter
*f
) const
2049 f
->open_array_section("osd_blocked_by_infos");
2050 for (auto i
= blocked_by_sum
.begin();
2051 i
!= blocked_by_sum
.end();
2053 f
->open_object_section("osd");
2054 f
->dump_int("id", i
->first
);
2055 f
->dump_int("num_blocked", i
->second
);
2060 void PGMap::print_osd_blocked_by_stats(std::ostream
*ss
) const
2063 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2064 tab
.define_column("num_blocked", TextTable::LEFT
, TextTable::RIGHT
);
2065 for (auto i
= blocked_by_sum
.begin();
2066 i
!= blocked_by_sum
.end();
2070 tab
<< TextTable::endrow
;
2077 * update aggregated delta
2079 * @param cct ceph context
2080 * @param ts Timestamp for the stats being delta'ed
2081 * @param old_pool_sum Previous stats sum
2082 * @param last_ts Last timestamp for pool
2083 * @param result_pool_sum Resulting stats
2084 * @param result_pool_delta Resulting pool delta
2085 * @param result_ts_delta Resulting timestamp delta
2086 * @param delta_avg_list List of last N computed deltas, used to average
2088 void PGMap::update_delta(
2091 const pool_stat_t
& old_pool_sum
,
2093 const pool_stat_t
& current_pool_sum
,
2094 pool_stat_t
*result_pool_delta
,
2095 utime_t
*result_ts_delta
,
2096 mempool::pgmap::list
<pair
<pool_stat_t
,utime_t
> > *delta_avg_list
)
2098 /* @p ts is the timestamp we want to associate with the data
2099 * in @p old_pool_sum, and on which we will base ourselves to
2100 * calculate the delta, stored in 'delta_t'.
2103 delta_t
= ts
; // start with the provided timestamp
2104 delta_t
-= *last_ts
; // take the last timestamp we saw
2105 *last_ts
= ts
; // @p ts becomes the last timestamp we saw
2107 // adjust delta_t, quick start if there is no update in a long period
2108 delta_t
= std::min(delta_t
,
2109 utime_t(2 * (cct
? cct
->_conf
->mon_delta_reset_interval
: 10), 0));
2111 // calculate a delta, and average over the last 6 deltas by default.
2112 /* start by taking a copy of our current @p result_pool_sum, and by
2113 * taking out the stats from @p old_pool_sum. This generates a stats
2114 * delta. Stash this stats delta in @p delta_avg_list, along with the
2115 * timestamp delta for these results.
2117 pool_stat_t d
= current_pool_sum
;
2118 d
.stats
.sub(old_pool_sum
.stats
);
2120 /* Aggregate current delta, and take out the last seen delta (if any) to
2122 * Skip calculating delta while sum was not synchronized.
2124 if(!old_pool_sum
.stats
.sum
.is_zero()) {
2125 delta_avg_list
->push_back(make_pair(d
,delta_t
));
2126 *result_ts_delta
+= delta_t
;
2127 result_pool_delta
->stats
.add(d
.stats
);
2129 size_t s
= cct
? cct
->_conf
.get_val
<uint64_t>("mon_stat_smooth_intervals") : 1;
2130 while (delta_avg_list
->size() > s
) {
2131 result_pool_delta
->stats
.sub(delta_avg_list
->front().first
.stats
);
2132 *result_ts_delta
-= delta_avg_list
->front().second
;
2133 delta_avg_list
->pop_front();
2138 * Update a given pool's deltas
2140 * @param cct Ceph Context
2141 * @param ts Timestamp for the stats being delta'ed
2142 * @param pool Pool's id
2143 * @param old_pool_sum Previous stats sum
2145 void PGMap::update_one_pool_delta(
2149 const pool_stat_t
& old_pool_sum
)
2151 if (per_pool_sum_deltas
.count(pool
) == 0) {
2152 ceph_assert(per_pool_sum_deltas_stamps
.count(pool
) == 0);
2153 ceph_assert(per_pool_sum_delta
.count(pool
) == 0);
2156 auto& sum_delta
= per_pool_sum_delta
[pool
];
2158 update_delta(cct
, ts
, old_pool_sum
, &sum_delta
.second
, pg_pool_sum
[pool
],
2159 &sum_delta
.first
, &per_pool_sum_deltas_stamps
[pool
],
2160 &per_pool_sum_deltas
[pool
]);
2164 * Update pools' deltas
2166 * @param cct CephContext
2167 * @param ts Timestamp for the stats being delta'ed
2168 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2170 void PGMap::update_pool_deltas(
2171 CephContext
*cct
, const utime_t ts
,
2172 const mempool::pgmap::unordered_map
<int32_t,pool_stat_t
>& pg_pool_sum_old
)
2174 for (auto it
= pg_pool_sum_old
.begin();
2175 it
!= pg_pool_sum_old
.end(); ++it
) {
2176 update_one_pool_delta(cct
, ts
, it
->first
, it
->second
);
2180 void PGMap::clear_delta()
2182 pg_sum_delta
= pool_stat_t();
2183 pg_sum_deltas
.clear();
2184 stamp_delta
= utime_t();
2187 void PGMap::generate_test_instances(list
<PGMap
*>& o
)
2189 o
.push_back(new PGMap
);
2190 list
<Incremental
*> inc
;
2191 Incremental::generate_test_instances(inc
);
2194 while (!inc
.empty()) {
2195 PGMap
*pmp
= new PGMap();
2198 o
.back()->apply_incremental(NULL
, *inc
.front());
2204 void PGMap::get_filtered_pg_stats(uint64_t state
, int64_t poolid
, int64_t osdid
,
2205 bool primary
, set
<pg_t
>& pgs
) const
2207 for (auto i
= pg_stat
.begin();
2210 if ((poolid
>= 0) && (poolid
!= i
->first
.pool()))
2212 if ((osdid
>= 0) && !(i
->second
.is_acting_osd(osdid
,primary
)))
2214 if (state
== (uint64_t)-1 || // "all"
2215 (i
->second
.state
& state
) || // matches a state bit
2216 (state
== 0 && i
->second
.state
== 0)) { // matches "unknown" (== 0)
2217 pgs
.insert(i
->first
);
2222 void PGMap::dump_filtered_pg_stats(ceph::Formatter
*f
, set
<pg_t
>& pgs
) const
2224 f
->open_array_section("pg_stats");
2225 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2226 const pg_stat_t
& st
= pg_stat
.at(*i
);
2227 f
->open_object_section("pg_stat");
2228 f
->dump_stream("pgid") << *i
;
2235 void PGMap::dump_filtered_pg_stats(ostream
& ss
, set
<pg_t
>& pgs
) const
2238 utime_t now
= ceph_clock_now();
2240 tab
.define_column("PG", TextTable::LEFT
, TextTable::LEFT
);
2241 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
2242 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
2243 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
2244 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
2245 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
2246 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
2247 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
2248 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
2249 tab
.define_column("LOG_DUPS", TextTable::LEFT
, TextTable::RIGHT
);
2250 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
2251 tab
.define_column("SINCE", TextTable::LEFT
, TextTable::RIGHT
);
2252 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
2253 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
2254 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
2255 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
2256 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2257 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2258 tab
.define_column("LAST_SCRUB_DURATION", TextTable::LEFT
, TextTable::RIGHT
);
2259 tab
.define_column("SCRUB_SCHEDULING", TextTable::LEFT
, TextTable::LEFT
);
2261 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2262 const pg_stat_t
& st
= pg_stat
.at(*i
);
2264 ostringstream reported
;
2265 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
2267 ostringstream upstr
, actingstr
;
2268 upstr
<< pg_vector_string(st
.up
) << 'p' << st
.up_primary
;
2269 actingstr
<< pg_vector_string(st
.acting
) << 'p' << st
.acting_primary
;
2271 << st
.stats
.sum
.num_objects
2272 << st
.stats
.sum
.num_objects_degraded
2273 << st
.stats
.sum
.num_objects_misplaced
2274 << st
.stats
.sum
.num_objects_unfound
2275 << st
.stats
.sum
.num_bytes
2276 << st
.stats
.sum
.num_omap_bytes
2277 << st
.stats
.sum
.num_omap_keys
2280 << pg_state_string(st
.state
)
2281 << utimespan_str(now
- st
.last_change
)
2286 << st
.last_scrub_stamp
2287 << st
.last_deep_scrub_stamp
2288 << st
.last_scrub_duration
2289 << st
.dump_scrub_schedule()
2290 << TextTable::endrow
;
2296 void PGMap::dump_pool_stats_and_io_rate(int64_t poolid
, const OSDMap
&osd_map
,
2298 stringstream
*rs
) const {
2299 const string
& pool_name
= osd_map
.get_pool_name(poolid
);
2301 f
->open_object_section("pool");
2302 f
->dump_string("pool_name", pool_name
.c_str());
2303 f
->dump_int("pool_id", poolid
);
2304 f
->open_object_section("recovery");
2308 pool_recovery_summary(f
, &sl
, poolid
);
2309 if (!f
&& !sl
.empty()) {
2311 tss
<< " " << p
<< "\n";
2314 f
->close_section(); // object section recovery
2315 f
->open_object_section("recovery_rate");
2318 pool_recovery_rate_summary(f
, &rss
, poolid
);
2319 if (!f
&& !rss
.str().empty())
2320 tss
<< " recovery io " << rss
.str() << "\n";
2322 f
->close_section(); // object section recovery_rate
2323 f
->open_object_section("client_io_rate");
2327 pool_client_io_rate_summary(f
, &rss
, poolid
);
2328 if (!f
&& !rss
.str().empty())
2329 tss
<< " client io " << rss
.str() << "\n";
2330 // dump cache tier IO rate for cache pool
2331 const pg_pool_t
*pool
= osd_map
.get_pg_pool(poolid
);
2332 if (pool
->is_tier()) {
2334 f
->close_section(); // object section client_io_rate
2335 f
->open_object_section("cache_io_rate");
2339 pool_cache_io_rate_summary(f
, &rss
, poolid
);
2340 if (!f
&& !rss
.str().empty())
2341 tss
<< " cache tier io " << rss
.str() << "\n";
2344 f
->close_section(); // object section cache_io_rate
2345 f
->close_section(); // object section pool
2347 *rs
<< "pool " << pool_name
<< " id " << poolid
<< "\n";
2348 if (!tss
.str().empty())
2349 *rs
<< tss
.str() << "\n";
2351 *rs
<< " nothing is going on\n\n";
2355 // Get crush parentage for an osd (skip root)
2356 set
<std::string
> PGMap::osd_parentage(const OSDMap
& osdmap
, int id
) const
2358 set
<std::string
> reporters_by_subtree
;
2359 auto reporter_subtree_level
= g_conf().get_val
<string
>("mon_osd_reporter_subtree_level");
2361 auto loc
= osdmap
.crush
->get_full_location(id
);
2362 for (auto& [parent_bucket_type
, parent_id
] : loc
) {
2363 // Should we show the root? Might not be too informative like "default"
2364 if (parent_bucket_type
!= "root" &&
2365 parent_bucket_type
!= reporter_subtree_level
) {
2366 reporters_by_subtree
.insert(parent_id
);
2369 return reporters_by_subtree
;
2372 void PGMap::get_health_checks(
2374 const OSDMap
& osdmap
,
2375 health_check_map_t
*checks
) const
2377 utime_t now
= ceph_clock_now();
2378 const auto max
= cct
->_conf
.get_val
<uint64_t>("mon_health_max_detail");
2379 const auto& pools
= osdmap
.get_pools();
2381 typedef enum pg_consequence_t
{
2382 UNAVAILABLE
= 1, // Client IO to the pool may block
2383 DEGRADED
= 2, // Fewer than the requested number of replicas are present
2384 BACKFILL_FULL
= 3, // Backfill is blocked for space considerations
2385 // This may or may not be a deadlock condition.
2386 DAMAGED
= 4, // The data may be missing or inconsistent on disk and
2388 RECOVERY_FULL
= 5 // Recovery is blocked because OSDs are full
2391 // For a given PG state, how should it be reported at the pool level?
2392 class PgStateResponse
{
2394 pg_consequence_t consequence
;
2395 typedef std::function
< utime_t(const pg_stat_t
&) > stuck_cb
;
2396 stuck_cb stuck_since
;
2399 PgStateResponse(const pg_consequence_t
& c
, stuck_cb
&& s
)
2400 : consequence(c
), stuck_since(std::move(s
)), invert(false)
2404 PgStateResponse(const pg_consequence_t
& c
, stuck_cb
&& s
, bool i
)
2405 : consequence(c
), stuck_since(std::move(s
)), invert(i
)
2410 // Record the PG state counts that contributed to a reported pool state
2413 // Map of PG_STATE_* to number of pgs in that state.
2414 std::map
<unsigned, unsigned> states
;
2416 // List of all PG IDs that had a state contributing
2417 // to this health condition.
2420 std::map
<pg_t
, std::string
> pg_messages
;
2423 // Map of PG state to how to respond to it
2424 std::map
<unsigned, PgStateResponse
> state_to_response
= {
2425 // Immediate reports
2426 { PG_STATE_INCONSISTENT
, {DAMAGED
, {}} },
2427 { PG_STATE_INCOMPLETE
, {UNAVAILABLE
, {}} },
2428 { PG_STATE_SNAPTRIM_ERROR
, {DAMAGED
, {}} },
2429 { PG_STATE_RECOVERY_UNFOUND
, {DAMAGED
, {}} },
2430 { PG_STATE_BACKFILL_UNFOUND
, {DAMAGED
, {}} },
2431 { PG_STATE_BACKFILL_TOOFULL
, {BACKFILL_FULL
, {}} },
2432 { PG_STATE_RECOVERY_TOOFULL
, {RECOVERY_FULL
, {}} },
2433 { PG_STATE_DEGRADED
, {DEGRADED
, {}} },
2434 { PG_STATE_DOWN
, {UNAVAILABLE
, {}} },
2435 // Delayed (wait until stuck) reports
2436 { PG_STATE_PEERING
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_peered
;} } },
2437 { PG_STATE_UNDERSIZED
, {DEGRADED
, [](const pg_stat_t
&p
){return p
.last_fullsized
;} } },
2438 { PG_STATE_STALE
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_unstale
;} } },
2439 // Delayed and inverted reports
2440 { PG_STATE_ACTIVE
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_active
;}, true} }
2443 // Specialized state printer that takes account of inversion of
2444 // ACTIVE, CLEAN checks.
2445 auto state_name
= [](const uint64_t &state
) {
2446 // Special cases for the states that are inverted checks
2447 if (state
== PG_STATE_CLEAN
) {
2448 return std::string("unclean");
2449 } else if (state
== PG_STATE_ACTIVE
) {
2450 return std::string("inactive");
2452 return pg_state_string(state
);
2456 // Map of what is wrong to information about why, implicitly also stores
2457 // the list of what is wrong.
2458 std::map
<pg_consequence_t
, PgCauses
> detected
;
2460 // Optimisation: trim down the number of checks to apply based on
2461 // the summary counters
2462 std::map
<unsigned, PgStateResponse
> possible_responses
;
2463 for (const auto &i
: num_pg_by_state
) {
2464 for (const auto &j
: state_to_response
) {
2465 if (!j
.second
.invert
) {
2466 // Check for normal tests by seeing if any pgs have the flag
2467 if (i
.first
& j
.first
) {
2468 possible_responses
.insert(j
);
2474 for (const auto &j
: state_to_response
) {
2475 if (j
.second
.invert
) {
2476 // Check for inverted tests by seeing if not-all pgs have the flag
2477 const auto &found
= num_pg_by_state
.find(j
.first
);
2478 if (found
== num_pg_by_state
.end() || found
->second
!= num_pg
) {
2479 possible_responses
.insert(j
);
2484 utime_t cutoff
= now
- utime_t(cct
->_conf
.get_val
<int64_t>("mon_pg_stuck_threshold"), 0);
2485 // Loop over all PGs, if there are any possibly-unhealthy states in there
2486 if (!possible_responses
.empty()) {
2487 for (const auto& i
: pg_stat
) {
2488 const auto &pg_id
= i
.first
;
2489 const auto &pg_info
= i
.second
;
2491 for (const auto &j
: state_to_response
) {
2492 const auto &pg_response_state
= j
.first
;
2493 const auto &pg_response
= j
.second
;
2495 // Apply the state test
2496 if (!(bool(pg_info
.state
& pg_response_state
) != pg_response
.invert
)) {
2500 // Apply stuckness test if needed
2501 if (pg_response
.stuck_since
) {
2502 // Delayed response, check for stuckness
2503 utime_t last_whatever
= pg_response
.stuck_since(pg_info
);
2504 if (last_whatever
.is_zero() &&
2505 pg_info
.last_change
>= cutoff
) {
2506 // still moving, ignore
2508 } else if (last_whatever
>= cutoff
) {
2509 // Not stuck enough, ignore.
2516 auto &causes
= detected
[pg_response
.consequence
];
2517 causes
.states
[pg_response_state
]++;
2518 causes
.pgs
.insert(pg_id
);
2520 // Don't bother composing detail string if we have already recorded
2522 if (causes
.pg_messages
.size() > max
) {
2526 std::ostringstream ss
;
2527 if (pg_response
.stuck_since
) {
2528 utime_t since
= pg_response
.stuck_since(pg_info
);
2529 ss
<< "pg " << pg_id
<< " is stuck " << state_name(pg_response_state
);
2530 if (since
== utime_t()) {
2531 ss
<< " since forever";
2533 utime_t dur
= now
- since
;
2534 ss
<< " for " << utimespan_str(dur
);
2536 ss
<< ", current state " << pg_state_string(pg_info
.state
)
2537 << ", last acting " << pg_vector_string(pg_info
.acting
);
2539 ss
<< "pg " << pg_id
<< " is "
2540 << pg_state_string(pg_info
.state
);
2541 ss
<< ", acting " << pg_vector_string(pg_info
.acting
);
2542 if (pg_info
.stats
.sum
.num_objects_unfound
) {
2543 ss
<< ", " << pg_info
.stats
.sum
.num_objects_unfound
2548 if (pg_info
.state
& PG_STATE_INCOMPLETE
) {
2549 const pg_pool_t
*pi
= osdmap
.get_pg_pool(pg_id
.pool());
2550 if (pi
&& pi
->min_size
> 1) {
2551 ss
<< " (reducing pool "
2552 << osdmap
.get_pool_name(pg_id
.pool())
2553 << " min_size from " << (int)pi
->min_size
2554 << " may help; search ceph.com/docs for 'incomplete')";
2558 causes
.pg_messages
[pg_id
] = ss
.str();
2562 dout(10) << __func__
<< " skipping loop over PGs: counters look OK" << dendl
;
2565 for (const auto &i
: detected
) {
2566 std::string health_code
;
2567 health_status_t sev
;
2568 std::string summary
;
2571 health_code
= "PG_AVAILABILITY";
2573 summary
= "Reduced data availability: ";
2576 health_code
= "PG_DEGRADED";
2577 summary
= "Degraded data redundancy: ";
2581 health_code
= "PG_BACKFILL_FULL";
2582 summary
= "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2586 health_code
= "PG_DAMAGED";
2587 summary
= "Possible data damage: ";
2591 health_code
= "PG_RECOVERY_FULL";
2592 summary
= "Full OSDs blocking recovery: ";
2599 if (i
.first
== DEGRADED
) {
2600 if (pg_sum
.stats
.sum
.num_objects_degraded
&&
2601 pg_sum
.stats
.sum
.num_object_copies
> 0) {
2602 double pc
= (double)pg_sum
.stats
.sum
.num_objects_degraded
/
2603 (double)pg_sum
.stats
.sum
.num_object_copies
* (double)100.0;
2605 snprintf(b
, sizeof(b
), "%.3lf", pc
);
2607 ss
<< pg_sum
.stats
.sum
.num_objects_degraded
2608 << "/" << pg_sum
.stats
.sum
.num_object_copies
<< " objects degraded ("
2611 // Throw in a comma for the benefit of the following PG counts
2612 summary
+= ss
.str() + ", ";
2616 // Compose summary message saying how many PGs in what states led
2617 // to this health check failing
2618 std::vector
<std::string
> pg_msgs
;
2620 for (const auto &j
: i
.second
.states
) {
2621 std::ostringstream msg
;
2622 msg
<< j
.second
<< (j
.second
> 1 ? " pgs " : " pg ") << state_name(j
.first
);
2623 pg_msgs
.push_back(msg
.str());
2626 summary
+= joinify(pg_msgs
.begin(), pg_msgs
.end(), std::string(", "));
2628 health_check_t
*check
= &checks
->add(
2634 // Compose list of PGs contributing to this health check failing
2635 for (const auto &j
: i
.second
.pg_messages
) {
2636 check
->detail
.push_back(j
.second
);
2641 if (pg_sum
.stats
.sum
.num_scrub_errors
) {
2643 ss
<< pg_sum
.stats
.sum
.num_scrub_errors
<< " scrub errors";
2644 checks
->add("OSD_SCRUB_ERRORS", HEALTH_ERR
, ss
.str(),
2645 pg_sum
.stats
.sum
.num_scrub_errors
);
2648 // LARGE_OMAP_OBJECTS
2649 if (pg_sum
.stats
.sum
.num_large_omap_objects
) {
2650 list
<string
> detail
;
2651 for (auto &pool
: pools
) {
2652 const string
& pool_name
= osdmap
.get_pool_name(pool
.first
);
2653 auto it2
= pg_pool_sum
.find(pool
.first
);
2654 if (it2
== pg_pool_sum
.end()) {
2657 const pool_stat_t
*pstat
= &it2
->second
;
2658 if (pstat
== nullptr) {
2661 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
2662 if (sum
.num_large_omap_objects
) {
2664 ss
<< sum
.num_large_omap_objects
<< " large objects found in pool "
2665 << "'" << pool_name
<< "'";
2666 detail
.push_back(ss
.str());
2669 if (!detail
.empty()) {
2671 ss
<< pg_sum
.stats
.sum
.num_large_omap_objects
<< " large omap objects";
2672 auto& d
= checks
->add("LARGE_OMAP_OBJECTS", HEALTH_WARN
, ss
.str(),
2673 pg_sum
.stats
.sum
.num_large_omap_objects
);
2675 tip
<< "Search the cluster log for 'Large omap object found' for more "
2677 detail
.push_back(tip
.str());
2678 d
.detail
.swap(detail
);
2682 // CACHE_POOL_NEAR_FULL
2684 list
<string
> detail
;
2685 unsigned num_pools
= 0;
2686 for (auto& p
: pools
) {
2687 if ((!p
.second
.target_max_objects
&& !p
.second
.target_max_bytes
) ||
2688 !pg_pool_sum
.count(p
.first
)) {
2691 bool nearfull
= false;
2692 const string
& name
= osdmap
.get_pool_name(p
.first
);
2693 const pool_stat_t
& st
= get_pg_pool_sum_stat(p
.first
);
2694 uint64_t ratio
= p
.second
.cache_target_full_ratio_micro
+
2695 ((1000000 - p
.second
.cache_target_full_ratio_micro
) *
2696 cct
->_conf
->mon_cache_target_full_warn_ratio
);
2697 if (p
.second
.target_max_objects
&&
2698 (uint64_t)(st
.stats
.sum
.num_objects
-
2699 st
.stats
.sum
.num_objects_hit_set_archive
) >
2700 p
.second
.target_max_objects
* (ratio
/ 1000000.0)) {
2702 ss
<< "cache pool '" << name
<< "' with "
2703 << si_u_t(st
.stats
.sum
.num_objects
)
2704 << " objects at/near target max "
2705 << si_u_t(p
.second
.target_max_objects
) << " objects";
2706 detail
.push_back(ss
.str());
2709 if (p
.second
.target_max_bytes
&&
2710 (uint64_t)(st
.stats
.sum
.num_bytes
-
2711 st
.stats
.sum
.num_bytes_hit_set_archive
) >
2712 p
.second
.target_max_bytes
* (ratio
/ 1000000.0)) {
2714 ss
<< "cache pool '" << name
2715 << "' with " << byte_u_t(st
.stats
.sum
.num_bytes
)
2716 << " at/near target max "
2717 << byte_u_t(p
.second
.target_max_bytes
);
2718 detail
.push_back(ss
.str());
2725 if (!detail
.empty()) {
2727 ss
<< num_pools
<< " cache pools at or near target size";
2728 auto& d
= checks
->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN
, ss
.str(),
2730 d
.detail
.swap(detail
);
2735 unsigned num_in
= osdmap
.get_num_in_osds();
2736 auto sum_pg_up
= std::max(static_cast<size_t>(pg_sum
.up
), pg_stat
.size());
2737 const auto min_pg_per_osd
=
2738 cct
->_conf
.get_val
<uint64_t>("mon_pg_warn_min_per_osd");
2739 if (num_in
&& min_pg_per_osd
> 0 && osdmap
.get_pools().size() > 0) {
2740 auto per
= sum_pg_up
/ num_in
;
2741 if (per
< min_pg_per_osd
&& per
) {
2743 ss
<< "too few PGs per OSD (" << per
2744 << " < min " << min_pg_per_osd
<< ")";
2745 checks
->add("TOO_FEW_PGS", HEALTH_WARN
, ss
.str(),
2746 min_pg_per_osd
- per
);
2751 auto max_pg_per_osd
= cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd");
2752 if (num_in
&& max_pg_per_osd
> 0) {
2753 auto per
= sum_pg_up
/ num_in
;
2754 if (per
> max_pg_per_osd
) {
2756 ss
<< "too many PGs per OSD (" << per
2757 << " > max " << max_pg_per_osd
<< ")";
2758 checks
->add("TOO_MANY_PGS", HEALTH_WARN
, ss
.str(),
2759 per
- max_pg_per_osd
);
2764 auto warn_too_few_osds
= cct
->_conf
.get_val
<bool>("mon_warn_on_too_few_osds");
2765 auto osd_pool_default_size
= cct
->_conf
.get_val
<uint64_t>("osd_pool_default_size");
2766 if (warn_too_few_osds
&& osdmap
.get_num_osds() < osd_pool_default_size
) {
2768 ss
<< "OSD count " << osdmap
.get_num_osds()
2769 << " < osd_pool_default_size " << osd_pool_default_size
;
2770 checks
->add("TOO_FEW_OSDS", HEALTH_WARN
, ss
.str(),
2771 osd_pool_default_size
- osdmap
.get_num_osds());
2775 // Convert milliseconds to microseconds
2776 auto warn_slow_ping_time
= cct
->_conf
.get_val
<double>("mon_warn_on_slow_ping_time") * 1000;
2777 auto grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
2778 if (warn_slow_ping_time
== 0) {
2779 double ratio
= cct
->_conf
.get_val
<double>("mon_warn_on_slow_ping_ratio");
2780 warn_slow_ping_time
= grace
;
2781 warn_slow_ping_time
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2783 if (warn_slow_ping_time
> 0) {
2785 struct mon_ping_item_t
{
2791 bool operator<(const mon_ping_item_t
& rhs
) const {
2792 if (pingtime
< rhs
.pingtime
)
2794 if (pingtime
> rhs
.pingtime
)
2796 if (from
< rhs
.from
)
2798 if (from
> rhs
.from
)
2804 list
<string
> detail_back
;
2805 list
<string
> detail_front
;
2806 list
<string
> detail
;
2807 set
<mon_ping_item_t
> back_sorted
, front_sorted
;
2808 for (auto i
: osd_stat
) {
2809 for (auto j
: i
.second
.hb_pingtime
) {
2811 // Maybe source info is old
2812 if (now
.sec() - j
.second
.last_update
> grace
* 60)
2815 mon_ping_item_t back
;
2816 back
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
2817 back
.pingtime
= std::max(back
.pingtime
, j
.second
.back_pingtime
[2]);
2818 back
.from
= i
.first
;
2820 if (back
.pingtime
> warn_slow_ping_time
) {
2821 back
.improving
= (j
.second
.back_pingtime
[0] < j
.second
.back_pingtime
[1]
2822 && j
.second
.back_pingtime
[1] < j
.second
.back_pingtime
[2]);
2823 back_sorted
.emplace(back
);
2826 mon_ping_item_t front
;
2827 front
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
2828 front
.pingtime
= std::max(front
.pingtime
, j
.second
.front_pingtime
[2]);
2829 front
.from
= i
.first
;
2831 if (front
.pingtime
> warn_slow_ping_time
) {
2832 front
.improving
= (j
.second
.front_pingtime
[0] < j
.second
.front_pingtime
[1]
2833 && j
.second
.front_pingtime
[1] < j
.second
.back_pingtime
[2]);
2834 front_sorted
.emplace(front
);
2837 if (i
.second
.num_shards_repaired
>
2838 cct
->_conf
.get_val
<uint64_t>("mon_osd_warn_num_repaired")) {
2840 ss
<< "osd." << i
.first
<< " had " << i
.second
.num_shards_repaired
<< " reads repaired";
2841 detail
.push_back(ss
.str());
2844 if (!detail
.empty()) {
2846 ss
<< "Too many repaired reads on " << detail
.size() << " OSDs";
2847 auto& d
= checks
->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN
, ss
.str(),
2849 d
.detail
.swap(detail
);
2851 int max_detail
= 10;
2852 for (auto &sback
: boost::adaptors::reverse(back_sorted
)) {
2854 if (max_detail
== 0) {
2855 ss
<< "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2856 detail_back
.push_back(ss
.str());
2860 ss
<< "Slow OSD heartbeats on back from osd." << sback
.from
2861 << " [" << osd_parentage(osdmap
, sback
.from
) << "]"
2862 << (osdmap
.is_down(sback
.from
) ? " (down)" : "")
2863 << " to osd." << sback
.to
2864 << " [" << osd_parentage(osdmap
, sback
.to
) << "]"
2865 << (osdmap
.is_down(sback
.to
) ? " (down)" : "")
2866 << " " << fixed_u_to_string(sback
.pingtime
, 3) << " msec"
2867 << (sback
.improving
? " possibly improving" : "");
2868 detail_back
.push_back(ss
.str());
2871 for (auto &sfront
: boost::adaptors::reverse(front_sorted
)) {
2873 if (max_detail
== 0) {
2874 ss
<< "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2875 detail_front
.push_back(ss
.str());
2879 // Get crush parentage for each osd
2880 ss
<< "Slow OSD heartbeats on front from osd." << sfront
.from
2881 << " [" << osd_parentage(osdmap
, sfront
.from
) << "]"
2882 << (osdmap
.is_down(sfront
.from
) ? " (down)" : "")
2883 << " to osd." << sfront
.to
2884 << " [" << osd_parentage(osdmap
, sfront
.to
) << "]"
2885 << (osdmap
.is_down(sfront
.to
) ? " (down)" : "")
2886 << " " << fixed_u_to_string(sfront
.pingtime
, 3) << " msec"
2887 << (sfront
.improving
? " possibly improving" : "");
2888 detail_front
.push_back(ss
.str());
2890 if (detail_back
.size() != 0) {
2892 ss
<< "Slow OSD heartbeats on back (longest "
2893 << fixed_u_to_string(back_sorted
.rbegin()->pingtime
, 3) << "ms)";
2894 auto& d
= checks
->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN
, ss
.str(),
2895 back_sorted
.size());
2896 d
.detail
.swap(detail_back
);
2898 if (detail_front
.size() != 0) {
2900 ss
<< "Slow OSD heartbeats on front (longest "
2901 << fixed_u_to_string(front_sorted
.rbegin()->pingtime
, 3) << "ms)";
2902 auto& d
= checks
->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN
, ss
.str(),
2903 front_sorted
.size());
2904 d
.detail
.swap(detail_front
);
2909 // MANY_OBJECTS_PER_PG
2910 if (!pg_stat
.empty()) {
2911 list
<string
> pgp_detail
, many_detail
;
2912 const auto mon_pg_warn_min_objects
=
2913 cct
->_conf
.get_val
<int64_t>("mon_pg_warn_min_objects");
2914 const auto mon_pg_warn_min_pool_objects
=
2915 cct
->_conf
.get_val
<int64_t>("mon_pg_warn_min_pool_objects");
2916 const auto mon_pg_warn_max_object_skew
=
2917 cct
->_conf
.get_val
<double>("mon_pg_warn_max_object_skew");
2918 for (auto p
= pg_pool_sum
.begin();
2919 p
!= pg_pool_sum
.end();
2921 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
);
2923 continue; // in case osdmap changes haven't propagated to PGMap yet
2924 const string
& name
= osdmap
.get_pool_name(p
->first
);
2925 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2926 // the warnings. If the cluster is failing to converge on the target
2927 // values that is a separate issue!
2928 if (pi
->get_pg_num_target() > pi
->get_pgp_num_target() &&
2929 !(name
.find(".DELETED") != string::npos
&&
2930 cct
->_conf
->mon_fake_pool_delete
)) {
2932 ss
<< "pool " << name
<< " pg_num "
2933 << pi
->get_pg_num_target()
2934 << " > pgp_num " << pi
->get_pgp_num_target();
2935 pgp_detail
.push_back(ss
.str());
2937 int average_objects_per_pg
= pg_sum
.stats
.sum
.num_objects
/ pg_stat
.size();
2938 if (average_objects_per_pg
> 0 &&
2939 pg_sum
.stats
.sum
.num_objects
>= mon_pg_warn_min_objects
&&
2940 p
->second
.stats
.sum
.num_objects
>= mon_pg_warn_min_pool_objects
) {
2941 int objects_per_pg
= p
->second
.stats
.sum
.num_objects
/
2942 pi
->get_pg_num_target();
2943 float ratio
= (float)objects_per_pg
/ (float)average_objects_per_pg
;
2944 if (mon_pg_warn_max_object_skew
> 0 &&
2945 ratio
> mon_pg_warn_max_object_skew
) {
2947 if (pi
->pg_autoscale_mode
!= pg_pool_t::pg_autoscale_mode_t::ON
) {
2948 ss
<< "pool " << name
<< " objects per pg ("
2949 << objects_per_pg
<< ") is more than " << ratio
2950 << " times cluster average ("
2951 << average_objects_per_pg
<< ")";
2952 many_detail
.push_back(ss
.str());
2957 if (!pgp_detail
.empty()) {
2959 ss
<< pgp_detail
.size() << " pools have pg_num > pgp_num";
2960 auto& d
= checks
->add("SMALLER_PGP_NUM", HEALTH_WARN
, ss
.str(),
2962 d
.detail
.swap(pgp_detail
);
2964 if (!many_detail
.empty()) {
2966 ss
<< many_detail
.size() << " pools have many more objects per pg than"
2968 auto& d
= checks
->add("MANY_OBJECTS_PER_PG", HEALTH_WARN
, ss
.str(),
2969 many_detail
.size());
2970 d
.detail
.swap(many_detail
);
2977 float warn_threshold
= (float)g_conf().get_val
<int64_t>("mon_pool_quota_warn_threshold")/100;
2978 float crit_threshold
= (float)g_conf().get_val
<int64_t>("mon_pool_quota_crit_threshold")/100;
2979 list
<string
> full_detail
, nearfull_detail
;
2980 unsigned full_pools
= 0, nearfull_pools
= 0;
2981 for (auto it
: pools
) {
2982 auto it2
= pg_pool_sum
.find(it
.first
);
2983 if (it2
== pg_pool_sum
.end()) {
2986 const pool_stat_t
*pstat
= &it2
->second
;
2987 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
2988 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
2989 const pg_pool_t
&pool
= it
.second
;
2990 bool full
= false, nearfull
= false;
2991 if (pool
.quota_max_objects
> 0) {
2993 if ((uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
2994 } else if (crit_threshold
> 0 &&
2995 sum
.num_objects
>= pool
.quota_max_objects
*crit_threshold
) {
2996 ss
<< "pool '" << pool_name
2997 << "' has " << sum
.num_objects
<< " objects"
2998 << " (max " << pool
.quota_max_objects
<< ")";
2999 full_detail
.push_back(ss
.str());
3001 } else if (warn_threshold
> 0 &&
3002 sum
.num_objects
>= pool
.quota_max_objects
*warn_threshold
) {
3003 ss
<< "pool '" << pool_name
3004 << "' has " << sum
.num_objects
<< " objects"
3005 << " (max " << pool
.quota_max_objects
<< ")";
3006 nearfull_detail
.push_back(ss
.str());
3010 if (pool
.quota_max_bytes
> 0) {
3012 if ((uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
3013 } else if (crit_threshold
> 0 &&
3014 sum
.num_bytes
>= pool
.quota_max_bytes
*crit_threshold
) {
3015 ss
<< "pool '" << pool_name
3016 << "' has " << byte_u_t(sum
.num_bytes
)
3017 << " (max " << byte_u_t(pool
.quota_max_bytes
) << ")";
3018 full_detail
.push_back(ss
.str());
3020 } else if (warn_threshold
> 0 &&
3021 sum
.num_bytes
>= pool
.quota_max_bytes
*warn_threshold
) {
3022 ss
<< "pool '" << pool_name
3023 << "' has " << byte_u_t(sum
.num_bytes
)
3024 << " (max " << byte_u_t(pool
.quota_max_bytes
) << ")";
3025 nearfull_detail
.push_back(ss
.str());
3038 ss
<< full_pools
<< " pools full";
3039 auto& d
= checks
->add("POOL_FULL", HEALTH_ERR
, ss
.str(), full_pools
);
3040 d
.detail
.swap(full_detail
);
3042 if (nearfull_pools
) {
3044 ss
<< nearfull_pools
<< " pools nearfull";
3045 auto& d
= checks
->add("POOL_NEAR_FULL", HEALTH_WARN
, ss
.str(), nearfull_pools
);
3046 d
.detail
.swap(nearfull_detail
);
3051 if (pg_sum
.stats
.sum
.num_objects_misplaced
&&
3052 pg_sum
.stats
.sum
.num_object_copies
> 0 &&
3053 cct
->_conf
->mon_warn_on_misplaced
) {
3054 double pc
= (double)pg_sum
.stats
.sum
.num_objects_misplaced
/
3055 (double)pg_sum
.stats
.sum
.num_object_copies
* (double)100.0;
3057 snprintf(b
, sizeof(b
), "%.3lf", pc
);
3059 ss
<< pg_sum
.stats
.sum
.num_objects_misplaced
3060 << "/" << pg_sum
.stats
.sum
.num_object_copies
<< " objects misplaced ("
3062 checks
->add("OBJECT_MISPLACED", HEALTH_WARN
, ss
.str(),
3063 pg_sum
.stats
.sum
.num_objects_misplaced
);
3067 if (pg_sum
.stats
.sum
.num_objects_unfound
&&
3068 pg_sum
.stats
.sum
.num_objects
) {
3069 double pc
= (double)pg_sum
.stats
.sum
.num_objects_unfound
/
3070 (double)pg_sum
.stats
.sum
.num_objects
* (double)100.0;
3072 snprintf(b
, sizeof(b
), "%.3lf", pc
);
3074 ss
<< pg_sum
.stats
.sum
.num_objects_unfound
3075 << "/" << pg_sum
.stats
.sum
.num_objects
<< " objects unfound (" << b
<< "%)";
3076 auto& d
= checks
->add("OBJECT_UNFOUND", HEALTH_WARN
, ss
.str(),
3077 pg_sum
.stats
.sum
.num_objects_unfound
);
3079 for (auto& p
: pg_stat
) {
3080 if (p
.second
.stats
.sum
.num_objects_unfound
) {
3082 ss
<< "pg " << p
.first
3083 << " has " << p
.second
.stats
.sum
.num_objects_unfound
3084 << " unfound objects";
3085 d
.detail
.push_back(ss
.str());
3086 if (d
.detail
.size() > max
) {
3087 d
.detail
.push_back("(additional pgs left out for brevity)");
3096 // SLOW_OPS unifies them in mimic.
3097 if (osdmap
.require_osd_release
< ceph_release_t::mimic
&&
3098 cct
->_conf
->mon_osd_warn_op_age
> 0 &&
3099 !osd_sum
.op_queue_age_hist
.h
.empty() &&
3100 osd_sum
.op_queue_age_hist
.upper_bound() / 1000.0 >
3101 cct
->_conf
->mon_osd_warn_op_age
) {
3102 list
<string
> warn_detail
, error_detail
;
3103 unsigned warn
= 0, error
= 0;
3105 cct
->_conf
->mon_osd_warn_op_age
* cct
->_conf
->mon_osd_err_op_age_ratio
;
3106 const pow2_hist_t
& h
= osd_sum
.op_queue_age_hist
;
3107 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
3108 float ub
= (float)(1 << i
) / 1000.0;
3109 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
3113 ss
<< h
.h
[i
] << " ops are blocked > " << ub
<< " sec";
3116 error_detail
.push_back(ss
.str());
3119 warn_detail
.push_back(ss
.str());
3124 map
<float,set
<int>> warn_osd_by_max
; // max -> osds
3125 map
<float,set
<int>> error_osd_by_max
; // max -> osds
3126 if (!warn_detail
.empty() || !error_detail
.empty()) {
3127 for (auto& p
: osd_stat
) {
3128 const pow2_hist_t
& h
= p
.second
.op_queue_age_hist
;
3129 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
3130 float ub
= (float)(1 << i
) / 1000.0;
3131 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
3135 error_osd_by_max
[ub
].insert(p
.first
);
3137 warn_osd_by_max
[ub
].insert(p
.first
);
3145 if (!warn_detail
.empty()) {
3147 ss
<< warn
<< " slow requests are blocked > "
3148 << cct
->_conf
->mon_osd_warn_op_age
<< " sec";
3149 auto& d
= checks
->add("REQUEST_SLOW", HEALTH_WARN
, ss
.str(), warn
);
3150 d
.detail
.swap(warn_detail
);
3152 for (auto& p
: warn_osd_by_max
) {
3154 if (p
.second
.size() > 1) {
3155 ss
<< "osds " << p
.second
3156 << " have blocked requests > " << p
.first
<< " sec";
3158 ss
<< "osd." << *p
.second
.begin()
3159 << " has blocked requests > " << p
.first
<< " sec";
3161 d
.detail
.push_back(ss
.str());
3167 if (!error_detail
.empty()) {
3169 ss
<< error
<< " stuck requests are blocked > "
3170 << err_age
<< " sec";
3171 auto& d
= checks
->add("REQUEST_STUCK", HEALTH_ERR
, ss
.str(), error
);
3172 d
.detail
.swap(error_detail
);
3174 for (auto& p
: error_osd_by_max
) {
3176 if (p
.second
.size() > 1) {
3177 ss
<< "osds " << p
.second
3178 << " have stuck requests > " << p
.first
<< " sec";
3180 ss
<< "osd." << *p
.second
.begin()
3181 << " has stuck requests > " << p
.first
<< " sec";
3183 d
.detail
.push_back(ss
.str());
3191 // OBJECT_STORE_WARN
3192 if (osd_sum
.os_alerts
.size()) {
3193 map
<string
, pair
<size_t, list
<string
>>> os_alerts_sum
;
3195 for (auto& a
: osd_sum
.os_alerts
) {
3197 string s0
= " osd.";
3198 s0
+= stringify(a
.first
);
3199 for (auto& aa
: a
.second
) {
3203 auto it
= os_alerts_sum
.find(aa
.first
);
3204 if (it
== os_alerts_sum
.end()) {
3207 os_alerts_sum
.emplace(aa
.first
, std::make_pair(1, d
));
3209 auto& p
= it
->second
;
3211 p
.second
.emplace_back(s
);
3219 for (auto& asum
: os_alerts_sum
) {
3220 string summary
= stringify(asum
.second
.first
) + " OSD(s)";
3221 if (asum
.first
== "BLUEFS_SPILLOVER") {
3222 summary
+= " experiencing BlueFS spillover";
3223 } else if (asum
.first
== "BLUESTORE_NO_COMPRESSION") {
3224 summary
+= " have broken BlueStore compression";
3225 } else if (asum
.first
== "BLUESTORE_LEGACY_STATFS") {
3226 summary
+= " reporting legacy (not per-pool) BlueStore stats";
3227 } else if (asum
.first
== "BLUESTORE_DISK_SIZE_MISMATCH") {
3228 summary
+= " have dangerous mismatch between BlueStore block device and free list sizes";
3229 } else if (asum
.first
== "BLUESTORE_NO_PER_PG_OMAP") {
3230 summary
+= " reporting legacy (not per-pg) BlueStore omap";
3231 } else if (asum
.first
== "BLUESTORE_NO_PER_POOL_OMAP") {
3232 summary
+= " reporting legacy (not per-pool) BlueStore omap usage stats";
3233 } else if (asum
.first
== "BLUESTORE_SPURIOUS_READ_ERRORS") {
3234 summary
+= " have spurious read errors";
3237 auto& d
= checks
->add(asum
.first
, HEALTH_WARN
, summary
, asum
.second
.first
);
3238 for (auto& s
: asum
.second
.second
) {
3239 d
.detail
.push_back(s
);
3244 // PG_NOT_DEEP_SCRUBBED
3245 if (cct
->_conf
->mon_warn_pg_not_scrubbed_ratio
||
3246 cct
->_conf
->mon_warn_pg_not_deep_scrubbed_ratio
) {
3247 list
<string
> detail
, deep_detail
;
3248 int detail_max
= max
, deep_detail_max
= max
;
3249 int detail_more
= 0, deep_detail_more
= 0;
3250 int detail_total
= 0, deep_detail_total
= 0;
3251 for (auto& p
: pg_stat
) {
3252 int64_t pnum
= p
.first
.pool();
3253 auto pool
= osdmap
.get_pg_pool(pnum
);
3256 if (cct
->_conf
->mon_warn_pg_not_scrubbed_ratio
) {
3257 double scrub_max_interval
= 0;
3258 pool
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &scrub_max_interval
);
3259 if (scrub_max_interval
<= 0) {
3260 scrub_max_interval
= cct
->_conf
->osd_scrub_max_interval
;
3262 const double age
= (cct
->_conf
->mon_warn_pg_not_scrubbed_ratio
* scrub_max_interval
) +
3264 utime_t cutoff
= now
;
3266 if (p
.second
.last_scrub_stamp
< cutoff
) {
3267 if (detail_max
> 0) {
3269 ss
<< "pg " << p
.first
<< " not scrubbed since "
3270 << p
.second
.last_scrub_stamp
;
3271 detail
.push_back(ss
.str());
3279 if (cct
->_conf
->mon_warn_pg_not_deep_scrubbed_ratio
) {
3280 double deep_scrub_interval
= 0;
3281 pool
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &deep_scrub_interval
);
3282 if (deep_scrub_interval
<= 0) {
3283 deep_scrub_interval
= cct
->_conf
->osd_deep_scrub_interval
;
3285 double deep_age
= (cct
->_conf
->mon_warn_pg_not_deep_scrubbed_ratio
* deep_scrub_interval
) +
3286 deep_scrub_interval
;
3287 utime_t deep_cutoff
= now
;
3288 deep_cutoff
-= deep_age
;
3289 if (p
.second
.last_deep_scrub_stamp
< deep_cutoff
) {
3290 if (deep_detail_max
> 0) {
3292 ss
<< "pg " << p
.first
<< " not deep-scrubbed since "
3293 << p
.second
.last_deep_scrub_stamp
;
3294 deep_detail
.push_back(ss
.str());
3299 ++deep_detail_total
;
3305 ss
<< detail_total
<< " pgs not scrubbed in time";
3306 auto& d
= checks
->add("PG_NOT_SCRUBBED", HEALTH_WARN
, ss
.str(), detail_total
);
3308 if (!detail
.empty()) {
3309 d
.detail
.swap(detail
);
3313 ss
<< detail_more
<< " more pgs... ";
3314 d
.detail
.push_back(ss
.str());
3318 if (deep_detail_total
) {
3320 ss
<< deep_detail_total
<< " pgs not deep-scrubbed in time";
3321 auto& d
= checks
->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN
, ss
.str(),
3324 if (!deep_detail
.empty()) {
3325 d
.detail
.swap(deep_detail
);
3327 if (deep_detail_more
) {
3329 ss
<< deep_detail_more
<< " more pgs... ";
3330 d
.detail
.push_back(ss
.str());
3337 if (g_conf().get_val
<bool>("mon_warn_on_pool_no_app")) {
3338 list
<string
> detail
;
3339 for (auto &it
: pools
) {
3340 const pg_pool_t
&pool
= it
.second
;
3341 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3342 // application metadata is not encoded until luminous is minimum
3344 if (pool
.application_metadata
.empty() && !pool
.is_tier()) {
3346 ss
<< "application not enabled on pool '" << pool_name
<< "'";
3347 detail
.push_back(ss
.str());
3350 if (!detail
.empty()) {
3352 ss
<< detail
.size() << " pool(s) do not have an application enabled";
3353 auto& d
= checks
->add("POOL_APP_NOT_ENABLED", HEALTH_WARN
, ss
.str(),
3356 tip
<< "use 'ceph osd pool application enable <pool-name> "
3357 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3358 << "or freeform for custom applications.";
3359 detail
.push_back(tip
.str());
3360 d
.detail
.swap(detail
);
3364 // PG_SLOW_SNAP_TRIMMING
3365 if (!pg_stat
.empty() && cct
->_conf
->mon_osd_snap_trim_queue_warn_on
> 0) {
3366 uint32_t snapthreshold
= cct
->_conf
->mon_osd_snap_trim_queue_warn_on
;
3367 uint64_t snaptrimq_exceeded
= 0;
3368 uint32_t longest_queue
= 0;
3369 const pg_t
* longest_q_pg
= nullptr;
3370 list
<string
> detail
;
3372 for (auto& i
: pg_stat
) {
3373 uint32_t current_len
= i
.second
.snaptrimq_len
;
3374 if (current_len
>= snapthreshold
) {
3375 snaptrimq_exceeded
++;
3376 if (longest_queue
<= current_len
) {
3377 longest_q_pg
= &i
.first
;
3378 longest_queue
= current_len
;
3380 if (detail
.size() < max
- 1) {
3382 ss
<< "snap trim queue for pg " << i
.first
<< " at " << current_len
;
3383 detail
.push_back(ss
.str());
3386 if (detail
.size() < max
) {
3387 detail
.push_back("...more pgs affected");
3393 if (snaptrimq_exceeded
) {
3396 ss
<< "longest queue on pg " << *longest_q_pg
<< " at " << longest_queue
;
3397 detail
.push_back(ss
.str());
3401 ss
<< "snap trim queue for " << snaptrimq_exceeded
<< " pg(s) >= " << snapthreshold
<< " (mon_osd_snap_trim_queue_warn_on)";
3402 auto& d
= checks
->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN
, ss
.str(),
3403 snaptrimq_exceeded
);
3404 detail
.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3405 d
.detail
.swap(detail
);
3410 void PGMap::print_summary(ceph::Formatter
*f
, ostream
*out
) const
3413 f
->open_array_section("pgs_by_pool_state");
3414 for (auto& i
: num_pg_by_pool_state
) {
3415 f
->open_object_section("per_pool_pgs_by_state");
3416 f
->dump_int("pool_id", i
.first
);
3417 f
->open_array_section("pg_state_counts");
3418 for (auto& j
: i
.second
) {
3419 f
->open_object_section("pg_state_count");
3420 f
->dump_string("state_name", pg_state_string(j
.first
));
3421 f
->dump_int("count", j
.second
);
3429 PGMapDigest::print_summary(f
, out
);
3432 int process_pg_map_command(
3433 const string
& orig_prefix
,
3434 const cmdmap_t
& orig_cmdmap
,
3435 const PGMap
& pg_map
,
3436 const OSDMap
& osdmap
,
3441 string prefix
= orig_prefix
;
3442 auto cmdmap
= orig_cmdmap
;
3444 string omap_stats_note
=
3445 "\n* NOTE: Omap statistics are gathered during deep scrub and "
3446 "may be inaccurate soon afterwards depending on utilization. See "
3447 "http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics "
3448 "for further details.\n";
3449 bool omap_stats_note_required
= false;
3451 // perhaps these would be better in the parsing, but it's weird
3452 bool primary
= false;
3453 if (prefix
== "pg dump_json") {
3455 v
.push_back(string("all"));
3456 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
3458 } else if (prefix
== "pg dump_pools_json") {
3460 v
.push_back(string("pools"));
3461 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
3463 } else if (prefix
== "pg ls-by-primary") {
3466 } else if (prefix
== "pg ls-by-osd") {
3468 } else if (prefix
== "pg ls-by-pool") {
3471 cmd_getval(cmdmap
, "poolstr", poolstr
);
3472 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
3474 *ss
<< "pool " << poolstr
<< " does not exist";
3477 cmd_putval(g_ceph_context
, cmdmap
, "pool", pool
);
3481 if (prefix
== "pg stat") {
3483 f
->open_object_section("pg_summary");
3484 pg_map
.print_oneline_summary(f
, NULL
);
3494 if (prefix
== "pg getmap") {
3495 pg_map
.encode(*odata
);
3496 *ss
<< "got pgmap version " << pg_map
.version
;
3500 if (prefix
== "pg dump") {
3502 vector
<string
> dumpcontents
;
3504 if (cmd_getval(cmdmap
, "dumpcontents", dumpcontents
)) {
3505 copy(dumpcontents
.begin(), dumpcontents
.end(),
3506 inserter(what
, what
.end()));
3511 if (what
.count("all")) {
3512 f
->open_object_section("pg_map");
3515 } else if (what
.count("summary") || what
.count("sum")) {
3516 f
->open_object_section("pg_map");
3517 pg_map
.dump_basic(f
);
3520 if (what
.count("pools")) {
3521 pg_map
.dump_pool_stats(f
);
3523 if (what
.count("osds")) {
3524 pg_map
.dump_osd_stats(f
);
3526 if (what
.count("pgs")) {
3527 pg_map
.dump_pg_stats(f
, false);
3529 if (what
.count("pgs_brief")) {
3530 pg_map
.dump_pg_stats(f
, true);
3532 if (what
.count("delta")) {
3533 f
->open_object_section("delta");
3534 pg_map
.dump_delta(f
);
3540 if (what
.count("all")) {
3542 omap_stats_note_required
= true;
3543 } else if (what
.count("summary") || what
.count("sum")) {
3544 pg_map
.dump_basic(ds
);
3545 pg_map
.dump_pg_sum_stats(ds
, true);
3546 pg_map
.dump_osd_sum_stats(ds
);
3547 omap_stats_note_required
= true;
3549 if (what
.count("pgs_brief")) {
3550 pg_map
.dump_pg_stats(ds
, true);
3553 if (what
.count("pgs")) {
3554 pg_map
.dump_pg_stats(ds
, false);
3556 omap_stats_note_required
= true;
3558 if (what
.count("pools")) {
3559 pg_map
.dump_pool_stats(ds
, header
);
3560 omap_stats_note_required
= true;
3562 if (what
.count("osds")) {
3563 pg_map
.dump_osd_stats(ds
);
3567 if (omap_stats_note_required
) {
3568 odata
->append(omap_stats_note
);
3571 *ss
<< "dumped " << what
;
3575 if (prefix
== "pg ls") {
3578 vector
<string
>states
;
3580 cmd_getval(cmdmap
, "pool", pool
);
3581 cmd_getval(cmdmap
, "osd", osd
);
3582 cmd_getval(cmdmap
, "states", states
);
3583 if (pool
>= 0 && !osdmap
.have_pg_pool(pool
)) {
3584 *ss
<< "pool " << pool
<< " does not exist";
3587 if (osd
>= 0 && !osdmap
.is_up(osd
)) {
3588 *ss
<< "osd " << osd
<< " is not up";
3592 states
.push_back("all");
3596 while (!states
.empty()) {
3597 string state_str
= states
.back();
3599 if (state_str
== "all") {
3603 auto filter
= pg_string_state(state_str
);
3605 *ss
<< "'" << state_str
<< "' is not a valid pg state,"
3606 << " available choices: " << pg_state_string(0xFFFFFFFF);
3615 pg_map
.get_filtered_pg_stats(state
, pool
, osd
, primary
, pgs
);
3617 if (f
&& !pgs
.empty()) {
3618 pg_map
.dump_filtered_pg_stats(f
, pgs
);
3620 } else if (!pgs
.empty()) {
3621 pg_map
.dump_filtered_pg_stats(ds
, pgs
);
3623 odata
->append(omap_stats_note
);
3628 if (prefix
== "pg dump_stuck") {
3629 vector
<string
> stuckop_vec
;
3630 cmd_getval(cmdmap
, "stuckops", stuckop_vec
);
3631 if (stuckop_vec
.empty())
3632 stuckop_vec
.push_back("unclean");
3633 const int64_t threshold
= cmd_getval_or
<int64_t>(
3634 cmdmap
, "threshold",
3635 g_conf().get_val
<int64_t>("mon_pg_stuck_threshold"));
3637 if (pg_map
.dump_stuck_pg_stats(ds
, f
, (int)threshold
, stuckop_vec
) < 0) {
3646 if (prefix
== "pg debug") {
3647 const string debugop
= cmd_getval_or
<string
>(
3649 "unfound_objects_exist");
3650 if (debugop
== "unfound_objects_exist") {
3651 bool unfound_objects_exist
= false;
3652 for (const auto& p
: pg_map
.pg_stat
) {
3653 if (p
.second
.stats
.sum
.num_objects_unfound
> 0) {
3654 unfound_objects_exist
= true;
3658 if (unfound_objects_exist
)
3665 if (debugop
== "degraded_pgs_exist") {
3666 bool degraded_pgs_exist
= false;
3667 for (const auto& p
: pg_map
.pg_stat
) {
3668 if (p
.second
.stats
.sum
.num_objects_degraded
> 0) {
3669 degraded_pgs_exist
= true;
3673 if (degraded_pgs_exist
)
3682 if (prefix
== "osd perf") {
3684 f
->open_object_section("osdstats");
3685 pg_map
.dump_osd_perf_stats(f
);
3689 pg_map
.print_osd_perf_stats(&ds
);
3695 if (prefix
== "osd blocked-by") {
3697 f
->open_object_section("osd_blocked_by");
3698 pg_map
.dump_osd_blocked_by_stats(f
);
3702 pg_map
.print_osd_blocked_by_stats(&ds
);
3711 void PGMapUpdater::check_osd_map(
3713 const OSDMap
& osdmap
,
3715 PGMap::Incremental
*pending_inc
)
3717 for (auto& p
: pgmap
.osd_stat
) {
3718 if (!osdmap
.exists(p
.first
)) {
3720 pending_inc
->rm_stat(p
.first
);
3721 } else if (osdmap
.is_out(p
.first
)) {
3723 if (p
.second
.statfs
.total
!= 0) {
3724 pending_inc
->stat_osd_out(p
.first
);
3726 } else if (!osdmap
.is_up(p
.first
)) {
3727 // zero the op_queue_age_hist
3728 if (!p
.second
.op_queue_age_hist
.empty()) {
3729 pending_inc
->stat_osd_down_up(p
.first
, pgmap
);
3734 // deleted pgs (pools)?
3735 for (auto& p
: pgmap
.pg_pool_sum
) {
3736 if (!osdmap
.have_pg_pool(p
.first
)) {
3737 ldout(cct
, 10) << __func__
<< " pool " << p
.first
<< " gone, removing pgs"
3739 for (auto& q
: pgmap
.pg_stat
) {
3740 if (q
.first
.pool() == p
.first
) {
3741 pending_inc
->pg_remove
.insert(q
.first
);
3744 auto q
= pending_inc
->pg_stat_updates
.begin();
3745 while (q
!= pending_inc
->pg_stat_updates
.end()) {
3746 if (q
->first
.pool() == p
.first
) {
3747 q
= pending_inc
->pg_stat_updates
.erase(q
);
3755 // new (split or new pool) or merged pgs?
3756 map
<int64_t,unsigned> new_pg_num
;
3757 for (auto& p
: osdmap
.get_pools()) {
3758 int64_t poolid
= p
.first
;
3759 const pg_pool_t
& pi
= p
.second
;
3760 auto q
= pgmap
.num_pg_by_pool
.find(poolid
);
3761 unsigned my_pg_num
= 0;
3762 if (q
!= pgmap
.num_pg_by_pool
.end())
3763 my_pg_num
= q
->second
;
3764 unsigned pg_num
= pi
.get_pg_num();
3765 new_pg_num
[poolid
] = pg_num
;
3766 if (my_pg_num
< pg_num
) {
3767 ldout(cct
,10) << __func__
<< " pool " << poolid
<< " pg_num " << pg_num
3768 << " > my pg_num " << my_pg_num
<< dendl
;
3769 for (unsigned ps
= my_pg_num
; ps
< pg_num
; ++ps
) {
3770 pg_t
pgid(ps
, poolid
);
3771 if (pending_inc
->pg_stat_updates
.count(pgid
) == 0) {
3772 ldout(cct
,20) << __func__
<< " adding " << pgid
<< dendl
;
3773 pg_stat_t
&stats
= pending_inc
->pg_stat_updates
[pgid
];
3774 stats
.last_fresh
= osdmap
.get_modified();
3775 stats
.last_active
= osdmap
.get_modified();
3776 stats
.last_change
= osdmap
.get_modified();
3777 stats
.last_peered
= osdmap
.get_modified();
3778 stats
.last_clean
= osdmap
.get_modified();
3779 stats
.last_unstale
= osdmap
.get_modified();
3780 stats
.last_undegraded
= osdmap
.get_modified();
3781 stats
.last_fullsized
= osdmap
.get_modified();
3782 stats
.last_scrub_stamp
= osdmap
.get_modified();
3783 stats
.last_deep_scrub_stamp
= osdmap
.get_modified();
3784 stats
.last_clean_scrub_stamp
= osdmap
.get_modified();
3787 } else if (my_pg_num
> pg_num
) {
3788 ldout(cct
,10) << __func__
<< " pool " << poolid
<< " pg_num " << pg_num
3789 << " < my pg_num " << my_pg_num
<< dendl
;
3790 for (unsigned i
= pg_num
; i
< my_pg_num
; ++i
) {
3791 pg_t
pgid(i
, poolid
);
3792 ldout(cct
,20) << __func__
<< " removing merged " << pgid
<< dendl
;
3793 if (pgmap
.pg_stat
.count(pgid
)) {
3794 pending_inc
->pg_remove
.insert(pgid
);
3796 pending_inc
->pg_stat_updates
.erase(pgid
);
3800 auto i
= pending_inc
->pg_stat_updates
.begin();
3801 while (i
!= pending_inc
->pg_stat_updates
.end()) {
3802 auto j
= new_pg_num
.find(i
->first
.pool());
3803 if (j
== new_pg_num
.end() ||
3804 i
->first
.ps() >= j
->second
) {
3805 ldout(cct
,20) << __func__
<< " removing pending update to old "
3806 << i
->first
<< dendl
;
3807 i
= pending_inc
->pg_stat_updates
.erase(i
);
3814 static void _try_mark_pg_stale(
3815 const OSDMap
& osdmap
,
3817 const pg_stat_t
& cur
,
3818 PGMap::Incremental
*pending_inc
)
3820 if ((cur
.state
& PG_STATE_STALE
) == 0 &&
3821 cur
.acting_primary
!= -1 &&
3822 osdmap
.is_down(cur
.acting_primary
)) {
3824 auto q
= pending_inc
->pg_stat_updates
.find(pgid
);
3825 if (q
!= pending_inc
->pg_stat_updates
.end()) {
3826 if ((q
->second
.acting_primary
== cur
.acting_primary
) ||
3827 ((q
->second
.state
& PG_STATE_STALE
) == 0 &&
3828 q
->second
.acting_primary
!= -1 &&
3829 osdmap
.is_down(q
->second
.acting_primary
))) {
3830 newstat
= &q
->second
;
3832 // pending update is no longer down or already stale
3836 newstat
= &pending_inc
->pg_stat_updates
[pgid
];
3839 dout(10) << __func__
<< " marking pg " << pgid
3840 << " stale (acting_primary " << newstat
->acting_primary
3842 newstat
->state
|= PG_STATE_STALE
;
3843 newstat
->last_unstale
= ceph_clock_now();
3847 void PGMapUpdater::check_down_pgs(
3848 const OSDMap
&osdmap
,
3849 const PGMap
&pg_map
,
3851 const set
<int>& need_check_down_pg_osds
,
3852 PGMap::Incremental
*pending_inc
)
3854 // if a large number of osds changed state, just iterate over the whole
3856 if (need_check_down_pg_osds
.size() > (unsigned)osdmap
.get_num_osds() *
3857 g_conf().get_val
<double>("mon_pg_check_down_all_threshold")) {
3862 for (const auto& p
: pg_map
.pg_stat
) {
3863 _try_mark_pg_stale(osdmap
, p
.first
, p
.second
, pending_inc
);
3866 for (auto osd
: need_check_down_pg_osds
) {
3867 if (osdmap
.is_down(osd
)) {
3868 auto p
= pg_map
.pg_by_osd
.find(osd
);
3869 if (p
== pg_map
.pg_by_osd
.end()) {
3872 for (auto pgid
: p
->second
) {
3873 const pg_stat_t
&stat
= pg_map
.pg_stat
.at(pgid
);
3874 ceph_assert(stat
.acting_primary
== osd
);
3875 _try_mark_pg_stale(osdmap
, pgid
, stat
, pending_inc
);
3882 int reweight::by_utilization(
3883 const OSDMap
&osdmap
,
3888 bool by_pg
, const set
<int64_t> *pools
,
3890 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
3891 std::stringstream
*ss
,
3892 std::string
*out_str
,
3896 *ss
<< "You must give a percentage higher than 100. "
3897 "The reweighting threshold will be calculated as <average-utilization> "
3898 "times <input-percentage>. For example, an argument of 200 would "
3899 "reweight OSDs which are twice as utilized as the average OSD.\n";
3903 vector
<int> pgs_by_osd(osdmap
.get_max_osd());
3905 // Avoid putting a small number (or 0) in the denominator when calculating
3907 double average_util
;
3910 double weight_sum
= 0.0; // sum up the crush weights
3911 unsigned num_pg_copies
= 0;
3913 for (const auto& pg
: pgm
.pg_stat
) {
3914 if (pools
&& pools
->count(pg
.first
.pool()) == 0)
3916 for (const auto acting
: pg
.second
.acting
) {
3917 if (!osdmap
.exists(acting
)) {
3920 if (acting
>= (int)pgs_by_osd
.size())
3921 pgs_by_osd
.resize(acting
);
3922 if (pgs_by_osd
[acting
] == 0) {
3923 if (osdmap
.crush
->get_item_weightf(acting
) <= 0) {
3924 //skip if we currently can not identify item
3927 weight_sum
+= osdmap
.crush
->get_item_weightf(acting
);
3930 ++pgs_by_osd
[acting
];
3935 if (!num_osds
|| (num_pg_copies
/ num_osds
< g_conf()->mon_reweight_min_pgs_per_osd
)) {
3936 *ss
<< "Refusing to reweight: we only have " << num_pg_copies
3937 << " PGs across " << num_osds
<< " osds!\n";
3941 average_util
= (double)num_pg_copies
/ weight_sum
;
3943 // by osd utilization
3944 int num_osd
= std::max
<size_t>(1, pgm
.osd_stat
.size());
3945 if ((uint64_t)pgm
.osd_sum
.statfs
.total
/ num_osd
3946 < g_conf()->mon_reweight_min_bytes_per_osd
) {
3947 *ss
<< "Refusing to reweight: we only have " << pgm
.osd_sum
.statfs
.kb()
3948 << " kb across all osds!\n";
3951 if ((uint64_t)pgm
.osd_sum
.statfs
.get_used_raw() / num_osd
3952 < g_conf()->mon_reweight_min_bytes_per_osd
) {
3953 *ss
<< "Refusing to reweight: we only have "
3954 << pgm
.osd_sum
.statfs
.kb_used_raw()
3955 << " kb used across all osds!\n";
3959 average_util
= (double)pgm
.osd_sum
.statfs
.get_used_raw() /
3960 (double)pgm
.osd_sum
.statfs
.total
;
3963 // adjust down only if we are above the threshold
3964 const double overload_util
= average_util
* (double)oload
/ 100.0;
3966 // but aggressively adjust weights up whenever possible.
3967 const double underload_util
= average_util
;
3969 const unsigned max_change
= (unsigned)(max_changef
* (double)CEPH_OSD_IN
);
3973 f
->open_object_section("reweight_by_utilization");
3974 f
->dump_int("overload_min", oload
);
3975 f
->dump_float("max_change", max_changef
);
3976 f
->dump_int("max_change_osds", max_osds
);
3977 f
->dump_float("average_utilization", average_util
);
3978 f
->dump_float("overload_utilization", overload_util
);
3980 oss
<< "oload " << oload
<< "\n";
3981 oss
<< "max_change " << max_changef
<< "\n";
3982 oss
<< "max_change_osds " << max_osds
<< "\n";
3984 oss
<< "average_utilization " << std::fixed
<< average_util
<< "\n";
3985 oss
<< "overload_utilization " << overload_util
<< "\n";
3987 int num_changed
= 0;
3989 // precompute util for each OSD
3990 std::vector
<std::pair
<int, float> > util_by_osd
;
3991 for (const auto& p
: pgm
.osd_stat
) {
3992 std::pair
<int, float> osd_util
;
3993 osd_util
.first
= p
.first
;
3995 if (p
.first
>= (int)pgs_by_osd
.size() ||
3996 pgs_by_osd
[p
.first
] == 0) {
3997 // skip if this OSD does not contain any pg
3998 // belonging to the specified pool(s).
4002 if (osdmap
.crush
->get_item_weightf(p
.first
) <= 0) {
4003 // skip if we are unable to locate item.
4008 pgs_by_osd
[p
.first
] / osdmap
.crush
->get_item_weightf(p
.first
);
4011 (double)p
.second
.statfs
.get_used_raw() / (double)p
.second
.statfs
.total
;
4013 util_by_osd
.push_back(osd_util
);
4016 // sort by absolute deviation from the mean utilization,
4017 // in descending order.
4018 std::sort(util_by_osd
.begin(), util_by_osd
.end(),
4019 [average_util
](std::pair
<int, float> l
, std::pair
<int, float> r
) {
4020 return abs(l
.second
- average_util
) > abs(r
.second
- average_util
);
4025 f
->open_array_section("reweights");
4027 for (const auto& p
: util_by_osd
) {
4028 unsigned weight
= osdmap
.get_weight(p
.first
);
4030 // skip if OSD is currently out
4033 float util
= p
.second
;
4035 if (util
>= overload_util
) {
4036 // Assign a lower weight to overloaded OSDs. The current weight
4037 // is a factor to take into account the original weights,
4038 // to represent e.g. differing storage capacities
4039 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
4040 if (weight
> max_change
)
4041 new_weight
= std::max(new_weight
, weight
- max_change
);
4042 new_weights
->insert({p
.first
, new_weight
});
4044 f
->open_object_section("osd");
4045 f
->dump_int("osd", p
.first
);
4046 f
->dump_float("weight", (float)weight
/ (float)CEPH_OSD_IN
);
4047 f
->dump_float("new_weight", (float)new_weight
/ (float)CEPH_OSD_IN
);
4050 oss
<< "osd." << p
.first
<< " weight "
4051 << (float)weight
/ (float)CEPH_OSD_IN
<< " -> "
4052 << (float)new_weight
/ (float)CEPH_OSD_IN
<< "\n";
4054 if (++num_changed
>= max_osds
)
4057 if (!no_increasing
&& util
<= underload_util
) {
4058 // assign a higher weight.. if we can.
4059 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
4060 new_weight
= std::min(new_weight
, weight
+ max_change
);
4061 if (new_weight
> CEPH_OSD_IN
)
4062 new_weight
= CEPH_OSD_IN
;
4063 if (new_weight
> weight
) {
4064 new_weights
->insert({p
.first
, new_weight
});
4065 oss
<< "osd." << p
.first
<< " weight "
4066 << (float)weight
/ (float)CEPH_OSD_IN
<< " -> "
4067 << (float)new_weight
/ (float)CEPH_OSD_IN
<< "\n";
4068 if (++num_changed
>= max_osds
)
4078 newmap
.deepish_copy_from(osdmap
);
4079 OSDMap::Incremental newinc
;
4080 newinc
.fsid
= newmap
.get_fsid();
4081 newinc
.epoch
= newmap
.get_epoch() + 1;
4082 newinc
.new_weight
= *new_weights
;
4083 newmap
.apply_incremental(newinc
);
4085 osdmap
.summarize_mapping_stats(&newmap
, pools
, out_str
, f
);
4091 *out_str
+= oss
.str();