1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include <boost/algorithm/string.hpp>
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Clock.h"
11 #include "common/Formatter.h"
12 #include "global/global_context.h"
13 #include "include/ceph_features.h"
14 #include "include/stringify.h"
16 #include "osd/osd_types.h"
17 #include "osd/OSDMap.h"
18 #include <boost/range/adaptor/reversed.hpp>
20 #define dout_context g_ceph_context
22 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest
, pgmap_digest
, pgmap
);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap
, pgmap
, pgmap
);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental
, pgmap_inc
, pgmap
);
27 // ---------------------
30 void PGMapDigest::encode(bufferlist
& bl
, uint64_t features
) const
32 // NOTE: see PGMap::encode_digest
34 if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
36 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
39 ENCODE_START(v
, 1, bl
);
41 encode(num_pg_active
, bl
);
42 encode(num_pg_unknown
, bl
);
44 encode(pg_pool_sum
, bl
, features
);
45 encode(pg_sum
, bl
, features
);
46 encode(osd_sum
, bl
, features
);
48 encode(num_pg_by_state
, bl
);
50 uint32_t n
= num_pg_by_state
.size();
52 for (auto p
: num_pg_by_state
) {
53 encode((uint32_t)p
.first
, bl
);
57 encode(num_pg_by_osd
, bl
);
58 encode(num_pg_by_pool
, bl
);
59 encode(osd_last_seq
, bl
);
60 encode(per_pool_sum_delta
, bl
, features
);
61 encode(per_pool_sum_deltas_stamps
, bl
);
62 encode(pg_sum_delta
, bl
, features
);
63 encode(stamp_delta
, bl
);
64 encode(avail_space_by_rule
, bl
);
66 encode(purged_snaps
, bl
);
69 encode(osd_sum_by_class
, bl
, features
);
74 void PGMapDigest::decode(bufferlist::const_iterator
& p
)
78 decode(num_pg_active
, p
);
79 decode(num_pg_unknown
, p
);
81 decode(pg_pool_sum
, p
);
85 decode(num_pg_by_state
, p
);
87 map
<int32_t, int32_t> nps
;
89 num_pg_by_state
.clear();
91 num_pg_by_state
[i
.first
] = i
.second
;
94 decode(num_pg_by_osd
, p
);
95 decode(num_pg_by_pool
, p
);
96 decode(osd_last_seq
, p
);
97 decode(per_pool_sum_delta
, p
);
98 decode(per_pool_sum_deltas_stamps
, p
);
99 decode(pg_sum_delta
, p
);
100 decode(stamp_delta
, p
);
101 decode(avail_space_by_rule
, p
);
103 decode(purged_snaps
, p
);
106 decode(osd_sum_by_class
, p
);
111 void PGMapDigest::dump(Formatter
*f
) const
113 f
->dump_unsigned("num_pg", num_pg
);
114 f
->dump_unsigned("num_pg_active", num_pg_active
);
115 f
->dump_unsigned("num_pg_unknown", num_pg_unknown
);
116 f
->dump_unsigned("num_osd", num_osd
);
117 f
->dump_object("pool_sum", pg_sum
);
118 f
->dump_object("osd_sum", osd_sum
);
120 f
->open_object_section("osd_sum_by_class");
121 for (auto& i
: osd_sum_by_class
) {
122 f
->dump_object(i
.first
.c_str(), i
.second
);
126 f
->open_array_section("pool_stats");
127 for (auto& p
: pg_pool_sum
) {
128 f
->open_object_section("pool_stat");
129 f
->dump_int("poolid", p
.first
);
130 auto q
= num_pg_by_pool
.find(p
.first
);
131 if (q
!= num_pg_by_pool
.end())
132 f
->dump_unsigned("num_pg", q
->second
);
137 f
->open_array_section("osd_stats");
139 // TODO: this isn't really correct since we can dump non-existent OSDs
140 // I dunno what osd_last_seq is set to in that case...
141 for (auto& p
: osd_last_seq
) {
142 f
->open_object_section("osd_stat");
143 f
->dump_int("osd", i
);
144 f
->dump_unsigned("seq", p
);
149 f
->open_array_section("num_pg_by_state");
150 for (auto& p
: num_pg_by_state
) {
151 f
->open_object_section("count");
152 f
->dump_string("state", pg_state_string(p
.first
));
153 f
->dump_unsigned("num", p
.second
);
157 f
->open_array_section("num_pg_by_osd");
158 for (auto& p
: num_pg_by_osd
) {
159 f
->open_object_section("count");
160 f
->dump_unsigned("osd", p
.first
);
161 f
->dump_unsigned("num_primary_pg", p
.second
.primary
);
162 f
->dump_unsigned("num_acting_pg", p
.second
.acting
);
163 f
->dump_unsigned("num_up_not_acting_pg", p
.second
.up_not_acting
);
167 f
->open_array_section("purged_snaps");
168 for (auto& j
: purged_snaps
) {
169 f
->open_object_section("pool");
170 f
->dump_int("pool", j
.first
);
171 f
->open_object_section("purged_snaps");
172 for (auto i
= j
.second
.begin(); i
!= j
.second
.end(); ++i
) {
173 f
->open_object_section("interval");
174 f
->dump_stream("start") << i
.get_start();
175 f
->dump_stream("length") << i
.get_len();
184 void PGMapDigest::generate_test_instances(list
<PGMapDigest
*>& ls
)
186 ls
.push_back(new PGMapDigest
);
189 inline std::string
percentify(const float& a
) {
190 std::stringstream ss
;
194 ss
<< std::fixed
<< std::setprecision(2) << a
;
198 void PGMapDigest::print_summary(Formatter
*f
, ostream
*out
) const
201 f
->open_array_section("pgs_by_state");
203 // list is descending numeric order (by count)
204 multimap
<int,int> state_by_count
; // count -> state
205 for (auto p
= num_pg_by_state
.begin();
206 p
!= num_pg_by_state
.end();
208 state_by_count
.insert(make_pair(p
->second
, p
->first
));
211 for (auto p
= state_by_count
.rbegin();
212 p
!= state_by_count
.rend();
215 f
->open_object_section("pgs_by_state_element");
216 f
->dump_string("state_name", pg_state_string(p
->second
));
217 f
->dump_unsigned("count", p
->first
);
225 f
->dump_unsigned("num_pgs", num_pg
);
226 f
->dump_unsigned("num_pools", pg_pool_sum
.size());
227 f
->dump_unsigned("num_objects", pg_sum
.stats
.sum
.num_objects
);
228 f
->dump_unsigned("data_bytes", pg_sum
.stats
.sum
.num_bytes
);
229 f
->dump_unsigned("bytes_used", osd_sum
.statfs
.get_used_raw());
230 f
->dump_unsigned("bytes_avail", osd_sum
.statfs
.available
);
231 f
->dump_unsigned("bytes_total", osd_sum
.statfs
.total
);
233 *out
<< " pools: " << pg_pool_sum
.size() << " pools, "
234 << num_pg
<< " pgs\n";
235 *out
<< " objects: " << si_u_t(pg_sum
.stats
.sum
.num_objects
) << " objects, "
236 << byte_u_t(pg_sum
.stats
.sum
.num_bytes
) << "\n";
238 << byte_u_t(osd_sum
.statfs
.get_used_raw()) << " used, "
239 << byte_u_t(osd_sum
.statfs
.available
) << " / "
240 << byte_u_t(osd_sum
.statfs
.total
) << " avail\n";
246 if (num_pg_unknown
> 0) {
247 float p
= (float)num_pg_unknown
/ (float)num_pg
;
249 f
->dump_float("unknown_pgs_ratio", p
);
252 snprintf(b
, sizeof(b
), "%.3lf", p
* 100.0);
253 *out
<< b
<< "% pgs unknown\n";
258 int num_pg_inactive
= num_pg
- num_pg_active
- num_pg_unknown
;
259 if (num_pg_inactive
> 0) {
260 float p
= (float)num_pg_inactive
/ (float)num_pg
;
262 f
->dump_float("inactive_pgs_ratio", p
);
268 snprintf(b
, sizeof(b
), "%.3f", p
* 100.0);
269 *out
<< b
<< "% pgs not active\n";
275 overall_recovery_summary(f
, &sl
);
276 if (!f
&& !sl
.empty()) {
277 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
) {
288 unsigned max_width
= 1;
289 for (multimap
<int,int>::reverse_iterator p
= state_by_count
.rbegin();
290 p
!= state_by_count
.rend();
293 std::stringstream ss
;
295 max_width
= std::max
<size_t>(ss
.str().size(), max_width
);
298 for (multimap
<int,int>::reverse_iterator p
= state_by_count
.rbegin();
299 p
!= state_by_count
.rend();
306 out
->setf(std::ios::left
);
307 *out
<< std::setw(max_width
) << p
->first
308 << " " << pg_state_string(p
->second
) << "\n";
309 out
->unsetf(std::ios::left
);
313 ostringstream ss_rec_io
;
314 overall_recovery_rate_summary(f
, &ss_rec_io
);
315 ostringstream ss_client_io
;
316 overall_client_io_rate_summary(f
, &ss_client_io
);
317 ostringstream ss_cache_io
;
318 overall_cache_io_rate_summary(f
, &ss_cache_io
);
320 if (!f
&& (ss_client_io
.str().length() || ss_rec_io
.str().length()
321 || ss_cache_io
.str().length())) {
326 if (!f
&& ss_client_io
.str().length())
327 *out
<< " client: " << ss_client_io
.str() << "\n";
328 if (!f
&& ss_rec_io
.str().length())
329 *out
<< " recovery: " << ss_rec_io
.str() << "\n";
330 if (!f
&& ss_cache_io
.str().length())
331 *out
<< " cache: " << ss_cache_io
.str() << "\n";
334 void PGMapDigest::print_oneline_summary(Formatter
*f
, ostream
*out
) const
336 std::stringstream ss
;
339 f
->open_array_section("num_pg_by_state");
340 for (auto p
= num_pg_by_state
.begin();
341 p
!= num_pg_by_state
.end();
344 f
->open_object_section("state");
345 f
->dump_string("name", pg_state_string(p
->first
));
346 f
->dump_unsigned("num", p
->second
);
349 if (p
!= num_pg_by_state
.begin())
351 ss
<< p
->second
<< " " << pg_state_string(p
->first
);
356 string states
= ss
.str();
358 *out
<< num_pg
<< " pgs: "
360 << byte_u_t(pg_sum
.stats
.sum
.num_bytes
) << " data, "
361 << byte_u_t(osd_sum
.statfs
.get_used()) << " used, "
362 << byte_u_t(osd_sum
.statfs
.available
) << " / "
363 << byte_u_t(osd_sum
.statfs
.total
) << " avail";
365 f
->dump_unsigned("num_pgs", num_pg
);
366 f
->dump_unsigned("num_bytes", pg_sum
.stats
.sum
.num_bytes
);
367 f
->dump_int("total_bytes", osd_sum
.statfs
.total
);
368 f
->dump_int("total_avail_bytes", osd_sum
.statfs
.available
);
369 f
->dump_int("total_used_bytes", osd_sum
.statfs
.get_used());
370 f
->dump_int("total_used_raw_bytes", osd_sum
.statfs
.get_used_raw());
373 // make non-negative; we can get negative values if osds send
374 // uncommitted stats and then "go backward" or if they are just
376 pool_stat_t pos_delta
= pg_sum_delta
;
378 if (pos_delta
.stats
.sum
.num_rd
||
379 pos_delta
.stats
.sum
.num_wr
) {
382 if (pos_delta
.stats
.sum
.num_rd
) {
383 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)stamp_delta
;
385 *out
<< byte_u_t(rd
) << "/s rd, ";
387 f
->dump_unsigned("read_bytes_sec", rd
);
389 if (pos_delta
.stats
.sum
.num_wr
) {
390 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)stamp_delta
;
392 *out
<< byte_u_t(wr
) << "/s wr, ";
394 f
->dump_unsigned("write_bytes_sec", wr
);
396 int64_t iops
= (pos_delta
.stats
.sum
.num_rd
+ pos_delta
.stats
.sum
.num_wr
) / (double)stamp_delta
;
398 *out
<< si_u_t(iops
) << " op/s";
400 f
->dump_unsigned("io_sec", iops
);
404 overall_recovery_summary(f
, &sl
);
406 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
)
408 std::stringstream ssr
;
409 overall_recovery_rate_summary(f
, &ssr
);
410 if (out
&& ssr
.str().length())
411 *out
<< "; " << ssr
.str() << " recovering";
414 void PGMapDigest::get_recovery_stats(
415 double *misplaced_ratio
,
416 double *degraded_ratio
,
417 double *inactive_pgs_ratio
,
418 double *unknown_pgs_ratio
) const
420 if (pg_sum
.stats
.sum
.num_objects_degraded
&&
421 pg_sum
.stats
.sum
.num_object_copies
> 0) {
422 *degraded_ratio
= (double)pg_sum
.stats
.sum
.num_objects_degraded
/
423 (double)pg_sum
.stats
.sum
.num_object_copies
;
427 if (pg_sum
.stats
.sum
.num_objects_misplaced
&&
428 pg_sum
.stats
.sum
.num_object_copies
> 0) {
429 *misplaced_ratio
= (double)pg_sum
.stats
.sum
.num_objects_misplaced
/
430 (double)pg_sum
.stats
.sum
.num_object_copies
;
432 *misplaced_ratio
= 0;
435 int num_pg_inactive
= num_pg
- num_pg_active
- num_pg_unknown
;
436 *inactive_pgs_ratio
= (double)num_pg_inactive
/ (double)num_pg
;
437 *unknown_pgs_ratio
= (double)num_pg_unknown
/ (double)num_pg
;
439 *inactive_pgs_ratio
= 0;
440 *unknown_pgs_ratio
= 0;
444 void PGMapDigest::recovery_summary(Formatter
*f
, list
<string
> *psl
,
445 const pool_stat_t
& pool_sum
) const
447 if (pool_sum
.stats
.sum
.num_objects_degraded
&& pool_sum
.stats
.sum
.num_object_copies
> 0) {
448 double pc
= (double)pool_sum
.stats
.sum
.num_objects_degraded
/
449 (double)pool_sum
.stats
.sum
.num_object_copies
* (double)100.0;
451 snprintf(b
, sizeof(b
), "%.3lf", pc
);
453 f
->dump_unsigned("degraded_objects", pool_sum
.stats
.sum
.num_objects_degraded
);
454 f
->dump_unsigned("degraded_total", pool_sum
.stats
.sum
.num_object_copies
);
455 f
->dump_float("degraded_ratio", pc
/ 100.0);
458 ss
<< pool_sum
.stats
.sum
.num_objects_degraded
459 << "/" << pool_sum
.stats
.sum
.num_object_copies
<< " objects degraded (" << b
<< "%)";
460 psl
->push_back(ss
.str());
463 if (pool_sum
.stats
.sum
.num_objects_misplaced
&& pool_sum
.stats
.sum
.num_object_copies
> 0) {
464 double pc
= (double)pool_sum
.stats
.sum
.num_objects_misplaced
/
465 (double)pool_sum
.stats
.sum
.num_object_copies
* (double)100.0;
467 snprintf(b
, sizeof(b
), "%.3lf", pc
);
469 f
->dump_unsigned("misplaced_objects", pool_sum
.stats
.sum
.num_objects_misplaced
);
470 f
->dump_unsigned("misplaced_total", pool_sum
.stats
.sum
.num_object_copies
);
471 f
->dump_float("misplaced_ratio", pc
/ 100.0);
474 ss
<< pool_sum
.stats
.sum
.num_objects_misplaced
475 << "/" << pool_sum
.stats
.sum
.num_object_copies
<< " objects misplaced (" << b
<< "%)";
476 psl
->push_back(ss
.str());
479 if (pool_sum
.stats
.sum
.num_objects_unfound
&& pool_sum
.stats
.sum
.num_objects
) {
480 double pc
= (double)pool_sum
.stats
.sum
.num_objects_unfound
/
481 (double)pool_sum
.stats
.sum
.num_objects
* (double)100.0;
483 snprintf(b
, sizeof(b
), "%.3lf", pc
);
485 f
->dump_unsigned("unfound_objects", pool_sum
.stats
.sum
.num_objects_unfound
);
486 f
->dump_unsigned("unfound_total", pool_sum
.stats
.sum
.num_objects
);
487 f
->dump_float("unfound_ratio", pc
/ 100.0);
490 ss
<< pool_sum
.stats
.sum
.num_objects_unfound
491 << "/" << pool_sum
.stats
.sum
.num_objects
<< " objects unfound (" << b
<< "%)";
492 psl
->push_back(ss
.str());
497 void PGMapDigest::recovery_rate_summary(Formatter
*f
, ostream
*out
,
498 const pool_stat_t
& delta_sum
,
499 utime_t delta_stamp
) const
501 // make non-negative; we can get negative values if osds send
502 // uncommitted stats and then "go backward" or if they are just
504 pool_stat_t pos_delta
= delta_sum
;
506 if (pos_delta
.stats
.sum
.num_objects_recovered
||
507 pos_delta
.stats
.sum
.num_bytes_recovered
||
508 pos_delta
.stats
.sum
.num_keys_recovered
) {
509 int64_t objps
= pos_delta
.stats
.sum
.num_objects_recovered
/ (double)delta_stamp
;
510 int64_t bps
= pos_delta
.stats
.sum
.num_bytes_recovered
/ (double)delta_stamp
;
511 int64_t kps
= pos_delta
.stats
.sum
.num_keys_recovered
/ (double)delta_stamp
;
513 f
->dump_int("recovering_objects_per_sec", objps
);
514 f
->dump_int("recovering_bytes_per_sec", bps
);
515 f
->dump_int("recovering_keys_per_sec", kps
);
516 f
->dump_int("num_objects_recovered", pos_delta
.stats
.sum
.num_objects_recovered
);
517 f
->dump_int("num_bytes_recovered", pos_delta
.stats
.sum
.num_bytes_recovered
);
518 f
->dump_int("num_keys_recovered", pos_delta
.stats
.sum
.num_keys_recovered
);
520 *out
<< byte_u_t(bps
) << "/s";
521 if (pos_delta
.stats
.sum
.num_keys_recovered
)
522 *out
<< ", " << si_u_t(kps
) << " keys/s";
523 *out
<< ", " << si_u_t(objps
) << " objects/s";
528 void PGMapDigest::overall_recovery_rate_summary(Formatter
*f
, ostream
*out
) const
530 recovery_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
533 void PGMapDigest::overall_recovery_summary(Formatter
*f
, list
<string
> *psl
) const
535 recovery_summary(f
, psl
, pg_sum
);
538 void PGMapDigest::pool_recovery_rate_summary(Formatter
*f
, ostream
*out
,
539 uint64_t poolid
) const
541 auto p
= per_pool_sum_delta
.find(poolid
);
542 if (p
== per_pool_sum_delta
.end())
545 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
546 ceph_assert(ts
!= per_pool_sum_deltas_stamps
.end());
547 recovery_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
550 void PGMapDigest::pool_recovery_summary(Formatter
*f
, list
<string
> *psl
,
551 uint64_t poolid
) const
553 auto p
= pg_pool_sum
.find(poolid
);
554 if (p
== pg_pool_sum
.end())
557 recovery_summary(f
, psl
, p
->second
);
560 void PGMapDigest::client_io_rate_summary(Formatter
*f
, ostream
*out
,
561 const pool_stat_t
& delta_sum
,
562 utime_t delta_stamp
) const
564 pool_stat_t pos_delta
= delta_sum
;
566 if (pos_delta
.stats
.sum
.num_rd
||
567 pos_delta
.stats
.sum
.num_wr
) {
568 if (pos_delta
.stats
.sum
.num_rd
) {
569 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)delta_stamp
;
571 f
->dump_int("read_bytes_sec", rd
);
573 *out
<< byte_u_t(rd
) << "/s rd, ";
576 if (pos_delta
.stats
.sum
.num_wr
) {
577 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)delta_stamp
;
579 f
->dump_int("write_bytes_sec", wr
);
581 *out
<< byte_u_t(wr
) << "/s wr, ";
584 int64_t iops_rd
= pos_delta
.stats
.sum
.num_rd
/ (double)delta_stamp
;
585 int64_t iops_wr
= pos_delta
.stats
.sum
.num_wr
/ (double)delta_stamp
;
587 f
->dump_int("read_op_per_sec", iops_rd
);
588 f
->dump_int("write_op_per_sec", iops_wr
);
590 *out
<< si_u_t(iops_rd
) << " op/s rd, " << si_u_t(iops_wr
) << " op/s wr";
595 void PGMapDigest::overall_client_io_rate_summary(Formatter
*f
, ostream
*out
) const
597 client_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
600 void PGMapDigest::pool_client_io_rate_summary(Formatter
*f
, ostream
*out
,
601 uint64_t poolid
) const
603 auto p
= per_pool_sum_delta
.find(poolid
);
604 if (p
== per_pool_sum_delta
.end())
607 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
608 ceph_assert(ts
!= per_pool_sum_deltas_stamps
.end());
609 client_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
612 void PGMapDigest::cache_io_rate_summary(Formatter
*f
, ostream
*out
,
613 const pool_stat_t
& delta_sum
,
614 utime_t delta_stamp
) const
616 pool_stat_t pos_delta
= delta_sum
;
618 bool have_output
= false;
620 if (pos_delta
.stats
.sum
.num_flush
) {
621 int64_t flush
= (pos_delta
.stats
.sum
.num_flush_kb
<< 10) / (double)delta_stamp
;
623 f
->dump_int("flush_bytes_sec", flush
);
625 *out
<< byte_u_t(flush
) << "/s flush";
629 if (pos_delta
.stats
.sum
.num_evict
) {
630 int64_t evict
= (pos_delta
.stats
.sum
.num_evict_kb
<< 10) / (double)delta_stamp
;
632 f
->dump_int("evict_bytes_sec", evict
);
636 *out
<< byte_u_t(evict
) << "/s evict";
640 if (pos_delta
.stats
.sum
.num_promote
) {
641 int64_t promote
= pos_delta
.stats
.sum
.num_promote
/ (double)delta_stamp
;
643 f
->dump_int("promote_op_per_sec", promote
);
647 *out
<< si_u_t(promote
) << " op/s promote";
651 if (pos_delta
.stats
.sum
.num_flush_mode_low
) {
653 f
->dump_int("num_flush_mode_low", pos_delta
.stats
.sum
.num_flush_mode_low
);
657 *out
<< si_u_t(pos_delta
.stats
.sum
.num_flush_mode_low
) << " PGs flushing";
661 if (pos_delta
.stats
.sum
.num_flush_mode_high
) {
663 f
->dump_int("num_flush_mode_high", pos_delta
.stats
.sum
.num_flush_mode_high
);
667 *out
<< si_u_t(pos_delta
.stats
.sum
.num_flush_mode_high
) << " PGs flushing (high)";
671 if (pos_delta
.stats
.sum
.num_evict_mode_some
) {
673 f
->dump_int("num_evict_mode_some", pos_delta
.stats
.sum
.num_evict_mode_some
);
677 *out
<< si_u_t(pos_delta
.stats
.sum
.num_evict_mode_some
) << " PGs evicting";
681 if (pos_delta
.stats
.sum
.num_evict_mode_full
) {
683 f
->dump_int("num_evict_mode_full", pos_delta
.stats
.sum
.num_evict_mode_full
);
687 *out
<< si_u_t(pos_delta
.stats
.sum
.num_evict_mode_full
) << " PGs evicting (full)";
692 void PGMapDigest::overall_cache_io_rate_summary(Formatter
*f
, ostream
*out
) const
694 cache_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
697 void PGMapDigest::pool_cache_io_rate_summary(Formatter
*f
, ostream
*out
,
698 uint64_t poolid
) const
700 auto p
= per_pool_sum_delta
.find(poolid
);
701 if (p
== per_pool_sum_delta
.end())
704 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
705 ceph_assert(ts
!= per_pool_sum_deltas_stamps
.end());
706 cache_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
709 ceph_statfs
PGMapDigest::get_statfs(OSDMap
&osdmap
,
710 boost::optional
<int64_t> data_pool
) const
714 object_stat_sum_t sum
;
717 auto i
= pg_pool_sum
.find(*data_pool
);
718 if (i
!= pg_pool_sum
.end()) {
719 sum
= i
->second
.stats
.sum
;
725 statfs
.kb_used
= (sum
.num_bytes
>> 10);
726 statfs
.kb_avail
= get_pool_free_space(osdmap
, *data_pool
) >> 10;
727 statfs
.num_objects
= sum
.num_objects
;
728 statfs
.kb
= statfs
.kb_used
+ statfs
.kb_avail
;
731 statfs
.kb
= osd_sum
.statfs
.kb();
732 statfs
.kb_used
= osd_sum
.statfs
.kb_used_raw();
733 statfs
.kb_avail
= osd_sum
.statfs
.kb_avail();
734 statfs
.num_objects
= pg_sum
.stats
.sum
.num_objects
;
740 void PGMapDigest::dump_pool_stats_full(
741 const OSDMap
&osd_map
,
749 f
->open_array_section("pools");
751 tbl
.define_column("POOL", TextTable::LEFT
, TextTable::LEFT
);
752 tbl
.define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
753 tbl
.define_column("STORED", TextTable::LEFT
, TextTable::RIGHT
);
754 tbl
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
755 tbl
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
756 tbl
.define_column("%USED", TextTable::LEFT
, TextTable::RIGHT
);
757 tbl
.define_column("MAX AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
760 tbl
.define_column("QUOTA OBJECTS", TextTable::LEFT
, TextTable::LEFT
);
761 tbl
.define_column("QUOTA BYTES", TextTable::LEFT
, TextTable::LEFT
);
762 tbl
.define_column("DIRTY", TextTable::LEFT
, TextTable::RIGHT
);
763 tbl
.define_column("USED COMPR", TextTable::LEFT
, TextTable::RIGHT
);
764 tbl
.define_column("UNDER COMPR", TextTable::LEFT
, TextTable::RIGHT
);
768 map
<int,uint64_t> avail_by_rule
;
769 for (auto p
= osd_map
.get_pools().begin();
770 p
!= osd_map
.get_pools().end(); ++p
) {
771 int64_t pool_id
= p
->first
;
772 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
775 const string
& pool_name
= osd_map
.get_pool_name(pool_id
);
776 const pool_stat_t
&stat
= pg_pool_sum
.at(pool_id
);
778 const pg_pool_t
*pool
= osd_map
.get_pg_pool(pool_id
);
779 int ruleno
= osd_map
.crush
->find_rule(pool
->get_crush_rule(),
783 if (avail_by_rule
.count(ruleno
) == 0) {
784 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
785 avail
= get_rule_avail(ruleno
);
788 avail_by_rule
[ruleno
] = avail
;
790 avail
= avail_by_rule
[ruleno
];
793 f
->open_object_section("pool");
794 f
->dump_string("name", pool_name
);
795 f
->dump_int("id", pool_id
);
796 f
->open_object_section("stats");
801 float raw_used_rate
= osd_map
.pool_raw_used_rate(pool_id
);
802 bool per_pool
= use_per_pool_stats();
803 dump_object_stat_sum(tbl
, f
, stat
, avail
, raw_used_rate
, verbose
, per_pool
,
806 f
->close_section(); // stats
807 f
->close_section(); // pool
809 tbl
<< TextTable::endrow
;
815 ceph_assert(ss
!= nullptr);
822 void PGMapDigest::dump_cluster_stats(stringstream
*ss
,
827 f
->open_object_section("stats");
828 f
->dump_int("total_bytes", osd_sum
.statfs
.total
);
829 f
->dump_int("total_avail_bytes", osd_sum
.statfs
.available
);
830 f
->dump_int("total_used_bytes", osd_sum
.statfs
.get_used());
831 f
->dump_int("total_used_raw_bytes", osd_sum
.statfs
.get_used_raw());
832 f
->dump_float("total_used_raw_ratio", osd_sum
.statfs
.get_used_raw_ratio());
833 f
->dump_unsigned("num_osds", osd_sum
.num_osds
);
834 f
->dump_unsigned("num_per_pool_osds", osd_sum
.num_per_pool_osds
);
836 f
->open_object_section("stats_by_class");
837 for (auto& i
: osd_sum_by_class
) {
838 f
->open_object_section(i
.first
.c_str());
839 f
->dump_int("total_bytes", i
.second
.statfs
.total
);
840 f
->dump_int("total_avail_bytes", i
.second
.statfs
.available
);
841 f
->dump_int("total_used_bytes", i
.second
.statfs
.get_used());
842 f
->dump_int("total_used_raw_bytes", i
.second
.statfs
.get_used_raw());
843 f
->dump_float("total_used_raw_ratio",
844 i
.second
.statfs
.get_used_raw_ratio());
849 ceph_assert(ss
!= nullptr);
851 tbl
.define_column("CLASS", TextTable::LEFT
, TextTable::LEFT
);
852 tbl
.define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
853 tbl
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
854 tbl
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
855 tbl
.define_column("RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
856 tbl
.define_column("%RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
859 for (auto& i
: osd_sum_by_class
) {
861 tbl
<< stringify(byte_u_t(i
.second
.statfs
.total
))
862 << stringify(byte_u_t(i
.second
.statfs
.available
))
863 << stringify(byte_u_t(i
.second
.statfs
.get_used()))
864 << stringify(byte_u_t(i
.second
.statfs
.get_used_raw()))
865 << percentify(i
.second
.statfs
.get_used_raw_ratio()*100.0)
866 << TextTable::endrow
;
869 tbl
<< stringify(byte_u_t(osd_sum
.statfs
.total
))
870 << stringify(byte_u_t(osd_sum
.statfs
.available
))
871 << stringify(byte_u_t(osd_sum
.statfs
.get_used()))
872 << stringify(byte_u_t(osd_sum
.statfs
.get_used_raw()))
873 << percentify(osd_sum
.statfs
.get_used_raw_ratio()*100.0)
874 << TextTable::endrow
;
876 *ss
<< "RAW STORAGE:\n";
882 void PGMapDigest::dump_object_stat_sum(
883 TextTable
&tbl
, Formatter
*f
,
884 const pool_stat_t
&pool_stat
, uint64_t avail
,
885 float raw_used_rate
, bool verbose
, bool per_pool
,
886 const pg_pool_t
*pool
)
888 const object_stat_sum_t
&sum
= pool_stat
.stats
.sum
;
889 const store_statfs_t statfs
= pool_stat
.store_stats
;
891 if (sum
.num_object_copies
> 0) {
892 raw_used_rate
*= (float)(sum
.num_object_copies
- sum
.num_objects_degraded
) / sum
.num_object_copies
;
895 uint64_t used_bytes
= pool_stat
.get_allocated_bytes(per_pool
);
898 // note avail passed in is raw_avail, calc raw_used here.
901 used
/= used
+ avail
;
902 } else if (used_bytes
) {
905 auto avail_res
= raw_used_rate
? avail
/ raw_used_rate
: 0;
906 // an approximation for actually stored user data
907 auto stored_normalized
= pool_stat
.get_user_bytes(raw_used_rate
, per_pool
);
909 f
->dump_int("stored", stored_normalized
);
910 f
->dump_int("objects", sum
.num_objects
);
911 f
->dump_int("kb_used", shift_round_up(used_bytes
, 10));
912 f
->dump_int("bytes_used", used_bytes
);
913 f
->dump_float("percent_used", used
);
914 f
->dump_unsigned("max_avail", avail_res
);
916 f
->dump_int("quota_objects", pool
->quota_max_objects
);
917 f
->dump_int("quota_bytes", pool
->quota_max_bytes
);
918 f
->dump_int("dirty", sum
.num_objects_dirty
);
919 f
->dump_int("rd", sum
.num_rd
);
920 f
->dump_int("rd_bytes", sum
.num_rd_kb
* 1024ull);
921 f
->dump_int("wr", sum
.num_wr
);
922 f
->dump_int("wr_bytes", sum
.num_wr_kb
* 1024ull);
923 f
->dump_int("compress_bytes_used", statfs
.data_compressed_allocated
);
924 f
->dump_int("compress_under_bytes", statfs
.data_compressed_original
);
925 // Stored by user amplified by replication
926 f
->dump_int("stored_raw", pool_stat
.get_user_bytes(1.0, per_pool
));
929 tbl
<< stringify(byte_u_t(stored_normalized
));
930 tbl
<< stringify(si_u_t(sum
.num_objects
));
931 tbl
<< stringify(byte_u_t(used_bytes
));
932 tbl
<< percentify(used
*100);
933 tbl
<< stringify(byte_u_t(avail_res
));
935 if (pool
->quota_max_objects
== 0)
938 tbl
<< stringify(si_u_t(pool
->quota_max_objects
));
940 if (pool
->quota_max_bytes
== 0)
943 tbl
<< stringify(byte_u_t(pool
->quota_max_bytes
));
945 tbl
<< stringify(si_u_t(sum
.num_objects_dirty
))
946 << stringify(byte_u_t(statfs
.data_compressed_allocated
))
947 << stringify(byte_u_t(statfs
.data_compressed_original
))
953 int64_t PGMapDigest::get_pool_free_space(const OSDMap
&osd_map
,
954 int64_t poolid
) const
956 const pg_pool_t
*pool
= osd_map
.get_pg_pool(poolid
);
957 int ruleno
= osd_map
.crush
->find_rule(pool
->get_crush_rule(),
961 avail
= get_rule_avail(ruleno
);
965 return avail
/ osd_map
.pool_raw_used_rate(poolid
);
968 int64_t PGMap::get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const
971 int r
= osdmap
.crush
->get_rule_weight_osd_map(ruleno
, &wm
);
979 float fratio
= osdmap
.get_full_ratio();
982 for (auto p
= wm
.begin(); p
!= wm
.end(); ++p
) {
983 auto osd_info
= osd_stat
.find(p
->first
);
984 if (osd_info
!= osd_stat
.end()) {
985 if (osd_info
->second
.statfs
.total
== 0 || p
->second
== 0) {
986 // osd must be out, hence its stats have been zeroed
987 // (unless we somehow managed to have a disk with size 0...)
989 // (p->second == 0), if osd weight is 0, no need to
990 // calculate proj below.
993 double unusable
= (double)osd_info
->second
.statfs
.kb() *
995 double avail
= std::max(0.0, (double)osd_info
->second
.statfs
.kb_avail() - unusable
);
997 int64_t proj
= (int64_t)(avail
/ (double)p
->second
);
998 if (min
< 0 || proj
< min
) {
1002 if (osdmap
.is_up(p
->first
)) {
1003 // This is a level 4 rather than an error, because we might have
1004 // only just started, and not received the first stats message yet.
1005 dout(4) << "OSD " << p
->first
<< " is up, but has no stats" << dendl
;
1012 void PGMap::get_rules_avail(const OSDMap
& osdmap
,
1013 std::map
<int,int64_t> *avail_map
) const
1016 for (auto p
: osdmap
.get_pools()) {
1017 int64_t pool_id
= p
.first
;
1018 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
1020 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
1021 int ruleno
= osdmap
.crush
->find_rule(pool
->get_crush_rule(),
1024 if (avail_map
->count(ruleno
) == 0)
1025 (*avail_map
)[ruleno
] = get_rule_avail(osdmap
, ruleno
);
1029 // ---------------------
1032 void PGMap::Incremental::dump(Formatter
*f
) const
1034 f
->dump_unsigned("version", version
);
1035 f
->dump_stream("stamp") << stamp
;
1036 f
->dump_unsigned("osdmap_epoch", osdmap_epoch
);
1037 f
->dump_unsigned("pg_scan_epoch", pg_scan
);
1039 f
->open_array_section("pg_stat_updates");
1040 for (auto p
= pg_stat_updates
.begin(); p
!= pg_stat_updates
.end(); ++p
) {
1041 f
->open_object_section("pg_stat");
1042 f
->dump_stream("pgid") << p
->first
;
1048 f
->open_array_section("osd_stat_updates");
1049 for (auto p
= osd_stat_updates
.begin(); p
!= osd_stat_updates
.end(); ++p
) {
1050 f
->open_object_section("osd_stat");
1051 f
->dump_int("osd", p
->first
);
1056 f
->open_array_section("pool_statfs_updates");
1057 for (auto p
= pool_statfs_updates
.begin(); p
!= pool_statfs_updates
.end(); ++p
) {
1058 f
->open_object_section("pool_statfs");
1059 f
->dump_stream("poolid/osd") << p
->first
;
1065 f
->open_array_section("osd_stat_removals");
1066 for (auto p
= osd_stat_rm
.begin(); p
!= osd_stat_rm
.end(); ++p
)
1067 f
->dump_int("osd", *p
);
1070 f
->open_array_section("pg_removals");
1071 for (auto p
= pg_remove
.begin(); p
!= pg_remove
.end(); ++p
)
1072 f
->dump_stream("pgid") << *p
;
1076 void PGMap::Incremental::generate_test_instances(list
<PGMap::Incremental
*>& o
)
1078 o
.push_back(new Incremental
);
1079 o
.push_back(new Incremental
);
1080 o
.back()->version
= 1;
1081 o
.back()->stamp
= utime_t(123,345);
1082 o
.push_back(new Incremental
);
1083 o
.back()->version
= 2;
1084 o
.back()->pg_stat_updates
[pg_t(1,2)] = pg_stat_t();
1085 o
.back()->osd_stat_updates
[5] = osd_stat_t();
1086 o
.push_back(new Incremental
);
1087 o
.back()->version
= 3;
1088 o
.back()->osdmap_epoch
= 1;
1089 o
.back()->pg_scan
= 2;
1090 o
.back()->pg_stat_updates
[pg_t(4,5)] = pg_stat_t();
1091 o
.back()->osd_stat_updates
[6] = osd_stat_t();
1092 o
.back()->pg_remove
.insert(pg_t(1,2));
1093 o
.back()->osd_stat_rm
.insert(5);
1094 o
.back()->pool_statfs_updates
[std::make_pair(1234,4)] = store_statfs_t();
1099 void PGMap::apply_incremental(CephContext
*cct
, const Incremental
& inc
)
1101 ceph_assert(inc
.version
== version
+1);
1104 pool_stat_t pg_sum_old
= pg_sum
;
1105 mempool::pgmap::unordered_map
<int32_t, pool_stat_t
> pg_pool_sum_old
;
1106 pg_pool_sum_old
= pg_pool_sum
;
1108 for (auto p
= inc
.pg_stat_updates
.begin();
1109 p
!= inc
.pg_stat_updates
.end();
1111 const pg_t
&update_pg(p
->first
);
1112 auto update_pool
= update_pg
.pool();
1113 const pg_stat_t
&update_stat(p
->second
);
1115 auto pg_stat_iter
= pg_stat
.find(update_pg
);
1116 pool_stat_t
&pool_sum_ref
= pg_pool_sum
[update_pool
];
1117 if (pg_stat_iter
== pg_stat
.end()) {
1118 pg_stat
.insert(make_pair(update_pg
, update_stat
));
1120 stat_pg_sub(update_pg
, pg_stat_iter
->second
);
1121 pool_sum_ref
.sub(pg_stat_iter
->second
);
1122 pg_stat_iter
->second
= update_stat
;
1124 stat_pg_add(update_pg
, update_stat
);
1125 pool_sum_ref
.add(update_stat
);
1128 for (auto p
= inc
.pool_statfs_updates
.begin();
1129 p
!= inc
.pool_statfs_updates
.end();
1131 auto update_pool
= p
->first
.first
;
1132 auto update_osd
= p
->first
.second
;
1133 auto& statfs_inc
= p
->second
;
1135 auto pool_statfs_iter
=
1136 pool_statfs
.find(std::make_pair(update_pool
, update_osd
));
1137 if (pg_pool_sum
.count(update_pool
)) {
1138 pool_stat_t
&pool_sum_ref
= pg_pool_sum
[update_pool
];
1139 if (pool_statfs_iter
== pool_statfs
.end()) {
1140 pool_statfs
.emplace(std::make_pair(update_pool
, update_osd
), statfs_inc
);
1142 pool_sum_ref
.sub(pool_statfs_iter
->second
);
1143 pool_statfs_iter
->second
= statfs_inc
;
1145 pool_sum_ref
.add(statfs_inc
);
1149 for (auto p
= inc
.get_osd_stat_updates().begin();
1150 p
!= inc
.get_osd_stat_updates().end();
1153 const osd_stat_t
&new_stats(p
->second
);
1155 auto t
= osd_stat
.find(osd
);
1156 if (t
== osd_stat
.end()) {
1157 osd_stat
.insert(make_pair(osd
, new_stats
));
1159 stat_osd_sub(t
->first
, t
->second
);
1160 t
->second
= new_stats
;
1162 stat_osd_add(osd
, new_stats
);
1164 set
<int64_t> deleted_pools
;
1165 for (auto p
= inc
.pg_remove
.begin();
1166 p
!= inc
.pg_remove
.end();
1168 const pg_t
&removed_pg(*p
);
1169 auto s
= pg_stat
.find(removed_pg
);
1170 bool pool_erased
= false;
1171 if (s
!= pg_stat
.end()) {
1172 pool_erased
= stat_pg_sub(removed_pg
, s
->second
);
1175 deleted_pools
.insert(removed_pg
.pool());
1180 for (auto p
= inc
.get_osd_stat_rm().begin();
1181 p
!= inc
.get_osd_stat_rm().end();
1183 auto t
= osd_stat
.find(*p
);
1184 if (t
!= osd_stat
.end()) {
1185 stat_osd_sub(t
->first
, t
->second
);
1188 for (auto i
= pool_statfs
.begin(); i
!= pool_statfs
.end(); ++i
) {
1189 if (i
->first
.second
== *p
) {
1190 pg_pool_sum
[i
->first
.first
].sub(i
->second
);
1191 pool_statfs
.erase(i
);
1196 // skip calculating delta while sum was not synchronized
1197 if (!stamp
.is_zero() && !pg_sum_old
.stats
.sum
.is_zero()) {
1199 delta_t
= inc
.stamp
;
1201 // calculate a delta, and average over the last 2 deltas.
1202 pool_stat_t d
= pg_sum
;
1203 d
.stats
.sub(pg_sum_old
.stats
);
1204 pg_sum_deltas
.push_back(make_pair(d
, delta_t
));
1205 stamp_delta
+= delta_t
;
1206 pg_sum_delta
.stats
.add(d
.stats
);
1207 auto smooth_intervals
=
1208 cct
? cct
->_conf
.get_val
<uint64_t>("mon_stat_smooth_intervals") : 1;
1209 while (pg_sum_deltas
.size() > smooth_intervals
) {
1210 pg_sum_delta
.stats
.sub(pg_sum_deltas
.front().first
.stats
);
1211 stamp_delta
-= pg_sum_deltas
.front().second
;
1212 pg_sum_deltas
.pop_front();
1217 update_pool_deltas(cct
, inc
.stamp
, pg_pool_sum_old
);
1219 for (auto p
: deleted_pools
) {
1221 dout(20) << " deleted pool " << p
<< dendl
;
1225 if (inc
.osdmap_epoch
)
1226 last_osdmap_epoch
= inc
.osdmap_epoch
;
1228 last_pg_scan
= inc
.pg_scan
;
1231 void PGMap::calc_stats()
1237 pg_pool_sum
.clear();
1238 num_pg_by_pool
.clear();
1240 pg_sum
= pool_stat_t();
1241 osd_sum
= osd_stat_t();
1242 osd_sum_by_class
.clear();
1243 num_pg_by_state
.clear();
1244 num_pg_by_pool_state
.clear();
1245 num_pg_by_osd
.clear();
1247 for (auto p
= pg_stat
.begin();
1251 stat_pg_add(pg
, p
->second
);
1252 pg_pool_sum
[pg
.pool()].add(p
->second
);
1254 for (auto p
= pool_statfs
.begin();
1255 p
!= pool_statfs
.end();
1257 auto pool
= p
->first
.first
;
1258 pg_pool_sum
[pool
].add(p
->second
);
1260 for (auto p
= osd_stat
.begin();
1261 p
!= osd_stat
.end();
1263 stat_osd_add(p
->first
, p
->second
);
1266 void PGMap::stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
1269 auto pool
= pgid
.pool();
1273 num_pg_by_state
[s
.state
]++;
1274 num_pg_by_pool_state
[pgid
.pool()][s
.state
]++;
1275 num_pg_by_pool
[pool
]++;
1277 if ((s
.state
& PG_STATE_CREATING
) &&
1278 s
.parent_split_bits
== 0) {
1279 creating_pgs
.insert(pgid
);
1280 if (s
.acting_primary
>= 0) {
1281 creating_pgs_by_osd_epoch
[s
.acting_primary
][s
.mapping_epoch
].insert(pgid
);
1285 if (s
.state
& PG_STATE_ACTIVE
) {
1295 for (auto p
= s
.blocked_by
.begin();
1296 p
!= s
.blocked_by
.end();
1298 ++blocked_by_sum
[*p
];
1301 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1302 pg_by_osd
[*p
].insert(pgid
);
1303 num_pg_by_osd
[*p
].acting
++;
1305 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1306 auto& t
= pg_by_osd
[*p
];
1307 if (t
.find(pgid
) == t
.end()) {
1309 num_pg_by_osd
[*p
].up_not_acting
++;
1313 if (s
.up_primary
>= 0) {
1314 num_pg_by_osd
[s
.up_primary
].primary
++;
1318 bool PGMap::stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
1321 bool pool_erased
= false;
1325 int end
= --num_pg_by_state
[s
.state
];
1326 ceph_assert(end
>= 0);
1328 num_pg_by_state
.erase(s
.state
);
1329 if (--num_pg_by_pool_state
[pgid
.pool()][s
.state
] == 0) {
1330 num_pg_by_pool_state
[pgid
.pool()].erase(s
.state
);
1332 end
= --num_pg_by_pool
[pgid
.pool()];
1337 if ((s
.state
& PG_STATE_CREATING
) &&
1338 s
.parent_split_bits
== 0) {
1339 creating_pgs
.erase(pgid
);
1340 if (s
.acting_primary
>= 0) {
1341 map
<epoch_t
,set
<pg_t
> >& r
= creating_pgs_by_osd_epoch
[s
.acting_primary
];
1342 r
[s
.mapping_epoch
].erase(pgid
);
1343 if (r
[s
.mapping_epoch
].empty())
1344 r
.erase(s
.mapping_epoch
);
1346 creating_pgs_by_osd_epoch
.erase(s
.acting_primary
);
1350 if (s
.state
& PG_STATE_ACTIVE
) {
1360 for (auto p
= s
.blocked_by
.begin();
1361 p
!= s
.blocked_by
.end();
1363 auto q
= blocked_by_sum
.find(*p
);
1364 ceph_assert(q
!= blocked_by_sum
.end());
1367 blocked_by_sum
.erase(q
);
1370 set
<int32_t> actingset
;
1371 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1372 actingset
.insert(*p
);
1373 auto& oset
= pg_by_osd
[*p
];
1376 pg_by_osd
.erase(*p
);
1377 auto it
= num_pg_by_osd
.find(*p
);
1378 if (it
!= num_pg_by_osd
.end() && it
->second
.acting
> 0)
1379 it
->second
.acting
--;
1381 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1382 auto& oset
= pg_by_osd
[*p
];
1385 pg_by_osd
.erase(*p
);
1386 if (actingset
.count(*p
))
1388 auto it
= num_pg_by_osd
.find(*p
);
1389 if (it
!= num_pg_by_osd
.end() && it
->second
.up_not_acting
> 0)
1390 it
->second
.up_not_acting
--;
1393 if (s
.up_primary
>= 0) {
1394 auto it
= num_pg_by_osd
.find(s
.up_primary
);
1395 if (it
!= num_pg_by_osd
.end() && it
->second
.primary
> 0)
1396 it
->second
.primary
--;
1401 void PGMap::calc_purged_snaps()
1403 purged_snaps
.clear();
1404 set
<int64_t> unknown
;
1405 for (auto& i
: pg_stat
) {
1406 if (i
.second
.state
== 0) {
1407 unknown
.insert(i
.first
.pool());
1408 purged_snaps
.erase(i
.first
.pool());
1410 } else if (unknown
.count(i
.first
.pool())) {
1413 auto j
= purged_snaps
.find(i
.first
.pool());
1414 if (j
== purged_snaps
.end()) {
1416 purged_snaps
[i
.first
.pool()] = i
.second
.purged_snaps
;
1418 j
->second
.intersection_of(i
.second
.purged_snaps
);
1423 void PGMap::calc_osd_sum_by_class(const OSDMap
& osdmap
)
1425 osd_sum_by_class
.clear();
1426 for (auto& i
: osd_stat
) {
1427 const char *class_name
= osdmap
.crush
->get_item_class(i
.first
);
1429 osd_sum_by_class
[class_name
].add(i
.second
);
1434 void PGMap::stat_osd_add(int osd
, const osd_stat_t
&s
)
1438 if (osd
>= (int)osd_last_seq
.size()) {
1439 osd_last_seq
.resize(osd
+ 1);
1441 osd_last_seq
[osd
] = s
.seq
;
1444 void PGMap::stat_osd_sub(int osd
, const osd_stat_t
&s
)
1448 ceph_assert(osd
< (int)osd_last_seq
.size());
1449 osd_last_seq
[osd
] = 0;
1452 void PGMap::encode_digest(const OSDMap
& osdmap
,
1453 bufferlist
& bl
, uint64_t features
)
1455 get_rules_avail(osdmap
, &avail_space_by_rule
);
1456 calc_osd_sum_by_class(osdmap
);
1457 calc_purged_snaps();
1458 PGMapDigest::encode(bl
, features
);
1461 void PGMap::encode(bufferlist
&bl
, uint64_t features
) const
1463 ENCODE_START(8, 8, bl
);
1464 encode(version
, bl
);
1465 encode(pg_stat
, bl
);
1466 encode(osd_stat
, bl
, features
);
1467 encode(last_osdmap_epoch
, bl
);
1468 encode(last_pg_scan
, bl
);
1470 encode(pool_statfs
, bl
, features
);
1474 void PGMap::decode(bufferlist::const_iterator
&bl
)
1476 DECODE_START(8, bl
);
1477 decode(version
, bl
);
1478 decode(pg_stat
, bl
);
1479 decode(osd_stat
, bl
);
1480 decode(last_osdmap_epoch
, bl
);
1481 decode(last_pg_scan
, bl
);
1483 decode(pool_statfs
, bl
);
1489 void PGMap::dump(Formatter
*f
) const
1492 dump_pg_stats(f
, false);
1497 void PGMap::dump_basic(Formatter
*f
) const
1499 f
->dump_unsigned("version", version
);
1500 f
->dump_stream("stamp") << stamp
;
1501 f
->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch
);
1502 f
->dump_unsigned("last_pg_scan", last_pg_scan
);
1504 f
->open_object_section("pg_stats_sum");
1508 f
->open_object_section("osd_stats_sum");
1515 void PGMap::dump_delta(Formatter
*f
) const
1517 f
->open_object_section("pg_stats_delta");
1518 pg_sum_delta
.dump(f
);
1519 f
->dump_stream("stamp_delta") << stamp_delta
;
1523 void PGMap::dump_pg_stats(Formatter
*f
, bool brief
) const
1525 f
->open_array_section("pg_stats");
1526 for (auto i
= pg_stat
.begin();
1529 f
->open_object_section("pg_stat");
1530 f
->dump_stream("pgid") << i
->first
;
1532 i
->second
.dump_brief(f
);
1540 void PGMap::dump_pool_stats(Formatter
*f
) const
1542 f
->open_array_section("pool_stats");
1543 for (auto p
= pg_pool_sum
.begin();
1544 p
!= pg_pool_sum
.end();
1546 f
->open_object_section("pool_stat");
1547 f
->dump_int("poolid", p
->first
);
1548 auto q
= num_pg_by_pool
.find(p
->first
);
1549 if (q
!= num_pg_by_pool
.end())
1550 f
->dump_unsigned("num_pg", q
->second
);
1557 void PGMap::dump_osd_stats(Formatter
*f
) const
1559 f
->open_array_section("osd_stats");
1560 for (auto q
= osd_stat
.begin();
1561 q
!= osd_stat
.end();
1563 f
->open_object_section("osd_stat");
1564 f
->dump_int("osd", q
->first
);
1571 void PGMap::dump_pg_stats_plain(
1573 const mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
1579 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1580 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1581 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1582 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1583 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1584 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1587 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1588 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1589 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1590 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1591 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1592 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1593 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1594 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
1595 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
1596 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1597 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1598 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1599 tab
.define_column("STATE_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1600 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
1601 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
1602 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1603 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1604 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1605 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1606 tab
.define_column("LAST_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1607 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1608 tab
.define_column("LAST_DEEP_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1609 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1610 tab
.define_column("SNAPTRIMQ_LEN", TextTable::LEFT
, TextTable::RIGHT
);
1613 for (auto i
= pg_stats
.begin();
1614 i
!= pg_stats
.end(); ++i
) {
1615 const pg_stat_t
&st(i
->second
);
1618 << pg_state_string(st
.state
)
1622 << st
.acting_primary
1623 << TextTable::endrow
;
1625 ostringstream reported
;
1626 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
1629 << st
.stats
.sum
.num_objects
1630 << st
.stats
.sum
.num_objects_missing_on_primary
1631 << st
.stats
.sum
.num_objects_degraded
1632 << st
.stats
.sum
.num_objects_misplaced
1633 << st
.stats
.sum
.num_objects_unfound
1634 << st
.stats
.sum
.num_bytes
1635 << st
.stats
.sum
.num_omap_bytes
1636 << st
.stats
.sum
.num_omap_keys
1638 << st
.ondisk_log_size
1639 << pg_state_string(st
.state
)
1643 << pg_vector_string(st
.up
)
1645 << pg_vector_string(st
.acting
)
1646 << st
.acting_primary
1648 << st
.last_scrub_stamp
1649 << st
.last_deep_scrub
1650 << st
.last_deep_scrub_stamp
1652 << TextTable::endrow
;
1659 void PGMap::dump(ostream
& ss
) const
1662 dump_pg_stats(ss
, false);
1663 dump_pool_stats(ss
, false);
1664 dump_pg_sum_stats(ss
, false);
1668 void PGMap::dump_basic(ostream
& ss
) const
1670 ss
<< "version " << version
<< std::endl
;
1671 ss
<< "stamp " << stamp
<< std::endl
;
1672 ss
<< "last_osdmap_epoch " << last_osdmap_epoch
<< std::endl
;
1673 ss
<< "last_pg_scan " << last_pg_scan
<< std::endl
;
1676 void PGMap::dump_pg_stats(ostream
& ss
, bool brief
) const
1678 dump_pg_stats_plain(ss
, pg_stat
, brief
);
1681 void PGMap::dump_pool_stats(ostream
& ss
, bool header
) const
1686 tab
.define_column("POOLID", TextTable::LEFT
, TextTable::LEFT
);
1687 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1688 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1689 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1690 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1691 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1692 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1693 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
1694 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
1695 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1696 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1698 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1699 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1700 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1701 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1702 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1703 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1704 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1705 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1706 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1707 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1708 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1711 for (auto p
= pg_pool_sum
.begin();
1712 p
!= pg_pool_sum
.end();
1715 << p
->second
.stats
.sum
.num_objects
1716 << p
->second
.stats
.sum
.num_objects_missing_on_primary
1717 << p
->second
.stats
.sum
.num_objects_degraded
1718 << p
->second
.stats
.sum
.num_objects_misplaced
1719 << p
->second
.stats
.sum
.num_objects_unfound
1720 << p
->second
.stats
.sum
.num_bytes
1721 << p
->second
.stats
.sum
.num_omap_bytes
1722 << p
->second
.stats
.sum
.num_omap_keys
1723 << p
->second
.log_size
1724 << p
->second
.ondisk_log_size
1725 << TextTable::endrow
;
1731 void PGMap::dump_pg_sum_stats(ostream
& ss
, bool header
) const
1736 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1737 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1738 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1739 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1740 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1741 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1742 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1743 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
1744 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
1745 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1746 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1748 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1749 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1750 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1751 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1752 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1753 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1754 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1755 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1756 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1757 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1758 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1762 << pg_sum
.stats
.sum
.num_objects
1763 << pg_sum
.stats
.sum
.num_objects_missing_on_primary
1764 << pg_sum
.stats
.sum
.num_objects_degraded
1765 << pg_sum
.stats
.sum
.num_objects_misplaced
1766 << pg_sum
.stats
.sum
.num_objects_unfound
1767 << pg_sum
.stats
.sum
.num_bytes
1768 << pg_sum
.stats
.sum
.num_omap_bytes
1769 << pg_sum
.stats
.sum
.num_omap_keys
1771 << pg_sum
.ondisk_log_size
1772 << TextTable::endrow
;
1777 void PGMap::dump_osd_stats(ostream
& ss
) const
1781 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1782 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1783 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1784 tab
.define_column("USED_RAW", TextTable::LEFT
, TextTable::RIGHT
);
1785 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1786 tab
.define_column("HB_PEERS", TextTable::LEFT
, TextTable::RIGHT
);
1787 tab
.define_column("PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1788 tab
.define_column("PRIMARY_PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1790 for (auto p
= osd_stat
.begin();
1791 p
!= osd_stat
.end();
1794 << byte_u_t(p
->second
.statfs
.get_used())
1795 << byte_u_t(p
->second
.statfs
.available
)
1796 << byte_u_t(p
->second
.statfs
.get_used_raw())
1797 << byte_u_t(p
->second
.statfs
.total
)
1798 << p
->second
.hb_peers
1799 << get_num_pg_by_osd(p
->first
)
1800 << get_num_primary_pg_by_osd(p
->first
)
1801 << TextTable::endrow
;
1805 << byte_u_t(osd_sum
.statfs
.get_used())
1806 << byte_u_t(osd_sum
.statfs
.available
)
1807 << byte_u_t(osd_sum
.statfs
.get_used_raw())
1808 << byte_u_t(osd_sum
.statfs
.total
)
1809 << TextTable::endrow
;
1814 void PGMap::dump_osd_sum_stats(ostream
& ss
) const
1818 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1819 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1820 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1821 tab
.define_column("USED_RAW", TextTable::LEFT
, TextTable::RIGHT
);
1822 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1825 << byte_u_t(osd_sum
.statfs
.get_used())
1826 << byte_u_t(osd_sum
.statfs
.available
)
1827 << byte_u_t(osd_sum
.statfs
.get_used_raw())
1828 << byte_u_t(osd_sum
.statfs
.total
)
1829 << TextTable::endrow
;
1834 void PGMap::get_stuck_stats(
1835 int types
, const utime_t cutoff
,
1836 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const
1838 ceph_assert(types
!= 0);
1839 for (auto i
= pg_stat
.begin();
1842 utime_t val
= cutoff
; // don't care about >= cutoff so that is infinity
1844 if ((types
& STUCK_INACTIVE
) && !(i
->second
.state
& PG_STATE_ACTIVE
)) {
1845 if (i
->second
.last_active
< val
)
1846 val
= i
->second
.last_active
;
1849 if ((types
& STUCK_UNCLEAN
) && !(i
->second
.state
& PG_STATE_CLEAN
)) {
1850 if (i
->second
.last_clean
< val
)
1851 val
= i
->second
.last_clean
;
1854 if ((types
& STUCK_DEGRADED
) && (i
->second
.state
& PG_STATE_DEGRADED
)) {
1855 if (i
->second
.last_undegraded
< val
)
1856 val
= i
->second
.last_undegraded
;
1859 if ((types
& STUCK_UNDERSIZED
) && (i
->second
.state
& PG_STATE_UNDERSIZED
)) {
1860 if (i
->second
.last_fullsized
< val
)
1861 val
= i
->second
.last_fullsized
;
1864 if ((types
& STUCK_STALE
) && (i
->second
.state
& PG_STATE_STALE
)) {
1865 if (i
->second
.last_unstale
< val
)
1866 val
= i
->second
.last_unstale
;
1869 // val is now the earliest any of the requested stuck states began
1871 stuck_pgs
[i
->first
] = i
->second
;
1876 bool PGMap::get_stuck_counts(const utime_t cutoff
, map
<string
, int>& note
) const
1884 for (auto i
= pg_stat
.begin();
1887 if (! (i
->second
.state
& PG_STATE_ACTIVE
)) {
1888 if (i
->second
.last_active
< cutoff
)
1891 if (! (i
->second
.state
& PG_STATE_CLEAN
)) {
1892 if (i
->second
.last_clean
< cutoff
)
1895 if (i
->second
.state
& PG_STATE_DEGRADED
) {
1896 if (i
->second
.last_undegraded
< cutoff
)
1899 if (i
->second
.state
& PG_STATE_UNDERSIZED
) {
1900 if (i
->second
.last_fullsized
< cutoff
)
1903 if (i
->second
.state
& PG_STATE_STALE
) {
1904 if (i
->second
.last_unstale
< cutoff
)
1910 note
["stuck inactive"] = inactive
;
1913 note
["stuck unclean"] = unclean
;
1916 note
["stuck undersized"] = undersized
;
1919 note
["stuck degraded"] = degraded
;
1922 note
["stuck stale"] = stale
;
1924 return inactive
|| unclean
|| undersized
|| degraded
|| stale
;
1927 void PGMap::dump_stuck(Formatter
*f
, int types
, utime_t cutoff
) const
1929 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
1930 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
1931 f
->open_array_section("stuck_pg_stats");
1932 for (auto i
= stuck_pg_stats
.begin();
1933 i
!= stuck_pg_stats
.end();
1935 f
->open_object_section("pg_stat");
1936 f
->dump_stream("pgid") << i
->first
;
1943 void PGMap::dump_stuck_plain(ostream
& ss
, int types
, utime_t cutoff
) const
1945 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
1946 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
1947 if (!stuck_pg_stats
.empty())
1948 dump_pg_stats_plain(ss
, stuck_pg_stats
, true);
1951 int PGMap::dump_stuck_pg_stats(
1955 vector
<string
>& args
) const
1957 int stuck_types
= 0;
1959 for (auto i
= args
.begin(); i
!= args
.end(); ++i
) {
1960 if (*i
== "inactive")
1961 stuck_types
|= PGMap::STUCK_INACTIVE
;
1962 else if (*i
== "unclean")
1963 stuck_types
|= PGMap::STUCK_UNCLEAN
;
1964 else if (*i
== "undersized")
1965 stuck_types
|= PGMap::STUCK_UNDERSIZED
;
1966 else if (*i
== "degraded")
1967 stuck_types
|= PGMap::STUCK_DEGRADED
;
1968 else if (*i
== "stale")
1969 stuck_types
|= PGMap::STUCK_STALE
;
1971 ds
<< "Unknown type: " << *i
<< std::endl
;
1976 utime_t
now(ceph_clock_now());
1977 utime_t cutoff
= now
- utime_t(threshold
, 0);
1980 dump_stuck_plain(ds
, stuck_types
, cutoff
);
1982 dump_stuck(f
, stuck_types
, cutoff
);
1989 void PGMap::dump_osd_perf_stats(Formatter
*f
) const
1991 f
->open_array_section("osd_perf_infos");
1992 for (auto i
= osd_stat
.begin();
1993 i
!= osd_stat
.end();
1995 f
->open_object_section("osd");
1996 f
->dump_int("id", i
->first
);
1998 f
->open_object_section("perf_stats");
1999 i
->second
.os_perf_stat
.dump(f
);
2006 void PGMap::print_osd_perf_stats(std::ostream
*ss
) const
2009 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2010 tab
.define_column("commit_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2011 tab
.define_column("apply_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2012 for (auto i
= osd_stat
.begin();
2013 i
!= osd_stat
.end();
2016 tab
<< i
->second
.os_perf_stat
.os_commit_latency_ns
/ 1000000ull;
2017 tab
<< i
->second
.os_perf_stat
.os_apply_latency_ns
/ 1000000ull;
2018 tab
<< TextTable::endrow
;
2023 void PGMap::dump_osd_blocked_by_stats(Formatter
*f
) const
2025 f
->open_array_section("osd_blocked_by_infos");
2026 for (auto i
= blocked_by_sum
.begin();
2027 i
!= blocked_by_sum
.end();
2029 f
->open_object_section("osd");
2030 f
->dump_int("id", i
->first
);
2031 f
->dump_int("num_blocked", i
->second
);
2036 void PGMap::print_osd_blocked_by_stats(std::ostream
*ss
) const
2039 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2040 tab
.define_column("num_blocked", TextTable::LEFT
, TextTable::RIGHT
);
2041 for (auto i
= blocked_by_sum
.begin();
2042 i
!= blocked_by_sum
.end();
2046 tab
<< TextTable::endrow
;
2053 * update aggregated delta
2055 * @param cct ceph context
2056 * @param ts Timestamp for the stats being delta'ed
2057 * @param old_pool_sum Previous stats sum
2058 * @param last_ts Last timestamp for pool
2059 * @param result_pool_sum Resulting stats
2060 * @param result_pool_delta Resulting pool delta
2061 * @param result_ts_delta Resulting timestamp delta
2062 * @param delta_avg_list List of last N computed deltas, used to average
2064 void PGMap::update_delta(
2067 const pool_stat_t
& old_pool_sum
,
2069 const pool_stat_t
& current_pool_sum
,
2070 pool_stat_t
*result_pool_delta
,
2071 utime_t
*result_ts_delta
,
2072 mempool::pgmap::list
<pair
<pool_stat_t
,utime_t
> > *delta_avg_list
)
2074 /* @p ts is the timestamp we want to associate with the data
2075 * in @p old_pool_sum, and on which we will base ourselves to
2076 * calculate the delta, stored in 'delta_t'.
2079 delta_t
= ts
; // start with the provided timestamp
2080 delta_t
-= *last_ts
; // take the last timestamp we saw
2081 *last_ts
= ts
; // @p ts becomes the last timestamp we saw
2083 // adjust delta_t, quick start if there is no update in a long period
2084 delta_t
= std::min(delta_t
,
2085 utime_t(2 * (cct
? cct
->_conf
->mon_delta_reset_interval
: 10), 0));
2087 // calculate a delta, and average over the last 6 deltas by default.
2088 /* start by taking a copy of our current @p result_pool_sum, and by
2089 * taking out the stats from @p old_pool_sum. This generates a stats
2090 * delta. Stash this stats delta in @p delta_avg_list, along with the
2091 * timestamp delta for these results.
2093 pool_stat_t d
= current_pool_sum
;
2094 d
.stats
.sub(old_pool_sum
.stats
);
2096 /* Aggregate current delta, and take out the last seen delta (if any) to
2098 * Skip calculating delta while sum was not synchronized.
2100 if(!old_pool_sum
.stats
.sum
.is_zero()) {
2101 delta_avg_list
->push_back(make_pair(d
,delta_t
));
2102 *result_ts_delta
+= delta_t
;
2103 result_pool_delta
->stats
.add(d
.stats
);
2105 size_t s
= cct
? cct
->_conf
.get_val
<uint64_t>("mon_stat_smooth_intervals") : 1;
2106 while (delta_avg_list
->size() > s
) {
2107 result_pool_delta
->stats
.sub(delta_avg_list
->front().first
.stats
);
2108 *result_ts_delta
-= delta_avg_list
->front().second
;
2109 delta_avg_list
->pop_front();
2114 * Update a given pool's deltas
2116 * @param cct Ceph Context
2117 * @param ts Timestamp for the stats being delta'ed
2118 * @param pool Pool's id
2119 * @param old_pool_sum Previous stats sum
2121 void PGMap::update_one_pool_delta(
2125 const pool_stat_t
& old_pool_sum
)
2127 if (per_pool_sum_deltas
.count(pool
) == 0) {
2128 ceph_assert(per_pool_sum_deltas_stamps
.count(pool
) == 0);
2129 ceph_assert(per_pool_sum_delta
.count(pool
) == 0);
2132 auto& sum_delta
= per_pool_sum_delta
[pool
];
2134 update_delta(cct
, ts
, old_pool_sum
, &sum_delta
.second
, pg_pool_sum
[pool
],
2135 &sum_delta
.first
, &per_pool_sum_deltas_stamps
[pool
],
2136 &per_pool_sum_deltas
[pool
]);
2140 * Update pools' deltas
2142 * @param cct CephContext
2143 * @param ts Timestamp for the stats being delta'ed
2144 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2146 void PGMap::update_pool_deltas(
2147 CephContext
*cct
, const utime_t ts
,
2148 const mempool::pgmap::unordered_map
<int32_t,pool_stat_t
>& pg_pool_sum_old
)
2150 for (auto it
= pg_pool_sum_old
.begin();
2151 it
!= pg_pool_sum_old
.end(); ++it
) {
2152 update_one_pool_delta(cct
, ts
, it
->first
, it
->second
);
2156 void PGMap::clear_delta()
2158 pg_sum_delta
= pool_stat_t();
2159 pg_sum_deltas
.clear();
2160 stamp_delta
= utime_t();
2163 void PGMap::generate_test_instances(list
<PGMap
*>& o
)
2165 o
.push_back(new PGMap
);
2166 list
<Incremental
*> inc
;
2167 Incremental::generate_test_instances(inc
);
2170 while (!inc
.empty()) {
2171 PGMap
*pmp
= new PGMap();
2174 o
.back()->apply_incremental(NULL
, *inc
.front());
2180 void PGMap::get_filtered_pg_stats(uint64_t state
, int64_t poolid
, int64_t osdid
,
2181 bool primary
, set
<pg_t
>& pgs
) const
2183 for (auto i
= pg_stat
.begin();
2186 if ((poolid
>= 0) && (poolid
!= i
->first
.pool()))
2188 if ((osdid
>= 0) && !(i
->second
.is_acting_osd(osdid
,primary
)))
2190 if (state
== (uint64_t)-1 || // "all"
2191 (i
->second
.state
& state
) || // matches a state bit
2192 (state
== 0 && i
->second
.state
== 0)) { // matches "unknown" (== 0)
2193 pgs
.insert(i
->first
);
2198 void PGMap::dump_filtered_pg_stats(Formatter
*f
, set
<pg_t
>& pgs
) const
2200 f
->open_array_section("pg_stats");
2201 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2202 const pg_stat_t
& st
= pg_stat
.at(*i
);
2203 f
->open_object_section("pg_stat");
2204 f
->dump_stream("pgid") << *i
;
2211 void PGMap::dump_filtered_pg_stats(ostream
& ss
, set
<pg_t
>& pgs
) const
2214 utime_t now
= ceph_clock_now();
2216 tab
.define_column("PG", TextTable::LEFT
, TextTable::LEFT
);
2217 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
2218 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
2219 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
2220 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
2221 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
2222 tab
.define_column("OMAP_BYTES*", TextTable::LEFT
, TextTable::RIGHT
);
2223 tab
.define_column("OMAP_KEYS*", TextTable::LEFT
, TextTable::RIGHT
);
2224 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
2225 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
2226 tab
.define_column("SINCE", TextTable::LEFT
, TextTable::RIGHT
);
2227 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
2228 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
2229 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
2230 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
2231 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2232 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2234 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2235 const pg_stat_t
& st
= pg_stat
.at(*i
);
2237 ostringstream reported
;
2238 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
2240 ostringstream upstr
, actingstr
;
2241 upstr
<< st
.up
<< 'p' << st
.up_primary
;
2242 actingstr
<< st
.acting
<< 'p' << st
.acting_primary
;
2244 << st
.stats
.sum
.num_objects
2245 << st
.stats
.sum
.num_objects_degraded
2246 << st
.stats
.sum
.num_objects_misplaced
2247 << st
.stats
.sum
.num_objects_unfound
2248 << st
.stats
.sum
.num_bytes
2249 << st
.stats
.sum
.num_omap_bytes
2250 << st
.stats
.sum
.num_omap_keys
2252 << pg_state_string(st
.state
)
2253 << utimespan_str(now
- st
.last_change
)
2258 << st
.last_scrub_stamp
2259 << st
.last_deep_scrub_stamp
2260 << TextTable::endrow
;
2266 void PGMap::dump_pool_stats_and_io_rate(int64_t poolid
, const OSDMap
&osd_map
,
2268 stringstream
*rs
) const {
2269 string pool_name
= osd_map
.get_pool_name(poolid
);
2271 f
->open_object_section("pool");
2272 f
->dump_string("pool_name", pool_name
.c_str());
2273 f
->dump_int("pool_id", poolid
);
2274 f
->open_object_section("recovery");
2278 pool_recovery_summary(f
, &sl
, poolid
);
2279 if (!f
&& !sl
.empty()) {
2281 tss
<< " " << p
<< "\n";
2284 f
->close_section(); // object section recovery
2285 f
->open_object_section("recovery_rate");
2288 pool_recovery_rate_summary(f
, &rss
, poolid
);
2289 if (!f
&& !rss
.str().empty())
2290 tss
<< " recovery io " << rss
.str() << "\n";
2292 f
->close_section(); // object section recovery_rate
2293 f
->open_object_section("client_io_rate");
2297 pool_client_io_rate_summary(f
, &rss
, poolid
);
2298 if (!f
&& !rss
.str().empty())
2299 tss
<< " client io " << rss
.str() << "\n";
2300 // dump cache tier IO rate for cache pool
2301 const pg_pool_t
*pool
= osd_map
.get_pg_pool(poolid
);
2302 if (pool
->is_tier()) {
2304 f
->close_section(); // object section client_io_rate
2305 f
->open_object_section("cache_io_rate");
2309 pool_cache_io_rate_summary(f
, &rss
, poolid
);
2310 if (!f
&& !rss
.str().empty())
2311 tss
<< " cache tier io " << rss
.str() << "\n";
2314 f
->close_section(); // object section cache_io_rate
2315 f
->close_section(); // object section pool
2317 *rs
<< "pool " << pool_name
<< " id " << poolid
<< "\n";
2318 if (!tss
.str().empty())
2319 *rs
<< tss
.str() << "\n";
2321 *rs
<< " nothing is going on\n\n";
2325 void PGMap::get_health_checks(
2327 const OSDMap
& osdmap
,
2328 health_check_map_t
*checks
) const
2330 utime_t now
= ceph_clock_now();
2331 const auto max
= cct
->_conf
.get_val
<uint64_t>("mon_health_max_detail");
2332 const auto& pools
= osdmap
.get_pools();
2334 typedef enum pg_consequence_t
{
2335 UNAVAILABLE
= 1, // Client IO to the pool may block
2336 DEGRADED
= 2, // Fewer than the requested number of replicas are present
2337 BACKFILL_FULL
= 3, // Backfill is blocked for space considerations
2338 // This may or may not be a deadlock condition.
2339 DAMAGED
= 4, // The data may be missing or inconsistent on disk and
2341 RECOVERY_FULL
= 5 // Recovery is blocked because OSDs are full
2344 // For a given PG state, how should it be reported at the pool level?
2345 class PgStateResponse
{
2347 pg_consequence_t consequence
;
2348 typedef std::function
< utime_t(const pg_stat_t
&) > stuck_cb
;
2349 stuck_cb stuck_since
;
2352 PgStateResponse(const pg_consequence_t
& c
, stuck_cb
&& s
)
2353 : consequence(c
), stuck_since(std::move(s
)), invert(false)
2357 PgStateResponse(const pg_consequence_t
& c
, stuck_cb
&& s
, bool i
)
2358 : consequence(c
), stuck_since(std::move(s
)), invert(i
)
2363 // Record the PG state counts that contributed to a reported pool state
2366 // Map of PG_STATE_* to number of pgs in that state.
2367 std::map
<unsigned, unsigned> states
;
2369 // List of all PG IDs that had a state contributing
2370 // to this health condition.
2373 std::map
<pg_t
, std::string
> pg_messages
;
2376 // Map of PG state to how to respond to it
2377 std::map
<unsigned, PgStateResponse
> state_to_response
= {
2378 // Immediate reports
2379 { PG_STATE_INCONSISTENT
, {DAMAGED
, {}} },
2380 { PG_STATE_INCOMPLETE
, {UNAVAILABLE
, {}} },
2381 { PG_STATE_SNAPTRIM_ERROR
, {DAMAGED
, {}} },
2382 { PG_STATE_RECOVERY_UNFOUND
, {DAMAGED
, {}} },
2383 { PG_STATE_BACKFILL_UNFOUND
, {DAMAGED
, {}} },
2384 { PG_STATE_BACKFILL_TOOFULL
, {BACKFILL_FULL
, {}} },
2385 { PG_STATE_RECOVERY_TOOFULL
, {RECOVERY_FULL
, {}} },
2386 { PG_STATE_DEGRADED
, {DEGRADED
, {}} },
2387 { PG_STATE_DOWN
, {UNAVAILABLE
, {}} },
2388 // Delayed (wait until stuck) reports
2389 { PG_STATE_PEERING
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_peered
;} } },
2390 { PG_STATE_UNDERSIZED
, {DEGRADED
, [](const pg_stat_t
&p
){return p
.last_fullsized
;} } },
2391 { PG_STATE_STALE
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_unstale
;} } },
2392 // Delayed and inverted reports
2393 { PG_STATE_ACTIVE
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_active
;}, true} }
2396 // Specialized state printer that takes account of inversion of
2397 // ACTIVE, CLEAN checks.
2398 auto state_name
= [](const uint64_t &state
) {
2399 // Special cases for the states that are inverted checks
2400 if (state
== PG_STATE_CLEAN
) {
2401 return std::string("unclean");
2402 } else if (state
== PG_STATE_ACTIVE
) {
2403 return std::string("inactive");
2405 return pg_state_string(state
);
2409 // Map of what is wrong to information about why, implicitly also stores
2410 // the list of what is wrong.
2411 std::map
<pg_consequence_t
, PgCauses
> detected
;
2413 // Optimisation: trim down the number of checks to apply based on
2414 // the summary counters
2415 std::map
<unsigned, PgStateResponse
> possible_responses
;
2416 for (const auto &i
: num_pg_by_state
) {
2417 for (const auto &j
: state_to_response
) {
2418 if (!j
.second
.invert
) {
2419 // Check for normal tests by seeing if any pgs have the flag
2420 if (i
.first
& j
.first
) {
2421 possible_responses
.insert(j
);
2427 for (const auto &j
: state_to_response
) {
2428 if (j
.second
.invert
) {
2429 // Check for inverted tests by seeing if not-all pgs have the flag
2430 const auto &found
= num_pg_by_state
.find(j
.first
);
2431 if (found
== num_pg_by_state
.end() || found
->second
!= num_pg
) {
2432 possible_responses
.insert(j
);
2437 utime_t cutoff
= now
- utime_t(cct
->_conf
.get_val
<int64_t>("mon_pg_stuck_threshold"), 0);
2438 // Loop over all PGs, if there are any possibly-unhealthy states in there
2439 if (!possible_responses
.empty()) {
2440 for (const auto& i
: pg_stat
) {
2441 const auto &pg_id
= i
.first
;
2442 const auto &pg_info
= i
.second
;
2444 for (const auto &j
: state_to_response
) {
2445 const auto &pg_response_state
= j
.first
;
2446 const auto &pg_response
= j
.second
;
2448 // Apply the state test
2449 if (!(bool(pg_info
.state
& pg_response_state
) != pg_response
.invert
)) {
2453 // Apply stuckness test if needed
2454 if (pg_response
.stuck_since
) {
2455 // Delayed response, check for stuckness
2456 utime_t last_whatever
= pg_response
.stuck_since(pg_info
);
2457 if (last_whatever
>= cutoff
) {
2458 // Not stuck enough, ignore.
2465 auto &causes
= detected
[pg_response
.consequence
];
2466 causes
.states
[pg_response_state
]++;
2467 causes
.pgs
.insert(pg_id
);
2469 // Don't bother composing detail string if we have already recorded
2471 if (causes
.pg_messages
.size() > max
) {
2475 std::ostringstream ss
;
2476 if (pg_response
.stuck_since
) {
2477 utime_t since
= pg_response
.stuck_since(pg_info
);
2478 ss
<< "pg " << pg_id
<< " is stuck " << state_name(pg_response_state
);
2479 if (since
== utime_t()) {
2480 ss
<< " since forever";
2482 utime_t dur
= now
- since
;
2483 ss
<< " for " << dur
;
2485 ss
<< ", current state " << pg_state_string(pg_info
.state
)
2486 << ", last acting " << pg_info
.acting
;
2488 ss
<< "pg " << pg_id
<< " is "
2489 << pg_state_string(pg_info
.state
);
2490 ss
<< ", acting " << pg_info
.acting
;
2491 if (pg_info
.stats
.sum
.num_objects_unfound
) {
2492 ss
<< ", " << pg_info
.stats
.sum
.num_objects_unfound
2497 if (pg_info
.state
& PG_STATE_INCOMPLETE
) {
2498 const pg_pool_t
*pi
= osdmap
.get_pg_pool(pg_id
.pool());
2499 if (pi
&& pi
->min_size
> 1) {
2500 ss
<< " (reducing pool "
2501 << osdmap
.get_pool_name(pg_id
.pool())
2502 << " min_size from " << (int)pi
->min_size
2503 << " may help; search ceph.com/docs for 'incomplete')";
2507 causes
.pg_messages
[pg_id
] = ss
.str();
2511 dout(10) << __func__
<< " skipping loop over PGs: counters look OK" << dendl
;
2514 for (const auto &i
: detected
) {
2515 std::string health_code
;
2516 health_status_t sev
;
2517 std::string summary
;
2520 health_code
= "PG_AVAILABILITY";
2522 summary
= "Reduced data availability: ";
2525 health_code
= "PG_DEGRADED";
2526 summary
= "Degraded data redundancy: ";
2530 health_code
= "PG_BACKFILL_FULL";
2531 summary
= "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2535 health_code
= "PG_DAMAGED";
2536 summary
= "Possible data damage: ";
2540 health_code
= "PG_RECOVERY_FULL";
2541 summary
= "Full OSDs blocking recovery: ";
2548 if (i
.first
== DEGRADED
) {
2549 if (pg_sum
.stats
.sum
.num_objects_degraded
&&
2550 pg_sum
.stats
.sum
.num_object_copies
> 0) {
2551 double pc
= (double)pg_sum
.stats
.sum
.num_objects_degraded
/
2552 (double)pg_sum
.stats
.sum
.num_object_copies
* (double)100.0;
2554 snprintf(b
, sizeof(b
), "%.3lf", pc
);
2556 ss
<< pg_sum
.stats
.sum
.num_objects_degraded
2557 << "/" << pg_sum
.stats
.sum
.num_object_copies
<< " objects degraded ("
2560 // Throw in a comma for the benefit of the following PG counts
2561 summary
+= ss
.str() + ", ";
2565 // Compose summary message saying how many PGs in what states led
2566 // to this health check failing
2567 std::vector
<std::string
> pg_msgs
;
2568 for (const auto &j
: i
.second
.states
) {
2569 std::ostringstream msg
;
2570 msg
<< j
.second
<< (j
.second
> 1 ? " pgs " : " pg ") << state_name(j
.first
);
2571 pg_msgs
.push_back(msg
.str());
2573 summary
+= joinify(pg_msgs
.begin(), pg_msgs
.end(), std::string(", "));
2577 health_check_t
*check
= &checks
->add(
2582 // Compose list of PGs contributing to this health check failing
2583 for (const auto &j
: i
.second
.pg_messages
) {
2584 check
->detail
.push_back(j
.second
);
2589 if (pg_sum
.stats
.sum
.num_scrub_errors
) {
2591 ss
<< pg_sum
.stats
.sum
.num_scrub_errors
<< " scrub errors";
2592 checks
->add("OSD_SCRUB_ERRORS", HEALTH_ERR
, ss
.str());
2595 // LARGE_OMAP_OBJECTS
2596 if (pg_sum
.stats
.sum
.num_large_omap_objects
) {
2597 list
<string
> detail
;
2598 for (auto &pool
: pools
) {
2599 const string
& pool_name
= osdmap
.get_pool_name(pool
.first
);
2600 auto it2
= pg_pool_sum
.find(pool
.first
);
2601 if (it2
== pg_pool_sum
.end()) {
2604 const pool_stat_t
*pstat
= &it2
->second
;
2605 if (pstat
== nullptr) {
2608 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
2609 if (sum
.num_large_omap_objects
) {
2611 ss
<< sum
.num_large_omap_objects
<< " large objects found in pool "
2612 << "'" << pool_name
<< "'";
2613 detail
.push_back(ss
.str());
2616 if (!detail
.empty()) {
2618 ss
<< pg_sum
.stats
.sum
.num_large_omap_objects
<< " large omap objects";
2619 auto& d
= checks
->add("LARGE_OMAP_OBJECTS", HEALTH_WARN
, ss
.str());
2621 tip
<< "Search the cluster log for 'Large omap object found' for more "
2623 detail
.push_back(tip
.str());
2624 d
.detail
.swap(detail
);
2628 // CACHE_POOL_NEAR_FULL
2630 list
<string
> detail
;
2631 unsigned num_pools
= 0;
2632 for (auto& p
: pools
) {
2633 if ((!p
.second
.target_max_objects
&& !p
.second
.target_max_bytes
) ||
2634 !pg_pool_sum
.count(p
.first
)) {
2637 bool nearfull
= false;
2638 const string
& name
= osdmap
.get_pool_name(p
.first
);
2639 const pool_stat_t
& st
= get_pg_pool_sum_stat(p
.first
);
2640 uint64_t ratio
= p
.second
.cache_target_full_ratio_micro
+
2641 ((1000000 - p
.second
.cache_target_full_ratio_micro
) *
2642 cct
->_conf
->mon_cache_target_full_warn_ratio
);
2643 if (p
.second
.target_max_objects
&&
2644 (uint64_t)(st
.stats
.sum
.num_objects
-
2645 st
.stats
.sum
.num_objects_hit_set_archive
) >
2646 p
.second
.target_max_objects
* (ratio
/ 1000000.0)) {
2648 ss
<< "cache pool '" << name
<< "' with "
2649 << si_u_t(st
.stats
.sum
.num_objects
)
2650 << " objects at/near target max "
2651 << si_u_t(p
.second
.target_max_objects
) << " objects";
2652 detail
.push_back(ss
.str());
2655 if (p
.second
.target_max_bytes
&&
2656 (uint64_t)(st
.stats
.sum
.num_bytes
-
2657 st
.stats
.sum
.num_bytes_hit_set_archive
) >
2658 p
.second
.target_max_bytes
* (ratio
/ 1000000.0)) {
2660 ss
<< "cache pool '" << name
2661 << "' with " << byte_u_t(st
.stats
.sum
.num_bytes
)
2662 << " at/near target max "
2663 << byte_u_t(p
.second
.target_max_bytes
);
2664 detail
.push_back(ss
.str());
2671 if (!detail
.empty()) {
2673 ss
<< num_pools
<< " cache pools at or near target size";
2674 auto& d
= checks
->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN
, ss
.str());
2675 d
.detail
.swap(detail
);
2680 unsigned num_in
= osdmap
.get_num_in_osds();
2681 auto sum_pg_up
= std::max(static_cast<size_t>(pg_sum
.up
), pg_stat
.size());
2682 const auto min_pg_per_osd
=
2683 cct
->_conf
.get_val
<uint64_t>("mon_pg_warn_min_per_osd");
2684 if (num_in
&& min_pg_per_osd
> 0 && osdmap
.get_pools().size() > 0) {
2685 auto per
= sum_pg_up
/ num_in
;
2686 if (per
< min_pg_per_osd
&& per
) {
2688 ss
<< "too few PGs per OSD (" << per
2689 << " < min " << min_pg_per_osd
<< ")";
2690 checks
->add("TOO_FEW_PGS", HEALTH_WARN
, ss
.str());
2695 auto max_pg_per_osd
= cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd");
2696 if (num_in
&& max_pg_per_osd
> 0) {
2697 auto per
= sum_pg_up
/ num_in
;
2698 if (per
> max_pg_per_osd
) {
2700 ss
<< "too many PGs per OSD (" << per
2701 << " > max " << max_pg_per_osd
<< ")";
2702 checks
->add("TOO_MANY_PGS", HEALTH_WARN
, ss
.str());
2707 auto warn_too_few_osds
= cct
->_conf
.get_val
<bool>("mon_warn_on_too_few_osds");
2708 auto osd_pool_default_size
= cct
->_conf
.get_val
<uint64_t>("osd_pool_default_size");
2709 if (warn_too_few_osds
&& osdmap
.get_num_osds() < osd_pool_default_size
) {
2711 ss
<< "OSD count " << osdmap
.get_num_osds()
2712 << " < osd_pool_default_size " << osd_pool_default_size
;
2713 checks
->add("TOO_FEW_OSDS", HEALTH_WARN
, ss
.str());
2717 // Convert milliseconds to microseconds
2718 auto warn_slow_ping_time
= cct
->_conf
.get_val
<double>("mon_warn_on_slow_ping_time") * 1000;
2719 auto grace
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_grace");
2720 if (warn_slow_ping_time
== 0) {
2721 double ratio
= cct
->_conf
.get_val
<double>("mon_warn_on_slow_ping_ratio");
2722 warn_slow_ping_time
= grace
;
2723 warn_slow_ping_time
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2725 if (warn_slow_ping_time
> 0) {
2727 struct mon_ping_item_t
{
2733 bool operator<(const mon_ping_item_t
& rhs
) const {
2734 if (pingtime
< rhs
.pingtime
)
2736 if (pingtime
> rhs
.pingtime
)
2738 if (from
< rhs
.from
)
2740 if (from
> rhs
.from
)
2746 list
<string
> detail_back
;
2747 list
<string
> detail_front
;
2748 set
<mon_ping_item_t
> back_sorted
, front_sorted
;
2749 for (auto i
: osd_stat
) {
2750 for (auto j
: i
.second
.hb_pingtime
) {
2752 // Maybe source info is old
2753 if (now
.sec() - j
.second
.last_update
> grace
* 60)
2756 mon_ping_item_t back
;
2757 back
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
2758 back
.pingtime
= std::max(back
.pingtime
, j
.second
.back_pingtime
[2]);
2759 back
.from
= i
.first
;
2761 if (back
.pingtime
> warn_slow_ping_time
) {
2762 back
.improving
= (j
.second
.back_pingtime
[0] < j
.second
.back_pingtime
[1]
2763 && j
.second
.back_pingtime
[1] < j
.second
.back_pingtime
[2]);
2764 back_sorted
.emplace(back
);
2767 mon_ping_item_t front
;
2768 front
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
2769 front
.pingtime
= std::max(front
.pingtime
, j
.second
.front_pingtime
[2]);
2770 front
.from
= i
.first
;
2772 if (front
.pingtime
> warn_slow_ping_time
) {
2773 front
.improving
= (j
.second
.front_pingtime
[0] < j
.second
.front_pingtime
[1]
2774 && j
.second
.front_pingtime
[1] < j
.second
.back_pingtime
[2]);
2775 front_sorted
.emplace(front
);
2779 int max_detail
= 10;
2780 for (auto &sback
: boost::adaptors::reverse(back_sorted
)) {
2782 if (max_detail
== 0) {
2783 ss
<< "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2784 detail_back
.push_back(ss
.str());
2788 ss
<< "Slow heartbeat ping on back interface from osd." << sback
.from
2789 << (osdmap
.is_down(sback
.from
) ? " (down)" : "")
2790 << " to osd." << sback
.to
2791 << (osdmap
.is_down(sback
.to
) ? " (down)" : "")
2792 << " " << fixed_u_to_string(sback
.pingtime
, 3) << " msec"
2793 << (sback
.improving
? " possibly improving" : "");
2794 detail_back
.push_back(ss
.str());
2797 for (auto &sfront
: boost::adaptors::reverse(front_sorted
)) {
2799 if (max_detail
== 0) {
2800 ss
<< "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2801 detail_front
.push_back(ss
.str());
2805 ss
<< "Slow heartbeat ping on front interface from osd." << sfront
.from
2806 << (osdmap
.is_down(sfront
.from
) ? " (down)" : "")
2807 << " to osd." << sfront
.to
2808 << (osdmap
.is_down(sfront
.to
) ? " (down)" : "")
2809 << " " << fixed_u_to_string(sfront
.pingtime
, 3) << " msec"
2810 << (sfront
.improving
? " possibly improving" : "");
2811 detail_front
.push_back(ss
.str());
2813 if (detail_back
.size() != 0) {
2815 ss
<< "Long heartbeat ping times on back interface seen, longest is "
2816 << fixed_u_to_string(back_sorted
.rbegin()->pingtime
, 3) << " msec";
2817 auto& d
= checks
->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN
, ss
.str());
2818 d
.detail
.swap(detail_back
);
2820 if (detail_front
.size() != 0) {
2822 ss
<< "Long heartbeat ping times on front interface seen, longest is "
2823 << fixed_u_to_string(front_sorted
.rbegin()->pingtime
, 3) << " msec";
2824 auto& d
= checks
->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN
, ss
.str());
2825 d
.detail
.swap(detail_front
);
2830 // MANY_OBJECTS_PER_PG
2831 if (!pg_stat
.empty()) {
2832 list
<string
> pgp_detail
, many_detail
;
2833 const auto mon_pg_warn_min_objects
=
2834 cct
->_conf
.get_val
<int64_t>("mon_pg_warn_min_objects");
2835 const auto mon_pg_warn_min_pool_objects
=
2836 cct
->_conf
.get_val
<int64_t>("mon_pg_warn_min_pool_objects");
2837 const auto mon_pg_warn_max_object_skew
=
2838 cct
->_conf
.get_val
<double>("mon_pg_warn_max_object_skew");
2839 for (auto p
= pg_pool_sum
.begin();
2840 p
!= pg_pool_sum
.end();
2842 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
);
2844 continue; // in case osdmap changes haven't propagated to PGMap yet
2845 const string
& name
= osdmap
.get_pool_name(p
->first
);
2846 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2847 // the warnings. If the cluster is failing to converge on the target
2848 // values that is a separate issue!
2849 if (pi
->get_pg_num_target() > pi
->get_pgp_num_target() &&
2850 !(name
.find(".DELETED") != string::npos
&&
2851 cct
->_conf
->mon_fake_pool_delete
)) {
2853 ss
<< "pool " << name
<< " pg_num "
2854 << pi
->get_pg_num_target()
2855 << " > pgp_num " << pi
->get_pgp_num_target();
2856 pgp_detail
.push_back(ss
.str());
2858 int average_objects_per_pg
= pg_sum
.stats
.sum
.num_objects
/ pg_stat
.size();
2859 if (average_objects_per_pg
> 0 &&
2860 pg_sum
.stats
.sum
.num_objects
>= mon_pg_warn_min_objects
&&
2861 p
->second
.stats
.sum
.num_objects
>= mon_pg_warn_min_pool_objects
) {
2862 int objects_per_pg
= p
->second
.stats
.sum
.num_objects
/
2863 pi
->get_pg_num_target();
2864 float ratio
= (float)objects_per_pg
/ (float)average_objects_per_pg
;
2865 if (mon_pg_warn_max_object_skew
> 0 &&
2866 ratio
> mon_pg_warn_max_object_skew
) {
2868 ss
<< "pool " << name
<< " objects per pg ("
2869 << objects_per_pg
<< ") is more than " << ratio
2870 << " times cluster average ("
2871 << average_objects_per_pg
<< ")";
2872 many_detail
.push_back(ss
.str());
2876 if (!pgp_detail
.empty()) {
2878 ss
<< pgp_detail
.size() << " pools have pg_num > pgp_num";
2879 auto& d
= checks
->add("SMALLER_PGP_NUM", HEALTH_WARN
, ss
.str());
2880 d
.detail
.swap(pgp_detail
);
2882 if (!many_detail
.empty()) {
2884 ss
<< many_detail
.size() << " pools have many more objects per pg than"
2886 auto& d
= checks
->add("MANY_OBJECTS_PER_PG", HEALTH_WARN
, ss
.str());
2887 d
.detail
.swap(many_detail
);
2894 float warn_threshold
= (float)g_conf().get_val
<int64_t>("mon_pool_quota_warn_threshold")/100;
2895 float crit_threshold
= (float)g_conf().get_val
<int64_t>("mon_pool_quota_crit_threshold")/100;
2896 list
<string
> full_detail
, nearfull_detail
;
2897 unsigned full_pools
= 0, nearfull_pools
= 0;
2898 for (auto it
: pools
) {
2899 auto it2
= pg_pool_sum
.find(it
.first
);
2900 if (it2
== pg_pool_sum
.end()) {
2903 const pool_stat_t
*pstat
= &it2
->second
;
2904 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
2905 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
2906 const pg_pool_t
&pool
= it
.second
;
2907 bool full
= false, nearfull
= false;
2908 if (pool
.quota_max_objects
> 0) {
2910 if ((uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
2911 } else if (crit_threshold
> 0 &&
2912 sum
.num_objects
>= pool
.quota_max_objects
*crit_threshold
) {
2913 ss
<< "pool '" << pool_name
2914 << "' has " << sum
.num_objects
<< " objects"
2915 << " (max " << pool
.quota_max_objects
<< ")";
2916 full_detail
.push_back(ss
.str());
2918 } else if (warn_threshold
> 0 &&
2919 sum
.num_objects
>= pool
.quota_max_objects
*warn_threshold
) {
2920 ss
<< "pool '" << pool_name
2921 << "' has " << sum
.num_objects
<< " objects"
2922 << " (max " << pool
.quota_max_objects
<< ")";
2923 nearfull_detail
.push_back(ss
.str());
2927 if (pool
.quota_max_bytes
> 0) {
2929 if ((uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
2930 } else if (crit_threshold
> 0 &&
2931 sum
.num_bytes
>= pool
.quota_max_bytes
*crit_threshold
) {
2932 ss
<< "pool '" << pool_name
2933 << "' has " << byte_u_t(sum
.num_bytes
)
2934 << " (max " << byte_u_t(pool
.quota_max_bytes
) << ")";
2935 full_detail
.push_back(ss
.str());
2937 } else if (warn_threshold
> 0 &&
2938 sum
.num_bytes
>= pool
.quota_max_bytes
*warn_threshold
) {
2939 ss
<< "pool '" << pool_name
2940 << "' has " << byte_u_t(sum
.num_bytes
)
2941 << " (max " << byte_u_t(pool
.quota_max_bytes
) << ")";
2942 nearfull_detail
.push_back(ss
.str());
2955 ss
<< full_pools
<< " pools full";
2956 auto& d
= checks
->add("POOL_FULL", HEALTH_ERR
, ss
.str());
2957 d
.detail
.swap(full_detail
);
2959 if (nearfull_pools
) {
2961 ss
<< nearfull_pools
<< " pools nearfull";
2962 auto& d
= checks
->add("POOL_NEAR_FULL", HEALTH_WARN
, ss
.str());
2963 d
.detail
.swap(nearfull_detail
);
2968 if (pg_sum
.stats
.sum
.num_objects_misplaced
&&
2969 pg_sum
.stats
.sum
.num_object_copies
> 0 &&
2970 cct
->_conf
->mon_warn_on_misplaced
) {
2971 double pc
= (double)pg_sum
.stats
.sum
.num_objects_misplaced
/
2972 (double)pg_sum
.stats
.sum
.num_object_copies
* (double)100.0;
2974 snprintf(b
, sizeof(b
), "%.3lf", pc
);
2976 ss
<< pg_sum
.stats
.sum
.num_objects_misplaced
2977 << "/" << pg_sum
.stats
.sum
.num_object_copies
<< " objects misplaced ("
2979 checks
->add("OBJECT_MISPLACED", HEALTH_WARN
, ss
.str());
2983 if (pg_sum
.stats
.sum
.num_objects_unfound
&&
2984 pg_sum
.stats
.sum
.num_objects
) {
2985 double pc
= (double)pg_sum
.stats
.sum
.num_objects_unfound
/
2986 (double)pg_sum
.stats
.sum
.num_objects
* (double)100.0;
2988 snprintf(b
, sizeof(b
), "%.3lf", pc
);
2990 ss
<< pg_sum
.stats
.sum
.num_objects_unfound
2991 << "/" << pg_sum
.stats
.sum
.num_objects
<< " objects unfound (" << b
<< "%)";
2992 auto& d
= checks
->add("OBJECT_UNFOUND", HEALTH_WARN
, ss
.str());
2994 for (auto& p
: pg_stat
) {
2995 if (p
.second
.stats
.sum
.num_objects_unfound
) {
2997 ss
<< "pg " << p
.first
2998 << " has " << p
.second
.stats
.sum
.num_objects_unfound
2999 << " unfound objects";
3000 d
.detail
.push_back(ss
.str());
3001 if (d
.detail
.size() > max
) {
3002 d
.detail
.push_back("(additional pgs left out for brevity)");
3011 // SLOW_OPS unifies them in mimic.
3012 if (osdmap
.require_osd_release
< CEPH_RELEASE_MIMIC
&&
3013 cct
->_conf
->mon_osd_warn_op_age
> 0 &&
3014 !osd_sum
.op_queue_age_hist
.h
.empty() &&
3015 osd_sum
.op_queue_age_hist
.upper_bound() / 1000.0 >
3016 cct
->_conf
->mon_osd_warn_op_age
) {
3017 list
<string
> warn_detail
, error_detail
;
3018 unsigned warn
= 0, error
= 0;
3020 cct
->_conf
->mon_osd_warn_op_age
* cct
->_conf
->mon_osd_err_op_age_ratio
;
3021 const pow2_hist_t
& h
= osd_sum
.op_queue_age_hist
;
3022 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
3023 float ub
= (float)(1 << i
) / 1000.0;
3024 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
3028 ss
<< h
.h
[i
] << " ops are blocked > " << ub
<< " sec";
3031 error_detail
.push_back(ss
.str());
3034 warn_detail
.push_back(ss
.str());
3039 map
<float,set
<int>> warn_osd_by_max
; // max -> osds
3040 map
<float,set
<int>> error_osd_by_max
; // max -> osds
3041 if (!warn_detail
.empty() || !error_detail
.empty()) {
3042 for (auto& p
: osd_stat
) {
3043 const pow2_hist_t
& h
= p
.second
.op_queue_age_hist
;
3044 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
3045 float ub
= (float)(1 << i
) / 1000.0;
3046 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
3050 error_osd_by_max
[ub
].insert(p
.first
);
3052 warn_osd_by_max
[ub
].insert(p
.first
);
3060 if (!warn_detail
.empty()) {
3062 ss
<< warn
<< " slow requests are blocked > "
3063 << cct
->_conf
->mon_osd_warn_op_age
<< " sec";
3064 auto& d
= checks
->add("REQUEST_SLOW", HEALTH_WARN
, ss
.str());
3065 d
.detail
.swap(warn_detail
);
3067 for (auto& p
: warn_osd_by_max
) {
3069 if (p
.second
.size() > 1) {
3070 ss
<< "osds " << p
.second
3071 << " have blocked requests > " << p
.first
<< " sec";
3073 ss
<< "osd." << *p
.second
.begin()
3074 << " has blocked requests > " << p
.first
<< " sec";
3076 d
.detail
.push_back(ss
.str());
3082 if (!error_detail
.empty()) {
3084 ss
<< error
<< " stuck requests are blocked > "
3085 << err_age
<< " sec";
3086 auto& d
= checks
->add("REQUEST_STUCK", HEALTH_ERR
, ss
.str());
3087 d
.detail
.swap(error_detail
);
3089 for (auto& p
: error_osd_by_max
) {
3091 if (p
.second
.size() > 1) {
3092 ss
<< "osds " << p
.second
3093 << " have stuck requests > " << p
.first
<< " sec";
3095 ss
<< "osd." << *p
.second
.begin()
3096 << " has stuck requests > " << p
.first
<< " sec";
3098 d
.detail
.push_back(ss
.str());
3106 // OBJECT_STORE_WARN
3107 if (osd_sum
.os_alerts
.size()) {
3108 map
<string
, pair
<size_t, list
<string
>>> os_alerts_sum
;
3110 for (auto& a
: osd_sum
.os_alerts
) {
3112 string s0
= " osd.";
3113 s0
+= stringify(a
.first
);
3114 for (auto& aa
: a
.second
) {
3118 auto it
= os_alerts_sum
.find(aa
.first
);
3119 if (it
== os_alerts_sum
.end()) {
3122 os_alerts_sum
.emplace(aa
.first
, std::make_pair(1, d
));
3124 auto& p
= it
->second
;
3126 p
.second
.emplace_back(s
);
3134 for (auto& asum
: os_alerts_sum
) {
3136 if (asum
.first
== "BLUEFS_SPILLOVER") {
3137 summary
= "BlueFS spillover detected";
3138 } else if (asum
.first
== "BLUESTORE_NO_COMPRESSION") {
3139 summary
= "BlueStore compression broken";
3140 } else if (asum
.first
== "BLUESTORE_LEGACY_STATFS") {
3141 summary
= "Legacy BlueStore stats reporting detected";
3142 } else if (asum
.first
== "BLUESTORE_DISK_SIZE_MISMATCH") {
3143 summary
= "BlueStore has dangerous mismatch between block device and free list sizes";
3146 summary
+= stringify(asum
.second
.first
);
3147 summary
+= " OSD(s)";
3148 auto& d
= checks
->add(asum
.first
, HEALTH_WARN
, summary
);
3149 for (auto& s
: asum
.second
.second
) {
3150 d
.detail
.push_back(s
);
3155 // PG_NOT_DEEP_SCRUBBED
3156 if (cct
->_conf
->mon_warn_pg_not_scrubbed_ratio
||
3157 cct
->_conf
->mon_warn_pg_not_deep_scrubbed_ratio
) {
3158 list
<string
> detail
, deep_detail
;
3159 int detail_max
= max
, deep_detail_max
= max
;
3160 int detail_more
= 0, deep_detail_more
= 0;
3161 int detail_total
= 0, deep_detail_total
= 0;
3162 for (auto& p
: pg_stat
) {
3163 int64_t pnum
= p
.first
.pool();
3164 auto pool
= osdmap
.get_pg_pool(pnum
);
3167 if (cct
->_conf
->mon_warn_pg_not_scrubbed_ratio
) {
3168 double scrub_max_interval
= 0;
3169 pool
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &scrub_max_interval
);
3170 if (scrub_max_interval
<= 0) {
3171 scrub_max_interval
= cct
->_conf
->osd_scrub_max_interval
;
3173 const double age
= (cct
->_conf
->mon_warn_pg_not_scrubbed_ratio
* scrub_max_interval
) +
3175 utime_t cutoff
= now
;
3177 if (p
.second
.last_scrub_stamp
< cutoff
) {
3178 if (detail_max
> 0) {
3180 ss
<< "pg " << p
.first
<< " not scrubbed since "
3181 << p
.second
.last_scrub_stamp
;
3182 detail
.push_back(ss
.str());
3190 if (cct
->_conf
->mon_warn_pg_not_deep_scrubbed_ratio
) {
3191 double deep_scrub_interval
= 0;
3192 pool
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &deep_scrub_interval
);
3193 if (deep_scrub_interval
<= 0) {
3194 deep_scrub_interval
= cct
->_conf
->osd_deep_scrub_interval
;
3196 double deep_age
= (cct
->_conf
->mon_warn_pg_not_deep_scrubbed_ratio
* deep_scrub_interval
) +
3197 deep_scrub_interval
;
3198 utime_t deep_cutoff
= now
;
3199 deep_cutoff
-= deep_age
;
3200 if (p
.second
.last_deep_scrub_stamp
< deep_cutoff
) {
3201 if (deep_detail_max
> 0) {
3203 ss
<< "pg " << p
.first
<< " not deep-scrubbed since "
3204 << p
.second
.last_deep_scrub_stamp
;
3205 deep_detail
.push_back(ss
.str());
3210 ++deep_detail_total
;
3216 ss
<< detail_total
<< " pgs not scrubbed in time";
3217 auto& d
= checks
->add("PG_NOT_SCRUBBED", HEALTH_WARN
, ss
.str());
3219 if (!detail
.empty()) {
3220 d
.detail
.swap(detail
);
3224 ss
<< detail_more
<< " more pgs... ";
3225 d
.detail
.push_back(ss
.str());
3229 if (deep_detail_total
) {
3231 ss
<< deep_detail_total
<< " pgs not deep-scrubbed in time";
3232 auto& d
= checks
->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN
, ss
.str());
3234 if (!deep_detail
.empty()) {
3235 d
.detail
.swap(deep_detail
);
3237 if (deep_detail_more
) {
3239 ss
<< deep_detail_more
<< " more pgs... ";
3240 d
.detail
.push_back(ss
.str());
3247 if (g_conf().get_val
<bool>("mon_warn_on_pool_no_app")) {
3248 list
<string
> detail
;
3249 for (auto &it
: pools
) {
3250 const pg_pool_t
&pool
= it
.second
;
3251 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3252 auto it2
= pg_pool_sum
.find(it
.first
);
3253 if (it2
== pg_pool_sum
.end()) {
3256 const pool_stat_t
*pstat
= &it2
->second
;
3257 if (pstat
== nullptr) {
3260 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
3261 // application metadata is not encoded until luminous is minimum
3263 if (sum
.num_objects
> 0 && pool
.application_metadata
.empty() &&
3266 ss
<< "application not enabled on pool '" << pool_name
<< "'";
3267 detail
.push_back(ss
.str());
3270 if (!detail
.empty()) {
3272 ss
<< "application not enabled on " << detail
.size() << " pool(s)";
3273 auto& d
= checks
->add("POOL_APP_NOT_ENABLED", HEALTH_WARN
, ss
.str());
3275 tip
<< "use 'ceph osd pool application enable <pool-name> "
3276 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3277 << "or freeform for custom applications.";
3278 detail
.push_back(tip
.str());
3279 d
.detail
.swap(detail
);
3283 // PG_SLOW_SNAP_TRIMMING
3284 if (!pg_stat
.empty() && cct
->_conf
->mon_osd_snap_trim_queue_warn_on
> 0) {
3285 uint32_t snapthreshold
= cct
->_conf
->mon_osd_snap_trim_queue_warn_on
;
3286 uint64_t snaptrimq_exceeded
= 0;
3287 uint32_t longest_queue
= 0;
3288 const pg_t
* longest_q_pg
= nullptr;
3289 list
<string
> detail
;
3291 for (auto& i
: pg_stat
) {
3292 uint32_t current_len
= i
.second
.snaptrimq_len
;
3293 if (current_len
>= snapthreshold
) {
3294 snaptrimq_exceeded
++;
3295 if (longest_queue
<= current_len
) {
3296 longest_q_pg
= &i
.first
;
3297 longest_queue
= current_len
;
3299 if (detail
.size() < max
- 1) {
3301 ss
<< "snap trim queue for pg " << i
.first
<< " at " << current_len
;
3302 detail
.push_back(ss
.str());
3305 if (detail
.size() < max
) {
3306 detail
.push_back("...more pgs affected");
3312 if (snaptrimq_exceeded
) {
3315 ss
<< "longest queue on pg " << *longest_q_pg
<< " at " << longest_queue
;
3316 detail
.push_back(ss
.str());
3320 ss
<< "snap trim queue for " << snaptrimq_exceeded
<< " pg(s) >= " << snapthreshold
<< " (mon_osd_snap_trim_queue_warn_on)";
3321 auto& d
= checks
->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN
, ss
.str());
3322 detail
.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3323 d
.detail
.swap(detail
);
3328 int process_pg_map_command(
3329 const string
& orig_prefix
,
3330 const cmdmap_t
& orig_cmdmap
,
3331 const PGMap
& pg_map
,
3332 const OSDMap
& osdmap
,
3337 string prefix
= orig_prefix
;
3338 auto cmdmap
= orig_cmdmap
;
3340 string omap_stats_note
=
3341 "\n* NOTE: Omap statistics are gathered during deep scrub and "
3342 "may be inaccurate soon afterwards depending on utilisation. See "
3343 "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3344 "for further details.\n";
3345 bool omap_stats_note_required
= false;
3347 // perhaps these would be better in the parsing, but it's weird
3348 bool primary
= false;
3349 if (prefix
== "pg dump_json") {
3351 v
.push_back(string("all"));
3352 cmd_putval(g_ceph_context
, cmdmap
, "format", string("json"));
3353 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
3355 } else if (prefix
== "pg dump_pools_json") {
3357 v
.push_back(string("pools"));
3358 cmd_putval(g_ceph_context
, cmdmap
, "format", string("json"));
3359 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
3361 } else if (prefix
== "pg ls-by-primary") {
3364 } else if (prefix
== "pg ls-by-osd") {
3366 } else if (prefix
== "pg ls-by-pool") {
3369 cmd_getval(g_ceph_context
, cmdmap
, "poolstr", poolstr
);
3370 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
3372 *ss
<< "pool " << poolstr
<< " does not exist";
3375 cmd_putval(g_ceph_context
, cmdmap
, "pool", pool
);
3379 if (prefix
== "pg stat") {
3381 f
->open_object_section("pg_summary");
3382 pg_map
.print_oneline_summary(f
, NULL
);
3392 if (prefix
== "pg getmap") {
3393 pg_map
.encode(*odata
);
3394 *ss
<< "got pgmap version " << pg_map
.version
;
3398 if (prefix
== "pg dump") {
3400 vector
<string
> dumpcontents
;
3402 if (cmd_getval(g_ceph_context
, cmdmap
, "dumpcontents", dumpcontents
)) {
3403 copy(dumpcontents
.begin(), dumpcontents
.end(),
3404 inserter(what
, what
.end()));
3409 if (what
.count("all")) {
3410 f
->open_object_section("pg_map");
3413 } else if (what
.count("summary") || what
.count("sum")) {
3414 f
->open_object_section("pg_map");
3415 pg_map
.dump_basic(f
);
3418 if (what
.count("pools")) {
3419 pg_map
.dump_pool_stats(f
);
3421 if (what
.count("osds")) {
3422 pg_map
.dump_osd_stats(f
);
3424 if (what
.count("pgs")) {
3425 pg_map
.dump_pg_stats(f
, false);
3427 if (what
.count("pgs_brief")) {
3428 pg_map
.dump_pg_stats(f
, true);
3430 if (what
.count("delta")) {
3431 f
->open_object_section("delta");
3432 pg_map
.dump_delta(f
);
3438 if (what
.count("all")) {
3440 omap_stats_note_required
= true;
3441 } else if (what
.count("summary") || what
.count("sum")) {
3442 pg_map
.dump_basic(ds
);
3443 pg_map
.dump_pg_sum_stats(ds
, true);
3444 pg_map
.dump_osd_sum_stats(ds
);
3445 omap_stats_note_required
= true;
3447 if (what
.count("pgs_brief")) {
3448 pg_map
.dump_pg_stats(ds
, true);
3451 if (what
.count("pgs")) {
3452 pg_map
.dump_pg_stats(ds
, false);
3454 omap_stats_note_required
= true;
3456 if (what
.count("pools")) {
3457 pg_map
.dump_pool_stats(ds
, header
);
3458 omap_stats_note_required
= true;
3460 if (what
.count("osds")) {
3461 pg_map
.dump_osd_stats(ds
);
3465 if (omap_stats_note_required
) {
3466 odata
->append(omap_stats_note
);
3469 *ss
<< "dumped " << what
;
3473 if (prefix
== "pg ls") {
3476 vector
<string
>states
;
3478 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool
);
3479 cmd_getval(g_ceph_context
, cmdmap
, "osd", osd
);
3480 cmd_getval(g_ceph_context
, cmdmap
, "states", states
);
3481 if (pool
>= 0 && !osdmap
.have_pg_pool(pool
)) {
3482 *ss
<< "pool " << pool
<< " does not exist";
3485 if (osd
>= 0 && !osdmap
.is_up(osd
)) {
3486 *ss
<< "osd " << osd
<< " is not up";
3490 states
.push_back("all");
3494 while (!states
.empty()) {
3495 string state_str
= states
.back();
3497 if (state_str
== "all") {
3501 auto filter
= pg_string_state(state_str
);
3503 *ss
<< "'" << state_str
<< "' is not a valid pg state,"
3504 << " available choices: " << pg_state_string(0xFFFFFFFF);
3513 pg_map
.get_filtered_pg_stats(state
, pool
, osd
, primary
, pgs
);
3515 if (f
&& !pgs
.empty()) {
3516 pg_map
.dump_filtered_pg_stats(f
, pgs
);
3518 } else if (!pgs
.empty()) {
3519 pg_map
.dump_filtered_pg_stats(ds
, pgs
);
3521 odata
->append(omap_stats_note
);
3526 if (prefix
== "pg dump_stuck") {
3527 vector
<string
> stuckop_vec
;
3528 cmd_getval(g_ceph_context
, cmdmap
, "stuckops", stuckop_vec
);
3529 if (stuckop_vec
.empty())
3530 stuckop_vec
.push_back("unclean");
3532 cmd_getval(g_ceph_context
, cmdmap
, "threshold", threshold
,
3533 g_conf().get_val
<int64_t>("mon_pg_stuck_threshold"));
3535 if (pg_map
.dump_stuck_pg_stats(ds
, f
, (int)threshold
, stuckop_vec
) < 0) {
3544 if (prefix
== "pg debug") {
3546 cmd_getval(g_ceph_context
, cmdmap
, "debugop", debugop
,
3547 string("unfound_objects_exist"));
3548 if (debugop
== "unfound_objects_exist") {
3549 bool unfound_objects_exist
= false;
3550 for (const auto& p
: pg_map
.pg_stat
) {
3551 if (p
.second
.stats
.sum
.num_objects_unfound
> 0) {
3552 unfound_objects_exist
= true;
3556 if (unfound_objects_exist
)
3563 if (debugop
== "degraded_pgs_exist") {
3564 bool degraded_pgs_exist
= false;
3565 for (const auto& p
: pg_map
.pg_stat
) {
3566 if (p
.second
.stats
.sum
.num_objects_degraded
> 0) {
3567 degraded_pgs_exist
= true;
3571 if (degraded_pgs_exist
)
3580 if (prefix
== "osd perf") {
3582 f
->open_object_section("osdstats");
3583 pg_map
.dump_osd_perf_stats(f
);
3587 pg_map
.print_osd_perf_stats(&ds
);
3593 if (prefix
== "osd blocked-by") {
3595 f
->open_object_section("osd_blocked_by");
3596 pg_map
.dump_osd_blocked_by_stats(f
);
3600 pg_map
.print_osd_blocked_by_stats(&ds
);
3609 void PGMapUpdater::check_osd_map(
3611 const OSDMap
& osdmap
,
3613 PGMap::Incremental
*pending_inc
)
3615 for (auto& p
: pgmap
.osd_stat
) {
3616 if (!osdmap
.exists(p
.first
)) {
3618 pending_inc
->rm_stat(p
.first
);
3619 } else if (osdmap
.is_out(p
.first
)) {
3621 if (p
.second
.statfs
.total
!= 0) {
3622 pending_inc
->stat_osd_out(p
.first
);
3624 } else if (!osdmap
.is_up(p
.first
)) {
3625 // zero the op_queue_age_hist
3626 if (!p
.second
.op_queue_age_hist
.empty()) {
3627 pending_inc
->stat_osd_down_up(p
.first
, pgmap
);
3632 // deleted pgs (pools)?
3633 for (auto& p
: pgmap
.pg_pool_sum
) {
3634 if (!osdmap
.have_pg_pool(p
.first
)) {
3635 ldout(cct
, 10) << __func__
<< " pool " << p
.first
<< " gone, removing pgs"
3637 for (auto& q
: pgmap
.pg_stat
) {
3638 if (q
.first
.pool() == p
.first
) {
3639 pending_inc
->pg_remove
.insert(q
.first
);
3642 auto q
= pending_inc
->pg_stat_updates
.begin();
3643 while (q
!= pending_inc
->pg_stat_updates
.end()) {
3644 if (q
->first
.pool() == p
.first
) {
3645 q
= pending_inc
->pg_stat_updates
.erase(q
);
3653 // new (split or new pool) or merged pgs?
3654 map
<int64_t,unsigned> new_pg_num
;
3655 for (auto& p
: osdmap
.get_pools()) {
3656 int64_t poolid
= p
.first
;
3657 const pg_pool_t
& pi
= p
.second
;
3658 auto q
= pgmap
.num_pg_by_pool
.find(poolid
);
3659 unsigned my_pg_num
= 0;
3660 if (q
!= pgmap
.num_pg_by_pool
.end())
3661 my_pg_num
= q
->second
;
3662 unsigned pg_num
= pi
.get_pg_num();
3663 new_pg_num
[poolid
] = pg_num
;
3664 if (my_pg_num
< pg_num
) {
3665 ldout(cct
,10) << __func__
<< " pool " << poolid
<< " pg_num " << pg_num
3666 << " > my pg_num " << my_pg_num
<< dendl
;
3667 for (unsigned ps
= my_pg_num
; ps
< pg_num
; ++ps
) {
3668 pg_t
pgid(ps
, poolid
);
3669 if (pending_inc
->pg_stat_updates
.count(pgid
) == 0) {
3670 ldout(cct
,20) << __func__
<< " adding " << pgid
<< dendl
;
3671 pg_stat_t
&stats
= pending_inc
->pg_stat_updates
[pgid
];
3672 stats
.last_fresh
= osdmap
.get_modified();
3673 stats
.last_active
= osdmap
.get_modified();
3674 stats
.last_change
= osdmap
.get_modified();
3675 stats
.last_peered
= osdmap
.get_modified();
3676 stats
.last_clean
= osdmap
.get_modified();
3677 stats
.last_unstale
= osdmap
.get_modified();
3678 stats
.last_undegraded
= osdmap
.get_modified();
3679 stats
.last_fullsized
= osdmap
.get_modified();
3680 stats
.last_scrub_stamp
= osdmap
.get_modified();
3681 stats
.last_deep_scrub_stamp
= osdmap
.get_modified();
3682 stats
.last_clean_scrub_stamp
= osdmap
.get_modified();
3685 } else if (my_pg_num
> pg_num
) {
3686 ldout(cct
,10) << __func__
<< " pool " << poolid
<< " pg_num " << pg_num
3687 << " < my pg_num " << my_pg_num
<< dendl
;
3688 for (unsigned i
= pg_num
; i
< my_pg_num
; ++i
) {
3689 pg_t
pgid(i
, poolid
);
3690 ldout(cct
,20) << __func__
<< " removing merged " << pgid
<< dendl
;
3691 if (pgmap
.pg_stat
.count(pgid
)) {
3692 pending_inc
->pg_remove
.insert(pgid
);
3694 pending_inc
->pg_stat_updates
.erase(pgid
);
3698 auto i
= pending_inc
->pg_stat_updates
.begin();
3699 while (i
!= pending_inc
->pg_stat_updates
.end()) {
3700 auto j
= new_pg_num
.find(i
->first
.pool());
3701 if (j
== new_pg_num
.end() ||
3702 i
->first
.ps() >= j
->second
) {
3703 ldout(cct
,20) << __func__
<< " removing pending update to old "
3704 << i
->first
<< dendl
;
3705 i
= pending_inc
->pg_stat_updates
.erase(i
);
3712 static void _try_mark_pg_stale(
3713 const OSDMap
& osdmap
,
3715 const pg_stat_t
& cur
,
3716 PGMap::Incremental
*pending_inc
)
3718 if ((cur
.state
& PG_STATE_STALE
) == 0 &&
3719 cur
.acting_primary
!= -1 &&
3720 osdmap
.is_down(cur
.acting_primary
)) {
3722 auto q
= pending_inc
->pg_stat_updates
.find(pgid
);
3723 if (q
!= pending_inc
->pg_stat_updates
.end()) {
3724 if ((q
->second
.acting_primary
== cur
.acting_primary
) ||
3725 ((q
->second
.state
& PG_STATE_STALE
) == 0 &&
3726 q
->second
.acting_primary
!= -1 &&
3727 osdmap
.is_down(q
->second
.acting_primary
))) {
3728 newstat
= &q
->second
;
3730 // pending update is no longer down or already stale
3734 newstat
= &pending_inc
->pg_stat_updates
[pgid
];
3737 dout(10) << __func__
<< " marking pg " << pgid
3738 << " stale (acting_primary " << newstat
->acting_primary
3740 newstat
->state
|= PG_STATE_STALE
;
3741 newstat
->last_unstale
= ceph_clock_now();
3745 void PGMapUpdater::check_down_pgs(
3746 const OSDMap
&osdmap
,
3747 const PGMap
&pg_map
,
3749 const set
<int>& need_check_down_pg_osds
,
3750 PGMap::Incremental
*pending_inc
)
3752 // if a large number of osds changed state, just iterate over the whole
3754 if (need_check_down_pg_osds
.size() > (unsigned)osdmap
.get_num_osds() *
3755 g_conf().get_val
<double>("mon_pg_check_down_all_threshold")) {
3760 for (const auto& p
: pg_map
.pg_stat
) {
3761 _try_mark_pg_stale(osdmap
, p
.first
, p
.second
, pending_inc
);
3764 for (auto osd
: need_check_down_pg_osds
) {
3765 if (osdmap
.is_down(osd
)) {
3766 auto p
= pg_map
.pg_by_osd
.find(osd
);
3767 if (p
== pg_map
.pg_by_osd
.end()) {
3770 for (auto pgid
: p
->second
) {
3771 const pg_stat_t
&stat
= pg_map
.pg_stat
.at(pgid
);
3772 ceph_assert(stat
.acting_primary
== osd
);
3773 _try_mark_pg_stale(osdmap
, pgid
, stat
, pending_inc
);
3780 int reweight::by_utilization(
3781 const OSDMap
&osdmap
,
3786 bool by_pg
, const set
<int64_t> *pools
,
3788 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
3789 std::stringstream
*ss
,
3790 std::string
*out_str
,
3794 *ss
<< "You must give a percentage higher than 100. "
3795 "The reweighting threshold will be calculated as <average-utilization> "
3796 "times <input-percentage>. For example, an argument of 200 would "
3797 "reweight OSDs which are twice as utilized as the average OSD.\n";
3801 vector
<int> pgs_by_osd(osdmap
.get_max_osd());
3803 // Avoid putting a small number (or 0) in the denominator when calculating
3805 double average_util
;
3808 double weight_sum
= 0.0; // sum up the crush weights
3809 unsigned num_pg_copies
= 0;
3811 for (const auto& pg
: pgm
.pg_stat
) {
3812 if (pools
&& pools
->count(pg
.first
.pool()) == 0)
3814 for (const auto acting
: pg
.second
.acting
) {
3815 if (!osdmap
.exists(acting
)) {
3818 if (acting
>= (int)pgs_by_osd
.size())
3819 pgs_by_osd
.resize(acting
);
3820 if (pgs_by_osd
[acting
] == 0) {
3821 if (osdmap
.crush
->get_item_weightf(acting
) <= 0) {
3822 //skip if we currently can not identify item
3825 weight_sum
+= osdmap
.crush
->get_item_weightf(acting
);
3828 ++pgs_by_osd
[acting
];
3833 if (!num_osds
|| (num_pg_copies
/ num_osds
< g_conf()->mon_reweight_min_pgs_per_osd
)) {
3834 *ss
<< "Refusing to reweight: we only have " << num_pg_copies
3835 << " PGs across " << num_osds
<< " osds!\n";
3839 average_util
= (double)num_pg_copies
/ weight_sum
;
3841 // by osd utilization
3842 int num_osd
= std::max
<size_t>(1, pgm
.osd_stat
.size());
3843 if ((uint64_t)pgm
.osd_sum
.statfs
.total
/ num_osd
3844 < g_conf()->mon_reweight_min_bytes_per_osd
) {
3845 *ss
<< "Refusing to reweight: we only have " << pgm
.osd_sum
.statfs
.kb()
3846 << " kb across all osds!\n";
3849 if ((uint64_t)pgm
.osd_sum
.statfs
.get_used_raw() / num_osd
3850 < g_conf()->mon_reweight_min_bytes_per_osd
) {
3851 *ss
<< "Refusing to reweight: we only have "
3852 << pgm
.osd_sum
.statfs
.kb_used_raw()
3853 << " kb used across all osds!\n";
3857 average_util
= (double)pgm
.osd_sum
.statfs
.get_used_raw() /
3858 (double)pgm
.osd_sum
.statfs
.total
;
3861 // adjust down only if we are above the threshold
3862 const double overload_util
= average_util
* (double)oload
/ 100.0;
3864 // but aggressively adjust weights up whenever possible.
3865 const double underload_util
= average_util
;
3867 const unsigned max_change
= (unsigned)(max_changef
* (double)0x10000);
3871 f
->open_object_section("reweight_by_utilization");
3872 f
->dump_int("overload_min", oload
);
3873 f
->dump_float("max_change", max_changef
);
3874 f
->dump_int("max_change_osds", max_osds
);
3875 f
->dump_float("average_utilization", average_util
);
3876 f
->dump_float("overload_utilization", overload_util
);
3878 oss
<< "oload " << oload
<< "\n";
3879 oss
<< "max_change " << max_changef
<< "\n";
3880 oss
<< "max_change_osds " << max_osds
<< "\n";
3882 oss
<< "average_utilization " << std::fixed
<< average_util
<< "\n";
3883 oss
<< "overload_utilization " << overload_util
<< "\n";
3885 int num_changed
= 0;
3887 // precompute util for each OSD
3888 std::vector
<std::pair
<int, float> > util_by_osd
;
3889 for (const auto& p
: pgm
.osd_stat
) {
3890 std::pair
<int, float> osd_util
;
3891 osd_util
.first
= p
.first
;
3893 if (p
.first
>= (int)pgs_by_osd
.size() ||
3894 pgs_by_osd
[p
.first
] == 0) {
3895 // skip if this OSD does not contain any pg
3896 // belonging to the specified pool(s).
3900 if (osdmap
.crush
->get_item_weightf(p
.first
) <= 0) {
3901 // skip if we are unable to locate item.
3906 pgs_by_osd
[p
.first
] / osdmap
.crush
->get_item_weightf(p
.first
);
3909 (double)p
.second
.statfs
.get_used_raw() / (double)p
.second
.statfs
.total
;
3911 util_by_osd
.push_back(osd_util
);
3914 // sort by absolute deviation from the mean utilization,
3915 // in descending order.
3916 std::sort(util_by_osd
.begin(), util_by_osd
.end(),
3917 [average_util
](std::pair
<int, float> l
, std::pair
<int, float> r
) {
3918 return abs(l
.second
- average_util
) > abs(r
.second
- average_util
);
3923 f
->open_array_section("reweights");
3925 for (const auto& p
: util_by_osd
) {
3926 unsigned weight
= osdmap
.get_weight(p
.first
);
3928 // skip if OSD is currently out
3931 float util
= p
.second
;
3933 if (util
>= overload_util
) {
3934 // Assign a lower weight to overloaded OSDs. The current weight
3935 // is a factor to take into account the original weights,
3936 // to represent e.g. differing storage capacities
3937 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
3938 if (weight
> max_change
)
3939 new_weight
= std::max(new_weight
, weight
- max_change
);
3940 new_weights
->insert({p
.first
, new_weight
});
3942 f
->open_object_section("osd");
3943 f
->dump_int("osd", p
.first
);
3944 f
->dump_float("weight", (float)weight
/ (float)0x10000);
3945 f
->dump_float("new_weight", (float)new_weight
/ (float)0x10000);
3948 oss
<< "osd." << p
.first
<< " weight "
3949 << (float)weight
/ (float)0x10000 << " -> "
3950 << (float)new_weight
/ (float)0x10000 << "\n";
3952 if (++num_changed
>= max_osds
)
3955 if (!no_increasing
&& util
<= underload_util
) {
3956 // assign a higher weight.. if we can.
3957 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
3958 new_weight
= std::min(new_weight
, weight
+ max_change
);
3959 if (new_weight
> 0x10000)
3960 new_weight
= 0x10000;
3961 if (new_weight
> weight
) {
3962 new_weights
->insert({p
.first
, new_weight
});
3963 oss
<< "osd." << p
.first
<< " weight "
3964 << (float)weight
/ (float)0x10000 << " -> "
3965 << (float)new_weight
/ (float)0x10000 << "\n";
3966 if (++num_changed
>= max_osds
)
3976 newmap
.deepish_copy_from(osdmap
);
3977 OSDMap::Incremental newinc
;
3978 newinc
.fsid
= newmap
.get_fsid();
3979 newinc
.epoch
= newmap
.get_epoch() + 1;
3980 newinc
.new_weight
= *new_weights
;
3981 newmap
.apply_incremental(newinc
);
3983 osdmap
.summarize_mapping_stats(&newmap
, pools
, out_str
, f
);
3989 *out_str
+= oss
.str();