1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include <boost/algorithm/string.hpp>
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Formatter.h"
11 #include "include/ceph_features.h"
12 #include "include/stringify.h"
14 #include "osd/osd_types.h"
15 #include "osd/OSDMap.h"
17 #define dout_context g_ceph_context
19 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest
, pgmap_digest
, pgmap
);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap
, pgmap
, pgmap
);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental
, pgmap_inc
, pgmap
);
24 // ---------------------
27 void PGMapDigest::encode(bufferlist
& bl
, uint64_t features
) const
29 // NOTE: see PGMap::encode_digest
30 ENCODE_START(1, 1, bl
);
32 ::encode(num_pg_active
, bl
);
33 ::encode(num_pg_unknown
, bl
);
34 ::encode(num_osd
, bl
);
35 ::encode(pg_pool_sum
, bl
, features
);
36 ::encode(pg_sum
, bl
, features
);
37 ::encode(osd_sum
, bl
);
38 ::encode(num_pg_by_state
, bl
);
39 ::encode(num_pg_by_osd
, bl
);
40 ::encode(num_pg_by_pool
, bl
);
41 ::encode(osd_last_seq
, bl
);
42 ::encode(per_pool_sum_delta
, bl
, features
);
43 ::encode(per_pool_sum_deltas_stamps
, bl
);
44 ::encode(pg_sum_delta
, bl
, features
);
45 ::encode(stamp_delta
, bl
);
46 ::encode(avail_space_by_rule
, bl
);
50 void PGMapDigest::decode(bufferlist::iterator
& p
)
54 ::decode(num_pg_active
, p
);
55 ::decode(num_pg_unknown
, p
);
57 ::decode(pg_pool_sum
, p
);
60 ::decode(num_pg_by_state
, p
);
61 ::decode(num_pg_by_osd
, p
);
62 ::decode(num_pg_by_pool
, p
);
63 ::decode(osd_last_seq
, p
);
64 ::decode(per_pool_sum_delta
, p
);
65 ::decode(per_pool_sum_deltas_stamps
, p
);
66 ::decode(pg_sum_delta
, p
);
67 ::decode(stamp_delta
, p
);
68 ::decode(avail_space_by_rule
, p
);
72 void PGMapDigest::dump(Formatter
*f
) const
74 f
->dump_unsigned("num_pg", num_pg
);
75 f
->dump_unsigned("num_pg_active", num_pg_active
);
76 f
->dump_unsigned("num_pg_unknown", num_pg_unknown
);
77 f
->dump_unsigned("num_osd", num_osd
);
78 f
->dump_object("pool_sum", pg_sum
);
79 f
->dump_object("osd_sum", osd_sum
);
80 f
->open_array_section("pool_stats");
81 for (auto& p
: pg_pool_sum
) {
82 f
->open_object_section("pool_stat");
83 f
->dump_int("poolid", p
.first
);
84 auto q
= num_pg_by_pool
.find(p
.first
);
85 if (q
!= num_pg_by_pool
.end())
86 f
->dump_unsigned("num_pg", q
->second
);
91 f
->open_array_section("osd_stats");
93 // TODO: this isn't really correct since we can dump non-existent OSDs
94 // I dunno what osd_last_seq is set to in that case...
95 for (auto& p
: osd_last_seq
) {
96 f
->open_object_section("osd_stat");
97 f
->dump_int("osd", i
);
98 f
->dump_unsigned("seq", p
);
103 f
->open_array_section("num_pg_by_state");
104 for (auto& p
: num_pg_by_state
) {
105 f
->open_object_section("count");
106 f
->dump_string("state", pg_state_string(p
.first
));
107 f
->dump_unsigned("num", p
.second
);
111 f
->open_array_section("num_pg_by_osd");
112 for (auto& p
: num_pg_by_osd
) {
113 f
->open_object_section("count");
114 f
->dump_unsigned("osd", p
.first
);
115 f
->dump_unsigned("num_primary_pg", p
.second
.primary
);
116 f
->dump_unsigned("num_acting_pg", p
.second
.acting
);
117 f
->dump_unsigned("num_up_pg", p
.second
.up
);
123 void PGMapDigest::generate_test_instances(list
<PGMapDigest
*>& ls
)
125 ls
.push_back(new PGMapDigest
);
128 inline std::string
percentify(const float& a
) {
129 std::stringstream ss
;
133 ss
<< std::fixed
<< std::setprecision(2) << a
;
137 void PGMapDigest::print_summary(Formatter
*f
, ostream
*out
) const
140 f
->open_array_section("pgs_by_state");
142 // list is descending numeric order (by count)
143 multimap
<int,int> state_by_count
; // count -> state
144 for (auto p
= num_pg_by_state
.begin();
145 p
!= num_pg_by_state
.end();
147 state_by_count
.insert(make_pair(p
->second
, p
->first
));
150 for (auto p
= state_by_count
.rbegin();
151 p
!= state_by_count
.rend();
154 f
->open_object_section("pgs_by_state_element");
155 f
->dump_string("state_name", pg_state_string(p
->second
));
156 f
->dump_unsigned("count", p
->first
);
164 f
->dump_unsigned("num_pgs", num_pg
);
165 f
->dump_unsigned("num_pools", pg_pool_sum
.size());
166 f
->dump_unsigned("num_objects", pg_sum
.stats
.sum
.num_objects
);
167 f
->dump_unsigned("data_bytes", pg_sum
.stats
.sum
.num_bytes
);
168 f
->dump_unsigned("bytes_used", osd_sum
.kb_used
* 1024ull);
169 f
->dump_unsigned("bytes_avail", osd_sum
.kb_avail
* 1024ull);
170 f
->dump_unsigned("bytes_total", osd_sum
.kb
* 1024ull);
172 *out
<< " pools: " << pg_pool_sum
.size() << " pools, "
173 << num_pg
<< " pgs\n";
174 *out
<< " objects: " << si_t(pg_sum
.stats
.sum
.num_objects
) << " objects, "
175 << prettybyte_t(pg_sum
.stats
.sum
.num_bytes
) << "\n";
177 << kb_t(osd_sum
.kb_used
) << " used, "
178 << kb_t(osd_sum
.kb_avail
) << " / "
179 << kb_t(osd_sum
.kb
) << " avail\n";
185 if (num_pg_unknown
> 0) {
186 float p
= (float)num_pg_unknown
/ (float)num_pg
;
188 f
->dump_float("unknown_pgs_ratio", p
);
191 snprintf(b
, sizeof(b
), "%.3lf", p
* 100.0);
192 *out
<< b
<< "% pgs unknown\n";
197 int num_pg_inactive
= num_pg
- num_pg_active
- num_pg_unknown
;
198 if (num_pg_inactive
> 0) {
199 float p
= (float)num_pg_inactive
/ (float)num_pg
;
201 f
->dump_float("inactive_pgs_ratio", p
);
207 snprintf(b
, sizeof(b
), "%.3f", p
* 100.0);
208 *out
<< b
<< "% pgs not active\n";
214 overall_recovery_summary(f
, &sl
);
215 if (!f
&& !sl
.empty()) {
216 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
) {
227 unsigned max_width
= 1;
228 for (multimap
<int,int>::reverse_iterator p
= state_by_count
.rbegin();
229 p
!= state_by_count
.rend();
232 std::stringstream ss
;
234 max_width
= MAX(ss
.str().size(), max_width
);
237 for (multimap
<int,int>::reverse_iterator p
= state_by_count
.rbegin();
238 p
!= state_by_count
.rend();
245 out
->setf(std::ios::left
);
246 *out
<< std::setw(max_width
) << p
->first
247 << " " << pg_state_string(p
->second
) << "\n";
248 out
->unsetf(std::ios::left
);
252 ostringstream ss_rec_io
;
253 overall_recovery_rate_summary(f
, &ss_rec_io
);
254 ostringstream ss_client_io
;
255 overall_client_io_rate_summary(f
, &ss_client_io
);
256 ostringstream ss_cache_io
;
257 overall_cache_io_rate_summary(f
, &ss_cache_io
);
259 if (!f
&& (ss_client_io
.str().length() || ss_rec_io
.str().length()
260 || ss_cache_io
.str().length())) {
265 if (!f
&& ss_client_io
.str().length())
266 *out
<< " client: " << ss_client_io
.str() << "\n";
267 if (!f
&& ss_rec_io
.str().length())
268 *out
<< " recovery: " << ss_rec_io
.str() << "\n";
269 if (!f
&& ss_cache_io
.str().length())
270 *out
<< " cache: " << ss_cache_io
.str() << "\n";
273 void PGMapDigest::print_oneline_summary(Formatter
*f
, ostream
*out
) const
275 std::stringstream ss
;
278 f
->open_array_section("num_pg_by_state");
279 for (auto p
= num_pg_by_state
.begin();
280 p
!= num_pg_by_state
.end();
283 f
->open_object_section("state");
284 f
->dump_string("name", pg_state_string(p
->first
));
285 f
->dump_unsigned("num", p
->second
);
288 if (p
!= num_pg_by_state
.begin())
290 ss
<< p
->second
<< " " << pg_state_string(p
->first
);
295 string states
= ss
.str();
297 *out
<< num_pg
<< " pgs: "
299 << prettybyte_t(pg_sum
.stats
.sum
.num_bytes
) << " data, "
300 << kb_t(osd_sum
.kb_used
) << " used, "
301 << kb_t(osd_sum
.kb_avail
) << " / "
302 << kb_t(osd_sum
.kb
) << " avail";
304 f
->dump_unsigned("num_pgs", num_pg
);
305 f
->dump_unsigned("num_bytes", pg_sum
.stats
.sum
.num_bytes
);
306 f
->dump_unsigned("raw_bytes_used", osd_sum
.kb_used
<< 10);
307 f
->dump_unsigned("raw_bytes_avail", osd_sum
.kb_avail
<< 10);
308 f
->dump_unsigned("raw_bytes", osd_sum
.kb
<< 10);
311 // make non-negative; we can get negative values if osds send
312 // uncommitted stats and then "go backward" or if they are just
314 pool_stat_t pos_delta
= pg_sum_delta
;
316 if (pos_delta
.stats
.sum
.num_rd
||
317 pos_delta
.stats
.sum
.num_wr
) {
320 if (pos_delta
.stats
.sum
.num_rd
) {
321 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)stamp_delta
;
323 *out
<< pretty_si_t(rd
) << "B/s rd, ";
325 f
->dump_unsigned("read_bytes_sec", rd
);
327 if (pos_delta
.stats
.sum
.num_wr
) {
328 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)stamp_delta
;
330 *out
<< pretty_si_t(wr
) << "B/s wr, ";
332 f
->dump_unsigned("write_bytes_sec", wr
);
334 int64_t iops
= (pos_delta
.stats
.sum
.num_rd
+ pos_delta
.stats
.sum
.num_wr
) / (double)stamp_delta
;
336 *out
<< pretty_si_t(iops
) << "op/s";
338 f
->dump_unsigned("io_sec", iops
);
342 overall_recovery_summary(f
, &sl
);
344 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
)
346 std::stringstream ssr
;
347 overall_recovery_rate_summary(f
, &ssr
);
348 if (out
&& ssr
.str().length())
349 *out
<< "; " << ssr
.str() << " recovering";
352 void PGMapDigest::recovery_summary(Formatter
*f
, list
<string
> *psl
,
353 const pool_stat_t
& pool_sum
) const
355 if (pool_sum
.stats
.sum
.num_objects_degraded
&& pool_sum
.stats
.sum
.num_object_copies
> 0) {
356 double pc
= (double)pool_sum
.stats
.sum
.num_objects_degraded
/
357 (double)pool_sum
.stats
.sum
.num_object_copies
* (double)100.0;
359 snprintf(b
, sizeof(b
), "%.3lf", pc
);
361 f
->dump_unsigned("degraded_objects", pool_sum
.stats
.sum
.num_objects_degraded
);
362 f
->dump_unsigned("degraded_total", pool_sum
.stats
.sum
.num_object_copies
);
363 f
->dump_float("degraded_ratio", pc
/ 100.0);
366 ss
<< pool_sum
.stats
.sum
.num_objects_degraded
367 << "/" << pool_sum
.stats
.sum
.num_object_copies
<< " objects degraded (" << b
<< "%)";
368 psl
->push_back(ss
.str());
371 if (pool_sum
.stats
.sum
.num_objects_misplaced
&& pool_sum
.stats
.sum
.num_object_copies
> 0) {
372 double pc
= (double)pool_sum
.stats
.sum
.num_objects_misplaced
/
373 (double)pool_sum
.stats
.sum
.num_object_copies
* (double)100.0;
375 snprintf(b
, sizeof(b
), "%.3lf", pc
);
377 f
->dump_unsigned("misplaced_objects", pool_sum
.stats
.sum
.num_objects_misplaced
);
378 f
->dump_unsigned("misplaced_total", pool_sum
.stats
.sum
.num_object_copies
);
379 f
->dump_float("misplaced_ratio", pc
/ 100.0);
382 ss
<< pool_sum
.stats
.sum
.num_objects_misplaced
383 << "/" << pool_sum
.stats
.sum
.num_object_copies
<< " objects misplaced (" << b
<< "%)";
384 psl
->push_back(ss
.str());
387 if (pool_sum
.stats
.sum
.num_objects_unfound
&& pool_sum
.stats
.sum
.num_objects
) {
388 double pc
= (double)pool_sum
.stats
.sum
.num_objects_unfound
/
389 (double)pool_sum
.stats
.sum
.num_objects
* (double)100.0;
391 snprintf(b
, sizeof(b
), "%.3lf", pc
);
393 f
->dump_unsigned("unfound_objects", pool_sum
.stats
.sum
.num_objects_unfound
);
394 f
->dump_unsigned("unfound_total", pool_sum
.stats
.sum
.num_objects
);
395 f
->dump_float("unfound_ratio", pc
/ 100.0);
398 ss
<< pool_sum
.stats
.sum
.num_objects_unfound
399 << "/" << pool_sum
.stats
.sum
.num_objects
<< " objects unfound (" << b
<< "%)";
400 psl
->push_back(ss
.str());
405 void PGMapDigest::recovery_rate_summary(Formatter
*f
, ostream
*out
,
406 const pool_stat_t
& delta_sum
,
407 utime_t delta_stamp
) const
409 // make non-negative; we can get negative values if osds send
410 // uncommitted stats and then "go backward" or if they are just
412 pool_stat_t pos_delta
= delta_sum
;
414 if (pos_delta
.stats
.sum
.num_objects_recovered
||
415 pos_delta
.stats
.sum
.num_bytes_recovered
||
416 pos_delta
.stats
.sum
.num_keys_recovered
) {
417 int64_t objps
= pos_delta
.stats
.sum
.num_objects_recovered
/ (double)delta_stamp
;
418 int64_t bps
= pos_delta
.stats
.sum
.num_bytes_recovered
/ (double)delta_stamp
;
419 int64_t kps
= pos_delta
.stats
.sum
.num_keys_recovered
/ (double)delta_stamp
;
421 f
->dump_int("recovering_objects_per_sec", objps
);
422 f
->dump_int("recovering_bytes_per_sec", bps
);
423 f
->dump_int("recovering_keys_per_sec", kps
);
424 f
->dump_int("num_objects_recovered", pos_delta
.stats
.sum
.num_objects_recovered
);
425 f
->dump_int("num_bytes_recovered", pos_delta
.stats
.sum
.num_bytes_recovered
);
426 f
->dump_int("num_keys_recovered", pos_delta
.stats
.sum
.num_keys_recovered
);
428 *out
<< pretty_si_t(bps
) << "B/s";
429 if (pos_delta
.stats
.sum
.num_keys_recovered
)
430 *out
<< ", " << pretty_si_t(kps
) << "keys/s";
431 *out
<< ", " << pretty_si_t(objps
) << "objects/s";
436 void PGMapDigest::overall_recovery_rate_summary(Formatter
*f
, ostream
*out
) const
438 recovery_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
441 void PGMapDigest::overall_recovery_summary(Formatter
*f
, list
<string
> *psl
) const
443 recovery_summary(f
, psl
, pg_sum
);
446 void PGMapDigest::pool_recovery_rate_summary(Formatter
*f
, ostream
*out
,
447 uint64_t poolid
) const
449 auto p
= per_pool_sum_delta
.find(poolid
);
450 if (p
== per_pool_sum_delta
.end())
453 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
454 assert(ts
!= per_pool_sum_deltas_stamps
.end());
455 recovery_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
458 void PGMapDigest::pool_recovery_summary(Formatter
*f
, list
<string
> *psl
,
459 uint64_t poolid
) const
461 auto p
= pg_pool_sum
.find(poolid
);
462 if (p
== pg_pool_sum
.end())
465 recovery_summary(f
, psl
, p
->second
);
468 void PGMapDigest::client_io_rate_summary(Formatter
*f
, ostream
*out
,
469 const pool_stat_t
& delta_sum
,
470 utime_t delta_stamp
) const
472 pool_stat_t pos_delta
= delta_sum
;
474 if (pos_delta
.stats
.sum
.num_rd
||
475 pos_delta
.stats
.sum
.num_wr
) {
476 if (pos_delta
.stats
.sum
.num_rd
) {
477 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)delta_stamp
;
479 f
->dump_int("read_bytes_sec", rd
);
481 *out
<< pretty_si_t(rd
) << "B/s rd, ";
484 if (pos_delta
.stats
.sum
.num_wr
) {
485 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)delta_stamp
;
487 f
->dump_int("write_bytes_sec", wr
);
489 *out
<< pretty_si_t(wr
) << "B/s wr, ";
492 int64_t iops_rd
= pos_delta
.stats
.sum
.num_rd
/ (double)delta_stamp
;
493 int64_t iops_wr
= pos_delta
.stats
.sum
.num_wr
/ (double)delta_stamp
;
495 f
->dump_int("read_op_per_sec", iops_rd
);
496 f
->dump_int("write_op_per_sec", iops_wr
);
498 *out
<< pretty_si_t(iops_rd
) << "op/s rd, " << pretty_si_t(iops_wr
) << "op/s wr";
503 void PGMapDigest::overall_client_io_rate_summary(Formatter
*f
, ostream
*out
) const
505 client_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
508 void PGMapDigest::pool_client_io_rate_summary(Formatter
*f
, ostream
*out
,
509 uint64_t poolid
) const
511 auto p
= per_pool_sum_delta
.find(poolid
);
512 if (p
== per_pool_sum_delta
.end())
515 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
516 assert(ts
!= per_pool_sum_deltas_stamps
.end());
517 client_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
520 void PGMapDigest::cache_io_rate_summary(Formatter
*f
, ostream
*out
,
521 const pool_stat_t
& delta_sum
,
522 utime_t delta_stamp
) const
524 pool_stat_t pos_delta
= delta_sum
;
526 bool have_output
= false;
528 if (pos_delta
.stats
.sum
.num_flush
) {
529 int64_t flush
= (pos_delta
.stats
.sum
.num_flush_kb
<< 10) / (double)delta_stamp
;
531 f
->dump_int("flush_bytes_sec", flush
);
533 *out
<< pretty_si_t(flush
) << "B/s flush";
537 if (pos_delta
.stats
.sum
.num_evict
) {
538 int64_t evict
= (pos_delta
.stats
.sum
.num_evict_kb
<< 10) / (double)delta_stamp
;
540 f
->dump_int("evict_bytes_sec", evict
);
544 *out
<< pretty_si_t(evict
) << "B/s evict";
548 if (pos_delta
.stats
.sum
.num_promote
) {
549 int64_t promote
= pos_delta
.stats
.sum
.num_promote
/ (double)delta_stamp
;
551 f
->dump_int("promote_op_per_sec", promote
);
555 *out
<< pretty_si_t(promote
) << "op/s promote";
559 if (pos_delta
.stats
.sum
.num_flush_mode_low
) {
561 f
->dump_int("num_flush_mode_low", pos_delta
.stats
.sum
.num_flush_mode_low
);
565 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_flush_mode_low
) << "PG(s) flushing";
569 if (pos_delta
.stats
.sum
.num_flush_mode_high
) {
571 f
->dump_int("num_flush_mode_high", pos_delta
.stats
.sum
.num_flush_mode_high
);
575 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_flush_mode_high
) << "PG(s) flushing (high)";
579 if (pos_delta
.stats
.sum
.num_evict_mode_some
) {
581 f
->dump_int("num_evict_mode_some", pos_delta
.stats
.sum
.num_evict_mode_some
);
585 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_evict_mode_some
) << "PG(s) evicting";
589 if (pos_delta
.stats
.sum
.num_evict_mode_full
) {
591 f
->dump_int("num_evict_mode_full", pos_delta
.stats
.sum
.num_evict_mode_full
);
595 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_evict_mode_full
) << "PG(s) evicting (full)";
600 void PGMapDigest::overall_cache_io_rate_summary(Formatter
*f
, ostream
*out
) const
602 cache_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
605 void PGMapDigest::pool_cache_io_rate_summary(Formatter
*f
, ostream
*out
,
606 uint64_t poolid
) const
608 auto p
= per_pool_sum_delta
.find(poolid
);
609 if (p
== per_pool_sum_delta
.end())
612 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
613 assert(ts
!= per_pool_sum_deltas_stamps
.end());
614 cache_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
617 static float pool_raw_used_rate(const OSDMap
&osd_map
, int64_t poolid
)
619 const pg_pool_t
*pool
= osd_map
.get_pg_pool(poolid
);
621 switch (pool
->get_type()) {
622 case pg_pool_t::TYPE_REPLICATED
:
623 return pool
->get_size();
625 case pg_pool_t::TYPE_ERASURE
:
628 osd_map
.get_erasure_code_profile(pool
->erasure_code_profile
);
629 auto pm
= ecp
.find("m");
630 auto pk
= ecp
.find("k");
631 if (pm
!= ecp
.end() && pk
!= ecp
.end()) {
632 int k
= atoi(pk
->second
.c_str());
633 int m
= atoi(pm
->second
.c_str());
637 return (float)mk
/ k
;
644 assert(0 == "unrecognized pool type");
648 ceph_statfs
PGMapDigest::get_statfs(OSDMap
&osdmap
,
649 boost::optional
<int64_t> data_pool
) const
653 object_stat_sum_t sum
;
656 auto i
= pg_pool_sum
.find(*data_pool
);
657 if (i
!= pg_pool_sum
.end()) {
658 sum
= i
->second
.stats
.sum
;
664 statfs
.kb_used
= (sum
.num_bytes
>> 10);
665 statfs
.kb_avail
= get_pool_free_space(osdmap
, *data_pool
) >> 10;
666 statfs
.num_objects
= sum
.num_objects
;
667 statfs
.kb
= statfs
.kb_used
+ statfs
.kb_avail
;
670 statfs
.kb
= osd_sum
.kb
;
671 statfs
.kb_used
= osd_sum
.kb_used
;
672 statfs
.kb_avail
= osd_sum
.kb_avail
;
673 statfs
.num_objects
= pg_sum
.stats
.sum
.num_objects
;
679 void PGMapDigest::dump_pool_stats_full(
680 const OSDMap
&osd_map
,
688 f
->open_array_section("pools");
690 tbl
.define_column("NAME", TextTable::LEFT
, TextTable::LEFT
);
691 tbl
.define_column("ID", TextTable::LEFT
, TextTable::LEFT
);
693 tbl
.define_column("QUOTA OBJECTS", TextTable::LEFT
, TextTable::LEFT
);
694 tbl
.define_column("QUOTA BYTES", TextTable::LEFT
, TextTable::LEFT
);
697 tbl
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
698 tbl
.define_column("%USED", TextTable::LEFT
, TextTable::RIGHT
);
699 tbl
.define_column("MAX AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
700 tbl
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
702 tbl
.define_column("DIRTY", TextTable::LEFT
, TextTable::RIGHT
);
703 tbl
.define_column("READ", TextTable::LEFT
, TextTable::RIGHT
);
704 tbl
.define_column("WRITE", TextTable::LEFT
, TextTable::RIGHT
);
705 tbl
.define_column("RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
709 map
<int,uint64_t> avail_by_rule
;
710 for (auto p
= osd_map
.get_pools().begin();
711 p
!= osd_map
.get_pools().end(); ++p
) {
712 int64_t pool_id
= p
->first
;
713 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
715 const string
& pool_name
= osd_map
.get_pool_name(pool_id
);
716 const pool_stat_t
&stat
= pg_pool_sum
.at(pool_id
);
718 const pg_pool_t
*pool
= osd_map
.get_pg_pool(pool_id
);
719 int ruleno
= osd_map
.crush
->find_rule(pool
->get_crush_rule(),
724 if (avail_by_rule
.count(ruleno
) == 0) {
725 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
726 avail
= get_rule_avail(ruleno
);
729 avail_by_rule
[ruleno
] = avail
;
731 avail
= avail_by_rule
[ruleno
];
734 raw_used_rate
= ::pool_raw_used_rate(osd_map
, pool_id
);
737 f
->open_object_section("pool");
738 f
->dump_string("name", pool_name
);
739 f
->dump_int("id", pool_id
);
740 f
->open_object_section("stats");
745 if (pool
->quota_max_objects
== 0)
748 tbl
<< si_t(pool
->quota_max_objects
);
750 if (pool
->quota_max_bytes
== 0)
753 tbl
<< si_t(pool
->quota_max_bytes
);
757 dump_object_stat_sum(tbl
, f
, stat
.stats
.sum
, avail
, raw_used_rate
, verbose
, pool
);
759 f
->close_section(); // stats
761 tbl
<< TextTable::endrow
;
764 f
->close_section(); // pool
769 assert(ss
!= nullptr);
776 void PGMapDigest::dump_fs_stats(stringstream
*ss
, Formatter
*f
, bool verbose
) const
779 f
->open_object_section("stats");
780 f
->dump_int("total_bytes", osd_sum
.kb
* 1024ull);
781 f
->dump_int("total_used_bytes", osd_sum
.kb_used
* 1024ull);
782 f
->dump_int("total_avail_bytes", osd_sum
.kb_avail
* 1024ull);
784 f
->dump_int("total_objects", pg_sum
.stats
.sum
.num_objects
);
788 assert(ss
!= nullptr);
790 tbl
.define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
791 tbl
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
792 tbl
.define_column("RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
793 tbl
.define_column("%RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
795 tbl
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
797 tbl
<< stringify(si_t(osd_sum
.kb
*1024))
798 << stringify(si_t(osd_sum
.kb_avail
*1024))
799 << stringify(si_t(osd_sum
.kb_used
*1024));
801 if (osd_sum
.kb
> 0) {
802 used
= ((float)osd_sum
.kb_used
/ osd_sum
.kb
);
804 tbl
<< percentify(used
*100);
806 tbl
<< stringify(si_t(pg_sum
.stats
.sum
.num_objects
));
808 tbl
<< TextTable::endrow
;
816 void PGMapDigest::dump_object_stat_sum(
817 TextTable
&tbl
, Formatter
*f
,
818 const object_stat_sum_t
&sum
, uint64_t avail
,
819 float raw_used_rate
, bool verbose
,
820 const pg_pool_t
*pool
)
822 float curr_object_copies_rate
= 0.0;
823 if (sum
.num_object_copies
> 0)
824 curr_object_copies_rate
= (float)(sum
.num_object_copies
- sum
.num_objects_degraded
) / sum
.num_object_copies
;
827 // note avail passed in is raw_avail, calc raw_used here.
829 used
= sum
.num_bytes
* raw_used_rate
* curr_object_copies_rate
;
830 used
/= used
+ avail
;
831 } else if (sum
.num_bytes
) {
836 f
->dump_int("kb_used", SHIFT_ROUND_UP(sum
.num_bytes
, 10));
837 f
->dump_int("bytes_used", sum
.num_bytes
);
838 f
->dump_format_unquoted("percent_used", "%.2f", (used
*100));
839 f
->dump_unsigned("max_avail", avail
/ raw_used_rate
);
840 f
->dump_int("objects", sum
.num_objects
);
842 f
->dump_int("quota_objects", pool
->quota_max_objects
);
843 f
->dump_int("quota_bytes", pool
->quota_max_bytes
);
844 f
->dump_int("dirty", sum
.num_objects_dirty
);
845 f
->dump_int("rd", sum
.num_rd
);
846 f
->dump_int("rd_bytes", sum
.num_rd_kb
* 1024ull);
847 f
->dump_int("wr", sum
.num_wr
);
848 f
->dump_int("wr_bytes", sum
.num_wr_kb
* 1024ull);
849 f
->dump_int("raw_bytes_used", sum
.num_bytes
* raw_used_rate
* curr_object_copies_rate
);
852 tbl
<< stringify(si_t(sum
.num_bytes
));
853 tbl
<< percentify(used
*100);
854 tbl
<< si_t(avail
/ raw_used_rate
);
855 tbl
<< sum
.num_objects
;
857 tbl
<< stringify(si_t(sum
.num_objects_dirty
))
858 << stringify(si_t(sum
.num_rd
))
859 << stringify(si_t(sum
.num_wr
))
860 << stringify(si_t(sum
.num_bytes
* raw_used_rate
* curr_object_copies_rate
));
865 int64_t PGMapDigest::get_pool_free_space(const OSDMap
&osd_map
,
866 int64_t poolid
) const
868 const pg_pool_t
*pool
= osd_map
.get_pg_pool(poolid
);
869 int ruleno
= osd_map
.crush
->find_rule(pool
->get_crush_rule(),
873 avail
= get_rule_avail(ruleno
);
877 return avail
/ ::pool_raw_used_rate(osd_map
, poolid
);
880 int64_t PGMap::get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const
883 int r
= osdmap
.crush
->get_rule_weight_osd_map(ruleno
, &wm
);
892 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
893 osdmap
.get_full_ratio() > 0) {
894 fratio
= osdmap
.get_full_ratio();
896 fratio
= get_fallback_full_ratio();
900 for (auto p
= wm
.begin(); p
!= wm
.end(); ++p
) {
901 auto osd_info
= osd_stat
.find(p
->first
);
902 if (osd_info
!= osd_stat
.end()) {
903 if (osd_info
->second
.kb
== 0 || p
->second
== 0) {
904 // osd must be out, hence its stats have been zeroed
905 // (unless we somehow managed to have a disk with size 0...)
907 // (p->second == 0), if osd weight is 0, no need to
908 // calculate proj below.
911 double unusable
= (double)osd_info
->second
.kb
*
913 double avail
= MAX(0.0, (double)osd_info
->second
.kb_avail
- unusable
);
915 int64_t proj
= (int64_t)(avail
/ (double)p
->second
);
916 if (min
< 0 || proj
< min
) {
920 if (osdmap
.is_up(p
->first
)) {
921 // This is a level 4 rather than an error, because we might have
922 // only just started, and not received the first stats message yet.
923 dout(4) << "OSD " << p
->first
<< " is up, but has no stats" << dendl
;
930 void PGMap::get_rules_avail(const OSDMap
& osdmap
,
931 std::map
<int,int64_t> *avail_map
) const
934 for (auto p
: osdmap
.get_pools()) {
935 int64_t pool_id
= p
.first
;
936 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
938 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
939 int ruleno
= osdmap
.crush
->find_rule(pool
->get_crush_rule(),
942 if (avail_map
->count(ruleno
) == 0)
943 (*avail_map
)[ruleno
] = get_rule_avail(osdmap
, ruleno
);
947 // ---------------------
950 void PGMap::Incremental::encode(bufferlist
&bl
, uint64_t features
) const
952 if ((features
& CEPH_FEATURE_MONENC
) == 0) {
955 ::encode(version
, bl
);
956 ::encode(pg_stat_updates
, bl
);
957 ::encode(osd_stat_updates
, bl
);
958 ::encode(osd_stat_rm
, bl
);
959 ::encode(osdmap_epoch
, bl
);
960 ::encode(pg_scan
, bl
);
961 ::encode(full_ratio
, bl
);
962 ::encode(nearfull_ratio
, bl
);
963 ::encode(pg_remove
, bl
);
967 ENCODE_START(7, 5, bl
);
968 ::encode(version
, bl
);
969 ::encode(pg_stat_updates
, bl
);
970 ::encode(osd_stat_updates
, bl
);
971 ::encode(osd_stat_rm
, bl
);
972 ::encode(osdmap_epoch
, bl
);
973 ::encode(pg_scan
, bl
);
974 ::encode(full_ratio
, bl
);
975 ::encode(nearfull_ratio
, bl
);
976 ::encode(pg_remove
, bl
);
978 ::encode(osd_epochs
, bl
);
982 void PGMap::Incremental::decode(bufferlist::iterator
&bl
)
984 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
985 ::decode(version
, bl
);
987 pg_stat_updates
.clear();
994 ::decode(pg_stat_updates
[pgid
], bl
);
997 ::decode(pg_stat_updates
, bl
);
999 ::decode(osd_stat_updates
, bl
);
1000 ::decode(osd_stat_rm
, bl
);
1001 ::decode(osdmap_epoch
, bl
);
1002 ::decode(pg_scan
, bl
);
1003 if (struct_v
>= 2) {
1004 ::decode(full_ratio
, bl
);
1005 ::decode(nearfull_ratio
, bl
);
1013 ::decode(opgid
, bl
);
1014 pg_remove
.insert(pg_t(opgid
));
1017 ::decode(pg_remove
, bl
);
1019 if (struct_v
< 4 && full_ratio
== 0) {
1022 if (struct_v
< 4 && nearfull_ratio
== 0) {
1023 nearfull_ratio
= -1;
1026 ::decode(stamp
, bl
);
1027 if (struct_v
>= 7) {
1028 ::decode(osd_epochs
, bl
);
1030 for (auto i
= osd_stat_updates
.begin();
1031 i
!= osd_stat_updates
.end();
1033 // This isn't accurate, but will cause trimming to behave like
1035 osd_epochs
.insert(make_pair(i
->first
, osdmap_epoch
));
1041 void PGMap::Incremental::dump(Formatter
*f
) const
1043 f
->dump_unsigned("version", version
);
1044 f
->dump_stream("stamp") << stamp
;
1045 f
->dump_unsigned("osdmap_epoch", osdmap_epoch
);
1046 f
->dump_unsigned("pg_scan_epoch", pg_scan
);
1047 f
->dump_float("full_ratio", full_ratio
);
1048 f
->dump_float("nearfull_ratio", nearfull_ratio
);
1050 f
->open_array_section("pg_stat_updates");
1051 for (auto p
= pg_stat_updates
.begin(); p
!= pg_stat_updates
.end(); ++p
) {
1052 f
->open_object_section("pg_stat");
1053 f
->dump_stream("pgid") << p
->first
;
1059 f
->open_array_section("osd_stat_updates");
1060 for (auto p
= osd_stat_updates
.begin(); p
!= osd_stat_updates
.end(); ++p
) {
1061 f
->open_object_section("osd_stat");
1062 f
->dump_int("osd", p
->first
);
1068 f
->open_array_section("osd_stat_removals");
1069 for (auto p
= osd_stat_rm
.begin(); p
!= osd_stat_rm
.end(); ++p
)
1070 f
->dump_int("osd", *p
);
1073 f
->open_array_section("pg_removals");
1074 for (auto p
= pg_remove
.begin(); p
!= pg_remove
.end(); ++p
)
1075 f
->dump_stream("pgid") << *p
;
1079 void PGMap::Incremental::generate_test_instances(list
<PGMap::Incremental
*>& o
)
1081 o
.push_back(new Incremental
);
1082 o
.push_back(new Incremental
);
1083 o
.back()->version
= 1;
1084 o
.back()->stamp
= utime_t(123,345);
1085 o
.push_back(new Incremental
);
1086 o
.back()->version
= 2;
1087 o
.back()->pg_stat_updates
[pg_t(1,2,3)] = pg_stat_t();
1088 o
.back()->osd_stat_updates
[5] = osd_stat_t();
1089 o
.back()->osd_epochs
[5] = 12;
1090 o
.push_back(new Incremental
);
1091 o
.back()->version
= 3;
1092 o
.back()->osdmap_epoch
= 1;
1093 o
.back()->pg_scan
= 2;
1094 o
.back()->full_ratio
= .2;
1095 o
.back()->nearfull_ratio
= .3;
1096 o
.back()->pg_stat_updates
[pg_t(4,5,6)] = pg_stat_t();
1097 o
.back()->osd_stat_updates
[6] = osd_stat_t();
1098 o
.back()->osd_epochs
[6] = 12;
1099 o
.back()->pg_remove
.insert(pg_t(1,2,3));
1100 o
.back()->osd_stat_rm
.insert(5);
1106 void PGMap::apply_incremental(CephContext
*cct
, const Incremental
& inc
)
1108 assert(inc
.version
== version
+1);
1111 pool_stat_t pg_sum_old
= pg_sum
;
1112 mempool::pgmap::unordered_map
<uint64_t, pool_stat_t
> pg_pool_sum_old
;
1114 bool ratios_changed
= false;
1115 if (inc
.full_ratio
!= full_ratio
&& inc
.full_ratio
!= -1) {
1116 full_ratio
= inc
.full_ratio
;
1117 ratios_changed
= true;
1119 if (inc
.nearfull_ratio
!= nearfull_ratio
&& inc
.nearfull_ratio
!= -1) {
1120 nearfull_ratio
= inc
.nearfull_ratio
;
1121 ratios_changed
= true;
1126 for (auto p
= inc
.pg_stat_updates
.begin();
1127 p
!= inc
.pg_stat_updates
.end();
1129 const pg_t
&update_pg(p
->first
);
1130 const pg_stat_t
&update_stat(p
->second
);
1132 if (pg_pool_sum_old
.count(update_pg
.pool()) == 0)
1133 pg_pool_sum_old
[update_pg
.pool()] = pg_pool_sum
[update_pg
.pool()];
1135 auto t
= pg_stat
.find(update_pg
);
1136 if (t
== pg_stat
.end()) {
1137 pg_stat
.insert(make_pair(update_pg
, update_stat
));
1139 stat_pg_sub(update_pg
, t
->second
);
1140 t
->second
= update_stat
;
1142 stat_pg_add(update_pg
, update_stat
);
1144 assert(osd_stat
.size() == osd_epochs
.size());
1145 for (auto p
= inc
.get_osd_stat_updates().begin();
1146 p
!= inc
.get_osd_stat_updates().end();
1149 const osd_stat_t
&new_stats(p
->second
);
1151 auto t
= osd_stat
.find(osd
);
1152 if (t
== osd_stat
.end()) {
1153 osd_stat
.insert(make_pair(osd
, new_stats
));
1155 stat_osd_sub(t
->first
, t
->second
);
1156 t
->second
= new_stats
;
1158 auto i
= osd_epochs
.find(osd
);
1159 auto j
= inc
.get_osd_epochs().find(osd
);
1160 assert(j
!= inc
.get_osd_epochs().end());
1162 if (i
== osd_epochs
.end())
1163 osd_epochs
.insert(*j
);
1165 i
->second
= j
->second
;
1167 stat_osd_add(osd
, new_stats
);
1169 // adjust [near]full status
1170 register_nearfull_status(osd
, new_stats
);
1172 set
<int64_t> deleted_pools
;
1173 for (auto p
= inc
.pg_remove
.begin();
1174 p
!= inc
.pg_remove
.end();
1176 const pg_t
&removed_pg(*p
);
1177 auto s
= pg_stat
.find(removed_pg
);
1178 if (s
!= pg_stat
.end()) {
1179 stat_pg_sub(removed_pg
, s
->second
);
1182 deleted_pools
.insert(removed_pg
.pool());
1185 for (auto p
= inc
.get_osd_stat_rm().begin();
1186 p
!= inc
.get_osd_stat_rm().end();
1188 auto t
= osd_stat
.find(*p
);
1189 if (t
!= osd_stat
.end()) {
1190 stat_osd_sub(t
->first
, t
->second
);
1192 osd_epochs
.erase(*p
);
1195 // remove these old osds from full/nearfull set(s), too
1196 nearfull_osds
.erase(*p
);
1197 full_osds
.erase(*p
);
1200 // skip calculating delta while sum was not synchronized
1201 if (!stamp
.is_zero() && !pg_sum_old
.stats
.sum
.is_zero()) {
1203 delta_t
= inc
.stamp
;
1205 // calculate a delta, and average over the last 2 deltas.
1206 pool_stat_t d
= pg_sum
;
1207 d
.stats
.sub(pg_sum_old
.stats
);
1208 pg_sum_deltas
.push_back(make_pair(d
, delta_t
));
1209 stamp_delta
+= delta_t
;
1210 pg_sum_delta
.stats
.add(d
.stats
);
1211 auto smooth_intervals
=
1212 cct
? cct
->_conf
->get_val
<uint64_t>("mon_stat_smooth_intervals") : 1;
1213 if (pg_sum_deltas
.size() > smooth_intervals
) {
1214 pg_sum_delta
.stats
.sub(pg_sum_deltas
.front().first
.stats
);
1215 stamp_delta
-= pg_sum_deltas
.front().second
;
1216 pg_sum_deltas
.pop_front();
1221 update_pool_deltas(cct
, inc
.stamp
, pg_pool_sum_old
);
1223 for (auto p
: deleted_pools
) {
1225 dout(20) << " deleted pool " << p
<< dendl
;
1229 if (inc
.osdmap_epoch
)
1230 last_osdmap_epoch
= inc
.osdmap_epoch
;
1232 last_pg_scan
= inc
.pg_scan
;
1234 min_last_epoch_clean
= 0; // invalidate
1237 void PGMap::redo_full_sets()
1240 nearfull_osds
.clear();
1241 for (auto i
= osd_stat
.begin();
1242 i
!= osd_stat
.end();
1244 register_nearfull_status(i
->first
, i
->second
);
1248 void PGMap::register_nearfull_status(int osd
, const osd_stat_t
& s
)
1250 float ratio
= ((float)s
.kb_used
) / ((float)s
.kb
);
1252 if (full_ratio
> 0 && ratio
> full_ratio
) {
1254 full_osds
.insert(osd
);
1255 nearfull_osds
.erase(osd
);
1256 } else if (nearfull_ratio
> 0 && ratio
> nearfull_ratio
) {
1258 full_osds
.erase(osd
);
1259 nearfull_osds
.insert(osd
);
1262 full_osds
.erase(osd
);
1263 nearfull_osds
.erase(osd
);
1267 void PGMap::calc_stats()
1273 pg_pool_sum
.clear();
1274 num_pg_by_pool
.clear();
1276 pg_sum
= pool_stat_t();
1277 osd_sum
= osd_stat_t();
1278 num_pg_by_state
.clear();
1279 num_pg_by_osd
.clear();
1281 for (auto p
= pg_stat
.begin();
1284 stat_pg_add(p
->first
, p
->second
);
1286 for (auto p
= osd_stat
.begin();
1287 p
!= osd_stat
.end();
1289 stat_osd_add(p
->first
, p
->second
);
1293 min_last_epoch_clean
= calc_min_last_epoch_clean();
1296 void PGMap::update_pg(pg_t pgid
, bufferlist
& bl
)
1298 bufferlist::iterator p
= bl
.begin();
1299 auto s
= pg_stat
.find(pgid
);
1300 epoch_t old_lec
= 0, lec
;
1301 if (s
!= pg_stat
.end()) {
1302 old_lec
= s
->second
.get_effective_last_epoch_clean();
1303 stat_pg_update(pgid
, s
->second
, p
);
1304 lec
= s
->second
.get_effective_last_epoch_clean();
1306 pg_stat_t
& r
= pg_stat
[pgid
];
1308 stat_pg_add(pgid
, r
);
1309 lec
= r
.get_effective_last_epoch_clean();
1312 if (min_last_epoch_clean
&&
1313 (lec
< min_last_epoch_clean
|| // we did
1314 (lec
> min_last_epoch_clean
&& // we might
1315 old_lec
== min_last_epoch_clean
)
1317 min_last_epoch_clean
= 0;
1320 void PGMap::remove_pg(pg_t pgid
)
1322 auto s
= pg_stat
.find(pgid
);
1323 if (s
!= pg_stat
.end()) {
1324 if (min_last_epoch_clean
&&
1325 s
->second
.get_effective_last_epoch_clean() == min_last_epoch_clean
)
1326 min_last_epoch_clean
= 0;
1327 stat_pg_sub(pgid
, s
->second
);
1332 void PGMap::update_osd(int osd
, bufferlist
& bl
)
1334 bufferlist::iterator p
= bl
.begin();
1335 auto o
= osd_stat
.find(osd
);
1336 epoch_t old_lec
= 0;
1337 if (o
!= osd_stat
.end()) {
1338 auto i
= osd_epochs
.find(osd
);
1339 if (i
!= osd_epochs
.end())
1340 old_lec
= i
->second
;
1341 stat_osd_sub(osd
, o
->second
);
1343 osd_stat_t
& r
= osd_stat
[osd
];
1345 stat_osd_add(osd
, r
);
1347 // adjust [near]full status
1348 register_nearfull_status(osd
, r
);
1355 if (e
< min_last_epoch_clean
||
1356 (e
> min_last_epoch_clean
&&
1357 old_lec
== min_last_epoch_clean
))
1358 min_last_epoch_clean
= 0;
1360 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1361 // or old mon running.
1365 void PGMap::remove_osd(int osd
)
1367 auto o
= osd_stat
.find(osd
);
1368 if (o
!= osd_stat
.end()) {
1369 stat_osd_sub(osd
, o
->second
);
1372 // remove these old osds from full/nearfull set(s), too
1373 nearfull_osds
.erase(osd
);
1374 full_osds
.erase(osd
);
1378 void PGMap::stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
1381 pg_pool_sum
[pgid
.pool()].add(s
);
1385 num_pg_by_state
[s
.state
]++;
1386 num_pg_by_pool
[pgid
.pool()]++;
1388 if ((s
.state
& PG_STATE_CREATING
) &&
1389 s
.parent_split_bits
== 0) {
1390 creating_pgs
.insert(pgid
);
1391 if (s
.acting_primary
>= 0) {
1392 creating_pgs_by_osd_epoch
[s
.acting_primary
][s
.mapping_epoch
].insert(pgid
);
1396 if (s
.state
& PG_STATE_ACTIVE
) {
1406 for (auto p
= s
.blocked_by
.begin();
1407 p
!= s
.blocked_by
.end();
1409 ++blocked_by_sum
[*p
];
1412 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1413 pg_by_osd
[*p
].insert(pgid
);
1414 num_pg_by_osd
[*p
].acting
++;
1416 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1417 pg_by_osd
[*p
].insert(pgid
);
1418 num_pg_by_osd
[*p
].up
++;
1421 if (s
.up_primary
>= 0) {
1422 num_pg_by_osd
[s
.up_primary
].primary
++;
1426 void PGMap::stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
1429 pool_stat_t
& ps
= pg_pool_sum
[pgid
.pool()];
1434 int end
= --num_pg_by_state
[s
.state
];
1437 num_pg_by_state
.erase(s
.state
);
1438 end
= --num_pg_by_pool
[pgid
.pool()];
1440 num_pg_by_pool
.erase(pgid
.pool());
1441 pg_pool_sum
.erase(pgid
.pool());
1444 if ((s
.state
& PG_STATE_CREATING
) &&
1445 s
.parent_split_bits
== 0) {
1446 creating_pgs
.erase(pgid
);
1447 if (s
.acting_primary
>= 0) {
1448 map
<epoch_t
,set
<pg_t
> >& r
= creating_pgs_by_osd_epoch
[s
.acting_primary
];
1449 r
[s
.mapping_epoch
].erase(pgid
);
1450 if (r
[s
.mapping_epoch
].empty())
1451 r
.erase(s
.mapping_epoch
);
1453 creating_pgs_by_osd_epoch
.erase(s
.acting_primary
);
1457 if (s
.state
& PG_STATE_ACTIVE
) {
1467 for (auto p
= s
.blocked_by
.begin();
1468 p
!= s
.blocked_by
.end();
1470 auto q
= blocked_by_sum
.find(*p
);
1471 assert(q
!= blocked_by_sum
.end());
1474 blocked_by_sum
.erase(q
);
1477 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1478 auto& oset
= pg_by_osd
[*p
];
1481 pg_by_osd
.erase(*p
);
1482 auto it
= num_pg_by_osd
.find(*p
);
1483 if (it
!= num_pg_by_osd
.end() && it
->second
.acting
> 0)
1484 it
->second
.acting
--;
1486 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1487 auto& oset
= pg_by_osd
[*p
];
1490 pg_by_osd
.erase(*p
);
1491 auto it
= num_pg_by_osd
.find(*p
);
1492 if (it
!= num_pg_by_osd
.end() && it
->second
.up
> 0)
1496 if (s
.up_primary
>= 0) {
1497 auto it
= num_pg_by_osd
.find(s
.up_primary
);
1498 if (it
!= num_pg_by_osd
.end() && it
->second
.primary
> 0)
1499 it
->second
.primary
--;
1503 void PGMap::stat_pg_update(const pg_t pgid
, pg_stat_t
& s
,
1504 bufferlist::iterator
& blp
)
1510 s
.acting
== n
.acting
&&
1512 s
.blocked_by
== n
.blocked_by
;
1514 stat_pg_sub(pgid
, s
, sameosds
);
1516 // if acting_primary has shift to an just restored osd, and pg yet to finish
1517 // peering, many attributes in current stats remain stale. others seem don't
1518 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1519 if (!(n
.state
& (PG_STATE_ACTIVE
| PG_STATE_PEERED
)) &&
1520 n
.last_active
< s
.last_active
)
1521 n
.last_active
= s
.last_active
;
1523 stat_pg_add(pgid
, n
, sameosds
);
1526 void PGMap::stat_osd_add(int osd
, const osd_stat_t
&s
)
1530 if (osd
>= (int)osd_last_seq
.size()) {
1531 osd_last_seq
.resize(osd
+ 1);
1533 osd_last_seq
[osd
] = s
.seq
;
1536 void PGMap::stat_osd_sub(int osd
, const osd_stat_t
&s
)
1540 assert(osd
< (int)osd_last_seq
.size());
1541 osd_last_seq
[osd
] = 0;
1544 epoch_t
PGMap::calc_min_last_epoch_clean() const
1546 if (pg_stat
.empty())
1549 auto p
= pg_stat
.begin();
1550 epoch_t min
= p
->second
.get_effective_last_epoch_clean();
1551 for (++p
; p
!= pg_stat
.end(); ++p
) {
1552 epoch_t lec
= p
->second
.get_effective_last_epoch_clean();
1556 // also scan osd epochs
1557 // don't trim past the oldest reported osd epoch
1558 for (auto i
= osd_epochs
.begin();
1559 i
!= osd_epochs
.end();
1561 if (i
->second
< min
)
1567 void PGMap::encode_digest(const OSDMap
& osdmap
,
1568 bufferlist
& bl
, uint64_t features
) const
1570 get_rules_avail(osdmap
, &avail_space_by_rule
);
1571 PGMapDigest::encode(bl
, features
);
1574 void PGMap::encode(bufferlist
&bl
, uint64_t features
) const
1576 if ((features
& CEPH_FEATURE_MONENC
) == 0) {
1579 ::encode(version
, bl
);
1580 ::encode(pg_stat
, bl
);
1581 ::encode(osd_stat
, bl
);
1582 ::encode(last_osdmap_epoch
, bl
);
1583 ::encode(last_pg_scan
, bl
);
1584 ::encode(full_ratio
, bl
);
1585 ::encode(nearfull_ratio
, bl
);
1589 ENCODE_START(6, 4, bl
);
1590 ::encode(version
, bl
);
1591 ::encode(pg_stat
, bl
);
1592 ::encode(osd_stat
, bl
);
1593 ::encode(last_osdmap_epoch
, bl
);
1594 ::encode(last_pg_scan
, bl
);
1595 ::encode(full_ratio
, bl
);
1596 ::encode(nearfull_ratio
, bl
);
1597 ::encode(stamp
, bl
);
1598 ::encode(osd_epochs
, bl
);
1602 void PGMap::decode(bufferlist::iterator
&bl
)
1604 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl
);
1605 ::decode(version
, bl
);
1612 ::decode(opgid
, bl
);
1614 ::decode(pg_stat
[pgid
], bl
);
1617 ::decode(pg_stat
, bl
);
1619 ::decode(osd_stat
, bl
);
1620 ::decode(last_osdmap_epoch
, bl
);
1621 ::decode(last_pg_scan
, bl
);
1622 if (struct_v
>= 2) {
1623 ::decode(full_ratio
, bl
);
1624 ::decode(nearfull_ratio
, bl
);
1627 ::decode(stamp
, bl
);
1628 if (struct_v
>= 6) {
1629 ::decode(osd_epochs
, bl
);
1631 for (auto i
= osd_stat
.begin();
1632 i
!= osd_stat
.end();
1634 // This isn't accurate, but will cause trimming to behave like
1636 osd_epochs
.insert(make_pair(i
->first
, last_osdmap_epoch
));
1644 void PGMap::dirty_all(Incremental
& inc
)
1646 inc
.osdmap_epoch
= last_osdmap_epoch
;
1647 inc
.pg_scan
= last_pg_scan
;
1648 inc
.full_ratio
= full_ratio
;
1649 inc
.nearfull_ratio
= nearfull_ratio
;
1651 for (auto p
= pg_stat
.begin(); p
!= pg_stat
.end(); ++p
) {
1652 inc
.pg_stat_updates
[p
->first
] = p
->second
;
1654 for (auto p
= osd_stat
.begin(); p
!= osd_stat
.end(); ++p
) {
1655 assert(osd_epochs
.count(p
->first
));
1656 inc
.update_stat(p
->first
,
1657 inc
.get_osd_epochs().find(p
->first
)->second
,
1662 void PGMap::dump(Formatter
*f
) const
1665 dump_pg_stats(f
, false);
1670 void PGMap::dump_basic(Formatter
*f
) const
1672 f
->dump_unsigned("version", version
);
1673 f
->dump_stream("stamp") << stamp
;
1674 f
->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch
);
1675 f
->dump_unsigned("last_pg_scan", last_pg_scan
);
1676 f
->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean
);
1677 f
->dump_float("full_ratio", full_ratio
);
1678 f
->dump_float("near_full_ratio", nearfull_ratio
);
1680 f
->open_object_section("pg_stats_sum");
1684 f
->open_object_section("osd_stats_sum");
1688 f
->open_array_section("osd_epochs");
1689 for (auto p
= osd_epochs
.begin(); p
!= osd_epochs
.end(); ++p
) {
1690 f
->open_object_section("osd");
1691 f
->dump_unsigned("osd", p
->first
);
1692 f
->dump_unsigned("epoch", p
->second
);
1700 void PGMap::dump_delta(Formatter
*f
) const
1702 f
->open_object_section("pg_stats_delta");
1703 pg_sum_delta
.dump(f
);
1707 void PGMap::dump_pg_stats(Formatter
*f
, bool brief
) const
1709 f
->open_array_section("pg_stats");
1710 for (auto i
= pg_stat
.begin();
1713 f
->open_object_section("pg_stat");
1714 f
->dump_stream("pgid") << i
->first
;
1716 i
->second
.dump_brief(f
);
1724 void PGMap::dump_pool_stats(Formatter
*f
) const
1726 f
->open_array_section("pool_stats");
1727 for (auto p
= pg_pool_sum
.begin();
1728 p
!= pg_pool_sum
.end();
1730 f
->open_object_section("pool_stat");
1731 f
->dump_int("poolid", p
->first
);
1732 auto q
= num_pg_by_pool
.find(p
->first
);
1733 if (q
!= num_pg_by_pool
.end())
1734 f
->dump_unsigned("num_pg", q
->second
);
1741 void PGMap::dump_osd_stats(Formatter
*f
) const
1743 f
->open_array_section("osd_stats");
1744 for (auto q
= osd_stat
.begin();
1745 q
!= osd_stat
.end();
1747 f
->open_object_section("osd_stat");
1748 f
->dump_int("osd", q
->first
);
1755 void PGMap::dump_pg_stats_plain(
1757 const mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
1763 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1764 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1765 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1766 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1767 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1768 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1771 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1772 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1773 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1774 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1775 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1776 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1777 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1778 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1779 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1780 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1781 tab
.define_column("STATE_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1782 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
1783 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
1784 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1785 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1786 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1787 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1788 tab
.define_column("LAST_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1789 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1790 tab
.define_column("LAST_DEEP_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1791 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1792 tab
.define_column("SNAPTRIMQ_LEN", TextTable::LEFT
, TextTable::RIGHT
);
1795 for (auto i
= pg_stats
.begin();
1796 i
!= pg_stats
.end(); ++i
) {
1797 const pg_stat_t
&st(i
->second
);
1800 << pg_state_string(st
.state
)
1804 << st
.acting_primary
1805 << TextTable::endrow
;
1807 ostringstream reported
;
1808 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
1811 << st
.stats
.sum
.num_objects
1812 << st
.stats
.sum
.num_objects_missing_on_primary
1813 << st
.stats
.sum
.num_objects_degraded
1814 << st
.stats
.sum
.num_objects_misplaced
1815 << st
.stats
.sum
.num_objects_unfound
1816 << st
.stats
.sum
.num_bytes
1818 << st
.ondisk_log_size
1819 << pg_state_string(st
.state
)
1823 << pg_vector_string(st
.up
)
1825 << pg_vector_string(st
.acting
)
1826 << st
.acting_primary
1828 << st
.last_scrub_stamp
1829 << st
.last_deep_scrub
1830 << st
.last_deep_scrub_stamp
1832 << TextTable::endrow
;
1839 void PGMap::dump(ostream
& ss
) const
1842 dump_pg_stats(ss
, false);
1843 dump_pool_stats(ss
, false);
1844 dump_pg_sum_stats(ss
, false);
1848 void PGMap::dump_basic(ostream
& ss
) const
1850 ss
<< "version " << version
<< std::endl
;
1851 ss
<< "stamp " << stamp
<< std::endl
;
1852 ss
<< "last_osdmap_epoch " << last_osdmap_epoch
<< std::endl
;
1853 ss
<< "last_pg_scan " << last_pg_scan
<< std::endl
;
1854 ss
<< "full_ratio " << full_ratio
<< std::endl
;
1855 ss
<< "nearfull_ratio " << nearfull_ratio
<< std::endl
;
1858 void PGMap::dump_pg_stats(ostream
& ss
, bool brief
) const
1860 dump_pg_stats_plain(ss
, pg_stat
, brief
);
1863 void PGMap::dump_pool_stats(ostream
& ss
, bool header
) const
1868 tab
.define_column("POOLID", TextTable::LEFT
, TextTable::LEFT
);
1869 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1870 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1871 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1872 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1873 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1874 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1875 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1876 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1878 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1879 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1880 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1881 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1882 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1883 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1884 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1885 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1886 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1889 for (auto p
= pg_pool_sum
.begin();
1890 p
!= pg_pool_sum
.end();
1893 << p
->second
.stats
.sum
.num_objects
1894 << p
->second
.stats
.sum
.num_objects_missing_on_primary
1895 << p
->second
.stats
.sum
.num_objects_degraded
1896 << p
->second
.stats
.sum
.num_objects_misplaced
1897 << p
->second
.stats
.sum
.num_objects_unfound
1898 << p
->second
.stats
.sum
.num_bytes
1899 << p
->second
.log_size
1900 << p
->second
.ondisk_log_size
1901 << TextTable::endrow
;
1907 void PGMap::dump_pg_sum_stats(ostream
& ss
, bool header
) const
1912 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1913 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1914 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1915 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1916 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1917 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1918 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1919 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1920 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1922 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1923 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1924 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1925 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1926 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1927 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1928 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1929 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1930 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1934 << pg_sum
.stats
.sum
.num_objects
1935 << pg_sum
.stats
.sum
.num_objects_missing_on_primary
1936 << pg_sum
.stats
.sum
.num_objects_degraded
1937 << pg_sum
.stats
.sum
.num_objects_misplaced
1938 << pg_sum
.stats
.sum
.num_objects_unfound
1939 << pg_sum
.stats
.sum
.num_bytes
1941 << pg_sum
.ondisk_log_size
1942 << TextTable::endrow
;
1947 void PGMap::dump_osd_stats(ostream
& ss
) const
1951 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1952 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1953 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1954 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1955 tab
.define_column("HB_PEERS", TextTable::LEFT
, TextTable::RIGHT
);
1956 tab
.define_column("PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1957 tab
.define_column("PRIMARY_PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1959 for (auto p
= osd_stat
.begin();
1960 p
!= osd_stat
.end();
1963 << si_t(p
->second
.kb_used
<< 10)
1964 << si_t(p
->second
.kb_avail
<< 10)
1965 << si_t(p
->second
.kb
<< 10)
1966 << p
->second
.hb_peers
1967 << get_num_pg_by_osd(p
->first
)
1968 << get_num_primary_pg_by_osd(p
->first
)
1969 << TextTable::endrow
;
1973 << si_t(osd_sum
.kb_used
<< 10)
1974 << si_t(osd_sum
.kb_avail
<< 10)
1975 << si_t(osd_sum
.kb
<< 10)
1976 << TextTable::endrow
;
1981 void PGMap::dump_osd_sum_stats(ostream
& ss
) const
1985 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1986 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1987 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1988 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1991 << si_t(osd_sum
.kb_used
<< 10)
1992 << si_t(osd_sum
.kb_avail
<< 10)
1993 << si_t(osd_sum
.kb
<< 10)
1994 << TextTable::endrow
;
1999 void PGMap::get_stuck_stats(
2000 int types
, const utime_t cutoff
,
2001 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const
2004 for (auto i
= pg_stat
.begin();
2007 utime_t val
= cutoff
; // don't care about >= cutoff so that is infinity
2009 if ((types
& STUCK_INACTIVE
) && !(i
->second
.state
& PG_STATE_ACTIVE
)) {
2010 if (i
->second
.last_active
< val
)
2011 val
= i
->second
.last_active
;
2014 if ((types
& STUCK_UNCLEAN
) && !(i
->second
.state
& PG_STATE_CLEAN
)) {
2015 if (i
->second
.last_clean
< val
)
2016 val
= i
->second
.last_clean
;
2019 if ((types
& STUCK_DEGRADED
) && (i
->second
.state
& PG_STATE_DEGRADED
)) {
2020 if (i
->second
.last_undegraded
< val
)
2021 val
= i
->second
.last_undegraded
;
2024 if ((types
& STUCK_UNDERSIZED
) && (i
->second
.state
& PG_STATE_UNDERSIZED
)) {
2025 if (i
->second
.last_fullsized
< val
)
2026 val
= i
->second
.last_fullsized
;
2029 if ((types
& STUCK_STALE
) && (i
->second
.state
& PG_STATE_STALE
)) {
2030 if (i
->second
.last_unstale
< val
)
2031 val
= i
->second
.last_unstale
;
2034 // val is now the earliest any of the requested stuck states began
2036 stuck_pgs
[i
->first
] = i
->second
;
2041 bool PGMap::get_stuck_counts(const utime_t cutoff
, map
<string
, int>& note
) const
2049 for (auto i
= pg_stat
.begin();
2052 if (! (i
->second
.state
& PG_STATE_ACTIVE
)) {
2053 if (i
->second
.last_active
< cutoff
)
2056 if (! (i
->second
.state
& PG_STATE_CLEAN
)) {
2057 if (i
->second
.last_clean
< cutoff
)
2060 if (i
->second
.state
& PG_STATE_DEGRADED
) {
2061 if (i
->second
.last_undegraded
< cutoff
)
2064 if (i
->second
.state
& PG_STATE_UNDERSIZED
) {
2065 if (i
->second
.last_fullsized
< cutoff
)
2068 if (i
->second
.state
& PG_STATE_STALE
) {
2069 if (i
->second
.last_unstale
< cutoff
)
2075 note
["stuck inactive"] = inactive
;
2078 note
["stuck unclean"] = unclean
;
2081 note
["stuck undersized"] = undersized
;
2084 note
["stuck degraded"] = degraded
;
2087 note
["stuck stale"] = stale
;
2089 return inactive
|| unclean
|| undersized
|| degraded
|| stale
;
2092 void PGMap::dump_stuck(Formatter
*f
, int types
, utime_t cutoff
) const
2094 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
2095 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
2096 f
->open_array_section("stuck_pg_stats");
2097 for (auto i
= stuck_pg_stats
.begin();
2098 i
!= stuck_pg_stats
.end();
2100 f
->open_object_section("pg_stat");
2101 f
->dump_stream("pgid") << i
->first
;
2108 void PGMap::dump_stuck_plain(ostream
& ss
, int types
, utime_t cutoff
) const
2110 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
2111 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
2112 if (!stuck_pg_stats
.empty())
2113 dump_pg_stats_plain(ss
, stuck_pg_stats
, true);
2116 int PGMap::dump_stuck_pg_stats(
2120 vector
<string
>& args
) const
2122 int stuck_types
= 0;
2124 for (auto i
= args
.begin(); i
!= args
.end(); ++i
) {
2125 if (*i
== "inactive")
2126 stuck_types
|= PGMap::STUCK_INACTIVE
;
2127 else if (*i
== "unclean")
2128 stuck_types
|= PGMap::STUCK_UNCLEAN
;
2129 else if (*i
== "undersized")
2130 stuck_types
|= PGMap::STUCK_UNDERSIZED
;
2131 else if (*i
== "degraded")
2132 stuck_types
|= PGMap::STUCK_DEGRADED
;
2133 else if (*i
== "stale")
2134 stuck_types
|= PGMap::STUCK_STALE
;
2136 ds
<< "Unknown type: " << *i
<< std::endl
;
2141 utime_t
now(ceph_clock_now());
2142 utime_t cutoff
= now
- utime_t(threshold
, 0);
2145 dump_stuck_plain(ds
, stuck_types
, cutoff
);
2147 dump_stuck(f
, stuck_types
, cutoff
);
2154 void PGMap::dump_osd_perf_stats(Formatter
*f
) const
2156 f
->open_array_section("osd_perf_infos");
2157 for (auto i
= osd_stat
.begin();
2158 i
!= osd_stat
.end();
2160 f
->open_object_section("osd");
2161 f
->dump_int("id", i
->first
);
2163 f
->open_object_section("perf_stats");
2164 i
->second
.os_perf_stat
.dump(f
);
2171 void PGMap::print_osd_perf_stats(std::ostream
*ss
) const
2174 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2175 tab
.define_column("commit_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2176 tab
.define_column("apply_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2177 for (auto i
= osd_stat
.begin();
2178 i
!= osd_stat
.end();
2181 tab
<< i
->second
.os_perf_stat
.os_commit_latency
;
2182 tab
<< i
->second
.os_perf_stat
.os_apply_latency
;
2183 tab
<< TextTable::endrow
;
2188 void PGMap::dump_osd_blocked_by_stats(Formatter
*f
) const
2190 f
->open_array_section("osd_blocked_by_infos");
2191 for (auto i
= blocked_by_sum
.begin();
2192 i
!= blocked_by_sum
.end();
2194 f
->open_object_section("osd");
2195 f
->dump_int("id", i
->first
);
2196 f
->dump_int("num_blocked", i
->second
);
2201 void PGMap::print_osd_blocked_by_stats(std::ostream
*ss
) const
2204 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2205 tab
.define_column("num_blocked", TextTable::LEFT
, TextTable::RIGHT
);
2206 for (auto i
= blocked_by_sum
.begin();
2207 i
!= blocked_by_sum
.end();
2211 tab
<< TextTable::endrow
;
2218 * update aggregated delta
2220 * @param cct ceph context
2221 * @param ts Timestamp for the stats being delta'ed
2222 * @param old_pool_sum Previous stats sum
2223 * @param last_ts Last timestamp for pool
2224 * @param result_pool_sum Resulting stats
2225 * @param result_pool_delta Resulting pool delta
2226 * @param result_ts_delta Resulting timestamp delta
2227 * @param delta_avg_list List of last N computed deltas, used to average
2229 void PGMap::update_delta(
2232 const pool_stat_t
& old_pool_sum
,
2234 const pool_stat_t
& current_pool_sum
,
2235 pool_stat_t
*result_pool_delta
,
2236 utime_t
*result_ts_delta
,
2237 mempool::pgmap::list
<pair
<pool_stat_t
,utime_t
> > *delta_avg_list
)
2239 /* @p ts is the timestamp we want to associate with the data
2240 * in @p old_pool_sum, and on which we will base ourselves to
2241 * calculate the delta, stored in 'delta_t'.
2244 delta_t
= ts
; // start with the provided timestamp
2245 delta_t
-= *last_ts
; // take the last timestamp we saw
2246 *last_ts
= ts
; // @p ts becomes the last timestamp we saw
2248 // adjust delta_t, quick start if there is no update in a long period
2249 delta_t
= std::min(delta_t
,
2250 utime_t(2 * (cct
? cct
->_conf
->mon_delta_reset_interval
: 10), 0));
2252 // calculate a delta, and average over the last 6 deltas by default.
2253 /* start by taking a copy of our current @p result_pool_sum, and by
2254 * taking out the stats from @p old_pool_sum. This generates a stats
2255 * delta. Stash this stats delta in @p delta_avg_list, along with the
2256 * timestamp delta for these results.
2258 pool_stat_t d
= current_pool_sum
;
2259 d
.stats
.sub(old_pool_sum
.stats
);
2261 /* Aggregate current delta, and take out the last seen delta (if any) to
2263 * Skip calculating delta while sum was not synchronized.
2265 if(!old_pool_sum
.stats
.sum
.is_zero()) {
2266 delta_avg_list
->push_back(make_pair(d
,delta_t
));
2267 *result_ts_delta
+= delta_t
;
2268 result_pool_delta
->stats
.add(d
.stats
);
2270 size_t s
= cct
? cct
->_conf
->get_val
<uint64_t>("mon_stat_smooth_intervals") : 1;
2271 if (delta_avg_list
->size() > s
) {
2272 result_pool_delta
->stats
.sub(delta_avg_list
->front().first
.stats
);
2273 *result_ts_delta
-= delta_avg_list
->front().second
;
2274 delta_avg_list
->pop_front();
2279 * update aggregated delta
2281 * @param cct ceph context
2282 * @param ts Timestamp
2283 * @param pg_sum_old Old pg_sum
2285 void PGMap::update_global_delta(CephContext
*cct
,
2286 const utime_t ts
, const pool_stat_t
& pg_sum_old
)
2288 update_delta(cct
, ts
, pg_sum_old
, &stamp
, pg_sum
, &pg_sum_delta
,
2289 &stamp_delta
, &pg_sum_deltas
);
2293 * Update a given pool's deltas
2295 * @param cct Ceph Context
2296 * @param ts Timestamp for the stats being delta'ed
2297 * @param pool Pool's id
2298 * @param old_pool_sum Previous stats sum
2300 void PGMap::update_one_pool_delta(
2303 const uint64_t pool
,
2304 const pool_stat_t
& old_pool_sum
)
2306 if (per_pool_sum_deltas
.count(pool
) == 0) {
2307 assert(per_pool_sum_deltas_stamps
.count(pool
) == 0);
2308 assert(per_pool_sum_delta
.count(pool
) == 0);
2311 auto& sum_delta
= per_pool_sum_delta
[pool
];
2313 update_delta(cct
, ts
, old_pool_sum
, &sum_delta
.second
, pg_pool_sum
[pool
],
2314 &sum_delta
.first
, &per_pool_sum_deltas_stamps
[pool
],
2315 &per_pool_sum_deltas
[pool
]);
2319 * Update pools' deltas
2321 * @param cct CephContext
2322 * @param ts Timestamp for the stats being delta'ed
2323 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2325 void PGMap::update_pool_deltas(
2326 CephContext
*cct
, const utime_t ts
,
2327 const mempool::pgmap::unordered_map
<uint64_t,pool_stat_t
>& pg_pool_sum_old
)
2329 for (auto it
= pg_pool_sum_old
.begin();
2330 it
!= pg_pool_sum_old
.end(); ++it
) {
2331 update_one_pool_delta(cct
, ts
, it
->first
, it
->second
);
2335 void PGMap::clear_delta()
2337 pg_sum_delta
= pool_stat_t();
2338 pg_sum_deltas
.clear();
2339 stamp_delta
= utime_t();
2342 void PGMap::generate_test_instances(list
<PGMap
*>& o
)
2344 o
.push_back(new PGMap
);
2345 list
<Incremental
*> inc
;
2346 Incremental::generate_test_instances(inc
);
2349 while (!inc
.empty()) {
2350 PGMap
*pmp
= new PGMap();
2353 o
.back()->apply_incremental(NULL
, *inc
.front());
2359 void PGMap::get_filtered_pg_stats(uint32_t state
, int64_t poolid
, int64_t osdid
,
2360 bool primary
, set
<pg_t
>& pgs
) const
2362 for (auto i
= pg_stat
.begin();
2365 if ((poolid
>= 0) && (uint64_t(poolid
) != i
->first
.pool()))
2367 if ((osdid
>= 0) && !(i
->second
.is_acting_osd(osdid
,primary
)))
2369 if (!(i
->second
.state
& state
))
2371 pgs
.insert(i
->first
);
2375 void PGMap::dump_filtered_pg_stats(Formatter
*f
, set
<pg_t
>& pgs
) const
2377 f
->open_array_section("pg_stats");
2378 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2379 const pg_stat_t
& st
= pg_stat
.at(*i
);
2380 f
->open_object_section("pg_stat");
2381 f
->dump_stream("pgid") << *i
;
2388 void PGMap::dump_filtered_pg_stats(ostream
& ss
, set
<pg_t
>& pgs
) const
2392 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
2393 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
2394 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
2395 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
2396 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
2397 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
2398 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
2399 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
2400 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
2401 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
2402 tab
.define_column("STATE_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2403 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
2404 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
2405 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
2406 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
2407 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
2408 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
2409 tab
.define_column("LAST_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
2410 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2411 tab
.define_column("LAST_DEEP_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
2412 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2414 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2415 const pg_stat_t
& st
= pg_stat
.at(*i
);
2417 ostringstream reported
;
2418 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
2421 << st
.stats
.sum
.num_objects
2422 << st
.stats
.sum
.num_objects_missing_on_primary
2423 << st
.stats
.sum
.num_objects_degraded
2424 << st
.stats
.sum
.num_objects_misplaced
2425 << st
.stats
.sum
.num_objects_unfound
2426 << st
.stats
.sum
.num_bytes
2428 << st
.ondisk_log_size
2429 << pg_state_string(st
.state
)
2436 << st
.acting_primary
2438 << st
.last_scrub_stamp
2439 << st
.last_deep_scrub
2440 << st
.last_deep_scrub_stamp
2441 << TextTable::endrow
;
2449 // Only called with a single bit set in "what"
2450 static void note_stuck_detail(
2452 mempool::pgmap::unordered_map
<pg_t
,pg_stat_t
>& stuck_pgs
,
2454 list
<pair
<health_status_t
,string
> > *detail
)
2457 for (auto p
= stuck_pgs
.begin();
2458 p
!= stuck_pgs
.end();
2462 const char *whatname
= 0;
2464 case PGMap::STUCK_INACTIVE
:
2465 since
= p
->second
.last_active
;
2466 whatname
= "inactive";
2468 case PGMap::STUCK_UNCLEAN
:
2469 since
= p
->second
.last_clean
;
2470 whatname
= "unclean";
2472 case PGMap::STUCK_DEGRADED
:
2473 since
= p
->second
.last_undegraded
;
2474 whatname
= "degraded";
2476 case PGMap::STUCK_UNDERSIZED
:
2477 since
= p
->second
.last_fullsized
;
2478 whatname
= "undersized";
2480 case PGMap::STUCK_STALE
:
2481 since
= p
->second
.last_unstale
;
2487 if (--max_detail
== 0) {
2489 ss
<< (stuck_pgs
.size() - n
) << " more pgs are also stuck " << whatname
;
2490 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2494 ss
<< "pg " << p
->first
<< " is stuck " << whatname
;
2495 if (since
== utime_t()) {
2496 ss
<< " since forever";
2498 utime_t dur
= ceph_clock_now() - since
;
2499 ss
<< " for " << dur
;
2501 ss
<< ", current state " << pg_state_string(p
->second
.state
)
2502 << ", last acting " << p
->second
.acting
;
2503 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2507 static pair
<int,int> _warn_slow_request_histogram(
2509 const pow2_hist_t
& h
,
2511 list
<pair
<health_status_t
,string
> >& summary
,
2512 list
<pair
<health_status_t
,string
> > *detail
)
2515 return make_pair(0, 0);
2517 unsigned warn
= 0, error
= 0;
2519 cct
->_conf
->mon_osd_warn_op_age
* cct
->_conf
->mon_osd_err_op_age_ratio
;
2520 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
2521 float ub
= (float)(1 << i
) / 1000.0;
2522 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
2525 auto sev
= HEALTH_WARN
;
2534 ss
<< h
.h
[i
] << " ops are blocked > " << ub
<< " sec" << suffix
;
2535 detail
->push_back(make_pair(sev
, ss
.str()));
2539 return make_pair(warn
, error
);
2543 enum class scrubbed_or_deepscrubbed_t
{ SCRUBBED
, DEEPSCRUBBED
};
2545 void print_unscrubbed_detailed(
2546 const std::pair
<const pg_t
,pg_stat_t
> &pg_entry
,
2547 list
<pair
<health_status_t
,string
> > *detail
,
2548 scrubbed_or_deepscrubbed_t how_scrubbed
)
2550 std::stringstream ss
;
2551 const auto& pg_stat(pg_entry
.second
);
2553 ss
<< "pg " << pg_entry
.first
<< " is not ";
2554 if (how_scrubbed
== scrubbed_or_deepscrubbed_t::SCRUBBED
) {
2555 ss
<< "scrubbed, last_scrub_stamp "
2556 << pg_stat
.last_scrub_stamp
;
2557 } else if (how_scrubbed
== scrubbed_or_deepscrubbed_t::DEEPSCRUBBED
) {
2558 ss
<< "deep-scrubbed, last_deep_scrub_stamp "
2559 << pg_stat
.last_deep_scrub_stamp
;
2562 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2565 using pg_stat_map_t
= const mempool::pgmap::unordered_map
<pg_t
,pg_stat_t
>;
2567 void print_unscrubbed_pgs(
2568 pg_stat_map_t
& pg_stats
,
2569 list
<pair
<health_status_t
,string
> > &summary
,
2570 list
<pair
<health_status_t
,string
> > *detail
,
2571 const CephContext
* cct
)
2573 if (cct
->_conf
->mon_warn_not_scrubbed
== 0 &&
2574 cct
->_conf
->mon_warn_not_deep_scrubbed
== 0)
2578 const utime_t now
= ceph_clock_now();
2579 for (const auto& pg_entry
: pg_stats
) {
2580 const auto& pg_stat(pg_entry
.second
);
2581 const utime_t time_since_ls
= now
- pg_stat
.last_scrub_stamp
;
2582 const utime_t time_since_lds
= now
- pg_stat
.last_deep_scrub_stamp
;
2584 const int mon_warn_not_scrubbed
=
2585 cct
->_conf
->mon_warn_not_scrubbed
+ cct
->_conf
->mon_scrub_interval
;
2587 const int mon_warn_not_deep_scrubbed
=
2588 cct
->_conf
->mon_warn_not_deep_scrubbed
+ cct
->_conf
->osd_deep_scrub_interval
;
2590 bool not_scrubbed
= (time_since_ls
>= mon_warn_not_scrubbed
&&
2591 cct
->_conf
->mon_warn_not_scrubbed
!= 0);
2593 bool not_deep_scrubbed
= (time_since_lds
>= mon_warn_not_deep_scrubbed
&&
2594 cct
->_conf
->mon_warn_not_deep_scrubbed
!= 0);
2596 if (detail
!= nullptr) {
2598 print_unscrubbed_detailed(pg_entry
,
2600 scrubbed_or_deepscrubbed_t::SCRUBBED
);
2602 if (not_deep_scrubbed
) {
2603 print_unscrubbed_detailed(pg_entry
,
2605 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED
);
2608 if (not_scrubbed
|| not_deep_scrubbed
) {
2613 if (pgs_count
> 0) {
2614 std::stringstream ss
;
2615 ss
<< pgs_count
<< " unscrubbed pgs";
2616 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2621 void PGMap::get_health_checks(
2623 const OSDMap
& osdmap
,
2624 health_check_map_t
*checks
) const
2626 utime_t now
= ceph_clock_now();
2627 const auto max
= cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail");
2628 const auto& pools
= osdmap
.get_pools();
2630 typedef enum pg_consequence_t
{
2631 UNAVAILABLE
= 1, // Client IO to the pool may block
2632 DEGRADED
= 2, // Fewer than the requested number of replicas are present
2633 DEGRADED_FULL
= 3, // Fewer than the request number of replicas may be present
2634 // and insufficiet resources are present to fix this
2635 DAMAGED
= 4 // The data may be missing or inconsistent on disk and
2639 // For a given PG state, how should it be reported at the pool level?
2640 class PgStateResponse
{
2642 pg_consequence_t consequence
;
2643 typedef std::function
< utime_t(const pg_stat_t
&) > stuck_cb
;
2644 stuck_cb stuck_since
;
2647 PgStateResponse(const pg_consequence_t
&c
, stuck_cb s
)
2648 : consequence(c
), stuck_since(s
), invert(false)
2652 PgStateResponse(const pg_consequence_t
&c
, stuck_cb s
, bool i
)
2653 : consequence(c
), stuck_since(s
), invert(i
)
2658 // Record the PG state counts that contributed to a reported pool state
2661 // Map of PG_STATE_* to number of pgs in that state.
2662 std::map
<unsigned, unsigned> states
;
2664 // List of all PG IDs that had a state contributing
2665 // to this health condition.
2668 std::map
<pg_t
, std::string
> pg_messages
;
2671 // Map of PG state to how to respond to it
2672 std::map
<unsigned, PgStateResponse
> state_to_response
= {
2673 // Immediate reports
2674 { PG_STATE_INCONSISTENT
, {DAMAGED
, {}} },
2675 { PG_STATE_INCOMPLETE
, {UNAVAILABLE
, {}} },
2676 { PG_STATE_REPAIR
, {DAMAGED
, {}} },
2677 { PG_STATE_SNAPTRIM_ERROR
, {DAMAGED
, {}} },
2678 { PG_STATE_RECOVERY_UNFOUND
, {DAMAGED
, {}} },
2679 { PG_STATE_BACKFILL_UNFOUND
, {DAMAGED
, {}} },
2680 { PG_STATE_BACKFILL_TOOFULL
, {DEGRADED_FULL
, {}} },
2681 { PG_STATE_RECOVERY_TOOFULL
, {DEGRADED_FULL
, {}} },
2682 { PG_STATE_DEGRADED
, {DEGRADED
, {}} },
2683 { PG_STATE_DOWN
, {UNAVAILABLE
, {}} },
2684 // Delayed (wait until stuck) reports
2685 { PG_STATE_PEERING
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_peered
;} } },
2686 { PG_STATE_UNDERSIZED
, {DEGRADED
, [](const pg_stat_t
&p
){return p
.last_fullsized
;} } },
2687 { PG_STATE_STALE
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_unstale
;} } },
2688 // Delayed and inverted reports
2689 { PG_STATE_ACTIVE
, {UNAVAILABLE
, [](const pg_stat_t
&p
){return p
.last_active
;}, true} }
2692 // Specialized state printer that takes account of inversion of
2693 // ACTIVE, CLEAN checks.
2694 auto state_name
= [](const uint32_t &state
) {
2695 // Special cases for the states that are inverted checks
2696 if (state
== PG_STATE_CLEAN
) {
2697 return std::string("unclean");
2698 } else if (state
== PG_STATE_ACTIVE
) {
2699 return std::string("inactive");
2701 return pg_state_string(state
);
2705 // Map of what is wrong to information about why, implicitly also stores
2706 // the list of what is wrong.
2707 std::map
<pg_consequence_t
, PgCauses
> detected
;
2709 // Optimisation: trim down the number of checks to apply based on
2710 // the summary counters
2711 std::map
<unsigned, PgStateResponse
> possible_responses
;
2712 for (const auto &i
: num_pg_by_state
) {
2713 for (const auto &j
: state_to_response
) {
2714 if (!j
.second
.invert
) {
2715 // Check for normal tests by seeing if any pgs have the flag
2716 if (i
.first
& j
.first
) {
2717 possible_responses
.insert(j
);
2723 for (const auto &j
: state_to_response
) {
2724 if (j
.second
.invert
) {
2725 // Check for inverted tests by seeing if not-all pgs have the flag
2726 const auto &found
= num_pg_by_state
.find(j
.first
);
2727 if (found
== num_pg_by_state
.end() || found
->second
!= num_pg
) {
2728 possible_responses
.insert(j
);
2733 utime_t cutoff
= now
- utime_t(cct
->_conf
->get_val
<int64_t>("mon_pg_stuck_threshold"), 0);
2734 // Loop over all PGs, if there are any possibly-unhealthy states in there
2735 if (!possible_responses
.empty()) {
2736 for (const auto& i
: pg_stat
) {
2737 const auto &pg_id
= i
.first
;
2738 const auto &pg_info
= i
.second
;
2740 for (const auto &j
: state_to_response
) {
2741 const auto &pg_response_state
= j
.first
;
2742 const auto &pg_response
= j
.second
;
2744 // Apply the state test
2745 if (!(bool(pg_info
.state
& pg_response_state
) != pg_response
.invert
)) {
2749 // Apply stuckness test if needed
2750 if (pg_response
.stuck_since
) {
2751 // Delayed response, check for stuckness
2752 utime_t last_whatever
= pg_response
.stuck_since(pg_info
);
2753 if (last_whatever
>= cutoff
) {
2754 // Not stuck enough, ignore.
2761 auto &causes
= detected
[pg_response
.consequence
];
2762 causes
.states
[pg_response_state
]++;
2763 causes
.pgs
.insert(pg_id
);
2765 // Don't bother composing detail string if we have already recorded
2767 if (causes
.pg_messages
.size() > max
) {
2771 std::ostringstream ss
;
2772 if (pg_response
.stuck_since
) {
2773 utime_t since
= pg_response
.stuck_since(pg_info
);
2774 ss
<< "pg " << pg_id
<< " is stuck " << state_name(pg_response_state
);
2775 if (since
== utime_t()) {
2776 ss
<< " since forever";
2778 utime_t dur
= now
- since
;
2779 ss
<< " for " << dur
;
2781 ss
<< ", current state " << pg_state_string(pg_info
.state
)
2782 << ", last acting " << pg_info
.acting
;
2784 ss
<< "pg " << pg_id
<< " is "
2785 << pg_state_string(pg_info
.state
);
2786 ss
<< ", acting " << pg_info
.acting
;
2787 if (pg_info
.stats
.sum
.num_objects_unfound
) {
2788 ss
<< ", " << pg_info
.stats
.sum
.num_objects_unfound
2793 if (pg_info
.state
& PG_STATE_INCOMPLETE
) {
2794 const pg_pool_t
*pi
= osdmap
.get_pg_pool(pg_id
.pool());
2795 if (pi
&& pi
->min_size
> 1) {
2796 ss
<< " (reducing pool "
2797 << osdmap
.get_pool_name(pg_id
.pool())
2798 << " min_size from " << (int)pi
->min_size
2799 << " may help; search ceph.com/docs for 'incomplete')";
2803 causes
.pg_messages
[pg_id
] = ss
.str();
2807 dout(10) << __func__
<< " skipping loop over PGs: counters look OK" << dendl
;
2810 for (const auto &i
: detected
) {
2811 std::string health_code
;
2812 health_status_t sev
;
2813 std::string summary
;
2816 health_code
= "PG_AVAILABILITY";
2818 summary
= "Reduced data availability: ";
2821 health_code
= "PG_DEGRADED";
2822 summary
= "Degraded data redundancy: ";
2826 health_code
= "PG_DEGRADED_FULL";
2827 summary
= "Degraded data redundancy (low space): ";
2831 health_code
= "PG_DAMAGED";
2832 summary
= "Possible data damage: ";
2839 if (i
.first
== DEGRADED
) {
2840 if (pg_sum
.stats
.sum
.num_objects_degraded
&&
2841 pg_sum
.stats
.sum
.num_object_copies
> 0) {
2842 double pc
= (double)pg_sum
.stats
.sum
.num_objects_degraded
/
2843 (double)pg_sum
.stats
.sum
.num_object_copies
* (double)100.0;
2845 snprintf(b
, sizeof(b
), "%.3lf", pc
);
2847 ss
<< pg_sum
.stats
.sum
.num_objects_degraded
2848 << "/" << pg_sum
.stats
.sum
.num_object_copies
<< " objects degraded ("
2851 // Throw in a comma for the benefit of the following PG counts
2852 summary
+= ss
.str() + ", ";
2856 // Compose summary message saying how many PGs in what states led
2857 // to this health check failing
2858 std::vector
<std::string
> pg_msgs
;
2859 for (const auto &j
: i
.second
.states
) {
2860 std::ostringstream msg
;
2861 msg
<< j
.second
<< (j
.second
> 1 ? " pgs " : " pg ") << state_name(j
.first
);
2862 pg_msgs
.push_back(msg
.str());
2864 summary
+= joinify(pg_msgs
.begin(), pg_msgs
.end(), std::string(", "));
2868 health_check_t
*check
= &checks
->add(
2873 // Compose list of PGs contributing to this health check failing
2874 for (const auto &j
: i
.second
.pg_messages
) {
2875 check
->detail
.push_back(j
.second
);
2880 if (pg_sum
.stats
.sum
.num_scrub_errors
) {
2882 ss
<< pg_sum
.stats
.sum
.num_scrub_errors
<< " scrub errors";
2883 checks
->add("OSD_SCRUB_ERRORS", HEALTH_ERR
, ss
.str());
2886 // LARGE_OMAP_OBJECTS
2887 if (pg_sum
.stats
.sum
.num_large_omap_objects
) {
2888 list
<string
> detail
;
2889 for (auto &pool
: pools
) {
2890 const string
& pool_name
= osdmap
.get_pool_name(pool
.first
);
2891 auto it2
= pg_pool_sum
.find(pool
.first
);
2892 if (it2
== pg_pool_sum
.end()) {
2895 const pool_stat_t
*pstat
= &it2
->second
;
2896 if (pstat
== nullptr) {
2899 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
2900 if (sum
.num_large_omap_objects
) {
2902 ss
<< sum
.num_large_omap_objects
<< " large objects found in pool "
2903 << "'" << pool_name
<< "'";
2904 detail
.push_back(ss
.str());
2907 if (!detail
.empty()) {
2909 ss
<< pg_sum
.stats
.sum
.num_large_omap_objects
<< " large omap objects";
2910 auto& d
= checks
->add("LARGE_OMAP_OBJECTS", HEALTH_WARN
, ss
.str());
2912 tip
<< "Search the cluster log for 'Large omap object found' for more "
2914 detail
.push_back(tip
.str());
2915 d
.detail
.swap(detail
);
2919 // CACHE_POOL_NEAR_FULL
2921 list
<string
> detail
;
2922 unsigned num_pools
= 0;
2923 for (auto& p
: pools
) {
2924 if ((!p
.second
.target_max_objects
&& !p
.second
.target_max_bytes
) ||
2925 !pg_pool_sum
.count(p
.first
)) {
2928 bool nearfull
= false;
2929 const string
& name
= osdmap
.get_pool_name(p
.first
);
2930 const pool_stat_t
& st
= get_pg_pool_sum_stat(p
.first
);
2931 uint64_t ratio
= p
.second
.cache_target_full_ratio_micro
+
2932 ((1000000 - p
.second
.cache_target_full_ratio_micro
) *
2933 cct
->_conf
->mon_cache_target_full_warn_ratio
);
2934 if (p
.second
.target_max_objects
&&
2935 (uint64_t)(st
.stats
.sum
.num_objects
-
2936 st
.stats
.sum
.num_objects_hit_set_archive
) >
2937 p
.second
.target_max_objects
* (ratio
/ 1000000.0)) {
2939 ss
<< "cache pool '" << name
<< "' with "
2940 << si_t(st
.stats
.sum
.num_objects
)
2941 << " objects at/near target max "
2942 << si_t(p
.second
.target_max_objects
) << " objects";
2943 detail
.push_back(ss
.str());
2946 if (p
.second
.target_max_bytes
&&
2947 (uint64_t)(st
.stats
.sum
.num_bytes
-
2948 st
.stats
.sum
.num_bytes_hit_set_archive
) >
2949 p
.second
.target_max_bytes
* (ratio
/ 1000000.0)) {
2951 ss
<< "cache pool '" << name
2952 << "' with " << si_t(st
.stats
.sum
.num_bytes
)
2953 << "B at/near target max "
2954 << si_t(p
.second
.target_max_bytes
) << "B";
2955 detail
.push_back(ss
.str());
2962 if (!detail
.empty()) {
2964 ss
<< num_pools
<< " cache pools at or near target size";
2965 auto& d
= checks
->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN
, ss
.str());
2966 d
.detail
.swap(detail
);
2971 unsigned num_in
= osdmap
.get_num_in_osds();
2972 auto sum_pg_up
= std::max(static_cast<size_t>(pg_sum
.up
), pg_stat
.size());
2973 const auto min_pg_per_osd
=
2974 cct
->_conf
->get_val
<uint64_t>("mon_pg_warn_min_per_osd");
2975 if (num_in
&& min_pg_per_osd
> 0 && osdmap
.get_pools().size() > 0) {
2976 auto per
= sum_pg_up
/ num_in
;
2977 if (per
< min_pg_per_osd
&& per
) {
2979 ss
<< "too few PGs per OSD (" << per
2980 << " < min " << min_pg_per_osd
<< ")";
2981 checks
->add("TOO_FEW_PGS", HEALTH_WARN
, ss
.str());
2986 auto max_pg_per_osd
= cct
->_conf
->get_val
<uint64_t>("mon_max_pg_per_osd");
2987 if (num_in
&& max_pg_per_osd
> 0) {
2988 auto per
= sum_pg_up
/ num_in
;
2989 if (per
> max_pg_per_osd
) {
2991 ss
<< "too many PGs per OSD (" << per
2992 << " > max " << max_pg_per_osd
<< ")";
2993 checks
->add("TOO_MANY_PGS", HEALTH_WARN
, ss
.str());
2998 // MANY_OBJECTS_PER_PG
2999 if (!pg_stat
.empty()) {
3000 list
<string
> pgp_detail
, many_detail
;
3001 const auto mon_pg_warn_min_objects
=
3002 cct
->_conf
->get_val
<int64_t>("mon_pg_warn_min_objects");
3003 const auto mon_pg_warn_min_pool_objects
=
3004 cct
->_conf
->get_val
<int64_t>("mon_pg_warn_min_pool_objects");
3005 const auto mon_pg_warn_max_object_skew
=
3006 cct
->_conf
->get_val
<double>("mon_pg_warn_max_object_skew");
3007 for (auto p
= pg_pool_sum
.begin();
3008 p
!= pg_pool_sum
.end();
3010 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
);
3012 continue; // in case osdmap changes haven't propagated to PGMap yet
3013 const string
& name
= osdmap
.get_pool_name(p
->first
);
3014 if (pi
->get_pg_num() > pi
->get_pgp_num() &&
3015 !(name
.find(".DELETED") != string::npos
&&
3016 cct
->_conf
->mon_fake_pool_delete
)) {
3018 ss
<< "pool " << name
<< " pg_num "
3019 << pi
->get_pg_num() << " > pgp_num " << pi
->get_pgp_num();
3020 pgp_detail
.push_back(ss
.str());
3022 int average_objects_per_pg
= pg_sum
.stats
.sum
.num_objects
/ pg_stat
.size();
3023 if (average_objects_per_pg
> 0 &&
3024 pg_sum
.stats
.sum
.num_objects
>= mon_pg_warn_min_objects
&&
3025 p
->second
.stats
.sum
.num_objects
>= mon_pg_warn_min_pool_objects
) {
3026 int objects_per_pg
= p
->second
.stats
.sum
.num_objects
/ pi
->get_pg_num();
3027 float ratio
= (float)objects_per_pg
/ (float)average_objects_per_pg
;
3028 if (mon_pg_warn_max_object_skew
> 0 &&
3029 ratio
> mon_pg_warn_max_object_skew
) {
3031 ss
<< "pool " << name
<< " objects per pg ("
3032 << objects_per_pg
<< ") is more than " << ratio
3033 << " times cluster average ("
3034 << average_objects_per_pg
<< ")";
3035 many_detail
.push_back(ss
.str());
3039 if (!pgp_detail
.empty()) {
3041 ss
<< pgp_detail
.size() << " pools have pg_num > pgp_num";
3042 auto& d
= checks
->add("SMALLER_PGP_NUM", HEALTH_WARN
, ss
.str());
3043 d
.detail
.swap(pgp_detail
);
3045 if (!many_detail
.empty()) {
3047 ss
<< many_detail
.size() << " pools have many more objects per pg than"
3049 auto& d
= checks
->add("MANY_OBJECTS_PER_PG", HEALTH_WARN
, ss
.str());
3050 d
.detail
.swap(many_detail
);
3057 float warn_threshold
= (float)g_conf
->get_val
<int64_t>("mon_pool_quota_warn_threshold")/100;
3058 float crit_threshold
= (float)g_conf
->get_val
<int64_t>("mon_pool_quota_crit_threshold")/100;
3059 list
<string
> full_detail
, nearfull_detail
;
3060 unsigned full_pools
= 0, nearfull_pools
= 0;
3061 for (auto it
: pools
) {
3062 auto it2
= pg_pool_sum
.find(it
.first
);
3063 if (it2
== pg_pool_sum
.end()) {
3066 const pool_stat_t
*pstat
= &it2
->second
;
3067 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
3068 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3069 const pg_pool_t
&pool
= it
.second
;
3070 bool full
= false, nearfull
= false;
3071 if (pool
.quota_max_objects
> 0) {
3073 if ((uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
3074 } else if (crit_threshold
> 0 &&
3075 sum
.num_objects
>= pool
.quota_max_objects
*crit_threshold
) {
3076 ss
<< "pool '" << pool_name
3077 << "' has " << sum
.num_objects
<< " objects"
3078 << " (max " << pool
.quota_max_objects
<< ")";
3079 full_detail
.push_back(ss
.str());
3081 } else if (warn_threshold
> 0 &&
3082 sum
.num_objects
>= pool
.quota_max_objects
*warn_threshold
) {
3083 ss
<< "pool '" << pool_name
3084 << "' has " << sum
.num_objects
<< " objects"
3085 << " (max " << pool
.quota_max_objects
<< ")";
3086 nearfull_detail
.push_back(ss
.str());
3090 if (pool
.quota_max_bytes
> 0) {
3092 if ((uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
3093 } else if (crit_threshold
> 0 &&
3094 sum
.num_bytes
>= pool
.quota_max_bytes
*crit_threshold
) {
3095 ss
<< "pool '" << pool_name
3096 << "' has " << si_t(sum
.num_bytes
) << " bytes"
3097 << " (max " << si_t(pool
.quota_max_bytes
) << ")";
3098 full_detail
.push_back(ss
.str());
3100 } else if (warn_threshold
> 0 &&
3101 sum
.num_bytes
>= pool
.quota_max_bytes
*warn_threshold
) {
3102 ss
<< "pool '" << pool_name
3103 << "' has " << si_t(sum
.num_bytes
) << " bytes"
3104 << " (max " << si_t(pool
.quota_max_bytes
) << ")";
3105 nearfull_detail
.push_back(ss
.str());
3118 ss
<< full_pools
<< " pools full";
3119 auto& d
= checks
->add("POOL_FULL", HEALTH_ERR
, ss
.str());
3120 d
.detail
.swap(full_detail
);
3122 if (nearfull_pools
) {
3124 ss
<< nearfull_pools
<< " pools full";
3125 auto& d
= checks
->add("POOL_NEAR_FULL", HEALTH_WARN
, ss
.str());
3126 d
.detail
.swap(nearfull_detail
);
3131 if (pg_sum
.stats
.sum
.num_objects_misplaced
&&
3132 pg_sum
.stats
.sum
.num_object_copies
> 0) {
3133 double pc
= (double)pg_sum
.stats
.sum
.num_objects_misplaced
/
3134 (double)pg_sum
.stats
.sum
.num_object_copies
* (double)100.0;
3136 snprintf(b
, sizeof(b
), "%.3lf", pc
);
3138 ss
<< pg_sum
.stats
.sum
.num_objects_misplaced
3139 << "/" << pg_sum
.stats
.sum
.num_object_copies
<< " objects misplaced ("
3141 checks
->add("OBJECT_MISPLACED", HEALTH_WARN
, ss
.str());
3145 if (pg_sum
.stats
.sum
.num_objects_unfound
&&
3146 pg_sum
.stats
.sum
.num_objects
) {
3147 double pc
= (double)pg_sum
.stats
.sum
.num_objects_unfound
/
3148 (double)pg_sum
.stats
.sum
.num_objects
* (double)100.0;
3150 snprintf(b
, sizeof(b
), "%.3lf", pc
);
3152 ss
<< pg_sum
.stats
.sum
.num_objects_unfound
3153 << "/" << pg_sum
.stats
.sum
.num_objects
<< " objects unfound (" << b
<< "%)";
3154 auto& d
= checks
->add("OBJECT_UNFOUND", HEALTH_WARN
, ss
.str());
3156 for (auto& p
: pg_stat
) {
3157 if (p
.second
.stats
.sum
.num_objects_unfound
) {
3159 ss
<< "pg " << p
.first
3160 << " has " << p
.second
.stats
.sum
.num_objects_unfound
3161 << " unfound objects";
3162 d
.detail
.push_back(ss
.str());
3163 if (d
.detail
.size() > max
) {
3164 d
.detail
.push_back("(additional pgs left out for brevity)");
3173 if (cct
->_conf
->mon_osd_warn_op_age
> 0 &&
3174 !osd_sum
.op_queue_age_hist
.h
.empty() &&
3175 osd_sum
.op_queue_age_hist
.upper_bound() / 1000.0 >
3176 cct
->_conf
->mon_osd_warn_op_age
) {
3177 list
<string
> warn_detail
, error_detail
;
3178 unsigned warn
= 0, error
= 0;
3180 cct
->_conf
->mon_osd_warn_op_age
* cct
->_conf
->mon_osd_err_op_age_ratio
;
3181 const pow2_hist_t
& h
= osd_sum
.op_queue_age_hist
;
3182 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
3183 float ub
= (float)(1 << i
) / 1000.0;
3184 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
3188 ss
<< h
.h
[i
] << " ops are blocked > " << ub
<< " sec";
3191 error_detail
.push_back(ss
.str());
3194 warn_detail
.push_back(ss
.str());
3199 map
<float,set
<int>> warn_osd_by_max
; // max -> osds
3200 map
<float,set
<int>> error_osd_by_max
; // max -> osds
3201 if (!warn_detail
.empty() || !error_detail
.empty()) {
3202 for (auto& p
: osd_stat
) {
3203 const pow2_hist_t
& h
= p
.second
.op_queue_age_hist
;
3204 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
3205 float ub
= (float)(1 << i
) / 1000.0;
3206 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
3210 error_osd_by_max
[ub
].insert(p
.first
);
3212 warn_osd_by_max
[ub
].insert(p
.first
);
3220 if (!warn_detail
.empty()) {
3222 set
<int> implicated_osds
;
3223 for (auto& p
: warn_osd_by_max
) {
3225 implicated_osds
.insert(p
.second
.begin(), p
.second
.end());
3226 if (p
.second
.size() > 1) {
3227 ss
<< "osds " << p
.second
3228 << " have blocked requests > " << p
.first
<< " sec";
3230 ss
<< "osd." << *p
.second
.begin()
3231 << " has blocked requests > " << p
.first
<< " sec";
3233 warn_detail
.push_back(ss
.str());
3239 ss
<< warn
<< " slow requests are blocked > "
3240 << cct
->_conf
->mon_osd_warn_op_age
<< " sec. Implicated osds "
3242 auto& d
= checks
->add("REQUEST_SLOW", HEALTH_WARN
, ss
.str());
3243 d
.detail
.swap(warn_detail
);
3245 if (!error_detail
.empty()) {
3247 set
<int> implicated_osds
;
3248 for (auto& p
: error_osd_by_max
) {
3250 implicated_osds
.insert(p
.second
.begin(), p
.second
.end());
3251 if (p
.second
.size() > 1) {
3252 ss
<< "osds " << p
.second
3253 << " have stuck requests > " << p
.first
<< " sec";
3255 ss
<< "osd." << *p
.second
.begin()
3256 << " has stuck requests > " << p
.first
<< " sec";
3258 error_detail
.push_back(ss
.str());
3264 ss
<< error
<< " stuck requests are blocked > "
3265 << err_age
<< " sec. Implicated osds " << implicated_osds
;
3266 auto& d
= checks
->add("REQUEST_STUCK", HEALTH_ERR
, ss
.str());
3267 d
.detail
.swap(error_detail
);
3272 // PG_NOT_DEEP_SCRUBBED
3274 if (cct
->_conf
->mon_warn_not_scrubbed
||
3275 cct
->_conf
->mon_warn_not_deep_scrubbed
) {
3276 list
<string
> detail
, deep_detail
;
3277 const double age
= cct
->_conf
->mon_warn_not_scrubbed
+
3278 cct
->_conf
->mon_scrub_interval
;
3279 utime_t cutoff
= now
;
3281 const double deep_age
= cct
->_conf
->mon_warn_not_deep_scrubbed
+
3282 cct
->_conf
->osd_deep_scrub_interval
;
3283 utime_t deep_cutoff
= now
;
3284 deep_cutoff
-= deep_age
;
3285 for (auto& p
: pg_stat
) {
3286 if (cct
->_conf
->mon_warn_not_scrubbed
&&
3287 p
.second
.last_scrub_stamp
< cutoff
) {
3289 ss
<< "pg " << p
.first
<< " not scrubbed since "
3290 << p
.second
.last_scrub_stamp
;
3291 detail
.push_back(ss
.str());
3293 if (cct
->_conf
->mon_warn_not_deep_scrubbed
&&
3294 p
.second
.last_deep_scrub_stamp
< deep_cutoff
) {
3296 ss
<< "pg " << p
.first
<< " not deep-scrubbed since "
3297 << p
.second
.last_deep_scrub_stamp
;
3298 deep_detail
.push_back(ss
.str());
3301 if (!detail
.empty()) {
3303 ss
<< detail
.size() << " pgs not scrubbed for " << age
;
3304 auto& d
= checks
->add("PG_NOT_SCRUBBED", HEALTH_WARN
, ss
.str());
3305 d
.detail
.swap(detail
);
3307 if (!deep_detail
.empty()) {
3309 ss
<< deep_detail
.size() << " pgs not deep-scrubbed for " << deep_age
;
3310 auto& d
= checks
->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN
, ss
.str());
3311 d
.detail
.swap(deep_detail
);
3317 if (g_conf
->get_val
<bool>("mon_warn_on_pool_no_app")) {
3318 list
<string
> detail
;
3319 for (auto &it
: pools
) {
3320 const pg_pool_t
&pool
= it
.second
;
3321 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3322 auto it2
= pg_pool_sum
.find(it
.first
);
3323 if (it2
== pg_pool_sum
.end()) {
3326 const pool_stat_t
*pstat
= &it2
->second
;
3327 if (pstat
== nullptr) {
3330 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
3331 // application metadata is not encoded until luminous is minimum
3333 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
3334 sum
.num_objects
> 0 && pool
.application_metadata
.empty() &&
3335 !pool
.is_tier() && !g_conf
->mon_debug_no_require_luminous
) {
3337 ss
<< "application not enabled on pool '" << pool_name
<< "'";
3338 detail
.push_back(ss
.str());
3341 if (!detail
.empty()) {
3343 ss
<< "application not enabled on " << detail
.size() << " pool(s)";
3344 auto& d
= checks
->add("POOL_APP_NOT_ENABLED", HEALTH_WARN
, ss
.str());
3346 tip
<< "use 'ceph osd pool application enable <pool-name> "
3347 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3348 << "or freeform for custom applications.";
3349 detail
.push_back(tip
.str());
3350 d
.detail
.swap(detail
);
3354 // PG_SLOW_SNAP_TRIMMING
3355 if (!pg_stat
.empty() && cct
->_conf
->mon_osd_snap_trim_queue_warn_on
> 0) {
3356 uint32_t snapthreshold
= cct
->_conf
->mon_osd_snap_trim_queue_warn_on
;
3357 uint64_t snaptrimq_exceeded
= 0;
3358 uint32_t longest_queue
= 0;
3359 const pg_t
* longest_q_pg
= nullptr;
3360 list
<string
> detail
;
3362 for (auto& i
: pg_stat
) {
3363 uint32_t current_len
= i
.second
.snaptrimq_len
;
3364 if (current_len
>= snapthreshold
) {
3365 snaptrimq_exceeded
++;
3366 if (longest_queue
<= current_len
) {
3367 longest_q_pg
= &i
.first
;
3368 longest_queue
= current_len
;
3370 if (detail
.size() < max
- 1) {
3372 ss
<< "snap trim queue for pg " << i
.first
<< " at " << current_len
;
3373 detail
.push_back(ss
.str());
3376 if (detail
.size() < max
) {
3377 detail
.push_back("...more pgs affected");
3383 if (snaptrimq_exceeded
) {
3386 ss
<< "longest queue on pg " << *longest_q_pg
<< " at " << longest_queue
;
3387 detail
.push_back(ss
.str());
3391 ss
<< "snap trim queue for " << snaptrimq_exceeded
<< " pg(s) >= " << snapthreshold
<< " (mon_osd_snap_trim_queue_warn_on)";
3392 auto& d
= checks
->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN
, ss
.str());
3393 detail
.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3394 d
.detail
.swap(detail
);
3399 void PGMap::get_health(
3401 const OSDMap
& osdmap
,
3402 list
<pair
<health_status_t
,string
> >& summary
,
3403 list
<pair
<health_status_t
,string
> > *detail
) const
3405 map
<string
,int> note
;
3406 auto p
= num_pg_by_state
.begin();
3407 auto p_end
= num_pg_by_state
.end();
3408 for (; p
!= p_end
; ++p
) {
3409 if (p
->first
& PG_STATE_STALE
)
3410 note
["stale"] += p
->second
;
3411 if (p
->first
& PG_STATE_DOWN
)
3412 note
["down"] += p
->second
;
3413 if (p
->first
& PG_STATE_UNDERSIZED
)
3414 note
["undersized"] += p
->second
;
3415 if (p
->first
& PG_STATE_DEGRADED
)
3416 note
["degraded"] += p
->second
;
3417 if (p
->first
& PG_STATE_INCONSISTENT
)
3418 note
["inconsistent"] += p
->second
;
3419 if (p
->first
& PG_STATE_PEERING
)
3420 note
["peering"] += p
->second
;
3421 if (p
->first
& PG_STATE_REPAIR
)
3422 note
["repair"] += p
->second
;
3423 if (p
->first
& PG_STATE_RECOVERING
)
3424 note
["recovering"] += p
->second
;
3425 if (p
->first
& PG_STATE_RECOVERY_WAIT
)
3426 note
["recovery_wait"] += p
->second
;
3427 if (p
->first
& PG_STATE_INCOMPLETE
)
3428 note
["incomplete"] += p
->second
;
3429 if (p
->first
& PG_STATE_BACKFILL_WAIT
)
3430 note
["backfill_wait"] += p
->second
;
3431 if (p
->first
& PG_STATE_BACKFILLING
)
3432 note
["backfilling"] += p
->second
;
3433 if (p
->first
& PG_STATE_BACKFILL_TOOFULL
)
3434 note
["backfill_toofull"] += p
->second
;
3435 if (p
->first
& PG_STATE_RECOVERY_TOOFULL
)
3436 note
["recovery_toofull"] += p
->second
;
3437 if (p
->first
& PG_STATE_SNAPTRIM_ERROR
)
3438 note
["snaptrim_error"] += p
->second
;
3441 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pgs
;
3442 utime_t
now(ceph_clock_now());
3443 utime_t cutoff
= now
- utime_t(g_conf
->get_val
<int64_t>("mon_pg_stuck_threshold"), 0);
3444 uint64_t num_inactive_pgs
= 0;
3447 // we need to collect details of stuck pgs, first do a quick check
3448 // whether this will yield any results
3449 if (get_stuck_counts(cutoff
, note
)) {
3451 // there are stuck pgs. gather details for specified statuses
3452 // only if we know that there are pgs stuck in that status
3454 if (note
.find("stuck inactive") != note
.end()) {
3455 get_stuck_stats(PGMap::STUCK_INACTIVE
, cutoff
, stuck_pgs
);
3456 note
["stuck inactive"] = stuck_pgs
.size();
3457 num_inactive_pgs
+= stuck_pgs
.size();
3458 note_stuck_detail(PGMap::STUCK_INACTIVE
, stuck_pgs
,
3459 cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail"), detail
);
3463 if (note
.find("stuck unclean") != note
.end()) {
3464 get_stuck_stats(PGMap::STUCK_UNCLEAN
, cutoff
, stuck_pgs
);
3465 note
["stuck unclean"] = stuck_pgs
.size();
3466 note_stuck_detail(PGMap::STUCK_UNCLEAN
, stuck_pgs
,
3467 cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail"), detail
);
3471 if (note
.find("stuck undersized") != note
.end()) {
3472 get_stuck_stats(PGMap::STUCK_UNDERSIZED
, cutoff
, stuck_pgs
);
3473 note
["stuck undersized"] = stuck_pgs
.size();
3474 note_stuck_detail(PGMap::STUCK_UNDERSIZED
, stuck_pgs
,
3475 cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail"), detail
);
3479 if (note
.find("stuck degraded") != note
.end()) {
3480 get_stuck_stats(PGMap::STUCK_DEGRADED
, cutoff
, stuck_pgs
);
3481 note
["stuck degraded"] = stuck_pgs
.size();
3482 note_stuck_detail(PGMap::STUCK_DEGRADED
, stuck_pgs
,
3483 cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail"), detail
);
3487 if (note
.find("stuck stale") != note
.end()) {
3488 get_stuck_stats(PGMap::STUCK_STALE
, cutoff
, stuck_pgs
);
3489 note
["stuck stale"] = stuck_pgs
.size();
3490 num_inactive_pgs
+= stuck_pgs
.size();
3491 note_stuck_detail(PGMap::STUCK_STALE
, stuck_pgs
,
3492 cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail"), detail
);
3496 get_stuck_counts(cutoff
, note
);
3497 auto p
= note
.find("stuck inactive");
3498 if (p
!= note
.end())
3499 num_inactive_pgs
+= p
->second
;
3500 p
= note
.find("stuck stale");
3501 if (p
!= note
.end())
3502 num_inactive_pgs
+= p
->second
;
3505 if (cct
->_conf
->mon_pg_min_inactive
> 0 &&
3506 num_inactive_pgs
>= cct
->_conf
->mon_pg_min_inactive
) {
3508 ss
<< num_inactive_pgs
<< " pgs are stuck inactive for more than " << g_conf
->get_val
<int64_t>("mon_pg_stuck_threshold") << " seconds";
3509 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3512 if (!note
.empty()) {
3513 for (auto p
= note
.begin(); p
!= note
.end(); ++p
) {
3515 ss
<< p
->second
<< " pgs " << p
->first
;
3516 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3519 int n
= 0, more
= 0;
3520 int max
= cct
->_conf
->get_val
<uint64_t>("mon_health_max_detail");
3521 for (auto p
= pg_stat
.begin();
3524 if ((p
->second
.state
& (PG_STATE_STALE
|
3526 PG_STATE_UNDERSIZED
|
3528 PG_STATE_INCONSISTENT
|
3531 PG_STATE_RECOVERING
|
3532 PG_STATE_RECOVERY_WAIT
|
3533 PG_STATE_RECOVERY_TOOFULL
|
3534 PG_STATE_INCOMPLETE
|
3535 PG_STATE_BACKFILL_WAIT
|
3536 PG_STATE_BACKFILLING
|
3537 PG_STATE_BACKFILL_TOOFULL
)) &&
3538 stuck_pgs
.count(p
->first
) == 0) {
3547 ss
<< "pg " << p
->first
<< " is " << pg_state_string(p
->second
.state
);
3548 ss
<< ", acting " << p
->second
.acting
;
3549 if (p
->second
.stats
.sum
.num_objects_unfound
)
3550 ss
<< ", " << p
->second
.stats
.sum
.num_objects_unfound
<< " unfound";
3551 if (p
->second
.state
& PG_STATE_INCOMPLETE
) {
3552 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
.pool());
3553 if (pi
&& pi
->min_size
> 1) {
3554 ss
<< " (reducing pool " << osdmap
.get_pool_name(p
->first
.pool())
3555 << " min_size from " << (int)pi
->min_size
3556 << " may help; search ceph.com/docs for 'incomplete')";
3559 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3564 ss
<< more
<< " more pgs are also unhealthy";
3565 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3571 if (cct
->_conf
->mon_osd_warn_op_age
> 0 &&
3572 osd_sum
.op_queue_age_hist
.upper_bound() / 1000.0 >
3573 cct
->_conf
->mon_osd_warn_op_age
) {
3574 auto sum
= _warn_slow_request_histogram(
3575 cct
, osd_sum
.op_queue_age_hist
, "", summary
, NULL
);
3576 if (sum
.first
> 0 || sum
.second
> 0) {
3577 if (sum
.first
> 0) {
3579 ss
<< sum
.first
<< " requests are blocked > "
3580 << cct
->_conf
->mon_osd_warn_op_age
3582 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3584 if (sum
.second
> 0) {
3586 ss
<< sum
.second
<< " requests are blocked > "
3587 << (cct
->_conf
->mon_osd_warn_op_age
*
3588 cct
->_conf
->mon_osd_err_op_age_ratio
)
3590 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3594 unsigned num_warn
= 0, num_err
= 0;
3595 // do per-osd warnings
3596 for (auto p
= osd_stat
.begin();
3597 p
!= osd_stat
.end();
3599 auto sum
= _warn_slow_request_histogram(
3601 p
->second
.op_queue_age_hist
,
3602 string(" on osd.") + stringify(p
->first
),
3611 ss2
<< num_err
<< " osds have very slow requests";
3612 summary
.push_back(make_pair(HEALTH_ERR
, ss2
.str()));
3613 detail
->push_back(make_pair(HEALTH_ERR
, ss2
.str()));
3617 ss2
<< num_warn
<< " osds have slow requests";
3618 summary
.push_back(make_pair(HEALTH_WARN
, ss2
.str()));
3619 detail
->push_back(make_pair(HEALTH_WARN
, ss2
.str()));
3627 overall_recovery_summary(NULL
, &sl
);
3628 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
) {
3629 summary
.push_back(make_pair(HEALTH_WARN
, "recovery " + *p
));
3631 detail
->push_back(make_pair(HEALTH_WARN
, "recovery " + *p
));
3634 // near-target max pools
3635 auto& pools
= osdmap
.get_pools();
3636 for (auto p
= pools
.begin();
3637 p
!= pools
.end(); ++p
) {
3638 if ((!p
->second
.target_max_objects
&& !p
->second
.target_max_bytes
) ||
3639 !pg_pool_sum
.count(p
->first
))
3641 bool nearfull
= false;
3642 const string
& name
= osdmap
.get_pool_name(p
->first
);
3643 const pool_stat_t
& st
= get_pg_pool_sum_stat(p
->first
);
3644 uint64_t ratio
= p
->second
.cache_target_full_ratio_micro
+
3645 ((1000000 - p
->second
.cache_target_full_ratio_micro
) *
3646 cct
->_conf
->mon_cache_target_full_warn_ratio
);
3647 if (p
->second
.target_max_objects
&&
3648 (uint64_t)(st
.stats
.sum
.num_objects
-
3649 st
.stats
.sum
.num_objects_hit_set_archive
) >
3650 p
->second
.target_max_objects
* (ratio
/ 1000000.0)) {
3654 ss
<< "cache pool '" << name
<< "' with "
3655 << si_t(st
.stats
.sum
.num_objects
)
3656 << " objects at/near target max "
3657 << si_t(p
->second
.target_max_objects
) << " objects";
3658 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3661 if (p
->second
.target_max_bytes
&&
3662 (uint64_t)(st
.stats
.sum
.num_bytes
-
3663 st
.stats
.sum
.num_bytes_hit_set_archive
) >
3664 p
->second
.target_max_bytes
* (ratio
/ 1000000.0)) {
3668 ss
<< "cache pool '" << name
3669 << "' with " << si_t(st
.stats
.sum
.num_bytes
)
3670 << "B at/near target max "
3671 << si_t(p
->second
.target_max_bytes
) << "B";
3672 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3677 ss
<< "'" << name
<< "' at/near target max";
3678 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3683 if (pg_sum
.stats
.sum
.num_scrub_errors
) {
3685 ss
<< pg_sum
.stats
.sum
.num_scrub_errors
<< " scrub errors";
3686 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
3688 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
3693 auto num_in
= osdmap
.get_num_in_osds();
3694 auto sum_pg_up
= MAX(static_cast<unsigned>(pg_sum
.up
), pg_stat
.size());
3695 int sum_objects
= pg_sum
.stats
.sum
.num_objects
;
3696 if (sum_objects
< cct
->_conf
->mon_pg_warn_min_objects
) {
3699 const auto min_pg_per_osd
=
3700 cct
->_conf
->get_val
<uint64_t>("mon_pg_warn_min_per_osd");
3701 if (num_in
&& min_pg_per_osd
> 0) {
3702 auto per
= sum_pg_up
/ num_in
;
3703 if (per
< min_pg_per_osd
&& per
) {
3705 ss
<< "too few PGs per OSD (" << per
<< " < min " << min_pg_per_osd
<< ")";
3706 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3708 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3711 int64_t max_pg_per_osd
= cct
->_conf
->get_val
<uint64_t>("mon_max_pg_per_osd");
3712 if (num_in
&& max_pg_per_osd
> 0) {
3713 int per
= sum_pg_up
/ num_in
;
3714 if (per
> max_pg_per_osd
) {
3716 ss
<< "too many PGs per OSD (" << per
<< " > max "
3717 << max_pg_per_osd
<< ")";
3718 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3720 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3723 if (!pg_stat
.empty()) {
3724 for (auto p
= pg_pool_sum
.begin();
3725 p
!= pg_pool_sum
.end();
3727 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
);
3729 continue; // in case osdmap changes haven't propagated to PGMap yet
3730 const string
& name
= osdmap
.get_pool_name(p
->first
);
3731 if (pi
->get_pg_num() > pi
->get_pgp_num() &&
3732 !(name
.find(".DELETED") != string::npos
&&
3733 cct
->_conf
->mon_fake_pool_delete
)) {
3735 ss
<< "pool " << name
<< " pg_num "
3736 << pi
->get_pg_num() << " > pgp_num " << pi
->get_pgp_num();
3737 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3739 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3741 int average_objects_per_pg
= pg_sum
.stats
.sum
.num_objects
/ pg_stat
.size();
3742 if (average_objects_per_pg
> 0 &&
3743 pg_sum
.stats
.sum
.num_objects
>= cct
->_conf
->mon_pg_warn_min_objects
&&
3744 p
->second
.stats
.sum
.num_objects
>= cct
->_conf
->mon_pg_warn_min_pool_objects
) {
3745 int objects_per_pg
= p
->second
.stats
.sum
.num_objects
/ pi
->get_pg_num();
3746 float ratio
= (float)objects_per_pg
/ (float)average_objects_per_pg
;
3747 if (cct
->_conf
->mon_pg_warn_max_object_skew
> 0 &&
3748 ratio
> cct
->_conf
->mon_pg_warn_max_object_skew
) {
3750 ss
<< "pool " << name
<< " has many more objects per pg than average (too few pgs?)";
3751 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
3754 ss
<< "pool " << name
<< " objects per pg ("
3755 << objects_per_pg
<< ") is more than " << ratio
<< " times cluster average ("
3756 << average_objects_per_pg
<< ")";
3757 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
3764 for (auto it
: pools
) {
3765 auto it2
= pg_pool_sum
.find(it
.first
);
3766 if (it2
== pg_pool_sum
.end()) {
3769 const pool_stat_t
*pstat
= &it2
->second
;
3770 const object_stat_sum_t
& sum
= pstat
->stats
.sum
;
3771 const string
& pool_name
= osdmap
.get_pool_name(it
.first
);
3772 const pg_pool_t
&pool
= it
.second
;
3774 float warn_threshold
= (float)g_conf
->mon_pool_quota_warn_threshold
/100;
3775 float crit_threshold
= (float)g_conf
->mon_pool_quota_crit_threshold
/100;
3777 if (pool
.quota_max_objects
> 0) {
3779 health_status_t status
= HEALTH_OK
;
3780 if ((uint64_t)sum
.num_objects
>= pool
.quota_max_objects
) {
3781 } else if (crit_threshold
> 0 &&
3782 sum
.num_objects
>= pool
.quota_max_objects
*crit_threshold
) {
3783 ss
<< "pool '" << pool_name
3784 << "' has " << sum
.num_objects
<< " objects"
3785 << " (max " << pool
.quota_max_objects
<< ")";
3786 status
= HEALTH_ERR
;
3787 } else if (warn_threshold
> 0 &&
3788 sum
.num_objects
>= pool
.quota_max_objects
*warn_threshold
) {
3789 ss
<< "pool '" << pool_name
3790 << "' has " << sum
.num_objects
<< " objects"
3791 << " (max " << pool
.quota_max_objects
<< ")";
3792 status
= HEALTH_WARN
;
3794 if (status
!= HEALTH_OK
) {
3795 pair
<health_status_t
,string
> s(status
, ss
.str());
3796 summary
.push_back(s
);
3798 detail
->push_back(s
);
3802 if (pool
.quota_max_bytes
> 0) {
3803 health_status_t status
= HEALTH_OK
;
3805 if ((uint64_t)sum
.num_bytes
>= pool
.quota_max_bytes
) {
3806 } else if (crit_threshold
> 0 &&
3807 sum
.num_bytes
>= pool
.quota_max_bytes
*crit_threshold
) {
3808 ss
<< "pool '" << pool_name
3809 << "' has " << si_t(sum
.num_bytes
) << " bytes"
3810 << " (max " << si_t(pool
.quota_max_bytes
) << ")";
3811 status
= HEALTH_ERR
;
3812 } else if (warn_threshold
> 0 &&
3813 sum
.num_bytes
>= pool
.quota_max_bytes
*warn_threshold
) {
3814 ss
<< "pool '" << pool_name
3815 << "' has " << si_t(sum
.num_bytes
) << " bytes"
3816 << " (max " << si_t(pool
.quota_max_bytes
) << ")";
3817 status
= HEALTH_WARN
;
3819 if (status
!= HEALTH_OK
) {
3820 pair
<health_status_t
,string
> s(status
, ss
.str());
3821 summary
.push_back(s
);
3823 detail
->push_back(s
);
3828 print_unscrubbed_pgs(pg_stat
, summary
, detail
, cct
);
3831 int process_pg_map_command(
3832 const string
& orig_prefix
,
3833 const map
<string
,cmd_vartype
>& orig_cmdmap
,
3834 const PGMap
& pg_map
,
3835 const OSDMap
& osdmap
,
3840 string prefix
= orig_prefix
;
3841 map
<string
,cmd_vartype
> cmdmap
= orig_cmdmap
;
3843 // perhaps these would be better in the parsing, but it's weird
3844 bool primary
= false;
3845 if (prefix
== "pg dump_json") {
3847 v
.push_back(string("all"));
3848 cmd_putval(g_ceph_context
, cmdmap
, "format", string("json"));
3849 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
3851 } else if (prefix
== "pg dump_pools_json") {
3853 v
.push_back(string("pools"));
3854 cmd_putval(g_ceph_context
, cmdmap
, "format", string("json"));
3855 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
3857 } else if (prefix
== "pg ls-by-primary") {
3860 } else if (prefix
== "pg ls-by-osd") {
3862 } else if (prefix
== "pg ls-by-pool") {
3865 cmd_getval(g_ceph_context
, cmdmap
, "poolstr", poolstr
);
3866 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
3868 *ss
<< "pool " << poolstr
<< " does not exist";
3871 cmd_putval(g_ceph_context
, cmdmap
, "pool", pool
);
3876 if (prefix
== "pg stat") {
3878 f
->open_object_section("pg_summary");
3879 pg_map
.print_oneline_summary(f
, NULL
);
3889 if (prefix
== "pg getmap") {
3890 pg_map
.encode(*odata
);
3891 *ss
<< "got pgmap version " << pg_map
.version
;
3895 if (prefix
== "pg dump") {
3897 vector
<string
> dumpcontents
;
3899 if (cmd_getval(g_ceph_context
, cmdmap
, "dumpcontents", dumpcontents
)) {
3900 copy(dumpcontents
.begin(), dumpcontents
.end(),
3901 inserter(what
, what
.end()));
3906 if (what
.count("all")) {
3907 f
->open_object_section("pg_map");
3910 } else if (what
.count("summary") || what
.count("sum")) {
3911 f
->open_object_section("pg_map");
3912 pg_map
.dump_basic(f
);
3915 if (what
.count("pools")) {
3916 pg_map
.dump_pool_stats(f
);
3918 if (what
.count("osds")) {
3919 pg_map
.dump_osd_stats(f
);
3921 if (what
.count("pgs")) {
3922 pg_map
.dump_pg_stats(f
, false);
3924 if (what
.count("pgs_brief")) {
3925 pg_map
.dump_pg_stats(f
, true);
3927 if (what
.count("delta")) {
3928 f
->open_object_section("delta");
3929 pg_map
.dump_delta(f
);
3935 if (what
.count("all")) {
3937 } else if (what
.count("summary") || what
.count("sum")) {
3938 pg_map
.dump_basic(ds
);
3939 pg_map
.dump_pg_sum_stats(ds
, true);
3940 pg_map
.dump_osd_sum_stats(ds
);
3942 if (what
.count("pgs_brief")) {
3943 pg_map
.dump_pg_stats(ds
, true);
3946 if (what
.count("pgs")) {
3947 pg_map
.dump_pg_stats(ds
, false);
3950 if (what
.count("pools")) {
3951 pg_map
.dump_pool_stats(ds
, header
);
3953 if (what
.count("osds")) {
3954 pg_map
.dump_osd_stats(ds
);
3959 *ss
<< "dumped " << what
;
3963 if (prefix
== "pg ls") {
3966 vector
<string
>states
;
3968 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool
);
3969 cmd_getval(g_ceph_context
, cmdmap
, "osd", osd
);
3970 cmd_getval(g_ceph_context
, cmdmap
, "states", states
);
3971 if (pool
>= 0 && !osdmap
.have_pg_pool(pool
)) {
3972 *ss
<< "pool " << pool
<< " does not exist";
3975 if (osd
>= 0 && !osdmap
.is_up(osd
)) {
3976 *ss
<< "osd " << osd
<< " is not up";
3980 states
.push_back("all");
3984 while (!states
.empty()) {
3985 string state_str
= states
.back();
3987 if (state_str
== "all") {
3991 auto filter
= pg_string_state(state_str
);
3993 *ss
<< "'" << state_str
<< "' is not a valid pg state,"
3994 << " available choices: " << pg_state_string(0xFFFFFFFF);
4003 pg_map
.get_filtered_pg_stats(state
, pool
, osd
, primary
, pgs
);
4005 if (f
&& !pgs
.empty()) {
4006 pg_map
.dump_filtered_pg_stats(f
, pgs
);
4008 } else if (!pgs
.empty()) {
4009 pg_map
.dump_filtered_pg_stats(ds
, pgs
);
4015 if (prefix
== "pg dump_stuck") {
4016 vector
<string
> stuckop_vec
;
4017 cmd_getval(g_ceph_context
, cmdmap
, "stuckops", stuckop_vec
);
4018 if (stuckop_vec
.empty())
4019 stuckop_vec
.push_back("unclean");
4021 cmd_getval(g_ceph_context
, cmdmap
, "threshold", threshold
,
4022 g_conf
->get_val
<int64_t>("mon_pg_stuck_threshold"));
4024 r
= pg_map
.dump_stuck_pg_stats(ds
, f
, (int)threshold
, stuckop_vec
);
4033 if (prefix
== "pg debug") {
4035 cmd_getval(g_ceph_context
, cmdmap
, "debugop", debugop
,
4036 string("unfound_objects_exist"));
4037 if (debugop
== "unfound_objects_exist") {
4038 bool unfound_objects_exist
= false;
4039 for (const auto& p
: pg_map
.pg_stat
) {
4040 if (p
.second
.stats
.sum
.num_objects_unfound
> 0) {
4041 unfound_objects_exist
= true;
4045 if (unfound_objects_exist
)
4052 if (debugop
== "degraded_pgs_exist") {
4053 bool degraded_pgs_exist
= false;
4054 for (const auto& p
: pg_map
.pg_stat
) {
4055 if (p
.second
.stats
.sum
.num_objects_degraded
> 0) {
4056 degraded_pgs_exist
= true;
4060 if (degraded_pgs_exist
)
4069 if (prefix
== "osd perf") {
4071 f
->open_object_section("osdstats");
4072 pg_map
.dump_osd_perf_stats(f
);
4076 pg_map
.print_osd_perf_stats(&ds
);
4082 if (prefix
== "osd blocked-by") {
4084 f
->open_object_section("osd_blocked_by");
4085 pg_map
.dump_osd_blocked_by_stats(f
);
4089 pg_map
.print_osd_blocked_by_stats(&ds
);
4095 if (prefix
== "osd pool stats") {
4097 cmd_getval(g_ceph_context
, cmdmap
, "name", pool_name
);
4099 int64_t poolid
= -ENOENT
;
4100 bool one_pool
= false;
4101 if (!pool_name
.empty()) {
4102 poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
4104 assert(poolid
== -ENOENT
);
4105 *ss
<< "unrecognized pool '" << pool_name
<< "'";
4114 f
->open_array_section("pool_stats");
4116 if (osdmap
.get_pools().empty()) {
4117 *ss
<< "there are no pools!";
4122 for (auto& p
: osdmap
.get_pools()) {
4126 pool_name
= osdmap
.get_pool_name(poolid
);
4129 f
->open_object_section("pool");
4130 f
->dump_string("pool_name", pool_name
.c_str());
4131 f
->dump_int("pool_id", poolid
);
4132 f
->open_object_section("recovery");
4137 pg_map
.pool_recovery_summary(f
, &sl
, poolid
);
4138 if (!f
&& !sl
.empty()) {
4140 tss
<< " " << p
<< "\n";
4145 f
->open_object_section("recovery_rate");
4149 pg_map
.pool_recovery_rate_summary(f
, &rss
, poolid
);
4150 if (!f
&& !rss
.str().empty())
4151 tss
<< " recovery io " << rss
.str() << "\n";
4155 f
->open_object_section("client_io_rate");
4160 pg_map
.pool_client_io_rate_summary(f
, &rss
, poolid
);
4161 if (!f
&& !rss
.str().empty())
4162 tss
<< " client io " << rss
.str() << "\n";
4164 // dump cache tier IO rate for cache pool
4165 const pg_pool_t
*pool
= osdmap
.get_pg_pool(poolid
);
4166 if (pool
->is_tier()) {
4169 f
->open_object_section("cache_io_rate");
4174 pg_map
.pool_cache_io_rate_summary(f
, &rss
, poolid
);
4175 if (!f
&& !rss
.str().empty())
4176 tss
<< " cache tier io " << rss
.str() << "\n";
4182 rs
<< "pool " << pool_name
<< " id " << poolid
<< "\n";
4183 if (!tss
.str().empty())
4184 rs
<< tss
.str() << "\n";
4186 rs
<< " nothing is going on\n\n";
4198 odata
->append(rs
.str());
4206 void PGMapUpdater::check_osd_map(const OSDMap::Incremental
&osd_inc
,
4207 std::set
<int> *need_check_down_pg_osds
,
4208 std::map
<int,utime_t
> *last_osd_report
,
4210 PGMap::Incremental
*pending_inc
)
4212 for (const auto &p
: osd_inc
.new_weight
) {
4213 if (p
.second
== CEPH_OSD_OUT
) {
4214 dout(10) << __func__
<< " osd." << p
.first
<< " went OUT" << dendl
;
4215 auto j
= pg_map
->osd_epochs
.find(p
.first
);
4216 if (j
!= pg_map
->osd_epochs
.end())
4217 pending_inc
->stat_osd_out(p
.first
, j
->second
);
4221 // this is conservative: we want to know if any osds (maybe) got marked down.
4222 for (const auto &p
: osd_inc
.new_state
) {
4223 if (p
.second
& CEPH_OSD_UP
) { // true if marked up OR down,
4224 // but we're too lazy to check
4226 need_check_down_pg_osds
->insert(p
.first
);
4228 // clear out the last_osd_report for this OSD
4229 auto report
= last_osd_report
->find(p
.first
);
4230 if (report
!= last_osd_report
->end()) {
4231 last_osd_report
->erase(report
);
4234 // clear out osd_stat slow request histogram
4235 dout(20) << __func__
<< " clearing osd." << p
.first
4236 << " request histogram" << dendl
;
4237 pending_inc
->stat_osd_down_up(p
.first
, osd_inc
.epoch
, *pg_map
);
4240 if (p
.second
& CEPH_OSD_EXISTS
) {
4241 // whether it was created *or* destroyed, we can safely drop
4242 // it's osd_stat_t record.
4243 dout(10) << __func__
<< " osd." << p
.first
4244 << " created or destroyed" << dendl
;
4245 pending_inc
->rm_stat(p
.first
);
4247 // and adjust full, nearfull set
4248 pg_map
->nearfull_osds
.erase(p
.first
);
4249 pg_map
->full_osds
.erase(p
.first
);
4254 void PGMapUpdater::check_osd_map(
4256 const OSDMap
& osdmap
,
4258 PGMap::Incremental
*pending_inc
)
4260 for (auto& p
: pgmap
.osd_stat
) {
4261 if (!osdmap
.exists(p
.first
)) {
4263 pending_inc
->rm_stat(p
.first
);
4264 } else if (osdmap
.is_out(p
.first
)) {
4266 if (p
.second
.kb
!= 0) {
4267 auto j
= pgmap
.osd_epochs
.find(p
.first
);
4268 if (j
!= pgmap
.osd_epochs
.end()) {
4269 pending_inc
->stat_osd_out(p
.first
, j
->second
);
4272 } else if (!osdmap
.is_up(p
.first
)) {
4273 // zero the op_queue_age_hist
4274 if (!p
.second
.op_queue_age_hist
.empty()) {
4275 pending_inc
->stat_osd_down_up(p
.first
, osdmap
.get_epoch(), pgmap
);
4280 // deleted pgs (pools)?
4281 for (auto& p
: pgmap
.pg_pool_sum
) {
4282 if (!osdmap
.have_pg_pool(p
.first
)) {
4283 ldout(cct
, 10) << __func__
<< " pool " << p
.first
<< " gone, removing pgs"
4285 for (auto& q
: pgmap
.pg_stat
) {
4286 if (q
.first
.pool() == (uint64_t)p
.first
) {
4287 pending_inc
->pg_remove
.insert(q
.first
);
4290 auto q
= pending_inc
->pg_stat_updates
.begin();
4291 while (q
!= pending_inc
->pg_stat_updates
.end()) {
4292 if (q
->first
.pool() == (uint64_t)p
.first
) {
4293 q
= pending_inc
->pg_stat_updates
.erase(q
);
4301 // new pgs (split or new pool)?
4302 for (auto& p
: osdmap
.get_pools()) {
4303 int64_t poolid
= p
.first
;
4304 const pg_pool_t
& pi
= p
.second
;
4305 auto q
= pgmap
.num_pg_by_pool
.find(poolid
);
4306 unsigned my_pg_num
= 0;
4307 if (q
!= pgmap
.num_pg_by_pool
.end())
4308 my_pg_num
= q
->second
;
4309 unsigned pg_num
= pi
.get_pg_num();
4310 if (my_pg_num
!= pg_num
) {
4311 ldout(cct
,10) << __func__
<< " pool " << poolid
<< " pg_num " << pg_num
4312 << " != my pg_num " << my_pg_num
<< dendl
;
4313 for (unsigned ps
= my_pg_num
; ps
< pg_num
; ++ps
) {
4314 pg_t
pgid(ps
, poolid
);
4315 if (pending_inc
->pg_stat_updates
.count(pgid
) == 0) {
4316 ldout(cct
,20) << __func__
<< " adding " << pgid
<< dendl
;
4317 pg_stat_t
&stats
= pending_inc
->pg_stat_updates
[pgid
];
4318 stats
.last_fresh
= osdmap
.get_modified();
4319 stats
.last_active
= osdmap
.get_modified();
4320 stats
.last_change
= osdmap
.get_modified();
4321 stats
.last_peered
= osdmap
.get_modified();
4322 stats
.last_clean
= osdmap
.get_modified();
4323 stats
.last_unstale
= osdmap
.get_modified();
4324 stats
.last_undegraded
= osdmap
.get_modified();
4325 stats
.last_fullsized
= osdmap
.get_modified();
4326 stats
.last_scrub_stamp
= osdmap
.get_modified();
4327 stats
.last_deep_scrub_stamp
= osdmap
.get_modified();
4328 stats
.last_clean_scrub_stamp
= osdmap
.get_modified();
4335 void PGMapUpdater::register_pg(
4336 const OSDMap
&osd_map
,
4337 pg_t pgid
, epoch_t epoch
,
4339 const PGMap
&pg_map
,
4340 PGMap::Incremental
*pending_inc
)
4344 auto parent_stat
= pg_map
.pg_stat
.end();
4348 // remove most significant bit
4349 int msb
= cbits(parent
.ps());
4352 parent
.set_ps(parent
.ps() & ~(1<<(msb
-1)));
4354 dout(30) << " is " << pgid
<< " parent " << parent
<< " ?" << dendl
;
4355 parent_stat
= pg_map
.pg_stat
.find(parent
);
4356 if (parent_stat
!= pg_map
.pg_stat
.end() &&
4357 parent_stat
->second
.state
!= PG_STATE_CREATING
) {
4358 dout(10) << " parent is " << parent
<< dendl
;
4364 pg_stat_t
&stats
= pending_inc
->pg_stat_updates
[pgid
];
4365 stats
.state
= PG_STATE_CREATING
;
4366 stats
.created
= epoch
;
4367 stats
.parent
= parent
;
4368 stats
.parent_split_bits
= split_bits
;
4369 stats
.mapping_epoch
= epoch
;
4371 if (parent_stat
!= pg_map
.pg_stat
.end()) {
4372 const pg_stat_t
&ps
= parent_stat
->second
;
4373 stats
.last_fresh
= ps
.last_fresh
;
4374 stats
.last_active
= ps
.last_active
;
4375 stats
.last_change
= ps
.last_change
;
4376 stats
.last_peered
= ps
.last_peered
;
4377 stats
.last_clean
= ps
.last_clean
;
4378 stats
.last_unstale
= ps
.last_unstale
;
4379 stats
.last_undegraded
= ps
.last_undegraded
;
4380 stats
.last_fullsized
= ps
.last_fullsized
;
4381 stats
.last_scrub_stamp
= ps
.last_scrub_stamp
;
4382 stats
.last_deep_scrub_stamp
= ps
.last_deep_scrub_stamp
;
4383 stats
.last_clean_scrub_stamp
= ps
.last_clean_scrub_stamp
;
4385 utime_t now
= osd_map
.get_modified();
4386 stats
.last_fresh
= now
;
4387 stats
.last_active
= now
;
4388 stats
.last_change
= now
;
4389 stats
.last_peered
= now
;
4390 stats
.last_clean
= now
;
4391 stats
.last_unstale
= now
;
4392 stats
.last_undegraded
= now
;
4393 stats
.last_fullsized
= now
;
4394 stats
.last_scrub_stamp
= now
;
4395 stats
.last_deep_scrub_stamp
= now
;
4396 stats
.last_clean_scrub_stamp
= now
;
4399 osd_map
.pg_to_up_acting_osds(
4404 &stats
.acting_primary
);
4406 if (split_bits
== 0) {
4407 dout(10) << __func__
<< " will create " << pgid
4408 << " primary " << stats
.acting_primary
4409 << " acting " << stats
.acting
4412 dout(10) << __func__
<< " will create " << pgid
4413 << " primary " << stats
.acting_primary
4414 << " acting " << stats
.acting
4415 << " parent " << parent
4416 << " by " << split_bits
<< " bits"
4421 void PGMapUpdater::register_new_pgs(
4422 const OSDMap
&osd_map
,
4423 const PGMap
&pg_map
,
4424 PGMap::Incremental
*pending_inc
)
4426 epoch_t epoch
= osd_map
.get_epoch();
4427 dout(10) << __func__
<< " checking pg pools for osdmap epoch " << epoch
4428 << ", last_pg_scan " << pg_map
.last_pg_scan
<< dendl
;
4431 const auto &pools
= osd_map
.get_pools();
4433 for (const auto &p
: pools
) {
4434 int64_t poolid
= p
.first
;
4435 const pg_pool_t
&pool
= p
.second
;
4436 int ruleno
= osd_map
.crush
->find_rule(pool
.get_crush_rule(),
4437 pool
.get_type(), pool
.get_size());
4438 if (ruleno
< 0 || !osd_map
.crush
->rule_exists(ruleno
))
4441 if (pool
.get_last_change() <= pg_map
.last_pg_scan
||
4442 pool
.get_last_change() <= pending_inc
->pg_scan
) {
4443 dout(10) << " no change in pool " << poolid
<< " " << pool
<< dendl
;
4447 dout(10) << __func__
<< " scanning pool " << poolid
4448 << " " << pool
<< dendl
;
4450 // first pgs in this pool
4451 bool new_pool
= pg_map
.pg_pool_sum
.count(poolid
) == 0;
4453 for (ps_t ps
= 0; ps
< pool
.get_pg_num(); ps
++) {
4454 pg_t
pgid(ps
, poolid
, -1);
4455 if (pg_map
.pg_stat
.count(pgid
)) {
4456 dout(20) << "register_new_pgs have " << pgid
<< dendl
;
4460 register_pg(osd_map
, pgid
, pool
.get_last_change(), new_pool
,
4461 pg_map
, pending_inc
);
4466 for (const auto &p
: pg_map
.creating_pgs
) {
4467 if (p
.preferred() >= 0) {
4468 dout(20) << " removing creating_pg " << p
4469 << " because it is localized and obsolete" << dendl
;
4470 pending_inc
->pg_remove
.insert(p
);
4472 } else if (!osd_map
.have_pg_pool(p
.pool())) {
4473 dout(20) << " removing creating_pg " << p
4474 << " because containing pool deleted" << dendl
;
4475 pending_inc
->pg_remove
.insert(p
);
4481 for (const auto &p
: pg_map
.pg_stat
) {
4482 if (!osd_map
.have_pg_pool(p
.first
.pool())) {
4483 dout(20) << " removing pg_stat " << p
.first
<< " because "
4484 << "containing pool deleted" << dendl
;
4485 pending_inc
->pg_remove
.insert(p
.first
);
4487 } else if (p
.first
.preferred() >= 0) {
4488 dout(20) << " removing localized pg " << p
.first
<< dendl
;
4489 pending_inc
->pg_remove
.insert(p
.first
);
4494 // we don't want to redo this work if we can avoid it.
4495 pending_inc
->pg_scan
= epoch
;
4497 dout(10) << "register_new_pgs registered " << created
<< " new pgs, removed "
4498 << removed
<< " uncreated pgs" << dendl
;
4502 void PGMapUpdater::update_creating_pgs(
4503 const OSDMap
&osd_map
,
4504 const PGMap
&pg_map
,
4505 PGMap::Incremental
*pending_inc
)
4507 dout(10) << __func__
<< " to " << pg_map
.creating_pgs
.size()
4508 << " pgs, osdmap epoch " << osd_map
.get_epoch()
4511 unsigned changed
= 0;
4512 for (auto p
= pg_map
.creating_pgs
.begin();
4513 p
!= pg_map
.creating_pgs
.end();
4517 auto q
= pg_map
.pg_stat
.find(pgid
);
4518 assert(q
!= pg_map
.pg_stat
.end());
4519 const pg_stat_t
*s
= &q
->second
;
4521 if (s
->parent_split_bits
)
4524 vector
<int> up
, acting
;
4525 int up_primary
, acting_primary
;
4526 osd_map
.pg_to_up_acting_osds(
4534 up_primary
!= s
->up_primary
||
4535 acting
!= s
->acting
||
4536 acting_primary
!= s
->acting_primary
) {
4537 pg_stat_t
*ns
= &pending_inc
->pg_stat_updates
[pgid
];
4538 if (osd_map
.get_epoch() > ns
->reported_epoch
) {
4539 dout(20) << __func__
<< " " << pgid
<< " "
4540 << " acting_primary: " << s
->acting_primary
4541 << " -> " << acting_primary
4542 << " acting: " << s
->acting
<< " -> " << acting
4543 << " up_primary: " << s
->up_primary
<< " -> " << up_primary
4544 << " up: " << s
->up
<< " -> " << up
4547 // only initialize if it wasn't already a pending update
4548 if (ns
->reported_epoch
== 0)
4551 // note epoch if the target of the create message changed
4552 if (acting_primary
!= ns
->acting_primary
)
4553 ns
->mapping_epoch
= osd_map
.get_epoch();
4556 ns
->up_primary
= up_primary
;
4557 ns
->acting
= acting
;
4558 ns
->acting_primary
= acting_primary
;
4562 dout(20) << __func__
<< " " << pgid
<< " has pending update from newer"
4563 << " epoch " << ns
->reported_epoch
4569 dout(10) << __func__
<< " " << changed
<< " pgs changed primary" << dendl
;
4573 static void _try_mark_pg_stale(
4574 const OSDMap
& osdmap
,
4576 const pg_stat_t
& cur
,
4577 PGMap::Incremental
*pending_inc
)
4579 if ((cur
.state
& PG_STATE_STALE
) == 0 &&
4580 cur
.acting_primary
!= -1 &&
4581 osdmap
.is_down(cur
.acting_primary
)) {
4583 auto q
= pending_inc
->pg_stat_updates
.find(pgid
);
4584 if (q
!= pending_inc
->pg_stat_updates
.end()) {
4585 if ((q
->second
.acting_primary
== cur
.acting_primary
) ||
4586 ((q
->second
.state
& PG_STATE_STALE
) == 0 &&
4587 q
->second
.acting_primary
!= -1 &&
4588 osdmap
.is_down(q
->second
.acting_primary
))) {
4589 newstat
= &q
->second
;
4591 // pending update is no longer down or already stale
4595 newstat
= &pending_inc
->pg_stat_updates
[pgid
];
4598 dout(10) << __func__
<< " marking pg " << pgid
4599 << " stale (acting_primary " << newstat
->acting_primary
4601 newstat
->state
|= PG_STATE_STALE
;
4602 newstat
->last_unstale
= ceph_clock_now();
4606 void PGMapUpdater::check_down_pgs(
4607 const OSDMap
&osdmap
,
4608 const PGMap
&pg_map
,
4610 const set
<int>& need_check_down_pg_osds
,
4611 PGMap::Incremental
*pending_inc
)
4613 // if a large number of osds changed state, just iterate over the whole
4615 if (need_check_down_pg_osds
.size() > (unsigned)osdmap
.get_num_osds() *
4616 g_conf
->get_val
<double>("mon_pg_check_down_all_threshold")) {
4621 for (const auto& p
: pg_map
.pg_stat
) {
4622 _try_mark_pg_stale(osdmap
, p
.first
, p
.second
, pending_inc
);
4625 for (auto osd
: need_check_down_pg_osds
) {
4626 if (osdmap
.is_down(osd
)) {
4627 auto p
= pg_map
.pg_by_osd
.find(osd
);
4628 if (p
== pg_map
.pg_by_osd
.end()) {
4631 for (auto pgid
: p
->second
) {
4632 const pg_stat_t
&stat
= pg_map
.pg_stat
.at(pgid
);
4633 assert(stat
.acting_primary
== osd
);
4634 _try_mark_pg_stale(osdmap
, pgid
, stat
, pending_inc
);
4641 int reweight::by_utilization(
4642 const OSDMap
&osdmap
,
4647 bool by_pg
, const set
<int64_t> *pools
,
4649 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
4650 std::stringstream
*ss
,
4651 std::string
*out_str
,
4655 *ss
<< "You must give a percentage higher than 100. "
4656 "The reweighting threshold will be calculated as <average-utilization> "
4657 "times <input-percentage>. For example, an argument of 200 would "
4658 "reweight OSDs which are twice as utilized as the average OSD.\n";
4662 vector
<int> pgs_by_osd(osdmap
.get_max_osd());
4664 // Avoid putting a small number (or 0) in the denominator when calculating
4666 double average_util
;
4669 double weight_sum
= 0.0; // sum up the crush weights
4670 unsigned num_pg_copies
= 0;
4672 for (const auto& pg
: pgm
.pg_stat
) {
4673 if (pools
&& pools
->count(pg
.first
.pool()) == 0)
4675 for (const auto acting
: pg
.second
.acting
) {
4676 if (!osdmap
.exists(acting
)) {
4679 if (acting
>= (int)pgs_by_osd
.size())
4680 pgs_by_osd
.resize(acting
);
4681 if (pgs_by_osd
[acting
] == 0) {
4682 if (osdmap
.crush
->get_item_weightf(acting
) <= 0) {
4683 //skip if we currently can not identify item
4686 weight_sum
+= osdmap
.crush
->get_item_weightf(acting
);
4689 ++pgs_by_osd
[acting
];
4694 if (!num_osds
|| (num_pg_copies
/ num_osds
< g_conf
->mon_reweight_min_pgs_per_osd
)) {
4695 *ss
<< "Refusing to reweight: we only have " << num_pg_copies
4696 << " PGs across " << num_osds
<< " osds!\n";
4700 average_util
= (double)num_pg_copies
/ weight_sum
;
4702 // by osd utilization
4703 int num_osd
= MAX(1, pgm
.osd_stat
.size());
4704 if ((uint64_t)pgm
.osd_sum
.kb
* 1024 / num_osd
4705 < g_conf
->mon_reweight_min_bytes_per_osd
) {
4706 *ss
<< "Refusing to reweight: we only have " << pgm
.osd_sum
.kb
4707 << " kb across all osds!\n";
4710 if ((uint64_t)pgm
.osd_sum
.kb_used
* 1024 / num_osd
4711 < g_conf
->mon_reweight_min_bytes_per_osd
) {
4712 *ss
<< "Refusing to reweight: we only have " << pgm
.osd_sum
.kb_used
4713 << " kb used across all osds!\n";
4717 average_util
= (double)pgm
.osd_sum
.kb_used
/ (double)pgm
.osd_sum
.kb
;
4720 // adjust down only if we are above the threshold
4721 const double overload_util
= average_util
* (double)oload
/ 100.0;
4723 // but aggressively adjust weights up whenever possible.
4724 const double underload_util
= average_util
;
4726 const unsigned max_change
= (unsigned)(max_changef
* (double)0x10000);
4730 f
->open_object_section("reweight_by_utilization");
4731 f
->dump_int("overload_min", oload
);
4732 f
->dump_float("max_change", max_changef
);
4733 f
->dump_int("max_change_osds", max_osds
);
4734 f
->dump_float("average_utilization", average_util
);
4735 f
->dump_float("overload_utilization", overload_util
);
4737 oss
<< "oload " << oload
<< "\n";
4738 oss
<< "max_change " << max_changef
<< "\n";
4739 oss
<< "max_change_osds " << max_osds
<< "\n";
4741 oss
<< "average_utilization " << std::fixed
<< average_util
<< "\n";
4742 oss
<< "overload_utilization " << overload_util
<< "\n";
4744 int num_changed
= 0;
4746 // precompute util for each OSD
4747 std::vector
<std::pair
<int, float> > util_by_osd
;
4748 for (const auto& p
: pgm
.osd_stat
) {
4749 std::pair
<int, float> osd_util
;
4750 osd_util
.first
= p
.first
;
4752 if (p
.first
>= (int)pgs_by_osd
.size() ||
4753 pgs_by_osd
[p
.first
] == 0) {
4754 // skip if this OSD does not contain any pg
4755 // belonging to the specified pool(s).
4759 if (osdmap
.crush
->get_item_weightf(p
.first
) <= 0) {
4760 // skip if we are unable to locate item.
4764 osd_util
.second
= pgs_by_osd
[p
.first
] / osdmap
.crush
->get_item_weightf(p
.first
);
4766 osd_util
.second
= (double)p
.second
.kb_used
/ (double)p
.second
.kb
;
4768 util_by_osd
.push_back(osd_util
);
4771 // sort by absolute deviation from the mean utilization,
4772 // in descending order.
4773 std::sort(util_by_osd
.begin(), util_by_osd
.end(),
4774 [average_util
](std::pair
<int, float> l
, std::pair
<int, float> r
) {
4775 return abs(l
.second
- average_util
) > abs(r
.second
- average_util
);
4780 f
->open_array_section("reweights");
4782 for (const auto& p
: util_by_osd
) {
4783 unsigned weight
= osdmap
.get_weight(p
.first
);
4785 // skip if OSD is currently out
4788 float util
= p
.second
;
4790 if (util
>= overload_util
) {
4791 // Assign a lower weight to overloaded OSDs. The current weight
4792 // is a factor to take into account the original weights,
4793 // to represent e.g. differing storage capacities
4794 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
4795 if (weight
> max_change
)
4796 new_weight
= MAX(new_weight
, weight
- max_change
);
4797 new_weights
->insert({p
.first
, new_weight
});
4799 f
->open_object_section("osd");
4800 f
->dump_int("osd", p
.first
);
4801 f
->dump_float("weight", (float)weight
/ (float)0x10000);
4802 f
->dump_float("new_weight", (float)new_weight
/ (float)0x10000);
4805 oss
<< "osd." << p
.first
<< " weight "
4806 << (float)weight
/ (float)0x10000 << " -> "
4807 << (float)new_weight
/ (float)0x10000 << "\n";
4809 if (++num_changed
>= max_osds
)
4812 if (!no_increasing
&& util
<= underload_util
) {
4813 // assign a higher weight.. if we can.
4814 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
4815 new_weight
= MIN(new_weight
, weight
+ max_change
);
4816 if (new_weight
> 0x10000)
4817 new_weight
= 0x10000;
4818 if (new_weight
> weight
) {
4819 new_weights
->insert({p
.first
, new_weight
});
4820 oss
<< "osd." << p
.first
<< " weight "
4821 << (float)weight
/ (float)0x10000 << " -> "
4822 << (float)new_weight
/ (float)0x10000 << "\n";
4823 if (++num_changed
>= max_osds
)
4833 newmap
.deepish_copy_from(osdmap
);
4834 OSDMap::Incremental newinc
;
4835 newinc
.fsid
= newmap
.get_fsid();
4836 newinc
.epoch
= newmap
.get_epoch() + 1;
4837 newinc
.new_weight
= *new_weights
;
4838 newmap
.apply_incremental(newinc
);
4840 osdmap
.summarize_mapping_stats(&newmap
, pools
, out_str
, f
);
4846 *out_str
+= oss
.str();