1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
6 #define dout_subsys ceph_subsys_mon
7 #include "common/debug.h"
8 #include "common/Formatter.h"
9 #include "include/ceph_features.h"
10 #include "include/stringify.h"
12 #include "osd/osd_types.h"
13 #include "osd/OSDMap.h"
15 #define dout_context g_ceph_context
17 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest
, pgmap_digest
, pgmap
);
18 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap
, pgmap
, pgmap
);
19 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental
, pgmap_inc
, pgmap
);
22 // ---------------------
25 void PGMapDigest::encode(bufferlist
& bl
, uint64_t features
) const
27 // NOTE: see PGMap::encode_digest
28 ENCODE_START(1, 1, bl
);
30 ::encode(num_pg_active
, bl
);
31 ::encode(num_pg_unknown
, bl
);
32 ::encode(num_osd
, bl
);
33 ::encode(pg_pool_sum
, bl
, features
);
34 ::encode(pg_sum
, bl
, features
);
35 ::encode(osd_sum
, bl
);
36 ::encode(num_pg_by_state
, bl
);
37 ::encode(num_pg_by_osd
, bl
);
38 ::encode(num_pg_by_pool
, bl
);
39 ::encode(osd_last_seq
, bl
);
40 ::encode(per_pool_sum_delta
, bl
, features
);
41 ::encode(per_pool_sum_deltas_stamps
, bl
);
42 ::encode(pg_sum_delta
, bl
, features
);
43 ::encode(stamp_delta
, bl
);
44 ::encode(avail_space_by_rule
, bl
);
48 void PGMapDigest::decode(bufferlist::iterator
& p
)
52 ::decode(num_pg_active
, p
);
53 ::decode(num_pg_unknown
, p
);
55 ::decode(pg_pool_sum
, p
);
58 ::decode(num_pg_by_state
, p
);
59 ::decode(num_pg_by_osd
, p
);
60 ::decode(num_pg_by_pool
, p
);
61 ::decode(osd_last_seq
, p
);
62 ::decode(per_pool_sum_delta
, p
);
63 ::decode(per_pool_sum_deltas_stamps
, p
);
64 ::decode(pg_sum_delta
, p
);
65 ::decode(stamp_delta
, p
);
66 ::decode(avail_space_by_rule
, p
);
70 void PGMapDigest::dump(Formatter
*f
) const
72 f
->dump_unsigned("num_pg", num_pg
);
73 f
->dump_unsigned("num_pg_active", num_pg_active
);
74 f
->dump_unsigned("num_pg_unknown", num_pg_unknown
);
75 f
->dump_unsigned("num_osd", num_osd
);
76 f
->dump_object("pool_sum", pg_sum
);
77 f
->dump_object("osd_sum", osd_sum
);
78 f
->open_array_section("pool_stats");
79 for (auto& p
: pg_pool_sum
) {
80 f
->open_object_section("pool_stat");
81 f
->dump_int("poolid", p
.first
);
82 auto q
= num_pg_by_pool
.find(p
.first
);
83 if (q
!= num_pg_by_pool
.end())
84 f
->dump_unsigned("num_pg", q
->second
);
89 f
->open_array_section("osd_stats");
91 // TODO: this isn't really correct since we can dump non-existent OSDs
92 // I dunno what osd_last_seq is set to in that case...
93 for (auto& p
: osd_last_seq
) {
94 f
->open_object_section("osd_stat");
95 f
->dump_int("osd", i
);
96 f
->dump_unsigned("seq", p
);
101 f
->open_array_section("num_pg_by_state");
102 for (auto& p
: num_pg_by_state
) {
103 f
->open_object_section("count");
104 f
->dump_string("state", pg_state_string(p
.first
));
105 f
->dump_unsigned("num", p
.second
);
109 f
->open_array_section("num_pg_by_osd");
110 for (auto& p
: num_pg_by_osd
) {
111 f
->open_object_section("count");
112 f
->dump_unsigned("osd", p
.first
);
113 f
->dump_unsigned("num_primary_pg", p
.second
.primary
);
114 f
->dump_unsigned("num_acting_pg", p
.second
.acting
);
115 f
->dump_unsigned("num_up_pg", p
.second
.up
);
121 void PGMapDigest::generate_test_instances(list
<PGMapDigest
*>& ls
)
123 ls
.push_back(new PGMapDigest
);
126 inline std::string
percentify(const float& a
) {
127 std::stringstream ss
;
131 ss
<< std::fixed
<< std::setprecision(2) << a
;
135 void PGMapDigest::print_summary(Formatter
*f
, ostream
*out
) const
138 f
->open_array_section("pgs_by_state");
140 // list is descending numeric order (by count)
141 multimap
<int,int> state_by_count
; // count -> state
142 for (auto p
= num_pg_by_state
.begin();
143 p
!= num_pg_by_state
.end();
145 state_by_count
.insert(make_pair(p
->second
, p
->first
));
148 for (auto p
= state_by_count
.rbegin();
149 p
!= state_by_count
.rend();
152 f
->open_object_section("pgs_by_state_element");
153 f
->dump_string("state_name", pg_state_string(p
->second
));
154 f
->dump_unsigned("count", p
->first
);
162 f
->dump_unsigned("num_pgs", num_pg
);
163 f
->dump_unsigned("num_pools", pg_pool_sum
.size());
164 f
->dump_unsigned("num_objects", pg_sum
.stats
.sum
.num_objects
);
165 f
->dump_unsigned("data_bytes", pg_sum
.stats
.sum
.num_bytes
);
166 f
->dump_unsigned("bytes_used", osd_sum
.kb_used
* 1024ull);
167 f
->dump_unsigned("bytes_avail", osd_sum
.kb_avail
* 1024ull);
168 f
->dump_unsigned("bytes_total", osd_sum
.kb
* 1024ull);
170 *out
<< " pools: " << pg_pool_sum
.size() << " pools, "
171 << num_pg
<< " pgs\n";
172 *out
<< " objects: " << si_t(pg_sum
.stats
.sum
.num_objects
) << " objects, "
173 << prettybyte_t(pg_sum
.stats
.sum
.num_bytes
) << "\n";
175 << kb_t(osd_sum
.kb_used
) << " used, "
176 << kb_t(osd_sum
.kb_avail
) << " / "
177 << kb_t(osd_sum
.kb
) << " avail\n";
183 if (num_pg_unknown
> 0) {
184 float p
= (float)num_pg_unknown
/ (float)num_pg
;
186 f
->dump_float("unknown_pgs_ratio", p
);
189 snprintf(b
, sizeof(b
), "%.3lf", p
* 100.0);
190 *out
<< b
<< "% pgs unknown\n";
195 int num_pg_inactive
= num_pg
- num_pg_active
- num_pg_unknown
;
196 if (num_pg_inactive
> 0) {
197 float p
= (float)num_pg_inactive
/ (float)num_pg
;
199 f
->dump_float("inactive_pgs_ratio", p
);
205 snprintf(b
, sizeof(b
), "%.3f", p
* 100.0);
206 *out
<< b
<< "% pgs not active\n";
212 overall_recovery_summary(f
, &sl
);
213 if (!f
&& !sl
.empty()) {
214 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
) {
225 unsigned max_width
= 1;
226 for (multimap
<int,int>::reverse_iterator p
= state_by_count
.rbegin();
227 p
!= state_by_count
.rend();
230 std::stringstream ss
;
232 max_width
= MAX(ss
.str().size(), max_width
);
235 for (multimap
<int,int>::reverse_iterator p
= state_by_count
.rbegin();
236 p
!= state_by_count
.rend();
243 out
->setf(std::ios::left
);
244 *out
<< std::setw(max_width
) << p
->first
245 << " " << pg_state_string(p
->second
) << "\n";
246 out
->unsetf(std::ios::left
);
250 ostringstream ss_rec_io
;
251 overall_recovery_rate_summary(f
, &ss_rec_io
);
252 ostringstream ss_client_io
;
253 overall_client_io_rate_summary(f
, &ss_client_io
);
254 ostringstream ss_cache_io
;
255 overall_cache_io_rate_summary(f
, &ss_cache_io
);
257 if (!f
&& (ss_client_io
.str().length() || ss_rec_io
.str().length()
258 || ss_cache_io
.str().length())) {
263 if (!f
&& ss_client_io
.str().length())
264 *out
<< " client: " << ss_client_io
.str() << "\n";
265 if (!f
&& ss_rec_io
.str().length())
266 *out
<< " recovery: " << ss_rec_io
.str() << "\n";
267 if (!f
&& ss_cache_io
.str().length())
268 *out
<< " cache: " << ss_cache_io
.str() << "\n";
271 void PGMapDigest::print_oneline_summary(Formatter
*f
, ostream
*out
) const
273 std::stringstream ss
;
276 f
->open_array_section("num_pg_by_state");
277 for (auto p
= num_pg_by_state
.begin();
278 p
!= num_pg_by_state
.end();
281 f
->open_object_section("state");
282 f
->dump_string("name", pg_state_string(p
->first
));
283 f
->dump_unsigned("num", p
->second
);
286 if (p
!= num_pg_by_state
.begin())
288 ss
<< p
->second
<< " " << pg_state_string(p
->first
);
293 string states
= ss
.str();
295 *out
<< num_pg
<< " pgs: "
297 << prettybyte_t(pg_sum
.stats
.sum
.num_bytes
) << " data, "
298 << kb_t(osd_sum
.kb_used
) << " used, "
299 << kb_t(osd_sum
.kb_avail
) << " / "
300 << kb_t(osd_sum
.kb
) << " avail";
302 f
->dump_unsigned("num_pgs", num_pg
);
303 f
->dump_unsigned("num_bytes", pg_sum
.stats
.sum
.num_bytes
);
304 f
->dump_unsigned("raw_bytes_used", osd_sum
.kb_used
<< 10);
305 f
->dump_unsigned("raw_bytes_avail", osd_sum
.kb_avail
<< 10);
306 f
->dump_unsigned("raw_bytes", osd_sum
.kb
<< 10);
309 // make non-negative; we can get negative values if osds send
310 // uncommitted stats and then "go backward" or if they are just
312 pool_stat_t pos_delta
= pg_sum_delta
;
314 if (pos_delta
.stats
.sum
.num_rd
||
315 pos_delta
.stats
.sum
.num_wr
) {
318 if (pos_delta
.stats
.sum
.num_rd
) {
319 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)stamp_delta
;
321 *out
<< pretty_si_t(rd
) << "B/s rd, ";
323 f
->dump_unsigned("read_bytes_sec", rd
);
325 if (pos_delta
.stats
.sum
.num_wr
) {
326 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)stamp_delta
;
328 *out
<< pretty_si_t(wr
) << "B/s wr, ";
330 f
->dump_unsigned("write_bytes_sec", wr
);
332 int64_t iops
= (pos_delta
.stats
.sum
.num_rd
+ pos_delta
.stats
.sum
.num_wr
) / (double)stamp_delta
;
334 *out
<< pretty_si_t(iops
) << "op/s";
336 f
->dump_unsigned("io_sec", iops
);
340 overall_recovery_summary(f
, &sl
);
342 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
)
344 std::stringstream ssr
;
345 overall_recovery_rate_summary(f
, &ssr
);
346 if (out
&& ssr
.str().length())
347 *out
<< "; " << ssr
.str() << " recovering";
350 void PGMapDigest::recovery_summary(Formatter
*f
, list
<string
> *psl
,
351 const pool_stat_t
& delta_sum
) const
353 if (delta_sum
.stats
.sum
.num_objects_degraded
&& delta_sum
.stats
.sum
.num_object_copies
> 0) {
354 double pc
= (double)delta_sum
.stats
.sum
.num_objects_degraded
/
355 (double)delta_sum
.stats
.sum
.num_object_copies
* (double)100.0;
357 snprintf(b
, sizeof(b
), "%.3lf", pc
);
359 f
->dump_unsigned("degraded_objects", delta_sum
.stats
.sum
.num_objects_degraded
);
360 f
->dump_unsigned("degraded_total", delta_sum
.stats
.sum
.num_object_copies
);
361 f
->dump_float("degraded_ratio", pc
/ 100.0);
364 ss
<< delta_sum
.stats
.sum
.num_objects_degraded
365 << "/" << delta_sum
.stats
.sum
.num_object_copies
<< " objects degraded (" << b
<< "%)";
366 psl
->push_back(ss
.str());
369 if (delta_sum
.stats
.sum
.num_objects_misplaced
&& delta_sum
.stats
.sum
.num_object_copies
> 0) {
370 double pc
= (double)delta_sum
.stats
.sum
.num_objects_misplaced
/
371 (double)delta_sum
.stats
.sum
.num_object_copies
* (double)100.0;
373 snprintf(b
, sizeof(b
), "%.3lf", pc
);
375 f
->dump_unsigned("misplaced_objects", delta_sum
.stats
.sum
.num_objects_misplaced
);
376 f
->dump_unsigned("misplaced_total", delta_sum
.stats
.sum
.num_object_copies
);
377 f
->dump_float("misplaced_ratio", pc
/ 100.0);
380 ss
<< delta_sum
.stats
.sum
.num_objects_misplaced
381 << "/" << delta_sum
.stats
.sum
.num_object_copies
<< " objects misplaced (" << b
<< "%)";
382 psl
->push_back(ss
.str());
385 if (delta_sum
.stats
.sum
.num_objects_unfound
&& delta_sum
.stats
.sum
.num_objects
) {
386 double pc
= (double)delta_sum
.stats
.sum
.num_objects_unfound
/
387 (double)delta_sum
.stats
.sum
.num_objects
* (double)100.0;
389 snprintf(b
, sizeof(b
), "%.3lf", pc
);
391 f
->dump_unsigned("unfound_objects", delta_sum
.stats
.sum
.num_objects_unfound
);
392 f
->dump_unsigned("unfound_total", delta_sum
.stats
.sum
.num_objects
);
393 f
->dump_float("unfound_ratio", pc
/ 100.0);
396 ss
<< delta_sum
.stats
.sum
.num_objects_unfound
397 << "/" << delta_sum
.stats
.sum
.num_objects
<< " unfound (" << b
<< "%)";
398 psl
->push_back(ss
.str());
403 void PGMapDigest::recovery_rate_summary(Formatter
*f
, ostream
*out
,
404 const pool_stat_t
& delta_sum
,
405 utime_t delta_stamp
) const
407 // make non-negative; we can get negative values if osds send
408 // uncommitted stats and then "go backward" or if they are just
410 pool_stat_t pos_delta
= delta_sum
;
412 if (pos_delta
.stats
.sum
.num_objects_recovered
||
413 pos_delta
.stats
.sum
.num_bytes_recovered
||
414 pos_delta
.stats
.sum
.num_keys_recovered
) {
415 int64_t objps
= pos_delta
.stats
.sum
.num_objects_recovered
/ (double)delta_stamp
;
416 int64_t bps
= pos_delta
.stats
.sum
.num_bytes_recovered
/ (double)delta_stamp
;
417 int64_t kps
= pos_delta
.stats
.sum
.num_keys_recovered
/ (double)delta_stamp
;
419 f
->dump_int("recovering_objects_per_sec", objps
);
420 f
->dump_int("recovering_bytes_per_sec", bps
);
421 f
->dump_int("recovering_keys_per_sec", kps
);
422 f
->dump_int("num_objects_recovered", pos_delta
.stats
.sum
.num_objects_recovered
);
423 f
->dump_int("num_bytes_recovered", pos_delta
.stats
.sum
.num_bytes_recovered
);
424 f
->dump_int("num_keys_recovered", pos_delta
.stats
.sum
.num_keys_recovered
);
426 *out
<< pretty_si_t(bps
) << "B/s";
427 if (pos_delta
.stats
.sum
.num_keys_recovered
)
428 *out
<< ", " << pretty_si_t(kps
) << "keys/s";
429 *out
<< ", " << pretty_si_t(objps
) << "objects/s";
434 void PGMapDigest::overall_recovery_rate_summary(Formatter
*f
, ostream
*out
) const
436 recovery_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
439 void PGMapDigest::overall_recovery_summary(Formatter
*f
, list
<string
> *psl
) const
441 recovery_summary(f
, psl
, pg_sum
);
444 void PGMapDigest::pool_recovery_rate_summary(Formatter
*f
, ostream
*out
,
445 uint64_t poolid
) const
447 auto p
= per_pool_sum_delta
.find(poolid
);
448 if (p
== per_pool_sum_delta
.end())
451 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
452 assert(ts
!= per_pool_sum_deltas_stamps
.end());
453 recovery_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
456 void PGMapDigest::pool_recovery_summary(Formatter
*f
, list
<string
> *psl
,
457 uint64_t poolid
) const
459 auto p
= per_pool_sum_delta
.find(poolid
);
460 if (p
== per_pool_sum_delta
.end())
463 recovery_summary(f
, psl
, p
->second
.first
);
466 void PGMapDigest::client_io_rate_summary(Formatter
*f
, ostream
*out
,
467 const pool_stat_t
& delta_sum
,
468 utime_t delta_stamp
) const
470 pool_stat_t pos_delta
= delta_sum
;
472 if (pos_delta
.stats
.sum
.num_rd
||
473 pos_delta
.stats
.sum
.num_wr
) {
474 if (pos_delta
.stats
.sum
.num_rd
) {
475 int64_t rd
= (pos_delta
.stats
.sum
.num_rd_kb
<< 10) / (double)delta_stamp
;
477 f
->dump_int("read_bytes_sec", rd
);
479 *out
<< pretty_si_t(rd
) << "B/s rd, ";
482 if (pos_delta
.stats
.sum
.num_wr
) {
483 int64_t wr
= (pos_delta
.stats
.sum
.num_wr_kb
<< 10) / (double)delta_stamp
;
485 f
->dump_int("write_bytes_sec", wr
);
487 *out
<< pretty_si_t(wr
) << "B/s wr, ";
490 int64_t iops_rd
= pos_delta
.stats
.sum
.num_rd
/ (double)delta_stamp
;
491 int64_t iops_wr
= pos_delta
.stats
.sum
.num_wr
/ (double)delta_stamp
;
493 f
->dump_int("read_op_per_sec", iops_rd
);
494 f
->dump_int("write_op_per_sec", iops_wr
);
496 *out
<< pretty_si_t(iops_rd
) << "op/s rd, " << pretty_si_t(iops_wr
) << "op/s wr";
501 void PGMapDigest::overall_client_io_rate_summary(Formatter
*f
, ostream
*out
) const
503 client_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
506 void PGMapDigest::pool_client_io_rate_summary(Formatter
*f
, ostream
*out
,
507 uint64_t poolid
) const
509 auto p
= per_pool_sum_delta
.find(poolid
);
510 if (p
== per_pool_sum_delta
.end())
513 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
514 assert(ts
!= per_pool_sum_deltas_stamps
.end());
515 client_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
518 void PGMapDigest::cache_io_rate_summary(Formatter
*f
, ostream
*out
,
519 const pool_stat_t
& delta_sum
,
520 utime_t delta_stamp
) const
522 pool_stat_t pos_delta
= delta_sum
;
524 bool have_output
= false;
526 if (pos_delta
.stats
.sum
.num_flush
) {
527 int64_t flush
= (pos_delta
.stats
.sum
.num_flush_kb
<< 10) / (double)delta_stamp
;
529 f
->dump_int("flush_bytes_sec", flush
);
531 *out
<< pretty_si_t(flush
) << "B/s flush";
535 if (pos_delta
.stats
.sum
.num_evict
) {
536 int64_t evict
= (pos_delta
.stats
.sum
.num_evict_kb
<< 10) / (double)delta_stamp
;
538 f
->dump_int("evict_bytes_sec", evict
);
542 *out
<< pretty_si_t(evict
) << "B/s evict";
546 if (pos_delta
.stats
.sum
.num_promote
) {
547 int64_t promote
= pos_delta
.stats
.sum
.num_promote
/ (double)delta_stamp
;
549 f
->dump_int("promote_op_per_sec", promote
);
553 *out
<< pretty_si_t(promote
) << "op/s promote";
557 if (pos_delta
.stats
.sum
.num_flush_mode_low
) {
559 f
->dump_int("num_flush_mode_low", pos_delta
.stats
.sum
.num_flush_mode_low
);
563 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_flush_mode_low
) << "PG(s) flushing";
567 if (pos_delta
.stats
.sum
.num_flush_mode_high
) {
569 f
->dump_int("num_flush_mode_high", pos_delta
.stats
.sum
.num_flush_mode_high
);
573 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_flush_mode_high
) << "PG(s) flushing (high)";
577 if (pos_delta
.stats
.sum
.num_evict_mode_some
) {
579 f
->dump_int("num_evict_mode_some", pos_delta
.stats
.sum
.num_evict_mode_some
);
583 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_evict_mode_some
) << "PG(s) evicting";
587 if (pos_delta
.stats
.sum
.num_evict_mode_full
) {
589 f
->dump_int("num_evict_mode_full", pos_delta
.stats
.sum
.num_evict_mode_full
);
593 *out
<< pretty_si_t(pos_delta
.stats
.sum
.num_evict_mode_full
) << "PG(s) evicting (full)";
598 void PGMapDigest::overall_cache_io_rate_summary(Formatter
*f
, ostream
*out
) const
600 cache_io_rate_summary(f
, out
, pg_sum_delta
, stamp_delta
);
603 void PGMapDigest::pool_cache_io_rate_summary(Formatter
*f
, ostream
*out
,
604 uint64_t poolid
) const
606 auto p
= per_pool_sum_delta
.find(poolid
);
607 if (p
== per_pool_sum_delta
.end())
610 auto ts
= per_pool_sum_deltas_stamps
.find(p
->first
);
611 assert(ts
!= per_pool_sum_deltas_stamps
.end());
612 cache_io_rate_summary(f
, out
, p
->second
.first
, ts
->second
);
615 void PGMapDigest::dump_pool_stats_full(
616 const OSDMap
&osd_map
,
624 f
->open_array_section("pools");
626 tbl
.define_column("NAME", TextTable::LEFT
, TextTable::LEFT
);
627 tbl
.define_column("ID", TextTable::LEFT
, TextTable::LEFT
);
629 tbl
.define_column("QUOTA OBJECTS", TextTable::LEFT
, TextTable::LEFT
);
630 tbl
.define_column("QUOTA BYTES", TextTable::LEFT
, TextTable::LEFT
);
633 tbl
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
634 tbl
.define_column("%USED", TextTable::LEFT
, TextTable::RIGHT
);
635 tbl
.define_column("MAX AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
636 tbl
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
638 tbl
.define_column("DIRTY", TextTable::LEFT
, TextTable::RIGHT
);
639 tbl
.define_column("READ", TextTable::LEFT
, TextTable::RIGHT
);
640 tbl
.define_column("WRITE", TextTable::LEFT
, TextTable::RIGHT
);
641 tbl
.define_column("RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
645 map
<int,uint64_t> avail_by_rule
;
646 for (auto p
= osd_map
.get_pools().begin();
647 p
!= osd_map
.get_pools().end(); ++p
) {
648 int64_t pool_id
= p
->first
;
649 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
651 const string
& pool_name
= osd_map
.get_pool_name(pool_id
);
652 const pool_stat_t
&stat
= pg_pool_sum
.at(pool_id
);
654 const pg_pool_t
*pool
= osd_map
.get_pg_pool(pool_id
);
655 int ruleno
= osd_map
.crush
->find_rule(pool
->get_crush_rule(),
660 if (avail_by_rule
.count(ruleno
) == 0) {
661 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
662 avail
= get_rule_avail(ruleno
);
665 avail_by_rule
[ruleno
] = avail
;
667 avail
= avail_by_rule
[ruleno
];
669 switch (pool
->get_type()) {
670 case pg_pool_t::TYPE_REPLICATED
:
671 avail
/= pool
->get_size();
672 raw_used_rate
= pool
->get_size();
674 case pg_pool_t::TYPE_ERASURE
:
677 osd_map
.get_erasure_code_profile(pool
->erasure_code_profile
);
678 auto pm
= ecp
.find("m");
679 auto pk
= ecp
.find("k");
680 if (pm
!= ecp
.end() && pk
!= ecp
.end()) {
681 int k
= atoi(pk
->second
.c_str());
682 int m
= atoi(pm
->second
.c_str());
685 avail
= avail
* k
/ mk
;
686 raw_used_rate
= (float)mk
/ k
;
693 assert(0 == "unrecognized pool type");
697 f
->open_object_section("pool");
698 f
->dump_string("name", pool_name
);
699 f
->dump_int("id", pool_id
);
700 f
->open_object_section("stats");
705 if (pool
->quota_max_objects
== 0)
708 tbl
<< si_t(pool
->quota_max_objects
);
710 if (pool
->quota_max_bytes
== 0)
713 tbl
<< si_t(pool
->quota_max_bytes
);
717 dump_object_stat_sum(tbl
, f
, stat
.stats
.sum
, avail
, raw_used_rate
, verbose
, pool
);
719 f
->close_section(); // stats
721 tbl
<< TextTable::endrow
;
724 f
->close_section(); // pool
729 assert(ss
!= nullptr);
736 void PGMapDigest::dump_fs_stats(stringstream
*ss
, Formatter
*f
, bool verbose
) const
739 f
->open_object_section("stats");
740 f
->dump_int("total_bytes", osd_sum
.kb
* 1024ull);
741 f
->dump_int("total_used_bytes", osd_sum
.kb_used
* 1024ull);
742 f
->dump_int("total_avail_bytes", osd_sum
.kb_avail
* 1024ull);
744 f
->dump_int("total_objects", pg_sum
.stats
.sum
.num_objects
);
748 assert(ss
!= nullptr);
750 tbl
.define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
751 tbl
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
752 tbl
.define_column("RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
753 tbl
.define_column("%RAW USED", TextTable::LEFT
, TextTable::RIGHT
);
755 tbl
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
757 tbl
<< stringify(si_t(osd_sum
.kb
*1024))
758 << stringify(si_t(osd_sum
.kb_avail
*1024))
759 << stringify(si_t(osd_sum
.kb_used
*1024));
761 if (osd_sum
.kb
> 0) {
762 used
= ((float)osd_sum
.kb_used
/ osd_sum
.kb
);
764 tbl
<< percentify(used
*100);
766 tbl
<< stringify(si_t(pg_sum
.stats
.sum
.num_objects
));
768 tbl
<< TextTable::endrow
;
776 void PGMapDigest::dump_object_stat_sum(
777 TextTable
&tbl
, Formatter
*f
,
778 const object_stat_sum_t
&sum
, uint64_t avail
,
779 float raw_used_rate
, bool verbose
,
780 const pg_pool_t
*pool
)
782 float curr_object_copies_rate
= 0.0;
783 if (sum
.num_object_copies
> 0)
784 curr_object_copies_rate
= (float)(sum
.num_object_copies
- sum
.num_objects_degraded
) / sum
.num_object_copies
;
788 used
= sum
.num_bytes
* curr_object_copies_rate
;
789 used
/= used
+ avail
;
790 } else if (sum
.num_bytes
) {
795 f
->dump_int("kb_used", SHIFT_ROUND_UP(sum
.num_bytes
, 10));
796 f
->dump_int("bytes_used", sum
.num_bytes
);
797 f
->dump_format_unquoted("percent_used", "%.2f", (used
*100));
798 f
->dump_unsigned("max_avail", avail
);
799 f
->dump_int("objects", sum
.num_objects
);
801 f
->dump_int("quota_objects", pool
->quota_max_objects
);
802 f
->dump_int("quota_bytes", pool
->quota_max_bytes
);
803 f
->dump_int("dirty", sum
.num_objects_dirty
);
804 f
->dump_int("rd", sum
.num_rd
);
805 f
->dump_int("rd_bytes", sum
.num_rd_kb
* 1024ull);
806 f
->dump_int("wr", sum
.num_wr
);
807 f
->dump_int("wr_bytes", sum
.num_wr_kb
* 1024ull);
808 f
->dump_int("raw_bytes_used", sum
.num_bytes
* raw_used_rate
* curr_object_copies_rate
);
811 tbl
<< stringify(si_t(sum
.num_bytes
));
812 tbl
<< percentify(used
*100);
814 tbl
<< sum
.num_objects
;
816 tbl
<< stringify(si_t(sum
.num_objects_dirty
))
817 << stringify(si_t(sum
.num_rd
))
818 << stringify(si_t(sum
.num_wr
))
819 << stringify(si_t(sum
.num_bytes
* raw_used_rate
* curr_object_copies_rate
));
824 int64_t PGMap::get_rule_avail(const OSDMap
& osdmap
, int ruleno
) const
827 int r
= osdmap
.crush
->get_rule_weight_osd_map(ruleno
, &wm
);
836 if (osdmap
.require_osd_release
>= CEPH_RELEASE_LUMINOUS
&&
837 osdmap
.get_full_ratio() > 0) {
838 fratio
= osdmap
.get_full_ratio();
840 fratio
= get_fallback_full_ratio();
844 for (auto p
= wm
.begin(); p
!= wm
.end(); ++p
) {
845 auto osd_info
= osd_stat
.find(p
->first
);
846 if (osd_info
!= osd_stat
.end()) {
847 if (osd_info
->second
.kb
== 0 || p
->second
== 0) {
848 // osd must be out, hence its stats have been zeroed
849 // (unless we somehow managed to have a disk with size 0...)
851 // (p->second == 0), if osd weight is 0, no need to
852 // calculate proj below.
855 double unusable
= (double)osd_info
->second
.kb
*
857 double avail
= MAX(0.0, (double)osd_info
->second
.kb_avail
- unusable
);
859 int64_t proj
= (int64_t)(avail
/ (double)p
->second
);
860 if (min
< 0 || proj
< min
) {
864 dout(0) << "Cannot get stat of OSD " << p
->first
<< dendl
;
870 void PGMap::get_rules_avail(const OSDMap
& osdmap
,
871 std::map
<int,int64_t> *avail_map
) const
874 for (auto p
: osdmap
.get_pools()) {
875 int64_t pool_id
= p
.first
;
876 if ((pool_id
< 0) || (pg_pool_sum
.count(pool_id
) == 0))
878 const pg_pool_t
*pool
= osdmap
.get_pg_pool(pool_id
);
879 int ruleno
= osdmap
.crush
->find_rule(pool
->get_crush_rule(),
882 if (avail_map
->count(ruleno
) == 0)
883 (*avail_map
)[ruleno
] = get_rule_avail(osdmap
, ruleno
);
887 // ---------------------
890 void PGMap::Incremental::encode(bufferlist
&bl
, uint64_t features
) const
892 if ((features
& CEPH_FEATURE_MONENC
) == 0) {
895 ::encode(version
, bl
);
896 ::encode(pg_stat_updates
, bl
);
897 ::encode(osd_stat_updates
, bl
);
898 ::encode(osd_stat_rm
, bl
);
899 ::encode(osdmap_epoch
, bl
);
900 ::encode(pg_scan
, bl
);
901 ::encode(full_ratio
, bl
);
902 ::encode(nearfull_ratio
, bl
);
903 ::encode(pg_remove
, bl
);
907 ENCODE_START(7, 5, bl
);
908 ::encode(version
, bl
);
909 ::encode(pg_stat_updates
, bl
);
910 ::encode(osd_stat_updates
, bl
);
911 ::encode(osd_stat_rm
, bl
);
912 ::encode(osdmap_epoch
, bl
);
913 ::encode(pg_scan
, bl
);
914 ::encode(full_ratio
, bl
);
915 ::encode(nearfull_ratio
, bl
);
916 ::encode(pg_remove
, bl
);
918 ::encode(osd_epochs
, bl
);
922 void PGMap::Incremental::decode(bufferlist::iterator
&bl
)
924 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
925 ::decode(version
, bl
);
927 pg_stat_updates
.clear();
934 ::decode(pg_stat_updates
[pgid
], bl
);
937 ::decode(pg_stat_updates
, bl
);
939 ::decode(osd_stat_updates
, bl
);
940 ::decode(osd_stat_rm
, bl
);
941 ::decode(osdmap_epoch
, bl
);
942 ::decode(pg_scan
, bl
);
944 ::decode(full_ratio
, bl
);
945 ::decode(nearfull_ratio
, bl
);
954 pg_remove
.insert(pg_t(opgid
));
957 ::decode(pg_remove
, bl
);
959 if (struct_v
< 4 && full_ratio
== 0) {
962 if (struct_v
< 4 && nearfull_ratio
== 0) {
968 ::decode(osd_epochs
, bl
);
970 for (auto i
= osd_stat_updates
.begin();
971 i
!= osd_stat_updates
.end();
973 // This isn't accurate, but will cause trimming to behave like
975 osd_epochs
.insert(make_pair(i
->first
, osdmap_epoch
));
981 void PGMap::Incremental::dump(Formatter
*f
) const
983 f
->dump_unsigned("version", version
);
984 f
->dump_stream("stamp") << stamp
;
985 f
->dump_unsigned("osdmap_epoch", osdmap_epoch
);
986 f
->dump_unsigned("pg_scan_epoch", pg_scan
);
987 f
->dump_float("full_ratio", full_ratio
);
988 f
->dump_float("nearfull_ratio", nearfull_ratio
);
990 f
->open_array_section("pg_stat_updates");
991 for (auto p
= pg_stat_updates
.begin(); p
!= pg_stat_updates
.end(); ++p
) {
992 f
->open_object_section("pg_stat");
993 f
->dump_stream("pgid") << p
->first
;
999 f
->open_array_section("osd_stat_updates");
1000 for (auto p
= osd_stat_updates
.begin(); p
!= osd_stat_updates
.end(); ++p
) {
1001 f
->open_object_section("osd_stat");
1002 f
->dump_int("osd", p
->first
);
1008 f
->open_array_section("osd_stat_removals");
1009 for (auto p
= osd_stat_rm
.begin(); p
!= osd_stat_rm
.end(); ++p
)
1010 f
->dump_int("osd", *p
);
1013 f
->open_array_section("pg_removals");
1014 for (auto p
= pg_remove
.begin(); p
!= pg_remove
.end(); ++p
)
1015 f
->dump_stream("pgid") << *p
;
1019 void PGMap::Incremental::generate_test_instances(list
<PGMap::Incremental
*>& o
)
1021 o
.push_back(new Incremental
);
1022 o
.push_back(new Incremental
);
1023 o
.back()->version
= 1;
1024 o
.back()->stamp
= utime_t(123,345);
1025 o
.push_back(new Incremental
);
1026 o
.back()->version
= 2;
1027 o
.back()->pg_stat_updates
[pg_t(1,2,3)] = pg_stat_t();
1028 o
.back()->osd_stat_updates
[5] = osd_stat_t();
1029 o
.back()->osd_epochs
[5] = 12;
1030 o
.push_back(new Incremental
);
1031 o
.back()->version
= 3;
1032 o
.back()->osdmap_epoch
= 1;
1033 o
.back()->pg_scan
= 2;
1034 o
.back()->full_ratio
= .2;
1035 o
.back()->nearfull_ratio
= .3;
1036 o
.back()->pg_stat_updates
[pg_t(4,5,6)] = pg_stat_t();
1037 o
.back()->osd_stat_updates
[6] = osd_stat_t();
1038 o
.back()->osd_epochs
[6] = 12;
1039 o
.back()->pg_remove
.insert(pg_t(1,2,3));
1040 o
.back()->osd_stat_rm
.insert(5);
1046 void PGMap::apply_incremental(CephContext
*cct
, const Incremental
& inc
)
1048 assert(inc
.version
== version
+1);
1052 delta_t
= inc
.stamp
;
1056 pool_stat_t pg_sum_old
= pg_sum
;
1057 mempool::pgmap::unordered_map
<uint64_t, pool_stat_t
> pg_pool_sum_old
;
1059 bool ratios_changed
= false;
1060 if (inc
.full_ratio
!= full_ratio
&& inc
.full_ratio
!= -1) {
1061 full_ratio
= inc
.full_ratio
;
1062 ratios_changed
= true;
1064 if (inc
.nearfull_ratio
!= nearfull_ratio
&& inc
.nearfull_ratio
!= -1) {
1065 nearfull_ratio
= inc
.nearfull_ratio
;
1066 ratios_changed
= true;
1071 for (auto p
= inc
.pg_stat_updates
.begin();
1072 p
!= inc
.pg_stat_updates
.end();
1074 const pg_t
&update_pg(p
->first
);
1075 const pg_stat_t
&update_stat(p
->second
);
1077 if (pg_pool_sum_old
.count(update_pg
.pool()) == 0)
1078 pg_pool_sum_old
[update_pg
.pool()] = pg_pool_sum
[update_pg
.pool()];
1080 auto t
= pg_stat
.find(update_pg
);
1081 if (t
== pg_stat
.end()) {
1082 pg_stat
.insert(make_pair(update_pg
, update_stat
));
1084 stat_pg_sub(update_pg
, t
->second
);
1085 t
->second
= update_stat
;
1087 stat_pg_add(update_pg
, update_stat
);
1089 assert(osd_stat
.size() == osd_epochs
.size());
1090 for (auto p
= inc
.get_osd_stat_updates().begin();
1091 p
!= inc
.get_osd_stat_updates().end();
1094 const osd_stat_t
&new_stats(p
->second
);
1096 auto t
= osd_stat
.find(osd
);
1097 if (t
== osd_stat
.end()) {
1098 osd_stat
.insert(make_pair(osd
, new_stats
));
1100 stat_osd_sub(t
->first
, t
->second
);
1101 t
->second
= new_stats
;
1103 auto i
= osd_epochs
.find(osd
);
1104 auto j
= inc
.get_osd_epochs().find(osd
);
1105 assert(j
!= inc
.get_osd_epochs().end());
1107 if (i
== osd_epochs
.end())
1108 osd_epochs
.insert(*j
);
1110 i
->second
= j
->second
;
1112 stat_osd_add(osd
, new_stats
);
1114 // adjust [near]full status
1115 register_nearfull_status(osd
, new_stats
);
1117 set
<int64_t> deleted_pools
;
1118 for (auto p
= inc
.pg_remove
.begin();
1119 p
!= inc
.pg_remove
.end();
1121 const pg_t
&removed_pg(*p
);
1122 auto s
= pg_stat
.find(removed_pg
);
1123 if (s
!= pg_stat
.end()) {
1124 stat_pg_sub(removed_pg
, s
->second
);
1127 deleted_pools
.insert(removed_pg
.pool());
1130 for (auto p
= inc
.get_osd_stat_rm().begin();
1131 p
!= inc
.get_osd_stat_rm().end();
1133 auto t
= osd_stat
.find(*p
);
1134 if (t
!= osd_stat
.end()) {
1135 stat_osd_sub(t
->first
, t
->second
);
1137 osd_epochs
.erase(*p
);
1140 // remove these old osds from full/nearfull set(s), too
1141 nearfull_osds
.erase(*p
);
1142 full_osds
.erase(*p
);
1145 // calculate a delta, and average over the last 2 deltas.
1146 pool_stat_t d
= pg_sum
;
1147 d
.stats
.sub(pg_sum_old
.stats
);
1148 pg_sum_deltas
.push_back(make_pair(d
, delta_t
));
1149 stamp_delta
+= delta_t
;
1151 pg_sum_delta
.stats
.add(d
.stats
);
1152 if (pg_sum_deltas
.size() > (unsigned)MAX(1, cct
? cct
->_conf
->mon_stat_smooth_intervals
: 1)) {
1153 pg_sum_delta
.stats
.sub(pg_sum_deltas
.front().first
.stats
);
1154 stamp_delta
-= pg_sum_deltas
.front().second
;
1155 pg_sum_deltas
.pop_front();
1158 update_pool_deltas(cct
, inc
.stamp
, pg_pool_sum_old
);
1160 for (auto p
: deleted_pools
) {
1162 dout(20) << " deleted pool " << p
<< dendl
;
1166 if (inc
.osdmap_epoch
)
1167 last_osdmap_epoch
= inc
.osdmap_epoch
;
1169 last_pg_scan
= inc
.pg_scan
;
1171 min_last_epoch_clean
= 0; // invalidate
1174 void PGMap::redo_full_sets()
1177 nearfull_osds
.clear();
1178 for (auto i
= osd_stat
.begin();
1179 i
!= osd_stat
.end();
1181 register_nearfull_status(i
->first
, i
->second
);
1185 void PGMap::register_nearfull_status(int osd
, const osd_stat_t
& s
)
1187 float ratio
= ((float)s
.kb_used
) / ((float)s
.kb
);
1189 if (full_ratio
> 0 && ratio
> full_ratio
) {
1191 full_osds
.insert(osd
);
1192 nearfull_osds
.erase(osd
);
1193 } else if (nearfull_ratio
> 0 && ratio
> nearfull_ratio
) {
1195 full_osds
.erase(osd
);
1196 nearfull_osds
.insert(osd
);
1199 full_osds
.erase(osd
);
1200 nearfull_osds
.erase(osd
);
1204 void PGMap::calc_stats()
1210 pg_pool_sum
.clear();
1211 num_pg_by_pool
.clear();
1213 pg_sum
= pool_stat_t();
1214 osd_sum
= osd_stat_t();
1215 num_pg_by_state
.clear();
1216 num_pg_by_osd
.clear();
1218 for (auto p
= pg_stat
.begin();
1221 stat_pg_add(p
->first
, p
->second
);
1223 for (auto p
= osd_stat
.begin();
1224 p
!= osd_stat
.end();
1226 stat_osd_add(p
->first
, p
->second
);
1230 min_last_epoch_clean
= calc_min_last_epoch_clean();
1233 void PGMap::update_pg(pg_t pgid
, bufferlist
& bl
)
1235 bufferlist::iterator p
= bl
.begin();
1236 auto s
= pg_stat
.find(pgid
);
1237 epoch_t old_lec
= 0, lec
;
1238 if (s
!= pg_stat
.end()) {
1239 old_lec
= s
->second
.get_effective_last_epoch_clean();
1240 stat_pg_update(pgid
, s
->second
, p
);
1241 lec
= s
->second
.get_effective_last_epoch_clean();
1243 pg_stat_t
& r
= pg_stat
[pgid
];
1245 stat_pg_add(pgid
, r
);
1246 lec
= r
.get_effective_last_epoch_clean();
1249 if (min_last_epoch_clean
&&
1250 (lec
< min_last_epoch_clean
|| // we did
1251 (lec
> min_last_epoch_clean
&& // we might
1252 old_lec
== min_last_epoch_clean
)
1254 min_last_epoch_clean
= 0;
1257 void PGMap::remove_pg(pg_t pgid
)
1259 auto s
= pg_stat
.find(pgid
);
1260 if (s
!= pg_stat
.end()) {
1261 if (min_last_epoch_clean
&&
1262 s
->second
.get_effective_last_epoch_clean() == min_last_epoch_clean
)
1263 min_last_epoch_clean
= 0;
1264 stat_pg_sub(pgid
, s
->second
);
1269 void PGMap::update_osd(int osd
, bufferlist
& bl
)
1271 bufferlist::iterator p
= bl
.begin();
1272 auto o
= osd_stat
.find(osd
);
1273 epoch_t old_lec
= 0;
1274 if (o
!= osd_stat
.end()) {
1275 auto i
= osd_epochs
.find(osd
);
1276 if (i
!= osd_epochs
.end())
1277 old_lec
= i
->second
;
1278 stat_osd_sub(osd
, o
->second
);
1280 osd_stat_t
& r
= osd_stat
[osd
];
1282 stat_osd_add(osd
, r
);
1284 // adjust [near]full status
1285 register_nearfull_status(osd
, r
);
1292 if (e
< min_last_epoch_clean
||
1293 (e
> min_last_epoch_clean
&&
1294 old_lec
== min_last_epoch_clean
))
1295 min_last_epoch_clean
= 0;
1297 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1298 // or old mon running.
1302 void PGMap::remove_osd(int osd
)
1304 auto o
= osd_stat
.find(osd
);
1305 if (o
!= osd_stat
.end()) {
1306 stat_osd_sub(osd
, o
->second
);
1309 // remove these old osds from full/nearfull set(s), too
1310 nearfull_osds
.erase(osd
);
1311 full_osds
.erase(osd
);
1315 void PGMap::stat_pg_add(const pg_t
&pgid
, const pg_stat_t
&s
,
1318 pg_pool_sum
[pgid
.pool()].add(s
);
1322 num_pg_by_state
[s
.state
]++;
1323 num_pg_by_pool
[pgid
.pool()]++;
1325 if ((s
.state
& PG_STATE_CREATING
) &&
1326 s
.parent_split_bits
== 0) {
1327 creating_pgs
.insert(pgid
);
1328 if (s
.acting_primary
>= 0) {
1329 creating_pgs_by_osd_epoch
[s
.acting_primary
][s
.mapping_epoch
].insert(pgid
);
1333 if (s
.state
& PG_STATE_ACTIVE
) {
1343 for (auto p
= s
.blocked_by
.begin();
1344 p
!= s
.blocked_by
.end();
1346 ++blocked_by_sum
[*p
];
1349 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1350 pg_by_osd
[*p
].insert(pgid
);
1351 num_pg_by_osd
[*p
].acting
++;
1353 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1354 pg_by_osd
[*p
].insert(pgid
);
1355 num_pg_by_osd
[*p
].up
++;
1358 if (s
.up_primary
>= 0) {
1359 num_pg_by_osd
[s
.up_primary
].primary
++;
1363 void PGMap::stat_pg_sub(const pg_t
&pgid
, const pg_stat_t
&s
,
1366 pool_stat_t
& ps
= pg_pool_sum
[pgid
.pool()];
1371 int end
= --num_pg_by_state
[s
.state
];
1374 num_pg_by_state
.erase(s
.state
);
1375 end
= --num_pg_by_pool
[pgid
.pool()];
1377 num_pg_by_pool
.erase(pgid
.pool());
1378 pg_pool_sum
.erase(pgid
.pool());
1381 if ((s
.state
& PG_STATE_CREATING
) &&
1382 s
.parent_split_bits
== 0) {
1383 creating_pgs
.erase(pgid
);
1384 if (s
.acting_primary
>= 0) {
1385 map
<epoch_t
,set
<pg_t
> >& r
= creating_pgs_by_osd_epoch
[s
.acting_primary
];
1386 r
[s
.mapping_epoch
].erase(pgid
);
1387 if (r
[s
.mapping_epoch
].empty())
1388 r
.erase(s
.mapping_epoch
);
1390 creating_pgs_by_osd_epoch
.erase(s
.acting_primary
);
1394 if (s
.state
& PG_STATE_ACTIVE
) {
1404 for (auto p
= s
.blocked_by
.begin();
1405 p
!= s
.blocked_by
.end();
1407 auto q
= blocked_by_sum
.find(*p
);
1408 assert(q
!= blocked_by_sum
.end());
1411 blocked_by_sum
.erase(q
);
1414 for (auto p
= s
.acting
.begin(); p
!= s
.acting
.end(); ++p
) {
1415 auto& oset
= pg_by_osd
[*p
];
1418 pg_by_osd
.erase(*p
);
1419 auto it
= num_pg_by_osd
.find(*p
);
1420 if (it
!= num_pg_by_osd
.end() && it
->second
.acting
> 0)
1421 it
->second
.acting
--;
1423 for (auto p
= s
.up
.begin(); p
!= s
.up
.end(); ++p
) {
1424 auto& oset
= pg_by_osd
[*p
];
1427 pg_by_osd
.erase(*p
);
1428 auto it
= num_pg_by_osd
.find(*p
);
1429 if (it
!= num_pg_by_osd
.end() && it
->second
.up
> 0)
1433 if (s
.up_primary
>= 0) {
1434 auto it
= num_pg_by_osd
.find(s
.up_primary
);
1435 if (it
!= num_pg_by_osd
.end() && it
->second
.primary
> 0)
1436 it
->second
.primary
--;
1440 void PGMap::stat_pg_update(const pg_t pgid
, pg_stat_t
& s
,
1441 bufferlist::iterator
& blp
)
1447 s
.acting
== n
.acting
&&
1449 s
.blocked_by
== n
.blocked_by
;
1451 stat_pg_sub(pgid
, s
, sameosds
);
1453 // if acting_primary has shift to an just restored osd, and pg yet to finish
1454 // peering, many attributes in current stats remain stale. others seem don't
1455 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1456 if (!(n
.state
& (PG_STATE_ACTIVE
| PG_STATE_PEERED
)) &&
1457 n
.last_active
< s
.last_active
)
1458 n
.last_active
= s
.last_active
;
1460 stat_pg_add(pgid
, n
, sameosds
);
1463 void PGMap::stat_osd_add(int osd
, const osd_stat_t
&s
)
1467 if (osd
>= (int)osd_last_seq
.size()) {
1468 osd_last_seq
.resize(osd
+ 1);
1470 osd_last_seq
[osd
] = s
.seq
;
1473 void PGMap::stat_osd_sub(int osd
, const osd_stat_t
&s
)
1477 assert(osd
< (int)osd_last_seq
.size());
1478 osd_last_seq
[osd
] = 0;
1481 epoch_t
PGMap::calc_min_last_epoch_clean() const
1483 if (pg_stat
.empty())
1486 auto p
= pg_stat
.begin();
1487 epoch_t min
= p
->second
.get_effective_last_epoch_clean();
1488 for (++p
; p
!= pg_stat
.end(); ++p
) {
1489 epoch_t lec
= p
->second
.get_effective_last_epoch_clean();
1493 // also scan osd epochs
1494 // don't trim past the oldest reported osd epoch
1495 for (auto i
= osd_epochs
.begin();
1496 i
!= osd_epochs
.end();
1498 if (i
->second
< min
)
1504 void PGMap::encode_digest(const OSDMap
& osdmap
,
1505 bufferlist
& bl
, uint64_t features
) const
1507 get_rules_avail(osdmap
, &avail_space_by_rule
);
1508 PGMapDigest::encode(bl
, features
);
1511 void PGMap::encode(bufferlist
&bl
, uint64_t features
) const
1513 if ((features
& CEPH_FEATURE_MONENC
) == 0) {
1516 ::encode(version
, bl
);
1517 ::encode(pg_stat
, bl
);
1518 ::encode(osd_stat
, bl
);
1519 ::encode(last_osdmap_epoch
, bl
);
1520 ::encode(last_pg_scan
, bl
);
1521 ::encode(full_ratio
, bl
);
1522 ::encode(nearfull_ratio
, bl
);
1526 ENCODE_START(6, 4, bl
);
1527 ::encode(version
, bl
);
1528 ::encode(pg_stat
, bl
);
1529 ::encode(osd_stat
, bl
);
1530 ::encode(last_osdmap_epoch
, bl
);
1531 ::encode(last_pg_scan
, bl
);
1532 ::encode(full_ratio
, bl
);
1533 ::encode(nearfull_ratio
, bl
);
1534 ::encode(stamp
, bl
);
1535 ::encode(osd_epochs
, bl
);
1539 void PGMap::decode(bufferlist::iterator
&bl
)
1541 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl
);
1542 ::decode(version
, bl
);
1549 ::decode(opgid
, bl
);
1551 ::decode(pg_stat
[pgid
], bl
);
1554 ::decode(pg_stat
, bl
);
1556 ::decode(osd_stat
, bl
);
1557 ::decode(last_osdmap_epoch
, bl
);
1558 ::decode(last_pg_scan
, bl
);
1559 if (struct_v
>= 2) {
1560 ::decode(full_ratio
, bl
);
1561 ::decode(nearfull_ratio
, bl
);
1564 ::decode(stamp
, bl
);
1565 if (struct_v
>= 6) {
1566 ::decode(osd_epochs
, bl
);
1568 for (auto i
= osd_stat
.begin();
1569 i
!= osd_stat
.end();
1571 // This isn't accurate, but will cause trimming to behave like
1573 osd_epochs
.insert(make_pair(i
->first
, last_osdmap_epoch
));
1581 void PGMap::dirty_all(Incremental
& inc
)
1583 inc
.osdmap_epoch
= last_osdmap_epoch
;
1584 inc
.pg_scan
= last_pg_scan
;
1585 inc
.full_ratio
= full_ratio
;
1586 inc
.nearfull_ratio
= nearfull_ratio
;
1588 for (auto p
= pg_stat
.begin(); p
!= pg_stat
.end(); ++p
) {
1589 inc
.pg_stat_updates
[p
->first
] = p
->second
;
1591 for (auto p
= osd_stat
.begin(); p
!= osd_stat
.end(); ++p
) {
1592 assert(osd_epochs
.count(p
->first
));
1593 inc
.update_stat(p
->first
,
1594 inc
.get_osd_epochs().find(p
->first
)->second
,
1599 void PGMap::dump(Formatter
*f
) const
1602 dump_pg_stats(f
, false);
1607 void PGMap::dump_basic(Formatter
*f
) const
1609 f
->dump_unsigned("version", version
);
1610 f
->dump_stream("stamp") << stamp
;
1611 f
->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch
);
1612 f
->dump_unsigned("last_pg_scan", last_pg_scan
);
1613 f
->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean
);
1614 f
->dump_float("full_ratio", full_ratio
);
1615 f
->dump_float("near_full_ratio", nearfull_ratio
);
1617 f
->open_object_section("pg_stats_sum");
1621 f
->open_object_section("osd_stats_sum");
1625 f
->open_array_section("osd_epochs");
1626 for (auto p
= osd_epochs
.begin(); p
!= osd_epochs
.end(); ++p
) {
1627 f
->open_object_section("osd");
1628 f
->dump_unsigned("osd", p
->first
);
1629 f
->dump_unsigned("epoch", p
->second
);
1637 void PGMap::dump_delta(Formatter
*f
) const
1639 f
->open_object_section("pg_stats_delta");
1640 pg_sum_delta
.dump(f
);
1644 void PGMap::dump_pg_stats(Formatter
*f
, bool brief
) const
1646 f
->open_array_section("pg_stats");
1647 for (auto i
= pg_stat
.begin();
1650 f
->open_object_section("pg_stat");
1651 f
->dump_stream("pgid") << i
->first
;
1653 i
->second
.dump_brief(f
);
1661 void PGMap::dump_pool_stats(Formatter
*f
) const
1663 f
->open_array_section("pool_stats");
1664 for (auto p
= pg_pool_sum
.begin();
1665 p
!= pg_pool_sum
.end();
1667 f
->open_object_section("pool_stat");
1668 f
->dump_int("poolid", p
->first
);
1669 auto q
= num_pg_by_pool
.find(p
->first
);
1670 if (q
!= num_pg_by_pool
.end())
1671 f
->dump_unsigned("num_pg", q
->second
);
1678 void PGMap::dump_osd_stats(Formatter
*f
) const
1680 f
->open_array_section("osd_stats");
1681 for (auto q
= osd_stat
.begin();
1682 q
!= osd_stat
.end();
1684 f
->open_object_section("osd_stat");
1685 f
->dump_int("osd", q
->first
);
1692 void PGMap::dump_pg_stats_plain(
1694 const mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& pg_stats
,
1700 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1701 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1702 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1703 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1704 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1705 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1708 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1709 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1710 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1711 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1712 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1713 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1714 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1715 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1716 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1717 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
1718 tab
.define_column("STATE_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1719 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
1720 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
1721 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
1722 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1723 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
1724 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1725 tab
.define_column("LAST_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1726 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1727 tab
.define_column("LAST_DEEP_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
1728 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
1731 for (auto i
= pg_stats
.begin();
1732 i
!= pg_stats
.end(); ++i
) {
1733 const pg_stat_t
&st(i
->second
);
1736 << pg_state_string(st
.state
)
1740 << st
.acting_primary
1741 << TextTable::endrow
;
1743 ostringstream reported
;
1744 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
1747 << st
.stats
.sum
.num_objects
1748 << st
.stats
.sum
.num_objects_missing_on_primary
1749 << st
.stats
.sum
.num_objects_degraded
1750 << st
.stats
.sum
.num_objects_misplaced
1751 << st
.stats
.sum
.num_objects_unfound
1752 << st
.stats
.sum
.num_bytes
1754 << st
.ondisk_log_size
1755 << pg_state_string(st
.state
)
1759 << pg_vector_string(st
.up
)
1761 << pg_vector_string(st
.acting
)
1762 << st
.acting_primary
1764 << st
.last_scrub_stamp
1765 << st
.last_deep_scrub
1766 << st
.last_deep_scrub_stamp
1767 << TextTable::endrow
;
1774 void PGMap::dump(ostream
& ss
) const
1777 dump_pg_stats(ss
, false);
1778 dump_pool_stats(ss
, false);
1779 dump_pg_sum_stats(ss
, false);
1783 void PGMap::dump_basic(ostream
& ss
) const
1785 ss
<< "version " << version
<< std::endl
;
1786 ss
<< "stamp " << stamp
<< std::endl
;
1787 ss
<< "last_osdmap_epoch " << last_osdmap_epoch
<< std::endl
;
1788 ss
<< "last_pg_scan " << last_pg_scan
<< std::endl
;
1789 ss
<< "full_ratio " << full_ratio
<< std::endl
;
1790 ss
<< "nearfull_ratio " << nearfull_ratio
<< std::endl
;
1793 void PGMap::dump_pg_stats(ostream
& ss
, bool brief
) const
1795 dump_pg_stats_plain(ss
, pg_stat
, brief
);
1798 void PGMap::dump_pool_stats(ostream
& ss
, bool header
) const
1803 tab
.define_column("POOLID", TextTable::LEFT
, TextTable::LEFT
);
1804 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1805 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1806 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1807 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1808 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1809 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1810 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1811 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1813 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1814 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1815 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1816 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1817 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1818 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1819 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1820 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1821 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1824 for (auto p
= pg_pool_sum
.begin();
1825 p
!= pg_pool_sum
.end();
1828 << p
->second
.stats
.sum
.num_objects
1829 << p
->second
.stats
.sum
.num_objects_missing_on_primary
1830 << p
->second
.stats
.sum
.num_objects_degraded
1831 << p
->second
.stats
.sum
.num_objects_misplaced
1832 << p
->second
.stats
.sum
.num_objects_unfound
1833 << p
->second
.stats
.sum
.num_bytes
1834 << p
->second
.log_size
1835 << p
->second
.ondisk_log_size
1836 << TextTable::endrow
;
1842 void PGMap::dump_pg_sum_stats(ostream
& ss
, bool header
) const
1847 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
1848 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
1849 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
1850 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
1851 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
1852 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
1853 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
1854 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
1855 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
1857 tab
.define_column("", TextTable::LEFT
, TextTable::LEFT
);
1858 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1859 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1860 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1861 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1862 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1863 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1864 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1865 tab
.define_column("", TextTable::LEFT
, TextTable::RIGHT
);
1869 << pg_sum
.stats
.sum
.num_objects
1870 << pg_sum
.stats
.sum
.num_objects_missing_on_primary
1871 << pg_sum
.stats
.sum
.num_objects_degraded
1872 << pg_sum
.stats
.sum
.num_objects_misplaced
1873 << pg_sum
.stats
.sum
.num_objects_unfound
1874 << pg_sum
.stats
.sum
.num_bytes
1876 << pg_sum
.ondisk_log_size
1877 << TextTable::endrow
;
1882 void PGMap::dump_osd_stats(ostream
& ss
) const
1886 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1887 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1888 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1889 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1890 tab
.define_column("HB_PEERS", TextTable::LEFT
, TextTable::RIGHT
);
1891 tab
.define_column("PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1892 tab
.define_column("PRIMARY_PG_SUM", TextTable::LEFT
, TextTable::RIGHT
);
1894 for (auto p
= osd_stat
.begin();
1895 p
!= osd_stat
.end();
1898 << si_t(p
->second
.kb_used
<< 10)
1899 << si_t(p
->second
.kb_avail
<< 10)
1900 << si_t(p
->second
.kb
<< 10)
1901 << p
->second
.hb_peers
1902 << get_num_pg_by_osd(p
->first
)
1903 << get_num_primary_pg_by_osd(p
->first
)
1904 << TextTable::endrow
;
1908 << si_t(osd_sum
.kb_used
<< 10)
1909 << si_t(osd_sum
.kb_avail
<< 10)
1910 << si_t(osd_sum
.kb
<< 10)
1911 << TextTable::endrow
;
1916 void PGMap::dump_osd_sum_stats(ostream
& ss
) const
1920 tab
.define_column("OSD_STAT", TextTable::LEFT
, TextTable::LEFT
);
1921 tab
.define_column("USED", TextTable::LEFT
, TextTable::RIGHT
);
1922 tab
.define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
1923 tab
.define_column("TOTAL", TextTable::LEFT
, TextTable::RIGHT
);
1926 << si_t(osd_sum
.kb_used
<< 10)
1927 << si_t(osd_sum
.kb_avail
<< 10)
1928 << si_t(osd_sum
.kb
<< 10)
1929 << TextTable::endrow
;
1934 void PGMap::get_stuck_stats(
1935 int types
, const utime_t cutoff
,
1936 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
>& stuck_pgs
) const
1939 for (auto i
= pg_stat
.begin();
1942 utime_t val
= cutoff
; // don't care about >= cutoff so that is infinity
1944 if ((types
& STUCK_INACTIVE
) && !(i
->second
.state
& PG_STATE_ACTIVE
)) {
1945 if (i
->second
.last_active
< val
)
1946 val
= i
->second
.last_active
;
1949 if ((types
& STUCK_UNCLEAN
) && !(i
->second
.state
& PG_STATE_CLEAN
)) {
1950 if (i
->second
.last_clean
< val
)
1951 val
= i
->second
.last_clean
;
1954 if ((types
& STUCK_DEGRADED
) && (i
->second
.state
& PG_STATE_DEGRADED
)) {
1955 if (i
->second
.last_undegraded
< val
)
1956 val
= i
->second
.last_undegraded
;
1959 if ((types
& STUCK_UNDERSIZED
) && (i
->second
.state
& PG_STATE_UNDERSIZED
)) {
1960 if (i
->second
.last_fullsized
< val
)
1961 val
= i
->second
.last_fullsized
;
1964 if ((types
& STUCK_STALE
) && (i
->second
.state
& PG_STATE_STALE
)) {
1965 if (i
->second
.last_unstale
< val
)
1966 val
= i
->second
.last_unstale
;
1969 // val is now the earliest any of the requested stuck states began
1971 stuck_pgs
[i
->first
] = i
->second
;
1976 bool PGMap::get_stuck_counts(const utime_t cutoff
, map
<string
, int>& note
) const
1984 for (auto i
= pg_stat
.begin();
1987 if (! (i
->second
.state
& PG_STATE_ACTIVE
)) {
1988 if (i
->second
.last_active
< cutoff
)
1991 if (! (i
->second
.state
& PG_STATE_CLEAN
)) {
1992 if (i
->second
.last_clean
< cutoff
)
1995 if (i
->second
.state
& PG_STATE_DEGRADED
) {
1996 if (i
->second
.last_undegraded
< cutoff
)
1999 if (i
->second
.state
& PG_STATE_UNDERSIZED
) {
2000 if (i
->second
.last_fullsized
< cutoff
)
2003 if (i
->second
.state
& PG_STATE_STALE
) {
2004 if (i
->second
.last_unstale
< cutoff
)
2010 note
["stuck inactive"] = inactive
;
2013 note
["stuck unclean"] = unclean
;
2016 note
["stuck undersized"] = undersized
;
2019 note
["stuck degraded"] = degraded
;
2022 note
["stuck stale"] = stale
;
2024 return inactive
|| unclean
|| undersized
|| degraded
|| stale
;
2027 void PGMap::dump_stuck(Formatter
*f
, int types
, utime_t cutoff
) const
2029 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
2030 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
2031 f
->open_array_section("stuck_pg_stats");
2032 for (auto i
= stuck_pg_stats
.begin();
2033 i
!= stuck_pg_stats
.end();
2035 f
->open_object_section("pg_stat");
2036 f
->dump_stream("pgid") << i
->first
;
2043 void PGMap::dump_stuck_plain(ostream
& ss
, int types
, utime_t cutoff
) const
2045 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pg_stats
;
2046 get_stuck_stats(types
, cutoff
, stuck_pg_stats
);
2047 if (!stuck_pg_stats
.empty())
2048 dump_pg_stats_plain(ss
, stuck_pg_stats
, true);
2051 int PGMap::dump_stuck_pg_stats(
2055 vector
<string
>& args
) const
2057 int stuck_types
= 0;
2059 for (auto i
= args
.begin(); i
!= args
.end(); ++i
) {
2060 if (*i
== "inactive")
2061 stuck_types
|= PGMap::STUCK_INACTIVE
;
2062 else if (*i
== "unclean")
2063 stuck_types
|= PGMap::STUCK_UNCLEAN
;
2064 else if (*i
== "undersized")
2065 stuck_types
|= PGMap::STUCK_UNDERSIZED
;
2066 else if (*i
== "degraded")
2067 stuck_types
|= PGMap::STUCK_DEGRADED
;
2068 else if (*i
== "stale")
2069 stuck_types
|= PGMap::STUCK_STALE
;
2071 ds
<< "Unknown type: " << *i
<< std::endl
;
2076 utime_t
now(ceph_clock_now());
2077 utime_t cutoff
= now
- utime_t(threshold
, 0);
2080 dump_stuck_plain(ds
, stuck_types
, cutoff
);
2082 dump_stuck(f
, stuck_types
, cutoff
);
2089 void PGMap::dump_osd_perf_stats(Formatter
*f
) const
2091 f
->open_array_section("osd_perf_infos");
2092 for (auto i
= osd_stat
.begin();
2093 i
!= osd_stat
.end();
2095 f
->open_object_section("osd");
2096 f
->dump_int("id", i
->first
);
2098 f
->open_object_section("perf_stats");
2099 i
->second
.os_perf_stat
.dump(f
);
2106 void PGMap::print_osd_perf_stats(std::ostream
*ss
) const
2109 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2110 tab
.define_column("commit_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2111 tab
.define_column("apply_latency(ms)", TextTable::LEFT
, TextTable::RIGHT
);
2112 for (auto i
= osd_stat
.begin();
2113 i
!= osd_stat
.end();
2116 tab
<< i
->second
.os_perf_stat
.os_commit_latency
;
2117 tab
<< i
->second
.os_perf_stat
.os_apply_latency
;
2118 tab
<< TextTable::endrow
;
2123 void PGMap::dump_osd_blocked_by_stats(Formatter
*f
) const
2125 f
->open_array_section("osd_blocked_by_infos");
2126 for (auto i
= blocked_by_sum
.begin();
2127 i
!= blocked_by_sum
.end();
2129 f
->open_object_section("osd");
2130 f
->dump_int("id", i
->first
);
2131 f
->dump_int("num_blocked", i
->second
);
2136 void PGMap::print_osd_blocked_by_stats(std::ostream
*ss
) const
2139 tab
.define_column("osd", TextTable::LEFT
, TextTable::RIGHT
);
2140 tab
.define_column("num_blocked", TextTable::LEFT
, TextTable::RIGHT
);
2141 for (auto i
= blocked_by_sum
.begin();
2142 i
!= blocked_by_sum
.end();
2146 tab
<< TextTable::endrow
;
2153 * update aggregated delta
2155 * @param cct ceph context
2156 * @param ts Timestamp for the stats being delta'ed
2157 * @param old_pool_sum Previous stats sum
2158 * @param last_ts Last timestamp for pool
2159 * @param result_pool_sum Resulting stats
2160 * @param result_pool_delta Resulting pool delta
2161 * @param result_ts_delta Resulting timestamp delta
2162 * @param delta_avg_list List of last N computed deltas, used to average
2164 void PGMap::update_delta(
2167 const pool_stat_t
& old_pool_sum
,
2169 const pool_stat_t
& current_pool_sum
,
2170 pool_stat_t
*result_pool_delta
,
2171 utime_t
*result_ts_delta
,
2172 mempool::pgmap::list
<pair
<pool_stat_t
,utime_t
> > *delta_avg_list
)
2174 /* @p ts is the timestamp we want to associate with the data
2175 * in @p old_pool_sum, and on which we will base ourselves to
2176 * calculate the delta, stored in 'delta_t'.
2179 delta_t
= ts
; // start with the provided timestamp
2180 delta_t
-= *last_ts
; // take the last timestamp we saw
2181 *last_ts
= ts
; // @p ts becomes the last timestamp we saw
2183 // adjust delta_t, quick start if there is no update in a long period
2184 delta_t
= std::min(delta_t
,
2185 utime_t(2 * (cct
? cct
->_conf
->mon_delta_reset_interval
: 10), 0));
2187 // calculate a delta, and average over the last 6 deltas by default.
2188 /* start by taking a copy of our current @p result_pool_sum, and by
2189 * taking out the stats from @p old_pool_sum. This generates a stats
2190 * delta. Stash this stats delta in @p delta_avg_list, along with the
2191 * timestamp delta for these results.
2193 pool_stat_t d
= current_pool_sum
;
2194 d
.stats
.sub(old_pool_sum
.stats
);
2195 delta_avg_list
->push_back(make_pair(d
,delta_t
));
2196 *result_ts_delta
+= delta_t
;
2198 /* Aggregate current delta, and take out the last seen delta (if any) to
2201 result_pool_delta
->stats
.add(d
.stats
);
2202 size_t s
= MAX(1, cct
? cct
->_conf
->mon_stat_smooth_intervals
: 1);
2203 if (delta_avg_list
->size() > s
) {
2204 result_pool_delta
->stats
.sub(delta_avg_list
->front().first
.stats
);
2205 *result_ts_delta
-= delta_avg_list
->front().second
;
2206 delta_avg_list
->pop_front();
2211 * update aggregated delta
2213 * @param cct ceph context
2214 * @param ts Timestamp
2215 * @param pg_sum_old Old pg_sum
2217 void PGMap::update_global_delta(CephContext
*cct
,
2218 const utime_t ts
, const pool_stat_t
& pg_sum_old
)
2220 update_delta(cct
, ts
, pg_sum_old
, &stamp
, pg_sum
, &pg_sum_delta
,
2221 &stamp_delta
, &pg_sum_deltas
);
2225 * Update a given pool's deltas
2227 * @param cct Ceph Context
2228 * @param ts Timestamp for the stats being delta'ed
2229 * @param pool Pool's id
2230 * @param old_pool_sum Previous stats sum
2232 void PGMap::update_one_pool_delta(
2235 const uint64_t pool
,
2236 const pool_stat_t
& old_pool_sum
)
2238 if (per_pool_sum_deltas
.count(pool
) == 0) {
2239 assert(per_pool_sum_deltas_stamps
.count(pool
) == 0);
2240 assert(per_pool_sum_delta
.count(pool
) == 0);
2243 auto& sum_delta
= per_pool_sum_delta
[pool
];
2245 update_delta(cct
, ts
, old_pool_sum
, &sum_delta
.second
, pg_pool_sum
[pool
],
2246 &sum_delta
.first
, &per_pool_sum_deltas_stamps
[pool
],
2247 &per_pool_sum_deltas
[pool
]);
2251 * Update pools' deltas
2253 * @param cct CephContext
2254 * @param ts Timestamp for the stats being delta'ed
2255 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2257 void PGMap::update_pool_deltas(
2258 CephContext
*cct
, const utime_t ts
,
2259 const mempool::pgmap::unordered_map
<uint64_t,pool_stat_t
>& pg_pool_sum_old
)
2261 for (auto it
= pg_pool_sum_old
.begin();
2262 it
!= pg_pool_sum_old
.end(); ++it
) {
2263 update_one_pool_delta(cct
, ts
, it
->first
, it
->second
);
2267 void PGMap::clear_delta()
2269 pg_sum_delta
= pool_stat_t();
2270 pg_sum_deltas
.clear();
2271 stamp_delta
= utime_t();
2274 void PGMap::generate_test_instances(list
<PGMap
*>& o
)
2276 o
.push_back(new PGMap
);
2277 list
<Incremental
*> inc
;
2278 Incremental::generate_test_instances(inc
);
2281 while (!inc
.empty()) {
2282 PGMap
*pmp
= new PGMap();
2285 o
.back()->apply_incremental(NULL
, *inc
.front());
2291 void PGMap::get_filtered_pg_stats(uint32_t state
, int64_t poolid
, int64_t osdid
,
2292 bool primary
, set
<pg_t
>& pgs
) const
2294 for (auto i
= pg_stat
.begin();
2297 if ((poolid
>= 0) && (uint64_t(poolid
) != i
->first
.pool()))
2299 if ((osdid
>= 0) && !(i
->second
.is_acting_osd(osdid
,primary
)))
2301 if (!(i
->second
.state
& state
))
2303 pgs
.insert(i
->first
);
2307 void PGMap::dump_filtered_pg_stats(Formatter
*f
, set
<pg_t
>& pgs
) const
2309 f
->open_array_section("pg_stats");
2310 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2311 const pg_stat_t
& st
= pg_stat
.at(*i
);
2312 f
->open_object_section("pg_stat");
2313 f
->dump_stream("pgid") << *i
;
2320 void PGMap::dump_filtered_pg_stats(ostream
& ss
, set
<pg_t
>& pgs
) const
2324 tab
.define_column("PG_STAT", TextTable::LEFT
, TextTable::LEFT
);
2325 tab
.define_column("OBJECTS", TextTable::LEFT
, TextTable::RIGHT
);
2326 tab
.define_column("MISSING_ON_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
2327 tab
.define_column("DEGRADED", TextTable::LEFT
, TextTable::RIGHT
);
2328 tab
.define_column("MISPLACED", TextTable::LEFT
, TextTable::RIGHT
);
2329 tab
.define_column("UNFOUND", TextTable::LEFT
, TextTable::RIGHT
);
2330 tab
.define_column("BYTES", TextTable::LEFT
, TextTable::RIGHT
);
2331 tab
.define_column("LOG", TextTable::LEFT
, TextTable::RIGHT
);
2332 tab
.define_column("DISK_LOG", TextTable::LEFT
, TextTable::RIGHT
);
2333 tab
.define_column("STATE", TextTable::LEFT
, TextTable::RIGHT
);
2334 tab
.define_column("STATE_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2335 tab
.define_column("VERSION", TextTable::LEFT
, TextTable::RIGHT
);
2336 tab
.define_column("REPORTED", TextTable::LEFT
, TextTable::RIGHT
);
2337 tab
.define_column("UP", TextTable::LEFT
, TextTable::RIGHT
);
2338 tab
.define_column("UP_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
2339 tab
.define_column("ACTING", TextTable::LEFT
, TextTable::RIGHT
);
2340 tab
.define_column("ACTING_PRIMARY", TextTable::LEFT
, TextTable::RIGHT
);
2341 tab
.define_column("LAST_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
2342 tab
.define_column("SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2343 tab
.define_column("LAST_DEEP_SCRUB", TextTable::LEFT
, TextTable::RIGHT
);
2344 tab
.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT
, TextTable::RIGHT
);
2346 for (auto i
= pgs
.begin(); i
!= pgs
.end(); ++i
) {
2347 const pg_stat_t
& st
= pg_stat
.at(*i
);
2349 ostringstream reported
;
2350 reported
<< st
.reported_epoch
<< ":" << st
.reported_seq
;
2353 << st
.stats
.sum
.num_objects
2354 << st
.stats
.sum
.num_objects_missing_on_primary
2355 << st
.stats
.sum
.num_objects_degraded
2356 << st
.stats
.sum
.num_objects_misplaced
2357 << st
.stats
.sum
.num_objects_unfound
2358 << st
.stats
.sum
.num_bytes
2360 << st
.ondisk_log_size
2361 << pg_state_string(st
.state
)
2368 << st
.acting_primary
2370 << st
.last_scrub_stamp
2371 << st
.last_deep_scrub
2372 << st
.last_deep_scrub_stamp
2373 << TextTable::endrow
;
2381 // Only called with a single bit set in "what"
2382 static void note_stuck_detail(
2384 mempool::pgmap::unordered_map
<pg_t
,pg_stat_t
>& stuck_pgs
,
2386 list
<pair
<health_status_t
,string
> > *detail
)
2389 for (auto p
= stuck_pgs
.begin();
2390 p
!= stuck_pgs
.end();
2394 const char *whatname
= 0;
2396 case PGMap::STUCK_INACTIVE
:
2397 since
= p
->second
.last_active
;
2398 whatname
= "inactive";
2400 case PGMap::STUCK_UNCLEAN
:
2401 since
= p
->second
.last_clean
;
2402 whatname
= "unclean";
2404 case PGMap::STUCK_DEGRADED
:
2405 since
= p
->second
.last_undegraded
;
2406 whatname
= "degraded";
2408 case PGMap::STUCK_UNDERSIZED
:
2409 since
= p
->second
.last_fullsized
;
2410 whatname
= "undersized";
2412 case PGMap::STUCK_STALE
:
2413 since
= p
->second
.last_unstale
;
2419 if (--max_detail
== 0) {
2421 ss
<< (stuck_pgs
.size() - n
) << " more pgs are also stuck " << whatname
;
2422 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2426 ss
<< "pg " << p
->first
<< " is stuck " << whatname
;
2427 if (since
== utime_t()) {
2428 ss
<< " since forever";
2430 utime_t dur
= ceph_clock_now() - since
;
2431 ss
<< " for " << dur
;
2433 ss
<< ", current state " << pg_state_string(p
->second
.state
)
2434 << ", last acting " << p
->second
.acting
;
2435 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2439 static pair
<int,int> _warn_slow_request_histogram(
2441 const pow2_hist_t
& h
,
2443 list
<pair
<health_status_t
,string
> >& summary
,
2444 list
<pair
<health_status_t
,string
> > *detail
)
2447 return make_pair(0, 0);
2449 unsigned warn
= 0, error
= 0;
2451 cct
->_conf
->mon_osd_warn_op_age
* cct
->_conf
->mon_osd_err_op_age_ratio
;
2452 for (unsigned i
= h
.h
.size() - 1; i
> 0; --i
) {
2453 float ub
= (float)(1 << i
) / 1000.0;
2454 if (ub
< cct
->_conf
->mon_osd_warn_op_age
)
2457 auto sev
= HEALTH_WARN
;
2466 ss
<< h
.h
[i
] << " ops are blocked > " << ub
<< " sec" << suffix
;
2467 detail
->push_back(make_pair(sev
, ss
.str()));
2471 return make_pair(warn
, error
);
2475 enum class scrubbed_or_deepscrubbed_t
{ SCRUBBED
, DEEPSCRUBBED
};
2477 void print_unscrubbed_detailed(
2478 const std::pair
<const pg_t
,pg_stat_t
> &pg_entry
,
2479 list
<pair
<health_status_t
,string
> > *detail
,
2480 scrubbed_or_deepscrubbed_t how_scrubbed
)
2482 std::stringstream ss
;
2483 const auto& pg_stat(pg_entry
.second
);
2485 ss
<< "pg " << pg_entry
.first
<< " is not ";
2486 if (how_scrubbed
== scrubbed_or_deepscrubbed_t::SCRUBBED
) {
2487 ss
<< "scrubbed, last_scrub_stamp "
2488 << pg_stat
.last_scrub_stamp
;
2489 } else if (how_scrubbed
== scrubbed_or_deepscrubbed_t::DEEPSCRUBBED
) {
2490 ss
<< "deep-scrubbed, last_deep_scrub_stamp "
2491 << pg_stat
.last_deep_scrub_stamp
;
2494 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2497 using pg_stat_map_t
= const mempool::pgmap::unordered_map
<pg_t
,pg_stat_t
>;
2499 void print_unscrubbed_pgs(
2500 pg_stat_map_t
& pg_stats
,
2501 list
<pair
<health_status_t
,string
> > &summary
,
2502 list
<pair
<health_status_t
,string
> > *detail
,
2503 const CephContext
* cct
)
2505 if (cct
->_conf
->mon_warn_not_scrubbed
== 0 &&
2506 cct
->_conf
->mon_warn_not_deep_scrubbed
== 0)
2510 const utime_t now
= ceph_clock_now();
2511 for (const auto& pg_entry
: pg_stats
) {
2512 const auto& pg_stat(pg_entry
.second
);
2513 const utime_t time_since_ls
= now
- pg_stat
.last_scrub_stamp
;
2514 const utime_t time_since_lds
= now
- pg_stat
.last_deep_scrub_stamp
;
2516 const int mon_warn_not_scrubbed
=
2517 cct
->_conf
->mon_warn_not_scrubbed
+ cct
->_conf
->mon_scrub_interval
;
2519 const int mon_warn_not_deep_scrubbed
=
2520 cct
->_conf
->mon_warn_not_deep_scrubbed
+ cct
->_conf
->osd_deep_scrub_interval
;
2522 bool not_scrubbed
= (time_since_ls
>= mon_warn_not_scrubbed
&&
2523 cct
->_conf
->mon_warn_not_scrubbed
!= 0);
2525 bool not_deep_scrubbed
= (time_since_lds
>= mon_warn_not_deep_scrubbed
&&
2526 cct
->_conf
->mon_warn_not_deep_scrubbed
!= 0);
2528 if (detail
!= nullptr) {
2530 print_unscrubbed_detailed(pg_entry
,
2532 scrubbed_or_deepscrubbed_t::SCRUBBED
);
2534 if (not_deep_scrubbed
) {
2535 print_unscrubbed_detailed(pg_entry
,
2537 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED
);
2540 if (not_scrubbed
|| not_deep_scrubbed
) {
2545 if (pgs_count
> 0) {
2546 std::stringstream ss
;
2547 ss
<< pgs_count
<< " unscrubbed pgs";
2548 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2554 void PGMap::get_health(
2556 const OSDMap
& osdmap
,
2557 list
<pair
<health_status_t
,string
> >& summary
,
2558 list
<pair
<health_status_t
,string
> > *detail
) const
2560 map
<string
,int> note
;
2561 auto p
= num_pg_by_state
.begin();
2562 auto p_end
= num_pg_by_state
.end();
2563 for (; p
!= p_end
; ++p
) {
2564 if (p
->first
& PG_STATE_STALE
)
2565 note
["stale"] += p
->second
;
2566 if (p
->first
& PG_STATE_DOWN
)
2567 note
["down"] += p
->second
;
2568 if (p
->first
& PG_STATE_UNDERSIZED
)
2569 note
["undersized"] += p
->second
;
2570 if (p
->first
& PG_STATE_DEGRADED
)
2571 note
["degraded"] += p
->second
;
2572 if (p
->first
& PG_STATE_INCONSISTENT
)
2573 note
["inconsistent"] += p
->second
;
2574 if (p
->first
& PG_STATE_PEERING
)
2575 note
["peering"] += p
->second
;
2576 if (p
->first
& PG_STATE_REPAIR
)
2577 note
["repair"] += p
->second
;
2578 if (p
->first
& PG_STATE_RECOVERING
)
2579 note
["recovering"] += p
->second
;
2580 if (p
->first
& PG_STATE_RECOVERY_WAIT
)
2581 note
["recovery_wait"] += p
->second
;
2582 if (p
->first
& PG_STATE_INCOMPLETE
)
2583 note
["incomplete"] += p
->second
;
2584 if (p
->first
& PG_STATE_BACKFILL_WAIT
)
2585 note
["backfill_wait"] += p
->second
;
2586 if (p
->first
& PG_STATE_BACKFILL
)
2587 note
["backfilling"] += p
->second
;
2588 if (p
->first
& PG_STATE_BACKFILL_TOOFULL
)
2589 note
["backfill_toofull"] += p
->second
;
2590 if (p
->first
& PG_STATE_RECOVERY_TOOFULL
)
2591 note
["recovery_toofull"] += p
->second
;
2594 mempool::pgmap::unordered_map
<pg_t
, pg_stat_t
> stuck_pgs
;
2595 utime_t
now(ceph_clock_now());
2596 utime_t cutoff
= now
- utime_t(cct
->_conf
->mon_pg_stuck_threshold
, 0);
2597 uint64_t num_inactive_pgs
= 0;
2600 // we need to collect details of stuck pgs, first do a quick check
2601 // whether this will yield any results
2602 if (get_stuck_counts(cutoff
, note
)) {
2604 // there are stuck pgs. gather details for specified statuses
2605 // only if we know that there are pgs stuck in that status
2607 if (note
.find("stuck inactive") != note
.end()) {
2608 get_stuck_stats(PGMap::STUCK_INACTIVE
, cutoff
, stuck_pgs
);
2609 note
["stuck inactive"] = stuck_pgs
.size();
2610 num_inactive_pgs
+= stuck_pgs
.size();
2611 note_stuck_detail(PGMap::STUCK_INACTIVE
, stuck_pgs
,
2612 cct
->_conf
->mon_health_max_detail
, detail
);
2616 if (note
.find("stuck unclean") != note
.end()) {
2617 get_stuck_stats(PGMap::STUCK_UNCLEAN
, cutoff
, stuck_pgs
);
2618 note
["stuck unclean"] = stuck_pgs
.size();
2619 note_stuck_detail(PGMap::STUCK_UNCLEAN
, stuck_pgs
,
2620 cct
->_conf
->mon_health_max_detail
, detail
);
2624 if (note
.find("stuck undersized") != note
.end()) {
2625 get_stuck_stats(PGMap::STUCK_UNDERSIZED
, cutoff
, stuck_pgs
);
2626 note
["stuck undersized"] = stuck_pgs
.size();
2627 note_stuck_detail(PGMap::STUCK_UNDERSIZED
, stuck_pgs
,
2628 cct
->_conf
->mon_health_max_detail
, detail
);
2632 if (note
.find("stuck degraded") != note
.end()) {
2633 get_stuck_stats(PGMap::STUCK_DEGRADED
, cutoff
, stuck_pgs
);
2634 note
["stuck degraded"] = stuck_pgs
.size();
2635 note_stuck_detail(PGMap::STUCK_DEGRADED
, stuck_pgs
,
2636 cct
->_conf
->mon_health_max_detail
, detail
);
2640 if (note
.find("stuck stale") != note
.end()) {
2641 get_stuck_stats(PGMap::STUCK_STALE
, cutoff
, stuck_pgs
);
2642 note
["stuck stale"] = stuck_pgs
.size();
2643 num_inactive_pgs
+= stuck_pgs
.size();
2644 note_stuck_detail(PGMap::STUCK_STALE
, stuck_pgs
,
2645 cct
->_conf
->mon_health_max_detail
, detail
);
2649 get_stuck_counts(cutoff
, note
);
2650 auto p
= note
.find("stuck inactive");
2651 if (p
!= note
.end())
2652 num_inactive_pgs
+= p
->second
;
2653 p
= note
.find("stuck stale");
2654 if (p
!= note
.end())
2655 num_inactive_pgs
+= p
->second
;
2658 if (cct
->_conf
->mon_pg_min_inactive
> 0 &&
2659 num_inactive_pgs
>= cct
->_conf
->mon_pg_min_inactive
) {
2661 ss
<< num_inactive_pgs
<< " pgs are stuck inactive for more than " << cct
->_conf
->mon_pg_stuck_threshold
<< " seconds";
2662 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
2665 if (!note
.empty()) {
2666 for (auto p
= note
.begin(); p
!= note
.end(); ++p
) {
2668 ss
<< p
->second
<< " pgs " << p
->first
;
2669 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2672 int n
= 0, more
= 0;
2673 int max
= cct
->_conf
->mon_health_max_detail
;
2674 for (auto p
= pg_stat
.begin();
2677 if ((p
->second
.state
& (PG_STATE_STALE
|
2679 PG_STATE_UNDERSIZED
|
2681 PG_STATE_INCONSISTENT
|
2684 PG_STATE_RECOVERING
|
2685 PG_STATE_RECOVERY_WAIT
|
2686 PG_STATE_RECOVERY_TOOFULL
|
2687 PG_STATE_INCOMPLETE
|
2688 PG_STATE_BACKFILL_WAIT
|
2690 PG_STATE_BACKFILL_TOOFULL
)) &&
2691 stuck_pgs
.count(p
->first
) == 0) {
2700 ss
<< "pg " << p
->first
<< " is " << pg_state_string(p
->second
.state
);
2701 ss
<< ", acting " << p
->second
.acting
;
2702 if (p
->second
.stats
.sum
.num_objects_unfound
)
2703 ss
<< ", " << p
->second
.stats
.sum
.num_objects_unfound
<< " unfound";
2704 if (p
->second
.state
& PG_STATE_INCOMPLETE
) {
2705 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
.pool());
2706 if (pi
&& pi
->min_size
> 1) {
2707 ss
<< " (reducing pool " << osdmap
.get_pool_name(p
->first
.pool())
2708 << " min_size from " << (int)pi
->min_size
2709 << " may help; search ceph.com/docs for 'incomplete')";
2712 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2717 ss
<< more
<< " more pgs are also unhealthy";
2718 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2724 if (cct
->_conf
->mon_osd_warn_op_age
> 0 &&
2725 osd_sum
.op_queue_age_hist
.upper_bound() > cct
->_conf
->mon_osd_warn_op_age
) {
2726 auto sum
= _warn_slow_request_histogram(
2727 cct
, osd_sum
.op_queue_age_hist
, "", summary
, NULL
);
2728 if (sum
.first
> 0 || sum
.second
> 0) {
2729 if (sum
.first
> 0) {
2731 ss
<< sum
.first
<< " requests are blocked > "
2732 << cct
->_conf
->mon_osd_warn_op_age
2734 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2736 if (sum
.second
> 0) {
2738 ss
<< sum
.first
<< " requests are blocked > "
2739 << (cct
->_conf
->mon_osd_warn_op_age
*
2740 cct
->_conf
->mon_osd_err_op_age_ratio
)
2742 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
2746 unsigned num_warn
= 0, num_err
= 0;
2747 // do per-osd warnings
2748 for (auto p
= osd_stat
.begin();
2749 p
!= osd_stat
.end();
2751 auto sum
= _warn_slow_request_histogram(
2753 p
->second
.op_queue_age_hist
,
2754 string(" on osd.") + stringify(p
->first
),
2763 ss2
<< num_err
<< " osds have very slow requests";
2764 summary
.push_back(make_pair(HEALTH_ERR
, ss2
.str()));
2765 detail
->push_back(make_pair(HEALTH_ERR
, ss2
.str()));
2769 ss2
<< num_warn
<< " osds have slow requests";
2770 summary
.push_back(make_pair(HEALTH_WARN
, ss2
.str()));
2771 detail
->push_back(make_pair(HEALTH_WARN
, ss2
.str()));
2777 if (cct
->_conf
->mon_warn_osd_usage_min_max_delta
) {
2778 float max_osd_usage
= 0.0, min_osd_usage
= 1.0;
2779 for (auto p
= osd_stat
.begin(); p
!= osd_stat
.end(); ++p
) {
2780 // kb should never be 0, but avoid divide by zero in case of corruption
2781 if (p
->second
.kb
<= 0)
2783 float usage
= ((float)p
->second
.kb_used
) / ((float)p
->second
.kb
);
2784 if (usage
> max_osd_usage
)
2785 max_osd_usage
= usage
;
2786 if (usage
< min_osd_usage
)
2787 min_osd_usage
= usage
;
2789 float diff
= max_osd_usage
- min_osd_usage
;
2790 if (diff
> cct
->_conf
->mon_warn_osd_usage_min_max_delta
) {
2792 ss
<< "difference between min (" << roundf(min_osd_usage
*1000.0)/100.0
2793 << "%) and max (" << roundf(max_osd_usage
*1000.0)/100.0
2794 << "%) osd usage " << roundf(diff
*1000.0)/100.0 << "% > "
2795 << roundf(cct
->_conf
->mon_warn_osd_usage_min_max_delta
*1000.0)/100.0
2796 << " (mon_warn_osd_usage_min_max_delta)";
2797 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2799 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2805 overall_recovery_summary(NULL
, &sl
);
2806 for (auto p
= sl
.begin(); p
!= sl
.end(); ++p
) {
2807 summary
.push_back(make_pair(HEALTH_WARN
, "recovery " + *p
));
2809 detail
->push_back(make_pair(HEALTH_WARN
, "recovery " + *p
));
2812 // near-target max pools
2813 auto& pools
= osdmap
.get_pools();
2814 for (auto p
= pools
.begin();
2815 p
!= pools
.end(); ++p
) {
2816 if ((!p
->second
.target_max_objects
&& !p
->second
.target_max_bytes
) ||
2817 !pg_pool_sum
.count(p
->first
))
2819 bool nearfull
= false;
2820 const string
& name
= osdmap
.get_pool_name(p
->first
);
2821 const pool_stat_t
& st
= get_pg_pool_sum_stat(p
->first
);
2822 uint64_t ratio
= p
->second
.cache_target_full_ratio_micro
+
2823 ((1000000 - p
->second
.cache_target_full_ratio_micro
) *
2824 cct
->_conf
->mon_cache_target_full_warn_ratio
);
2825 if (p
->second
.target_max_objects
&&
2826 (uint64_t)(st
.stats
.sum
.num_objects
-
2827 st
.stats
.sum
.num_objects_hit_set_archive
) >
2828 p
->second
.target_max_objects
* (ratio
/ 1000000.0)) {
2832 ss
<< "cache pool '" << name
<< "' with "
2833 << si_t(st
.stats
.sum
.num_objects
)
2834 << " objects at/near target max "
2835 << si_t(p
->second
.target_max_objects
) << " objects";
2836 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2839 if (p
->second
.target_max_bytes
&&
2840 (uint64_t)(st
.stats
.sum
.num_bytes
-
2841 st
.stats
.sum
.num_bytes_hit_set_archive
) >
2842 p
->second
.target_max_bytes
* (ratio
/ 1000000.0)) {
2846 ss
<< "cache pool '" << name
2847 << "' with " << si_t(st
.stats
.sum
.num_bytes
)
2848 << "B at/near target max "
2849 << si_t(p
->second
.target_max_bytes
) << "B";
2850 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2855 ss
<< "'" << name
<< "' at/near target max";
2856 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2861 if (pg_sum
.stats
.sum
.num_scrub_errors
) {
2863 ss
<< pg_sum
.stats
.sum
.num_scrub_errors
<< " scrub errors";
2864 summary
.push_back(make_pair(HEALTH_ERR
, ss
.str()));
2866 detail
->push_back(make_pair(HEALTH_ERR
, ss
.str()));
2871 int num_in
= osdmap
.get_num_in_osds();
2872 int sum_pg_up
= MAX(pg_sum
.up
, static_cast<int32_t>(pg_stat
.size()));
2873 if (num_in
&& cct
->_conf
->mon_pg_warn_min_per_osd
> 0) {
2874 int per
= sum_pg_up
/ num_in
;
2875 if (per
< cct
->_conf
->mon_pg_warn_min_per_osd
&& per
) {
2877 ss
<< "too few PGs per OSD (" << per
<< " < min " << cct
->_conf
->mon_pg_warn_min_per_osd
<< ")";
2878 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2880 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2883 if (num_in
&& cct
->_conf
->mon_pg_warn_max_per_osd
> 0) {
2884 int per
= sum_pg_up
/ num_in
;
2885 if (per
> cct
->_conf
->mon_pg_warn_max_per_osd
) {
2887 ss
<< "too many PGs per OSD (" << per
<< " > max " << cct
->_conf
->mon_pg_warn_max_per_osd
<< ")";
2888 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2890 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2893 if (!pg_stat
.empty()) {
2894 for (auto p
= pg_pool_sum
.begin();
2895 p
!= pg_pool_sum
.end();
2897 const pg_pool_t
*pi
= osdmap
.get_pg_pool(p
->first
);
2899 continue; // in case osdmap changes haven't propagated to PGMap yet
2900 const string
& name
= osdmap
.get_pool_name(p
->first
);
2901 if (pi
->get_pg_num() > pi
->get_pgp_num() &&
2902 !(name
.find(".DELETED") != string::npos
&&
2903 cct
->_conf
->mon_fake_pool_delete
)) {
2905 ss
<< "pool " << name
<< " pg_num "
2906 << pi
->get_pg_num() << " > pgp_num " << pi
->get_pgp_num();
2907 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2909 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2911 int average_objects_per_pg
= pg_sum
.stats
.sum
.num_objects
/ pg_stat
.size();
2912 if (average_objects_per_pg
> 0 &&
2913 pg_sum
.stats
.sum
.num_objects
>= cct
->_conf
->mon_pg_warn_min_objects
&&
2914 p
->second
.stats
.sum
.num_objects
>= cct
->_conf
->mon_pg_warn_min_pool_objects
) {
2915 int objects_per_pg
= p
->second
.stats
.sum
.num_objects
/ pi
->get_pg_num();
2916 float ratio
= (float)objects_per_pg
/ (float)average_objects_per_pg
;
2917 if (cct
->_conf
->mon_pg_warn_max_object_skew
> 0 &&
2918 ratio
> cct
->_conf
->mon_pg_warn_max_object_skew
) {
2920 ss
<< "pool " << name
<< " has many more objects per pg than average (too few pgs?)";
2921 summary
.push_back(make_pair(HEALTH_WARN
, ss
.str()));
2924 ss
<< "pool " << name
<< " objects per pg ("
2925 << objects_per_pg
<< ") is more than " << ratio
<< " times cluster average ("
2926 << average_objects_per_pg
<< ")";
2927 detail
->push_back(make_pair(HEALTH_WARN
, ss
.str()));
2934 print_unscrubbed_pgs(pg_stat
, summary
, detail
, cct
);
2937 int process_pg_map_command(
2938 const string
& orig_prefix
,
2939 const map
<string
,cmd_vartype
>& orig_cmdmap
,
2940 const PGMap
& pg_map
,
2941 const OSDMap
& osdmap
,
2946 string prefix
= orig_prefix
;
2947 map
<string
,cmd_vartype
> cmdmap
= orig_cmdmap
;
2949 // perhaps these would be better in the parsing, but it's weird
2950 bool primary
= false;
2951 if (prefix
== "pg dump_json") {
2953 v
.push_back(string("all"));
2954 cmd_putval(g_ceph_context
, cmdmap
, "format", string("json"));
2955 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
2957 } else if (prefix
== "pg dump_pools_json") {
2959 v
.push_back(string("pools"));
2960 cmd_putval(g_ceph_context
, cmdmap
, "format", string("json"));
2961 cmd_putval(g_ceph_context
, cmdmap
, "dumpcontents", v
);
2963 } else if (prefix
== "pg ls-by-primary") {
2966 } else if (prefix
== "pg ls-by-osd") {
2968 } else if (prefix
== "pg ls-by-pool") {
2971 cmd_getval(g_ceph_context
, cmdmap
, "poolstr", poolstr
);
2972 int64_t pool
= osdmap
.lookup_pg_pool_name(poolstr
.c_str());
2974 *ss
<< "pool " << poolstr
<< " does not exist";
2977 cmd_putval(g_ceph_context
, cmdmap
, "pool", pool
);
2982 if (prefix
== "pg stat") {
2984 f
->open_object_section("pg_summary");
2985 pg_map
.print_oneline_summary(f
, NULL
);
2995 if (prefix
== "pg getmap") {
2996 pg_map
.encode(*odata
);
2997 *ss
<< "got pgmap version " << pg_map
.version
;
3001 if (prefix
== "pg dump") {
3003 vector
<string
> dumpcontents
;
3005 if (cmd_getval(g_ceph_context
, cmdmap
, "dumpcontents", dumpcontents
)) {
3006 copy(dumpcontents
.begin(), dumpcontents
.end(),
3007 inserter(what
, what
.end()));
3012 if (what
.count("all")) {
3013 f
->open_object_section("pg_map");
3016 } else if (what
.count("summary") || what
.count("sum")) {
3017 f
->open_object_section("pg_map");
3018 pg_map
.dump_basic(f
);
3021 if (what
.count("pools")) {
3022 pg_map
.dump_pool_stats(f
);
3024 if (what
.count("osds")) {
3025 pg_map
.dump_osd_stats(f
);
3027 if (what
.count("pgs")) {
3028 pg_map
.dump_pg_stats(f
, false);
3030 if (what
.count("pgs_brief")) {
3031 pg_map
.dump_pg_stats(f
, true);
3033 if (what
.count("delta")) {
3034 f
->open_object_section("delta");
3035 pg_map
.dump_delta(f
);
3041 if (what
.count("all")) {
3043 } else if (what
.count("summary") || what
.count("sum")) {
3044 pg_map
.dump_basic(ds
);
3045 pg_map
.dump_pg_sum_stats(ds
, true);
3046 pg_map
.dump_osd_sum_stats(ds
);
3048 if (what
.count("pgs_brief")) {
3049 pg_map
.dump_pg_stats(ds
, true);
3052 if (what
.count("pgs")) {
3053 pg_map
.dump_pg_stats(ds
, false);
3056 if (what
.count("pools")) {
3057 pg_map
.dump_pool_stats(ds
, header
);
3059 if (what
.count("osds")) {
3060 pg_map
.dump_osd_stats(ds
);
3065 *ss
<< "dumped " << what
;
3069 if (prefix
== "pg ls") {
3072 vector
<string
>states
;
3074 cmd_getval(g_ceph_context
, cmdmap
, "pool", pool
);
3075 cmd_getval(g_ceph_context
, cmdmap
, "osd", osd
);
3076 cmd_getval(g_ceph_context
, cmdmap
, "states", states
);
3077 if (pool
>= 0 && !osdmap
.have_pg_pool(pool
)) {
3078 *ss
<< "pool " << pool
<< " does not exist";
3081 if (osd
>= 0 && !osdmap
.is_up(osd
)) {
3082 *ss
<< "osd " << osd
<< " is not up";
3086 states
.push_back("all");
3090 while (!states
.empty()) {
3091 string state_str
= states
.back();
3093 if (state_str
== "all") {
3097 int filter
= pg_string_state(state_str
);
3098 assert(filter
!= -1);
3105 pg_map
.get_filtered_pg_stats(state
, pool
, osd
, primary
, pgs
);
3107 if (f
&& !pgs
.empty()) {
3108 pg_map
.dump_filtered_pg_stats(f
, pgs
);
3110 } else if (!pgs
.empty()) {
3111 pg_map
.dump_filtered_pg_stats(ds
, pgs
);
3117 if (prefix
== "pg dump_stuck") {
3118 vector
<string
> stuckop_vec
;
3119 cmd_getval(g_ceph_context
, cmdmap
, "stuckops", stuckop_vec
);
3120 if (stuckop_vec
.empty())
3121 stuckop_vec
.push_back("unclean");
3123 cmd_getval(g_ceph_context
, cmdmap
, "threshold", threshold
,
3124 int64_t(g_conf
->mon_pg_stuck_threshold
));
3126 r
= pg_map
.dump_stuck_pg_stats(ds
, f
, (int)threshold
, stuckop_vec
);
3135 if (prefix
== "pg debug") {
3137 cmd_getval(g_ceph_context
, cmdmap
, "debugop", debugop
,
3138 string("unfound_objects_exist"));
3139 if (debugop
== "unfound_objects_exist") {
3140 bool unfound_objects_exist
= false;
3141 for (const auto& p
: pg_map
.pg_stat
) {
3142 if (p
.second
.stats
.sum
.num_objects_unfound
> 0) {
3143 unfound_objects_exist
= true;
3147 if (unfound_objects_exist
)
3154 if (debugop
== "degraded_pgs_exist") {
3155 bool degraded_pgs_exist
= false;
3156 for (const auto& p
: pg_map
.pg_stat
) {
3157 if (p
.second
.stats
.sum
.num_objects_degraded
> 0) {
3158 degraded_pgs_exist
= true;
3162 if (degraded_pgs_exist
)
3171 if (prefix
== "osd perf") {
3173 f
->open_object_section("osdstats");
3174 pg_map
.dump_osd_perf_stats(f
);
3178 pg_map
.print_osd_perf_stats(&ds
);
3184 if (prefix
== "osd blocked-by") {
3186 f
->open_object_section("osd_blocked_by");
3187 pg_map
.dump_osd_blocked_by_stats(f
);
3191 pg_map
.print_osd_blocked_by_stats(&ds
);
3197 if (prefix
== "osd pool stats") {
3199 cmd_getval(g_ceph_context
, cmdmap
, "name", pool_name
);
3201 int64_t poolid
= -ENOENT
;
3202 bool one_pool
= false;
3203 if (!pool_name
.empty()) {
3204 poolid
= osdmap
.lookup_pg_pool_name(pool_name
);
3206 assert(poolid
== -ENOENT
);
3207 *ss
<< "unrecognized pool '" << pool_name
<< "'";
3216 f
->open_array_section("pool_stats");
3218 if (osdmap
.get_pools().empty()) {
3219 *ss
<< "there are no pools!";
3224 for (auto& p
: osdmap
.get_pools()) {
3228 pool_name
= osdmap
.get_pool_name(poolid
);
3231 f
->open_object_section("pool");
3232 f
->dump_string("pool_name", pool_name
.c_str());
3233 f
->dump_int("pool_id", poolid
);
3234 f
->open_object_section("recovery");
3239 pg_map
.pool_recovery_summary(f
, &sl
, poolid
);
3240 if (!f
&& !sl
.empty()) {
3242 tss
<< " " << p
<< "\n";
3247 f
->open_object_section("recovery_rate");
3251 pg_map
.pool_recovery_rate_summary(f
, &rss
, poolid
);
3252 if (!f
&& !rss
.str().empty())
3253 tss
<< " recovery io " << rss
.str() << "\n";
3257 f
->open_object_section("client_io_rate");
3262 pg_map
.pool_client_io_rate_summary(f
, &rss
, poolid
);
3263 if (!f
&& !rss
.str().empty())
3264 tss
<< " client io " << rss
.str() << "\n";
3266 // dump cache tier IO rate for cache pool
3267 const pg_pool_t
*pool
= osdmap
.get_pg_pool(poolid
);
3268 if (pool
->is_tier()) {
3271 f
->open_object_section("cache_io_rate");
3276 pg_map
.pool_cache_io_rate_summary(f
, &rss
, poolid
);
3277 if (!f
&& !rss
.str().empty())
3278 tss
<< " cache tier io " << rss
.str() << "\n";
3284 rs
<< "pool " << pool_name
<< " id " << poolid
<< "\n";
3285 if (!tss
.str().empty())
3286 rs
<< tss
.str() << "\n";
3288 rs
<< " nothing is going on\n\n";
3300 odata
->append(rs
.str());
3308 void PGMapUpdater::check_osd_map(const OSDMap::Incremental
&osd_inc
,
3309 std::set
<int> *need_check_down_pg_osds
,
3310 std::map
<int,utime_t
> *last_osd_report
,
3312 PGMap::Incremental
*pending_inc
)
3314 for (const auto &p
: osd_inc
.new_weight
) {
3315 if (p
.second
== CEPH_OSD_OUT
) {
3316 dout(10) << __func__
<< " osd." << p
.first
<< " went OUT" << dendl
;
3317 auto j
= pg_map
->osd_epochs
.find(p
.first
);
3318 if (j
!= pg_map
->osd_epochs
.end())
3319 pending_inc
->stat_osd_out(p
.first
, j
->second
);
3323 // this is conservative: we want to know if any osds (maybe) got marked down.
3324 for (const auto &p
: osd_inc
.new_state
) {
3325 if (p
.second
& CEPH_OSD_UP
) { // true if marked up OR down,
3326 // but we're too lazy to check
3328 need_check_down_pg_osds
->insert(p
.first
);
3330 // clear out the last_osd_report for this OSD
3331 auto report
= last_osd_report
->find(p
.first
);
3332 if (report
!= last_osd_report
->end()) {
3333 last_osd_report
->erase(report
);
3336 // clear out osd_stat slow request histogram
3337 dout(20) << __func__
<< " clearing osd." << p
.first
3338 << " request histogram" << dendl
;
3339 pending_inc
->stat_osd_down_up(p
.first
, osd_inc
.epoch
, *pg_map
);
3342 if (p
.second
& CEPH_OSD_EXISTS
) {
3343 // whether it was created *or* destroyed, we can safely drop
3344 // it's osd_stat_t record.
3345 dout(10) << __func__
<< " osd." << p
.first
3346 << " created or destroyed" << dendl
;
3347 pending_inc
->rm_stat(p
.first
);
3349 // and adjust full, nearfull set
3350 pg_map
->nearfull_osds
.erase(p
.first
);
3351 pg_map
->full_osds
.erase(p
.first
);
3356 void PGMapUpdater::check_osd_map(
3358 const OSDMap
& osdmap
,
3360 PGMap::Incremental
*pending_inc
)
3362 for (auto& p
: pgmap
.osd_stat
) {
3363 if (!osdmap
.exists(p
.first
)) {
3365 pending_inc
->rm_stat(p
.first
);
3366 } else if (osdmap
.is_out(p
.first
)) {
3368 if (p
.second
.kb
!= 0) {
3369 auto j
= pgmap
.osd_epochs
.find(p
.first
);
3370 if (j
!= pgmap
.osd_epochs
.end()) {
3371 pending_inc
->stat_osd_out(p
.first
, j
->second
);
3374 } else if (!osdmap
.is_up(p
.first
)) {
3375 // zero the op_queue_age_hist
3376 if (!p
.second
.op_queue_age_hist
.empty()) {
3377 pending_inc
->stat_osd_down_up(p
.first
, osdmap
.get_epoch(), pgmap
);
3382 // deleted pgs (pools)?
3383 for (auto& p
: pgmap
.pg_pool_sum
) {
3384 if (!osdmap
.have_pg_pool(p
.first
)) {
3385 ldout(cct
, 10) << __func__
<< " pool " << p
.first
<< " gone, removing pgs"
3387 for (auto& q
: pgmap
.pg_stat
) {
3388 if (q
.first
.pool() == (uint64_t)p
.first
) {
3389 pending_inc
->pg_remove
.insert(q
.first
);
3392 auto q
= pending_inc
->pg_stat_updates
.begin();
3393 while (q
!= pending_inc
->pg_stat_updates
.end()) {
3394 if (q
->first
.pool() == (uint64_t)p
.first
) {
3395 q
= pending_inc
->pg_stat_updates
.erase(q
);
3403 // new pgs (split or new pool)?
3404 for (auto& p
: osdmap
.get_pools()) {
3405 int64_t poolid
= p
.first
;
3406 const pg_pool_t
& pi
= p
.second
;
3407 auto q
= pgmap
.num_pg_by_pool
.find(poolid
);
3408 unsigned my_pg_num
= 0;
3409 if (q
!= pgmap
.num_pg_by_pool
.end())
3410 my_pg_num
= q
->second
;
3411 unsigned pg_num
= pi
.get_pg_num();
3412 if (my_pg_num
!= pg_num
) {
3413 for (unsigned ps
= my_pg_num
; ps
< pg_num
; ++ps
) {
3414 pg_t
pgid(ps
, poolid
);
3415 if (pending_inc
->pg_stat_updates
.count(pgid
) == 0) {
3416 pg_stat_t
&stats
= pending_inc
->pg_stat_updates
[pgid
];
3417 stats
.last_fresh
= osdmap
.get_modified();
3418 stats
.last_active
= osdmap
.get_modified();
3419 stats
.last_change
= osdmap
.get_modified();
3420 stats
.last_peered
= osdmap
.get_modified();
3421 stats
.last_clean
= osdmap
.get_modified();
3422 stats
.last_unstale
= osdmap
.get_modified();
3423 stats
.last_undegraded
= osdmap
.get_modified();
3424 stats
.last_fullsized
= osdmap
.get_modified();
3425 stats
.last_scrub_stamp
= osdmap
.get_modified();
3426 stats
.last_deep_scrub_stamp
= osdmap
.get_modified();
3427 stats
.last_clean_scrub_stamp
= osdmap
.get_modified();
3434 void PGMapUpdater::register_pg(
3435 const OSDMap
&osd_map
,
3436 pg_t pgid
, epoch_t epoch
,
3438 const PGMap
&pg_map
,
3439 PGMap::Incremental
*pending_inc
)
3443 auto parent_stat
= pg_map
.pg_stat
.end();
3447 // remove most significant bit
3448 int msb
= cbits(parent
.ps());
3451 parent
.set_ps(parent
.ps() & ~(1<<(msb
-1)));
3453 dout(30) << " is " << pgid
<< " parent " << parent
<< " ?" << dendl
;
3454 parent_stat
= pg_map
.pg_stat
.find(parent
);
3455 if (parent_stat
!= pg_map
.pg_stat
.end() &&
3456 parent_stat
->second
.state
!= PG_STATE_CREATING
) {
3457 dout(10) << " parent is " << parent
<< dendl
;
3463 pg_stat_t
&stats
= pending_inc
->pg_stat_updates
[pgid
];
3464 stats
.state
= PG_STATE_CREATING
;
3465 stats
.created
= epoch
;
3466 stats
.parent
= parent
;
3467 stats
.parent_split_bits
= split_bits
;
3468 stats
.mapping_epoch
= epoch
;
3470 if (parent_stat
!= pg_map
.pg_stat
.end()) {
3471 const pg_stat_t
&ps
= parent_stat
->second
;
3472 stats
.last_fresh
= ps
.last_fresh
;
3473 stats
.last_active
= ps
.last_active
;
3474 stats
.last_change
= ps
.last_change
;
3475 stats
.last_peered
= ps
.last_peered
;
3476 stats
.last_clean
= ps
.last_clean
;
3477 stats
.last_unstale
= ps
.last_unstale
;
3478 stats
.last_undegraded
= ps
.last_undegraded
;
3479 stats
.last_fullsized
= ps
.last_fullsized
;
3480 stats
.last_scrub_stamp
= ps
.last_scrub_stamp
;
3481 stats
.last_deep_scrub_stamp
= ps
.last_deep_scrub_stamp
;
3482 stats
.last_clean_scrub_stamp
= ps
.last_clean_scrub_stamp
;
3484 utime_t now
= osd_map
.get_modified();
3485 stats
.last_fresh
= now
;
3486 stats
.last_active
= now
;
3487 stats
.last_change
= now
;
3488 stats
.last_peered
= now
;
3489 stats
.last_clean
= now
;
3490 stats
.last_unstale
= now
;
3491 stats
.last_undegraded
= now
;
3492 stats
.last_fullsized
= now
;
3493 stats
.last_scrub_stamp
= now
;
3494 stats
.last_deep_scrub_stamp
= now
;
3495 stats
.last_clean_scrub_stamp
= now
;
3498 osd_map
.pg_to_up_acting_osds(
3503 &stats
.acting_primary
);
3505 if (split_bits
== 0) {
3506 dout(10) << __func__
<< " will create " << pgid
3507 << " primary " << stats
.acting_primary
3508 << " acting " << stats
.acting
3511 dout(10) << __func__
<< " will create " << pgid
3512 << " primary " << stats
.acting_primary
3513 << " acting " << stats
.acting
3514 << " parent " << parent
3515 << " by " << split_bits
<< " bits"
3520 void PGMapUpdater::register_new_pgs(
3521 const OSDMap
&osd_map
,
3522 const PGMap
&pg_map
,
3523 PGMap::Incremental
*pending_inc
)
3525 epoch_t epoch
= osd_map
.get_epoch();
3526 dout(10) << __func__
<< " checking pg pools for osdmap epoch " << epoch
3527 << ", last_pg_scan " << pg_map
.last_pg_scan
<< dendl
;
3530 const auto &pools
= osd_map
.get_pools();
3532 for (const auto &p
: pools
) {
3533 int64_t poolid
= p
.first
;
3534 const pg_pool_t
&pool
= p
.second
;
3535 int ruleno
= osd_map
.crush
->find_rule(pool
.get_crush_rule(),
3536 pool
.get_type(), pool
.get_size());
3537 if (ruleno
< 0 || !osd_map
.crush
->rule_exists(ruleno
))
3540 if (pool
.get_last_change() <= pg_map
.last_pg_scan
||
3541 pool
.get_last_change() <= pending_inc
->pg_scan
) {
3542 dout(10) << " no change in pool " << poolid
<< " " << pool
<< dendl
;
3546 dout(10) << __func__
<< " scanning pool " << poolid
3547 << " " << pool
<< dendl
;
3549 // first pgs in this pool
3550 bool new_pool
= pg_map
.pg_pool_sum
.count(poolid
) == 0;
3552 for (ps_t ps
= 0; ps
< pool
.get_pg_num(); ps
++) {
3553 pg_t
pgid(ps
, poolid
, -1);
3554 if (pg_map
.pg_stat
.count(pgid
)) {
3555 dout(20) << "register_new_pgs have " << pgid
<< dendl
;
3559 register_pg(osd_map
, pgid
, pool
.get_last_change(), new_pool
,
3560 pg_map
, pending_inc
);
3565 for (const auto &p
: pg_map
.creating_pgs
) {
3566 if (p
.preferred() >= 0) {
3567 dout(20) << " removing creating_pg " << p
3568 << " because it is localized and obsolete" << dendl
;
3569 pending_inc
->pg_remove
.insert(p
);
3571 } else if (!osd_map
.have_pg_pool(p
.pool())) {
3572 dout(20) << " removing creating_pg " << p
3573 << " because containing pool deleted" << dendl
;
3574 pending_inc
->pg_remove
.insert(p
);
3580 for (const auto &p
: pg_map
.pg_stat
) {
3581 if (!osd_map
.have_pg_pool(p
.first
.pool())) {
3582 dout(20) << " removing pg_stat " << p
.first
<< " because "
3583 << "containing pool deleted" << dendl
;
3584 pending_inc
->pg_remove
.insert(p
.first
);
3586 } else if (p
.first
.preferred() >= 0) {
3587 dout(20) << " removing localized pg " << p
.first
<< dendl
;
3588 pending_inc
->pg_remove
.insert(p
.first
);
3593 // we don't want to redo this work if we can avoid it.
3594 pending_inc
->pg_scan
= epoch
;
3596 dout(10) << "register_new_pgs registered " << created
<< " new pgs, removed "
3597 << removed
<< " uncreated pgs" << dendl
;
3601 void PGMapUpdater::update_creating_pgs(
3602 const OSDMap
&osd_map
,
3603 const PGMap
&pg_map
,
3604 PGMap::Incremental
*pending_inc
)
3606 dout(10) << __func__
<< " to " << pg_map
.creating_pgs
.size()
3607 << " pgs, osdmap epoch " << osd_map
.get_epoch()
3610 unsigned changed
= 0;
3611 for (auto p
= pg_map
.creating_pgs
.begin();
3612 p
!= pg_map
.creating_pgs
.end();
3616 auto q
= pg_map
.pg_stat
.find(pgid
);
3617 assert(q
!= pg_map
.pg_stat
.end());
3618 const pg_stat_t
*s
= &q
->second
;
3620 if (s
->parent_split_bits
)
3623 vector
<int> up
, acting
;
3624 int up_primary
, acting_primary
;
3625 osd_map
.pg_to_up_acting_osds(
3633 up_primary
!= s
->up_primary
||
3634 acting
!= s
->acting
||
3635 acting_primary
!= s
->acting_primary
) {
3636 pg_stat_t
*ns
= &pending_inc
->pg_stat_updates
[pgid
];
3637 if (osd_map
.get_epoch() > ns
->reported_epoch
) {
3638 dout(20) << __func__
<< " " << pgid
<< " "
3639 << " acting_primary: " << s
->acting_primary
3640 << " -> " << acting_primary
3641 << " acting: " << s
->acting
<< " -> " << acting
3642 << " up_primary: " << s
->up_primary
<< " -> " << up_primary
3643 << " up: " << s
->up
<< " -> " << up
3646 // only initialize if it wasn't already a pending update
3647 if (ns
->reported_epoch
== 0)
3650 // note epoch if the target of the create message changed
3651 if (acting_primary
!= ns
->acting_primary
)
3652 ns
->mapping_epoch
= osd_map
.get_epoch();
3655 ns
->up_primary
= up_primary
;
3656 ns
->acting
= acting
;
3657 ns
->acting_primary
= acting_primary
;
3661 dout(20) << __func__
<< " " << pgid
<< " has pending update from newer"
3662 << " epoch " << ns
->reported_epoch
3668 dout(10) << __func__
<< " " << changed
<< " pgs changed primary" << dendl
;
3672 static void _try_mark_pg_stale(
3673 const OSDMap
& osdmap
,
3675 const pg_stat_t
& cur
,
3676 PGMap::Incremental
*pending_inc
)
3678 if ((cur
.state
& PG_STATE_STALE
) == 0 &&
3679 cur
.acting_primary
!= -1 &&
3680 osdmap
.is_down(cur
.acting_primary
)) {
3682 auto q
= pending_inc
->pg_stat_updates
.find(pgid
);
3683 if (q
!= pending_inc
->pg_stat_updates
.end()) {
3684 if ((q
->second
.acting_primary
== cur
.acting_primary
) ||
3685 ((q
->second
.state
& PG_STATE_STALE
) == 0 &&
3686 q
->second
.acting_primary
!= -1 &&
3687 osdmap
.is_down(q
->second
.acting_primary
))) {
3688 newstat
= &q
->second
;
3690 // pending update is no longer down or already stale
3694 newstat
= &pending_inc
->pg_stat_updates
[pgid
];
3697 dout(10) << __func__
<< " marking pg " << pgid
3698 << " stale (acting_primary " << newstat
->acting_primary
3700 newstat
->state
|= PG_STATE_STALE
;
3701 newstat
->last_unstale
= ceph_clock_now();
3705 void PGMapUpdater::check_down_pgs(
3706 const OSDMap
&osdmap
,
3707 const PGMap
&pg_map
,
3709 const set
<int>& need_check_down_pg_osds
,
3710 PGMap::Incremental
*pending_inc
)
3712 // if a large number of osds changed state, just iterate over the whole
3714 if (need_check_down_pg_osds
.size() > (unsigned)osdmap
.get_num_osds() *
3715 g_conf
->mon_pg_check_down_all_threshold
) {
3720 for (const auto& p
: pg_map
.pg_stat
) {
3721 _try_mark_pg_stale(osdmap
, p
.first
, p
.second
, pending_inc
);
3724 for (auto osd
: need_check_down_pg_osds
) {
3725 if (osdmap
.is_down(osd
)) {
3726 auto p
= pg_map
.pg_by_osd
.find(osd
);
3727 if (p
== pg_map
.pg_by_osd
.end()) {
3730 for (auto pgid
: p
->second
) {
3731 const pg_stat_t
&stat
= pg_map
.pg_stat
.at(pgid
);
3732 assert(stat
.acting_primary
== osd
);
3733 _try_mark_pg_stale(osdmap
, pgid
, stat
, pending_inc
);
3740 int reweight::by_utilization(
3741 const OSDMap
&osdmap
,
3746 bool by_pg
, const set
<int64_t> *pools
,
3748 mempool::osdmap::map
<int32_t, uint32_t>* new_weights
,
3749 std::stringstream
*ss
,
3750 std::string
*out_str
,
3754 *ss
<< "You must give a percentage higher than 100. "
3755 "The reweighting threshold will be calculated as <average-utilization> "
3756 "times <input-percentage>. For example, an argument of 200 would "
3757 "reweight OSDs which are twice as utilized as the average OSD.\n";
3761 vector
<int> pgs_by_osd(osdmap
.get_max_osd());
3763 // Avoid putting a small number (or 0) in the denominator when calculating
3765 double average_util
;
3768 double weight_sum
= 0.0; // sum up the crush weights
3769 unsigned num_pg_copies
= 0;
3771 for (const auto& pg
: pgm
.pg_stat
) {
3772 if (pools
&& pools
->count(pg
.first
.pool()) == 0)
3774 for (const auto acting
: pg
.second
.acting
) {
3775 if (acting
>= (int)pgs_by_osd
.size())
3776 pgs_by_osd
.resize(acting
);
3777 if (pgs_by_osd
[acting
] == 0) {
3778 if (osdmap
.crush
->get_item_weightf(acting
) <= 0) {
3779 //skip if we currently can not identify item
3782 weight_sum
+= osdmap
.crush
->get_item_weightf(acting
);
3785 ++pgs_by_osd
[acting
];
3790 if (!num_osds
|| (num_pg_copies
/ num_osds
< g_conf
->mon_reweight_min_pgs_per_osd
)) {
3791 *ss
<< "Refusing to reweight: we only have " << num_pg_copies
3792 << " PGs across " << num_osds
<< " osds!\n";
3796 average_util
= (double)num_pg_copies
/ weight_sum
;
3798 // by osd utilization
3799 int num_osd
= MAX(1, pgm
.osd_stat
.size());
3800 if ((uint64_t)pgm
.osd_sum
.kb
* 1024 / num_osd
3801 < g_conf
->mon_reweight_min_bytes_per_osd
) {
3802 *ss
<< "Refusing to reweight: we only have " << pgm
.osd_sum
.kb
3803 << " kb across all osds!\n";
3806 if ((uint64_t)pgm
.osd_sum
.kb_used
* 1024 / num_osd
3807 < g_conf
->mon_reweight_min_bytes_per_osd
) {
3808 *ss
<< "Refusing to reweight: we only have " << pgm
.osd_sum
.kb_used
3809 << " kb used across all osds!\n";
3813 average_util
= (double)pgm
.osd_sum
.kb_used
/ (double)pgm
.osd_sum
.kb
;
3816 // adjust down only if we are above the threshold
3817 const double overload_util
= average_util
* (double)oload
/ 100.0;
3819 // but aggressively adjust weights up whenever possible.
3820 const double underload_util
= average_util
;
3822 const unsigned max_change
= (unsigned)(max_changef
* (double)0x10000);
3826 f
->open_object_section("reweight_by_utilization");
3827 f
->dump_int("overload_min", oload
);
3828 f
->dump_float("max_change", max_changef
);
3829 f
->dump_int("max_change_osds", max_osds
);
3830 f
->dump_float("average_utilization", average_util
);
3831 f
->dump_float("overload_utilization", overload_util
);
3833 oss
<< "oload " << oload
<< "\n";
3834 oss
<< "max_change " << max_changef
<< "\n";
3835 oss
<< "max_change_osds " << max_osds
<< "\n";
3837 oss
<< "average_utilization " << std::fixed
<< average_util
<< "\n";
3838 oss
<< "overload_utilization " << overload_util
<< "\n";
3840 int num_changed
= 0;
3842 // precompute util for each OSD
3843 std::vector
<std::pair
<int, float> > util_by_osd
;
3844 for (const auto& p
: pgm
.osd_stat
) {
3845 std::pair
<int, float> osd_util
;
3846 osd_util
.first
= p
.first
;
3848 if (p
.first
>= (int)pgs_by_osd
.size() ||
3849 pgs_by_osd
[p
.first
] == 0) {
3850 // skip if this OSD does not contain any pg
3851 // belonging to the specified pool(s).
3855 if (osdmap
.crush
->get_item_weightf(p
.first
) <= 0) {
3856 // skip if we are unable to locate item.
3860 osd_util
.second
= pgs_by_osd
[p
.first
] / osdmap
.crush
->get_item_weightf(p
.first
);
3862 osd_util
.second
= (double)p
.second
.kb_used
/ (double)p
.second
.kb
;
3864 util_by_osd
.push_back(osd_util
);
3867 // sort by absolute deviation from the mean utilization,
3868 // in descending order.
3869 std::sort(util_by_osd
.begin(), util_by_osd
.end(),
3870 [average_util
](std::pair
<int, float> l
, std::pair
<int, float> r
) {
3871 return abs(l
.second
- average_util
) > abs(r
.second
- average_util
);
3876 f
->open_array_section("reweights");
3878 for (const auto& p
: util_by_osd
) {
3879 unsigned weight
= osdmap
.get_weight(p
.first
);
3881 // skip if OSD is currently out
3884 float util
= p
.second
;
3886 if (util
>= overload_util
) {
3887 // Assign a lower weight to overloaded OSDs. The current weight
3888 // is a factor to take into account the original weights,
3889 // to represent e.g. differing storage capacities
3890 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
3891 if (weight
> max_change
)
3892 new_weight
= MAX(new_weight
, weight
- max_change
);
3893 new_weights
->insert({p
.first
, new_weight
});
3895 f
->open_object_section("osd");
3896 f
->dump_int("osd", p
.first
);
3897 f
->dump_float("weight", (float)weight
/ (float)0x10000);
3898 f
->dump_float("new_weight", (float)new_weight
/ (float)0x10000);
3901 oss
<< "osd." << p
.first
<< " weight "
3902 << (float)weight
/ (float)0x10000 << " -> "
3903 << (float)new_weight
/ (float)0x10000 << "\n";
3905 if (++num_changed
>= max_osds
)
3908 if (!no_increasing
&& util
<= underload_util
) {
3909 // assign a higher weight.. if we can.
3910 unsigned new_weight
= (unsigned)((average_util
/ util
) * (float)weight
);
3911 new_weight
= MIN(new_weight
, weight
+ max_change
);
3912 if (new_weight
> 0x10000)
3913 new_weight
= 0x10000;
3914 if (new_weight
> weight
) {
3915 new_weights
->insert({p
.first
, new_weight
});
3916 oss
<< "osd." << p
.first
<< " weight "
3917 << (float)weight
/ (float)0x10000 << " -> "
3918 << (float)new_weight
/ (float)0x10000 << "\n";
3919 if (++num_changed
>= max_osds
)
3929 newmap
.deepish_copy_from(osdmap
);
3930 OSDMap::Incremental newinc
;
3931 newinc
.fsid
= newmap
.get_fsid();
3932 newinc
.epoch
= newmap
.get_epoch() + 1;
3933 newinc
.new_weight
= *new_weights
;
3934 newmap
.apply_incremental(newinc
);
3936 osdmap
.summarize_mapping_stats(&newmap
, pools
, out_str
, f
);
3942 *out_str
+= oss
.str();