]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PGMap.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / mon / PGMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
224ce89b
WB
4#include <boost/algorithm/string.hpp>
5
20effc67 6#include "include/rados.h"
7c673cae
FG
7#include "PGMap.h"
8
9#define dout_subsys ceph_subsys_mon
10#include "common/debug.h"
11fdf7f2 11#include "common/Clock.h"
7c673cae 12#include "common/Formatter.h"
11fdf7f2 13#include "global/global_context.h"
7c673cae
FG
14#include "include/ceph_features.h"
15#include "include/stringify.h"
16
17#include "osd/osd_types.h"
18#include "osd/OSDMap.h"
eafe8130 19#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
20
21#define dout_context g_ceph_context
22
9f95a23c
TL
23using std::list;
24using std::make_pair;
25using std::map;
26using std::pair;
27using std::ostream;
28using std::ostringstream;
29using std::set;
30using std::string;
31using std::stringstream;
32using std::vector;
33
34using ceph::bufferlist;
f67539c2 35using ceph::fixed_u_to_string;
1e59de90
TL
36using ceph::common::cmd_getval;
37using ceph::common::cmd_getval_or;
38using ceph::common::cmd_putval;
9f95a23c 39
31f18b77
FG
40MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
41MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
42MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
43
44
45// ---------------------
46// PGMapDigest
47
48void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
49{
50 // NOTE: see PGMap::encode_digest
11fdf7f2 51 uint8_t v = 4;
20effc67 52 assert(HAVE_FEATURE(features, SERVER_NAUTILUS));
11fdf7f2
TL
53 ENCODE_START(v, 1, bl);
54 encode(num_pg, bl);
55 encode(num_pg_active, bl);
56 encode(num_pg_unknown, bl);
57 encode(num_osd, bl);
58 encode(pg_pool_sum, bl, features);
59 encode(pg_sum, bl, features);
60 encode(osd_sum, bl, features);
20effc67 61 encode(num_pg_by_state, bl);
11fdf7f2
TL
62 encode(num_pg_by_osd, bl);
63 encode(num_pg_by_pool, bl);
64 encode(osd_last_seq, bl);
65 encode(per_pool_sum_delta, bl, features);
66 encode(per_pool_sum_deltas_stamps, bl);
67 encode(pg_sum_delta, bl, features);
68 encode(stamp_delta, bl);
69 encode(avail_space_by_rule, bl);
20effc67
TL
70 encode(purged_snaps, bl);
71 encode(osd_sum_by_class, bl, features);
7c673cae
FG
72 ENCODE_FINISH(bl);
73}
74
11fdf7f2 75void PGMapDigest::decode(bufferlist::const_iterator& p)
31f18b77 76{
11fdf7f2 77 DECODE_START(4, p);
20effc67 78 assert(struct_v >= 4);
11fdf7f2
TL
79 decode(num_pg, p);
80 decode(num_pg_active, p);
81 decode(num_pg_unknown, p);
82 decode(num_osd, p);
83 decode(pg_pool_sum, p);
84 decode(pg_sum, p);
85 decode(osd_sum, p);
20effc67 86 decode(num_pg_by_state, p);
11fdf7f2
TL
87 decode(num_pg_by_osd, p);
88 decode(num_pg_by_pool, p);
89 decode(osd_last_seq, p);
90 decode(per_pool_sum_delta, p);
91 decode(per_pool_sum_deltas_stamps, p);
92 decode(pg_sum_delta, p);
93 decode(stamp_delta, p);
94 decode(avail_space_by_rule, p);
20effc67
TL
95 decode(purged_snaps, p);
96 decode(osd_sum_by_class, p);
31f18b77
FG
97 DECODE_FINISH(p);
98}
99
9f95a23c 100void PGMapDigest::dump(ceph::Formatter *f) const
31f18b77
FG
101{
102 f->dump_unsigned("num_pg", num_pg);
103 f->dump_unsigned("num_pg_active", num_pg_active);
104 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
105 f->dump_unsigned("num_osd", num_osd);
106 f->dump_object("pool_sum", pg_sum);
107 f->dump_object("osd_sum", osd_sum);
11fdf7f2
TL
108
109 f->open_object_section("osd_sum_by_class");
110 for (auto& i : osd_sum_by_class) {
111 f->dump_object(i.first.c_str(), i.second);
112 }
113 f->close_section();
114
31f18b77
FG
115 f->open_array_section("pool_stats");
116 for (auto& p : pg_pool_sum) {
117 f->open_object_section("pool_stat");
118 f->dump_int("poolid", p.first);
119 auto q = num_pg_by_pool.find(p.first);
120 if (q != num_pg_by_pool.end())
121 f->dump_unsigned("num_pg", q->second);
122 p.second.dump(f);
7c673cae
FG
123 f->close_section();
124 }
125 f->close_section();
31f18b77
FG
126 f->open_array_section("osd_stats");
127 int i = 0;
128 // TODO: this isn't really correct since we can dump non-existent OSDs
129 // I dunno what osd_last_seq is set to in that case...
130 for (auto& p : osd_last_seq) {
7c673cae 131 f->open_object_section("osd_stat");
31f18b77
FG
132 f->dump_int("osd", i);
133 f->dump_unsigned("seq", p);
7c673cae 134 f->close_section();
31f18b77 135 ++i;
7c673cae
FG
136 }
137 f->close_section();
31f18b77
FG
138 f->open_array_section("num_pg_by_state");
139 for (auto& p : num_pg_by_state) {
140 f->open_object_section("count");
141 f->dump_string("state", pg_state_string(p.first));
142 f->dump_unsigned("num", p.second);
143 f->close_section();
144 }
7c673cae 145 f->close_section();
31f18b77
FG
146 f->open_array_section("num_pg_by_osd");
147 for (auto& p : num_pg_by_osd) {
148 f->open_object_section("count");
149 f->dump_unsigned("osd", p.first);
150 f->dump_unsigned("num_primary_pg", p.second.primary);
151 f->dump_unsigned("num_acting_pg", p.second.acting);
81eedcae 152 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
31f18b77
FG
153 f->close_section();
154 }
7c673cae 155 f->close_section();
11fdf7f2
TL
156 f->open_array_section("purged_snaps");
157 for (auto& j : purged_snaps) {
158 f->open_object_section("pool");
159 f->dump_int("pool", j.first);
160 f->open_object_section("purged_snaps");
161 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
162 f->open_object_section("interval");
163 f->dump_stream("start") << i.get_start();
164 f->dump_stream("length") << i.get_len();
165 f->close_section();
166 }
167 f->close_section();
168 f->close_section();
169 }
170 f->close_section();
7c673cae
FG
171}
172
31f18b77 173void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
7c673cae 174{
31f18b77 175 ls.push_back(new PGMapDigest);
7c673cae
FG
176}
177
31f18b77
FG
178inline std::string percentify(const float& a) {
179 std::stringstream ss;
180 if (a < 0.01)
181 ss << "0";
182 else
183 ss << std::fixed << std::setprecision(2) << a;
184 return ss.str();
185}
7c673cae 186
9f95a23c 187void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
7c673cae 188{
31f18b77
FG
189 if (f)
190 f->open_array_section("pgs_by_state");
7c673cae 191
31f18b77 192 // list is descending numeric order (by count)
9f95a23c 193 std::multimap<int,uint64_t> state_by_count; // count -> state
31f18b77
FG
194 for (auto p = num_pg_by_state.begin();
195 p != num_pg_by_state.end();
196 ++p) {
197 state_by_count.insert(make_pair(p->second, p->first));
7c673cae 198 }
31f18b77
FG
199 if (f) {
200 for (auto p = state_by_count.rbegin();
201 p != state_by_count.rend();
202 ++p)
203 {
204 f->open_object_section("pgs_by_state_element");
205 f->dump_string("state_name", pg_state_string(p->second));
206 f->dump_unsigned("count", p->first);
207 f->close_section();
208 }
7c673cae 209 }
31f18b77
FG
210 if (f)
211 f->close_section();
7c673cae 212
31f18b77
FG
213 if (f) {
214 f->dump_unsigned("num_pgs", num_pg);
215 f->dump_unsigned("num_pools", pg_pool_sum.size());
216 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
217 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
11fdf7f2
TL
218 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
219 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
220 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
31f18b77
FG
221 } else {
222 *out << " pools: " << pg_pool_sum.size() << " pools, "
223 << num_pg << " pgs\n";
1adf2230
AA
224 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
225 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
31f18b77 226 *out << " usage: "
11fdf7f2
TL
227 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
228 << byte_u_t(osd_sum.statfs.available) << " / "
229 << byte_u_t(osd_sum.statfs.total) << " avail\n";
31f18b77
FG
230 *out << " pgs: ";
231 }
7c673cae 232
31f18b77 233 bool pad = false;
7c673cae 234
31f18b77
FG
235 if (num_pg_unknown > 0) {
236 float p = (float)num_pg_unknown / (float)num_pg;
237 if (f) {
238 f->dump_float("unknown_pgs_ratio", p);
7c673cae 239 } else {
31f18b77
FG
240 char b[20];
241 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
242 *out << b << "% pgs unknown\n";
243 pad = true;
7c673cae 244 }
7c673cae 245 }
7c673cae 246
31f18b77
FG
247 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
248 if (num_pg_inactive > 0) {
249 float p = (float)num_pg_inactive / (float)num_pg;
250 if (f) {
251 f->dump_float("inactive_pgs_ratio", p);
7c673cae 252 } else {
31f18b77
FG
253 if (pad) {
254 *out << " ";
255 }
256 char b[20];
257 snprintf(b, sizeof(b), "%.3f", p * 100.0);
258 *out << b << "% pgs not active\n";
259 pad = true;
7c673cae 260 }
7c673cae 261 }
31f18b77
FG
262
263 list<string> sl;
264 overall_recovery_summary(f, &sl);
265 if (!f && !sl.empty()) {
266 for (auto p = sl.begin(); p != sl.end(); ++p) {
267 if (pad) {
268 *out << " ";
269 }
270 *out << *p << "\n";
271 pad = true;
7c673cae 272 }
7c673cae 273 }
31f18b77 274 sl.clear();
7c673cae 275
31f18b77
FG
276 if (!f) {
277 unsigned max_width = 1;
9f95a23c 278 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
31f18b77
FG
279 {
280 std::stringstream ss;
281 ss << p->first;
11fdf7f2 282 max_width = std::max<size_t>(ss.str().size(), max_width);
7c673cae
FG
283 }
284
9f95a23c 285 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
31f18b77
FG
286 {
287 if (pad) {
288 *out << " ";
289 }
290 pad = true;
291 out->setf(std::ios::left);
292 *out << std::setw(max_width) << p->first
293 << " " << pg_state_string(p->second) << "\n";
294 out->unsetf(std::ios::left);
295 }
7c673cae
FG
296 }
297
31f18b77
FG
298 ostringstream ss_rec_io;
299 overall_recovery_rate_summary(f, &ss_rec_io);
300 ostringstream ss_client_io;
301 overall_client_io_rate_summary(f, &ss_client_io);
302 ostringstream ss_cache_io;
303 overall_cache_io_rate_summary(f, &ss_cache_io);
7c673cae 304
31f18b77
FG
305 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
306 || ss_cache_io.str().length())) {
307 *out << "\n \n";
308 *out << " io:\n";
7c673cae
FG
309 }
310
31f18b77
FG
311 if (!f && ss_client_io.str().length())
312 *out << " client: " << ss_client_io.str() << "\n";
313 if (!f && ss_rec_io.str().length())
314 *out << " recovery: " << ss_rec_io.str() << "\n";
315 if (!f && ss_cache_io.str().length())
316 *out << " cache: " << ss_cache_io.str() << "\n";
7c673cae
FG
317}
318
9f95a23c 319void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
7c673cae 320{
31f18b77
FG
321 std::stringstream ss;
322
323 if (f)
324 f->open_array_section("num_pg_by_state");
325 for (auto p = num_pg_by_state.begin();
326 p != num_pg_by_state.end();
327 ++p) {
328 if (f) {
329 f->open_object_section("state");
330 f->dump_string("name", pg_state_string(p->first));
331 f->dump_unsigned("num", p->second);
332 f->close_section();
333 }
334 if (p != num_pg_by_state.begin())
335 ss << ", ";
336 ss << p->second << " " << pg_state_string(p->first);
7c673cae 337 }
31f18b77
FG
338 if (f)
339 f->close_section();
7c673cae 340
31f18b77
FG
341 string states = ss.str();
342 if (out)
343 *out << num_pg << " pgs: "
344 << states << "; "
1adf2230 345 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
11fdf7f2
TL
346 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
347 << byte_u_t(osd_sum.statfs.available) << " / "
348 << byte_u_t(osd_sum.statfs.total) << " avail";
31f18b77
FG
349 if (f) {
350 f->dump_unsigned("num_pgs", num_pg);
351 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
11fdf7f2
TL
352 f->dump_int("total_bytes", osd_sum.statfs.total);
353 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
354 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
355 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
31f18b77 356 }
7c673cae 357
31f18b77
FG
358 // make non-negative; we can get negative values if osds send
359 // uncommitted stats and then "go backward" or if they are just
360 // buggy/wrong.
361 pool_stat_t pos_delta = pg_sum_delta;
362 pos_delta.floor(0);
363 if (pos_delta.stats.sum.num_rd ||
364 pos_delta.stats.sum.num_wr) {
365 if (out)
366 *out << "; ";
367 if (pos_delta.stats.sum.num_rd) {
368 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
369 if (out)
1adf2230 370 *out << byte_u_t(rd) << "/s rd, ";
31f18b77
FG
371 if (f)
372 f->dump_unsigned("read_bytes_sec", rd);
373 }
374 if (pos_delta.stats.sum.num_wr) {
375 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
376 if (out)
1adf2230 377 *out << byte_u_t(wr) << "/s wr, ";
31f18b77
FG
378 if (f)
379 f->dump_unsigned("write_bytes_sec", wr);
380 }
381 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
382 if (out)
11fdf7f2 383 *out << si_u_t(iops) << " op/s";
31f18b77
FG
384 if (f)
385 f->dump_unsigned("io_sec", iops);
7c673cae 386 }
31f18b77
FG
387
388 list<string> sl;
389 overall_recovery_summary(f, &sl);
390 if (out)
391 for (auto p = sl.begin(); p != sl.end(); ++p)
392 *out << "; " << *p;
393 std::stringstream ssr;
394 overall_recovery_rate_summary(f, &ssr);
395 if (out && ssr.str().length())
396 *out << "; " << ssr.str() << " recovering";
7c673cae
FG
397}
398
11fdf7f2
TL
399void PGMapDigest::get_recovery_stats(
400 double *misplaced_ratio,
401 double *degraded_ratio,
402 double *inactive_pgs_ratio,
403 double *unknown_pgs_ratio) const
404{
405 if (pg_sum.stats.sum.num_objects_degraded &&
406 pg_sum.stats.sum.num_object_copies > 0) {
407 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
408 (double)pg_sum.stats.sum.num_object_copies;
409 } else {
410 *degraded_ratio = 0;
411 }
412 if (pg_sum.stats.sum.num_objects_misplaced &&
413 pg_sum.stats.sum.num_object_copies > 0) {
414 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
415 (double)pg_sum.stats.sum.num_object_copies;
416 } else {
417 *misplaced_ratio = 0;
418 }
419 if (num_pg > 0) {
420 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
421 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
422 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
423 } else {
424 *inactive_pgs_ratio = 0;
425 *unknown_pgs_ratio = 0;
426 }
427}
428
9f95a23c 429void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
b32b8144 430 const pool_stat_t& pool_sum) const
7c673cae 431{
b32b8144
FG
432 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
433 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
434 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
435 char b[20];
436 snprintf(b, sizeof(b), "%.3lf", pc);
437 if (f) {
b32b8144
FG
438 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
439 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
440 f->dump_float("degraded_ratio", pc / 100.0);
441 } else {
442 ostringstream ss;
b32b8144
FG
443 ss << pool_sum.stats.sum.num_objects_degraded
444 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
31f18b77
FG
445 psl->push_back(ss.str());
446 }
447 }
b32b8144
FG
448 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
449 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
450 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
451 char b[20];
452 snprintf(b, sizeof(b), "%.3lf", pc);
453 if (f) {
b32b8144
FG
454 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
455 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
456 f->dump_float("misplaced_ratio", pc / 100.0);
457 } else {
458 ostringstream ss;
b32b8144
FG
459 ss << pool_sum.stats.sum.num_objects_misplaced
460 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
31f18b77
FG
461 psl->push_back(ss.str());
462 }
463 }
b32b8144
FG
464 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
465 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
466 (double)pool_sum.stats.sum.num_objects * (double)100.0;
31f18b77
FG
467 char b[20];
468 snprintf(b, sizeof(b), "%.3lf", pc);
469 if (f) {
b32b8144
FG
470 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
471 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
31f18b77
FG
472 f->dump_float("unfound_ratio", pc / 100.0);
473 } else {
474 ostringstream ss;
b32b8144
FG
475 ss << pool_sum.stats.sum.num_objects_unfound
476 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
31f18b77
FG
477 psl->push_back(ss.str());
478 }
7c673cae 479 }
7c673cae
FG
480}
481
9f95a23c 482void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
483 const pool_stat_t& delta_sum,
484 utime_t delta_stamp) const
7c673cae 485{
31f18b77
FG
486 // make non-negative; we can get negative values if osds send
487 // uncommitted stats and then "go backward" or if they are just
488 // buggy/wrong.
489 pool_stat_t pos_delta = delta_sum;
490 pos_delta.floor(0);
491 if (pos_delta.stats.sum.num_objects_recovered ||
492 pos_delta.stats.sum.num_bytes_recovered ||
493 pos_delta.stats.sum.num_keys_recovered) {
494 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
495 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
496 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
497 if (f) {
498 f->dump_int("recovering_objects_per_sec", objps);
499 f->dump_int("recovering_bytes_per_sec", bps);
500 f->dump_int("recovering_keys_per_sec", kps);
501 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
502 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
503 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
504 } else {
1adf2230 505 *out << byte_u_t(bps) << "/s";
31f18b77 506 if (pos_delta.stats.sum.num_keys_recovered)
11fdf7f2
TL
507 *out << ", " << si_u_t(kps) << " keys/s";
508 *out << ", " << si_u_t(objps) << " objects/s";
31f18b77 509 }
7c673cae 510 }
31f18b77 511}
7c673cae 512
9f95a23c 513void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
31f18b77
FG
514{
515 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
516}
517
9f95a23c 518void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
7c673cae 519{
31f18b77 520 recovery_summary(f, psl, pg_sum);
7c673cae
FG
521}
522
9f95a23c 523void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77 524 uint64_t poolid) const
7c673cae 525{
31f18b77
FG
526 auto p = per_pool_sum_delta.find(poolid);
527 if (p == per_pool_sum_delta.end())
528 return;
7c673cae 529
31f18b77 530 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 531 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77
FG
532 recovery_rate_summary(f, out, p->second.first, ts->second);
533}
7c673cae 534
9f95a23c 535void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
31f18b77
FG
536 uint64_t poolid) const
537{
b32b8144
FG
538 auto p = pg_pool_sum.find(poolid);
539 if (p == pg_pool_sum.end())
31f18b77 540 return;
7c673cae 541
b32b8144 542 recovery_summary(f, psl, p->second);
7c673cae
FG
543}
544
9f95a23c 545void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
546 const pool_stat_t& delta_sum,
547 utime_t delta_stamp) const
7c673cae 548{
31f18b77
FG
549 pool_stat_t pos_delta = delta_sum;
550 pos_delta.floor(0);
551 if (pos_delta.stats.sum.num_rd ||
552 pos_delta.stats.sum.num_wr) {
553 if (pos_delta.stats.sum.num_rd) {
554 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
555 if (f) {
556 f->dump_int("read_bytes_sec", rd);
557 } else {
1adf2230 558 *out << byte_u_t(rd) << "/s rd, ";
31f18b77
FG
559 }
560 }
561 if (pos_delta.stats.sum.num_wr) {
562 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
563 if (f) {
564 f->dump_int("write_bytes_sec", wr);
565 } else {
1adf2230 566 *out << byte_u_t(wr) << "/s wr, ";
31f18b77
FG
567 }
568 }
569 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
570 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
571 if (f) {
572 f->dump_int("read_op_per_sec", iops_rd);
573 f->dump_int("write_op_per_sec", iops_wr);
574 } else {
11fdf7f2 575 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
31f18b77 576 }
7c673cae
FG
577 }
578}
579
9f95a23c 580void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
7c673cae 581{
31f18b77
FG
582 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
583}
7c673cae 584
9f95a23c 585void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
586 uint64_t poolid) const
587{
588 auto p = per_pool_sum_delta.find(poolid);
589 if (p == per_pool_sum_delta.end())
7c673cae
FG
590 return;
591
31f18b77 592 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 593 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77 594 client_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
595}
596
9f95a23c 597void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
598 const pool_stat_t& delta_sum,
599 utime_t delta_stamp) const
7c673cae 600{
31f18b77
FG
601 pool_stat_t pos_delta = delta_sum;
602 pos_delta.floor(0);
603 bool have_output = false;
7c673cae 604
31f18b77
FG
605 if (pos_delta.stats.sum.num_flush) {
606 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
607 if (f) {
608 f->dump_int("flush_bytes_sec", flush);
609 } else {
1adf2230 610 *out << byte_u_t(flush) << "/s flush";
31f18b77 611 have_output = true;
7c673cae
FG
612 }
613 }
31f18b77
FG
614 if (pos_delta.stats.sum.num_evict) {
615 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
616 if (f) {
617 f->dump_int("evict_bytes_sec", evict);
618 } else {
619 if (have_output)
620 *out << ", ";
1adf2230 621 *out << byte_u_t(evict) << "/s evict";
31f18b77
FG
622 have_output = true;
623 }
7c673cae 624 }
31f18b77
FG
625 if (pos_delta.stats.sum.num_promote) {
626 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
627 if (f) {
628 f->dump_int("promote_op_per_sec", promote);
629 } else {
630 if (have_output)
631 *out << ", ";
11fdf7f2 632 *out << si_u_t(promote) << " op/s promote";
31f18b77
FG
633 have_output = true;
634 }
7c673cae 635 }
31f18b77
FG
636 if (pos_delta.stats.sum.num_flush_mode_low) {
637 if (f) {
638 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
639 } else {
640 if (have_output)
641 *out << ", ";
11fdf7f2 642 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
31f18b77
FG
643 have_output = true;
644 }
7c673cae 645 }
31f18b77
FG
646 if (pos_delta.stats.sum.num_flush_mode_high) {
647 if (f) {
648 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
649 } else {
650 if (have_output)
651 *out << ", ";
11fdf7f2 652 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
31f18b77
FG
653 have_output = true;
654 }
7c673cae 655 }
31f18b77
FG
656 if (pos_delta.stats.sum.num_evict_mode_some) {
657 if (f) {
658 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
659 } else {
660 if (have_output)
661 *out << ", ";
11fdf7f2 662 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
31f18b77
FG
663 have_output = true;
664 }
665 }
666 if (pos_delta.stats.sum.num_evict_mode_full) {
667 if (f) {
668 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
669 } else {
670 if (have_output)
671 *out << ", ";
11fdf7f2 672 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
31f18b77 673 }
7c673cae
FG
674 }
675}
676
9f95a23c 677void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
7c673cae 678{
31f18b77 679 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
680}
681
9f95a23c 682void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77 683 uint64_t poolid) const
7c673cae 684{
31f18b77
FG
685 auto p = per_pool_sum_delta.find(poolid);
686 if (p == per_pool_sum_delta.end())
687 return;
7c673cae 688
31f18b77 689 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 690 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77 691 cache_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
692}
693
d2e6a577 694ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
20effc67 695 std::optional<int64_t> data_pool) const
d2e6a577
FG
696{
697 ceph_statfs statfs;
698 bool filter = false;
699 object_stat_sum_t sum;
700
701 if (data_pool) {
702 auto i = pg_pool_sum.find(*data_pool);
703 if (i != pg_pool_sum.end()) {
704 sum = i->second.stats.sum;
705 filter = true;
706 }
707 }
708
709 if (filter) {
710 statfs.kb_used = (sum.num_bytes >> 10);
711 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
712 statfs.num_objects = sum.num_objects;
713 statfs.kb = statfs.kb_used + statfs.kb_avail;
714 } else {
715 // these are in KB.
11fdf7f2
TL
716 statfs.kb = osd_sum.statfs.kb();
717 statfs.kb_used = osd_sum.statfs.kb_used_raw();
718 statfs.kb_avail = osd_sum.statfs.kb_avail();
d2e6a577
FG
719 statfs.num_objects = pg_sum.stats.sum.num_objects;
720 }
721
722 return statfs;
723}
724
31f18b77
FG
725void PGMapDigest::dump_pool_stats_full(
726 const OSDMap &osd_map,
727 stringstream *ss,
9f95a23c 728 ceph::Formatter *f,
31f18b77 729 bool verbose) const
7c673cae 730{
31f18b77 731 TextTable tbl;
7c673cae 732
31f18b77
FG
733 if (f) {
734 f->open_array_section("pools");
735 } else {
11fdf7f2 736 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
f67539c2
TL
737 tbl.define_column("ID", TextTable::RIGHT, TextTable::RIGHT);
738 tbl.define_column("PGS", TextTable::RIGHT, TextTable::RIGHT);
739 tbl.define_column("STORED", TextTable::RIGHT, TextTable::RIGHT);
9f95a23c 740 if (verbose) {
f67539c2
TL
741 tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
742 tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
9f95a23c 743 }
f67539c2
TL
744 tbl.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
745 tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
9f95a23c 746 if (verbose) {
f67539c2
TL
747 tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
748 tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
9f95a23c 749 }
f67539c2
TL
750 tbl.define_column("%USED", TextTable::RIGHT, TextTable::RIGHT);
751 tbl.define_column("MAX AVAIL", TextTable::RIGHT, TextTable::RIGHT);
11fdf7f2 752
31f18b77 753 if (verbose) {
f67539c2
TL
754 tbl.define_column("QUOTA OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
755 tbl.define_column("QUOTA BYTES", TextTable::RIGHT, TextTable::RIGHT);
756 tbl.define_column("DIRTY", TextTable::RIGHT, TextTable::RIGHT);
757 tbl.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT);
758 tbl.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT);
31f18b77
FG
759 }
760 }
761
762 map<int,uint64_t> avail_by_rule;
763 for (auto p = osd_map.get_pools().begin();
764 p != osd_map.get_pools().end(); ++p) {
765 int64_t pool_id = p->first;
766 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
767 continue;
11fdf7f2 768
31f18b77 769 const string& pool_name = osd_map.get_pool_name(pool_id);
f91f0fd5 770 auto pool_pg_num = osd_map.get_pg_num(pool_id);
31f18b77
FG
771 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
772
773 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
20effc67 774 int ruleno = pool->get_crush_rule();
31f18b77 775 int64_t avail;
31f18b77
FG
776 if (avail_by_rule.count(ruleno) == 0) {
777 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
778 avail = get_rule_avail(ruleno);
779 if (avail < 0)
780 avail = 0;
781 avail_by_rule[ruleno] = avail;
782 } else {
783 avail = avail_by_rule[ruleno];
784 }
31f18b77
FG
785 if (f) {
786 f->open_object_section("pool");
787 f->dump_string("name", pool_name);
788 f->dump_int("id", pool_id);
789 f->open_object_section("stats");
790 } else {
791 tbl << pool_name
f91f0fd5
TL
792 << pool_id
793 << pool_pg_num;
31f18b77 794 }
11fdf7f2 795 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
81eedcae 796 bool per_pool = use_per_pool_stats();
9f95a23c 797 bool per_pool_omap = use_per_pool_omap_stats();
81eedcae 798 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
9f95a23c 799 per_pool_omap, pool);
11fdf7f2 800 if (f) {
31f18b77 801 f->close_section(); // stats
31f18b77 802 f->close_section(); // pool
11fdf7f2
TL
803 } else {
804 tbl << TextTable::endrow;
805 }
31f18b77
FG
806 }
807 if (f)
808 f->close_section();
809 else {
11fdf7f2 810 ceph_assert(ss != nullptr);
9f95a23c 811 *ss << "--- POOLS ---\n";
31f18b77
FG
812 *ss << tbl;
813 }
814}
815
11fdf7f2 816void PGMapDigest::dump_cluster_stats(stringstream *ss,
9f95a23c 817 ceph::Formatter *f,
11fdf7f2 818 bool verbose) const
31f18b77
FG
819{
820 if (f) {
821 f->open_object_section("stats");
11fdf7f2
TL
822 f->dump_int("total_bytes", osd_sum.statfs.total);
823 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
824 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
825 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
826 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
81eedcae
TL
827 f->dump_unsigned("num_osds", osd_sum.num_osds);
828 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
9f95a23c 829 f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
11fdf7f2
TL
830 f->close_section();
831 f->open_object_section("stats_by_class");
832 for (auto& i : osd_sum_by_class) {
833 f->open_object_section(i.first.c_str());
834 f->dump_int("total_bytes", i.second.statfs.total);
835 f->dump_int("total_avail_bytes", i.second.statfs.available);
836 f->dump_int("total_used_bytes", i.second.statfs.get_used());
837 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
838 f->dump_float("total_used_raw_ratio",
839 i.second.statfs.get_used_raw_ratio());
840 f->close_section();
31f18b77
FG
841 }
842 f->close_section();
843 } else {
11fdf7f2 844 ceph_assert(ss != nullptr);
31f18b77 845 TextTable tbl;
11fdf7f2 846 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
f67539c2
TL
847 tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
848 tbl.define_column("AVAIL", TextTable::RIGHT, TextTable::RIGHT);
849 tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
850 tbl.define_column("RAW USED", TextTable::RIGHT, TextTable::RIGHT);
851 tbl.define_column("%RAW USED", TextTable::RIGHT, TextTable::RIGHT);
31f18b77 852
11fdf7f2
TL
853
854 for (auto& i : osd_sum_by_class) {
855 tbl << i.first;
856 tbl << stringify(byte_u_t(i.second.statfs.total))
857 << stringify(byte_u_t(i.second.statfs.available))
858 << stringify(byte_u_t(i.second.statfs.get_used()))
859 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
860 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
861 << TextTable::endrow;
862 }
863 tbl << "TOTAL";
864 tbl << stringify(byte_u_t(osd_sum.statfs.total))
865 << stringify(byte_u_t(osd_sum.statfs.available))
866 << stringify(byte_u_t(osd_sum.statfs.get_used()))
867 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
868 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
869 << TextTable::endrow;
870
9f95a23c 871 *ss << "--- RAW STORAGE ---\n";
31f18b77
FG
872 *ss << tbl;
873 }
874}
875
876void PGMapDigest::dump_object_stat_sum(
9f95a23c 877 TextTable &tbl, ceph::Formatter *f,
11fdf7f2 878 const pool_stat_t &pool_stat, uint64_t avail,
9f95a23c 879 float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
31f18b77
FG
880 const pg_pool_t *pool)
881{
11fdf7f2
TL
882 const object_stat_sum_t &sum = pool_stat.stats.sum;
883 const store_statfs_t statfs = pool_stat.store_stats;
884
885 if (sum.num_object_copies > 0) {
886 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
887 }
81eedcae 888
9f95a23c
TL
889 uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
890 uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
891 uint64_t used_bytes = used_data_bytes + used_omap_bytes;
31f18b77
FG
892
893 float used = 0.0;
3efd9988 894 // note avail passed in is raw_avail, calc raw_used here.
31f18b77 895 if (avail) {
11fdf7f2 896 used = used_bytes;
31f18b77 897 used /= used + avail;
11fdf7f2 898 } else if (used_bytes) {
31f18b77
FG
899 used = 1.0;
900 }
11fdf7f2
TL
901 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
902 // an approximation for actually stored user data
9f95a23c
TL
903 auto stored_data_normalized = pool_stat.get_user_data_bytes(
904 raw_used_rate, per_pool);
905 auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
906 raw_used_rate, per_pool_omap);
907 auto stored_normalized = stored_data_normalized + stored_omap_normalized;
908 // same, amplied by replication or EC
909 auto stored_raw = stored_normalized * raw_used_rate;
31f18b77 910 if (f) {
11fdf7f2 911 f->dump_int("stored", stored_normalized);
9f95a23c
TL
912 if (verbose) {
913 f->dump_int("stored_data", stored_data_normalized);
914 f->dump_int("stored_omap", stored_omap_normalized);
915 }
31f18b77 916 f->dump_int("objects", sum.num_objects);
11fdf7f2
TL
917 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
918 f->dump_int("bytes_used", used_bytes);
9f95a23c
TL
919 if (verbose) {
920 f->dump_int("data_bytes_used", used_data_bytes);
921 f->dump_int("omap_bytes_used", used_omap_bytes);
922 }
11fdf7f2
TL
923 f->dump_float("percent_used", used);
924 f->dump_unsigned("max_avail", avail_res);
31f18b77
FG
925 if (verbose) {
926 f->dump_int("quota_objects", pool->quota_max_objects);
927 f->dump_int("quota_bytes", pool->quota_max_bytes);
522d829b
TL
928 if (pool->is_tier()) {
929 f->dump_int("dirty", sum.num_objects_dirty);
930 } else {
931 f->dump_int("dirty", 0);
932 }
31f18b77
FG
933 f->dump_int("rd", sum.num_rd);
934 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
935 f->dump_int("wr", sum.num_wr);
936 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
11fdf7f2
TL
937 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
938 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
939 // Stored by user amplified by replication
9f95a23c 940 f->dump_int("stored_raw", stored_raw);
f6b5b4d7 941 f->dump_unsigned("avail_raw", avail);
31f18b77
FG
942 }
943 } else {
11fdf7f2 944 tbl << stringify(byte_u_t(stored_normalized));
9f95a23c
TL
945 if (verbose) {
946 tbl << stringify(byte_u_t(stored_data_normalized));
947 tbl << stringify(byte_u_t(stored_omap_normalized));
948 }
11fdf7f2
TL
949 tbl << stringify(si_u_t(sum.num_objects));
950 tbl << stringify(byte_u_t(used_bytes));
9f95a23c
TL
951 if (verbose) {
952 tbl << stringify(byte_u_t(used_data_bytes));
953 tbl << stringify(byte_u_t(used_omap_bytes));
954 }
31f18b77 955 tbl << percentify(used*100);
11fdf7f2 956 tbl << stringify(byte_u_t(avail_res));
31f18b77 957 if (verbose) {
11fdf7f2
TL
958 if (pool->quota_max_objects == 0)
959 tbl << "N/A";
960 else
961 tbl << stringify(si_u_t(pool->quota_max_objects));
11fdf7f2
TL
962 if (pool->quota_max_bytes == 0)
963 tbl << "N/A";
964 else
965 tbl << stringify(byte_u_t(pool->quota_max_bytes));
522d829b
TL
966 if (pool->is_tier()) {
967 tbl << stringify(si_u_t(sum.num_objects_dirty));
968 } else {
969 tbl << "N/A";
970 }
971 tbl << stringify(byte_u_t(statfs.data_compressed_allocated));
972 tbl << stringify(byte_u_t(statfs.data_compressed_original));
31f18b77
FG
973 }
974 }
975}
976
d2e6a577
FG
977int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
978 int64_t poolid) const
979{
980 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
20effc67 981 int ruleno = pool->get_crush_rule();
d2e6a577
FG
982 int64_t avail;
983 avail = get_rule_avail(ruleno);
984 if (avail < 0)
985 avail = 0;
986
11fdf7f2 987 return avail / osd_map.pool_raw_used_rate(poolid);
d2e6a577
FG
988}
989
31f18b77
FG
990int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
991{
992 map<int,float> wm;
993 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
994 if (r < 0) {
995 return r;
996 }
997 if (wm.empty()) {
998 return 0;
999 }
1000
11fdf7f2 1001 float fratio = osdmap.get_full_ratio();
31f18b77
FG
1002
1003 int64_t min = -1;
1004 for (auto p = wm.begin(); p != wm.end(); ++p) {
1005 auto osd_info = osd_stat.find(p->first);
1006 if (osd_info != osd_stat.end()) {
11fdf7f2 1007 if (osd_info->second.statfs.total == 0 || p->second == 0) {
31f18b77
FG
1008 // osd must be out, hence its stats have been zeroed
1009 // (unless we somehow managed to have a disk with size 0...)
1010 //
1011 // (p->second == 0), if osd weight is 0, no need to
1012 // calculate proj below.
1013 continue;
1014 }
11fdf7f2 1015 double unusable = (double)osd_info->second.statfs.kb() *
31f18b77 1016 (1.0 - fratio);
11fdf7f2 1017 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
31f18b77
FG
1018 avail *= 1024.0;
1019 int64_t proj = (int64_t)(avail / (double)p->second);
1020 if (min < 0 || proj < min) {
1021 min = proj;
1022 }
1023 } else {
94b18763
FG
1024 if (osdmap.is_up(p->first)) {
1025 // This is a level 4 rather than an error, because we might have
1026 // only just started, and not received the first stats message yet.
1027 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1028 }
31f18b77
FG
1029 }
1030 }
1031 return min;
1032}
1033
1034void PGMap::get_rules_avail(const OSDMap& osdmap,
1035 std::map<int,int64_t> *avail_map) const
1036{
1037 avail_map->clear();
1038 for (auto p : osdmap.get_pools()) {
1039 int64_t pool_id = p.first;
1040 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1041 continue;
1042 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
20effc67 1043 int ruleno = pool->get_crush_rule();
31f18b77
FG
1044 if (avail_map->count(ruleno) == 0)
1045 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1046 }
1047}
1048
1049// ---------------------
1050// PGMap
1051
9f95a23c 1052void PGMap::Incremental::dump(ceph::Formatter *f) const
7c673cae
FG
1053{
1054 f->dump_unsigned("version", version);
1055 f->dump_stream("stamp") << stamp;
31f18b77
FG
1056 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1057 f->dump_unsigned("pg_scan_epoch", pg_scan);
7c673cae 1058
31f18b77
FG
1059 f->open_array_section("pg_stat_updates");
1060 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1061 f->open_object_section("pg_stat");
1062 f->dump_stream("pgid") << p->first;
1063 p->second.dump(f);
1064 f->close_section();
1065 }
7c673cae
FG
1066 f->close_section();
1067
31f18b77
FG
1068 f->open_array_section("osd_stat_updates");
1069 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1070 f->open_object_section("osd_stat");
1071 f->dump_int("osd", p->first);
1072 p->second.dump(f);
7c673cae
FG
1073 f->close_section();
1074 }
1075 f->close_section();
11fdf7f2
TL
1076 f->open_array_section("pool_statfs_updates");
1077 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1078 f->open_object_section("pool_statfs");
1079 f->dump_stream("poolid/osd") << p->first;
1080 p->second.dump(f);
1081 f->close_section();
1082 }
1083 f->close_section();
7c673cae 1084
31f18b77
FG
1085 f->open_array_section("osd_stat_removals");
1086 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1087 f->dump_int("osd", *p);
7c673cae 1088 f->close_section();
7c673cae 1089
31f18b77
FG
1090 f->open_array_section("pg_removals");
1091 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1092 f->dump_stream("pgid") << *p;
7c673cae
FG
1093 f->close_section();
1094}
1095
31f18b77 1096void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
7c673cae 1097{
31f18b77
FG
1098 o.push_back(new Incremental);
1099 o.push_back(new Incremental);
1100 o.back()->version = 1;
1101 o.back()->stamp = utime_t(123,345);
1102 o.push_back(new Incremental);
1103 o.back()->version = 2;
11fdf7f2 1104 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
31f18b77 1105 o.back()->osd_stat_updates[5] = osd_stat_t();
31f18b77
FG
1106 o.push_back(new Incremental);
1107 o.back()->version = 3;
1108 o.back()->osdmap_epoch = 1;
1109 o.back()->pg_scan = 2;
11fdf7f2 1110 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
31f18b77 1111 o.back()->osd_stat_updates[6] = osd_stat_t();
11fdf7f2 1112 o.back()->pg_remove.insert(pg_t(1,2));
31f18b77 1113 o.back()->osd_stat_rm.insert(5);
11fdf7f2 1114 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
7c673cae
FG
1115}
1116
31f18b77
FG
1117// --
1118
1119void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
7c673cae 1120{
11fdf7f2 1121 ceph_assert(inc.version == version+1);
31f18b77 1122 version++;
7c673cae 1123
31f18b77 1124 pool_stat_t pg_sum_old = pg_sum;
11fdf7f2
TL
1125 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1126 pg_pool_sum_old = pg_pool_sum;
7c673cae 1127
31f18b77
FG
1128 for (auto p = inc.pg_stat_updates.begin();
1129 p != inc.pg_stat_updates.end();
1130 ++p) {
1131 const pg_t &update_pg(p->first);
11fdf7f2 1132 auto update_pool = update_pg.pool();
31f18b77 1133 const pg_stat_t &update_stat(p->second);
7c673cae 1134
11fdf7f2
TL
1135 auto pg_stat_iter = pg_stat.find(update_pg);
1136 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1137 if (pg_stat_iter == pg_stat.end()) {
31f18b77
FG
1138 pg_stat.insert(make_pair(update_pg, update_stat));
1139 } else {
11fdf7f2
TL
1140 stat_pg_sub(update_pg, pg_stat_iter->second);
1141 pool_sum_ref.sub(pg_stat_iter->second);
1142 pg_stat_iter->second = update_stat;
7c673cae 1143 }
31f18b77 1144 stat_pg_add(update_pg, update_stat);
11fdf7f2 1145 pool_sum_ref.add(update_stat);
7c673cae 1146 }
11fdf7f2
TL
1147
1148 for (auto p = inc.pool_statfs_updates.begin();
1149 p != inc.pool_statfs_updates.end();
1150 ++p) {
1151 auto update_pool = p->first.first;
1152 auto update_osd = p->first.second;
1153 auto& statfs_inc = p->second;
1154
1155 auto pool_statfs_iter =
1156 pool_statfs.find(std::make_pair(update_pool, update_osd));
f6b5b4d7 1157 if (pg_pool_sum.count(update_pool)) {
eafe8130
TL
1158 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1159 if (pool_statfs_iter == pool_statfs.end()) {
1160 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1161 } else {
1162 pool_sum_ref.sub(pool_statfs_iter->second);
1163 pool_statfs_iter->second = statfs_inc;
1164 }
1165 pool_sum_ref.add(statfs_inc);
11fdf7f2 1166 }
11fdf7f2
TL
1167 }
1168
31f18b77
FG
1169 for (auto p = inc.get_osd_stat_updates().begin();
1170 p != inc.get_osd_stat_updates().end();
1171 ++p) {
1172 int osd = p->first;
1173 const osd_stat_t &new_stats(p->second);
7c673cae 1174
31f18b77
FG
1175 auto t = osd_stat.find(osd);
1176 if (t == osd_stat.end()) {
1177 osd_stat.insert(make_pair(osd, new_stats));
1178 } else {
1179 stat_osd_sub(t->first, t->second);
1180 t->second = new_stats;
1181 }
31f18b77 1182 stat_osd_add(osd, new_stats);
31f18b77
FG
1183 }
1184 set<int64_t> deleted_pools;
1185 for (auto p = inc.pg_remove.begin();
1186 p != inc.pg_remove.end();
1187 ++p) {
1188 const pg_t &removed_pg(*p);
1189 auto s = pg_stat.find(removed_pg);
11fdf7f2 1190 bool pool_erased = false;
31f18b77 1191 if (s != pg_stat.end()) {
11fdf7f2 1192 pool_erased = stat_pg_sub(removed_pg, s->second);
f6b5b4d7
TL
1193
1194 // decrease pool stats if pg was removed
1195 auto pool_stats_it = pg_pool_sum.find(removed_pg.pool());
1196 if (pool_stats_it != pg_pool_sum.end()) {
1197 pool_stats_it->second.sub(s->second);
1198 }
1199
31f18b77 1200 pg_stat.erase(s);
11fdf7f2
TL
1201 if (pool_erased) {
1202 deleted_pools.insert(removed_pg.pool());
1203 }
31f18b77 1204 }
7c673cae
FG
1205 }
1206
31f18b77
FG
1207 for (auto p = inc.get_osd_stat_rm().begin();
1208 p != inc.get_osd_stat_rm().end();
7c673cae 1209 ++p) {
31f18b77
FG
1210 auto t = osd_stat.find(*p);
1211 if (t != osd_stat.end()) {
1212 stat_osd_sub(t->first, t->second);
1213 osd_stat.erase(t);
31f18b77 1214 }
11fdf7f2
TL
1215 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1216 if (i->first.second == *p) {
1217 pg_pool_sum[i->first.first].sub(i->second);
1218 pool_statfs.erase(i);
1219 }
1220 }
7c673cae
FG
1221 }
1222
b32b8144
FG
1223 // skip calculating delta while sum was not synchronized
1224 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1225 utime_t delta_t;
1226 delta_t = inc.stamp;
1227 delta_t -= stamp;
1228 // calculate a delta, and average over the last 2 deltas.
1229 pool_stat_t d = pg_sum;
1230 d.stats.sub(pg_sum_old.stats);
1231 pg_sum_deltas.push_back(make_pair(d, delta_t));
1232 stamp_delta += delta_t;
1233 pg_sum_delta.stats.add(d.stats);
1234 auto smooth_intervals =
11fdf7f2
TL
1235 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1236 while (pg_sum_deltas.size() > smooth_intervals) {
b32b8144
FG
1237 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1238 stamp_delta -= pg_sum_deltas.front().second;
1239 pg_sum_deltas.pop_front();
1240 }
31f18b77 1241 }
b32b8144 1242 stamp = inc.stamp;
7c673cae 1243
31f18b77 1244 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
7c673cae 1245
31f18b77
FG
1246 for (auto p : deleted_pools) {
1247 if (cct)
1248 dout(20) << " deleted pool " << p << dendl;
1249 deleted_pool(p);
1250 }
7c673cae 1251
31f18b77
FG
1252 if (inc.osdmap_epoch)
1253 last_osdmap_epoch = inc.osdmap_epoch;
1254 if (inc.pg_scan)
1255 last_pg_scan = inc.pg_scan;
7c673cae
FG
1256}
1257
31f18b77 1258void PGMap::calc_stats()
7c673cae 1259{
31f18b77
FG
1260 num_pg = 0;
1261 num_pg_active = 0;
1262 num_pg_unknown = 0;
1263 num_osd = 0;
1264 pg_pool_sum.clear();
1265 num_pg_by_pool.clear();
1266 pg_by_osd.clear();
1267 pg_sum = pool_stat_t();
1268 osd_sum = osd_stat_t();
11fdf7f2 1269 osd_sum_by_class.clear();
31f18b77 1270 num_pg_by_state.clear();
11fdf7f2 1271 num_pg_by_pool_state.clear();
31f18b77 1272 num_pg_by_osd.clear();
7c673cae 1273
31f18b77
FG
1274 for (auto p = pg_stat.begin();
1275 p != pg_stat.end();
1276 ++p) {
11fdf7f2
TL
1277 auto pg = p->first;
1278 stat_pg_add(pg, p->second);
1279 pg_pool_sum[pg.pool()].add(p->second);
1280 }
1281 for (auto p = pool_statfs.begin();
1282 p != pool_statfs.end();
1283 ++p) {
1284 auto pool = p->first.first;
1285 pg_pool_sum[pool].add(p->second);
31f18b77
FG
1286 }
1287 for (auto p = osd_stat.begin();
1288 p != osd_stat.end();
1289 ++p)
1290 stat_osd_add(p->first, p->second);
7c673cae
FG
1291}
1292
31f18b77
FG
1293void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1294 bool sameosds)
7c673cae 1295{
11fdf7f2 1296 auto pool = pgid.pool();
31f18b77 1297 pg_sum.add(s);
7c673cae 1298
31f18b77
FG
1299 num_pg++;
1300 num_pg_by_state[s.state]++;
11fdf7f2
TL
1301 num_pg_by_pool_state[pgid.pool()][s.state]++;
1302 num_pg_by_pool[pool]++;
7c673cae 1303
31f18b77
FG
1304 if ((s.state & PG_STATE_CREATING) &&
1305 s.parent_split_bits == 0) {
1306 creating_pgs.insert(pgid);
1307 if (s.acting_primary >= 0) {
1308 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
7c673cae
FG
1309 }
1310 }
1311
31f18b77
FG
1312 if (s.state & PG_STATE_ACTIVE) {
1313 ++num_pg_active;
1314 }
1315 if (s.state == 0) {
1316 ++num_pg_unknown;
7c673cae
FG
1317 }
1318
31f18b77
FG
1319 if (sameosds)
1320 return;
7c673cae 1321
31f18b77
FG
1322 for (auto p = s.blocked_by.begin();
1323 p != s.blocked_by.end();
1324 ++p) {
1325 ++blocked_by_sum[*p];
7c673cae 1326 }
31f18b77
FG
1327
1328 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1329 pg_by_osd[*p].insert(pgid);
1330 num_pg_by_osd[*p].acting++;
1331 }
1332 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
81eedcae
TL
1333 auto& t = pg_by_osd[*p];
1334 if (t.find(pgid) == t.end()) {
1335 t.insert(pgid);
1336 num_pg_by_osd[*p].up_not_acting++;
1337 }
7c673cae 1338 }
7c673cae 1339
31f18b77
FG
1340 if (s.up_primary >= 0) {
1341 num_pg_by_osd[s.up_primary].primary++;
7c673cae 1342 }
7c673cae 1343}
31f18b77 1344
11fdf7f2 1345bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
31f18b77 1346 bool sameosds)
7c673cae 1347{
11fdf7f2 1348 bool pool_erased = false;
31f18b77
FG
1349 pg_sum.sub(s);
1350
1351 num_pg--;
1352 int end = --num_pg_by_state[s.state];
11fdf7f2 1353 ceph_assert(end >= 0);
31f18b77
FG
1354 if (end == 0)
1355 num_pg_by_state.erase(s.state);
11fdf7f2
TL
1356 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1357 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1358 }
31f18b77
FG
1359 end = --num_pg_by_pool[pgid.pool()];
1360 if (end == 0) {
11fdf7f2 1361 pool_erased = true;
7c673cae 1362 }
7c673cae 1363
31f18b77
FG
1364 if ((s.state & PG_STATE_CREATING) &&
1365 s.parent_split_bits == 0) {
1366 creating_pgs.erase(pgid);
1367 if (s.acting_primary >= 0) {
1368 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1369 r[s.mapping_epoch].erase(pgid);
1370 if (r[s.mapping_epoch].empty())
1371 r.erase(s.mapping_epoch);
1372 if (r.empty())
1373 creating_pgs_by_osd_epoch.erase(s.acting_primary);
7c673cae
FG
1374 }
1375 }
31f18b77
FG
1376
1377 if (s.state & PG_STATE_ACTIVE) {
1378 --num_pg_active;
1379 }
1380 if (s.state == 0) {
1381 --num_pg_unknown;
1382 }
1383
1384 if (sameosds)
11fdf7f2 1385 return pool_erased;
31f18b77
FG
1386
1387 for (auto p = s.blocked_by.begin();
1388 p != s.blocked_by.end();
1389 ++p) {
1390 auto q = blocked_by_sum.find(*p);
11fdf7f2 1391 ceph_assert(q != blocked_by_sum.end());
31f18b77
FG
1392 --q->second;
1393 if (q->second == 0)
1394 blocked_by_sum.erase(q);
1395 }
1396
81eedcae 1397 set<int32_t> actingset;
31f18b77 1398 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
81eedcae 1399 actingset.insert(*p);
31f18b77
FG
1400 auto& oset = pg_by_osd[*p];
1401 oset.erase(pgid);
1402 if (oset.empty())
1403 pg_by_osd.erase(*p);
1404 auto it = num_pg_by_osd.find(*p);
1405 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1406 it->second.acting--;
1407 }
1408 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1409 auto& oset = pg_by_osd[*p];
1410 oset.erase(pgid);
1411 if (oset.empty())
1412 pg_by_osd.erase(*p);
81eedcae
TL
1413 if (actingset.count(*p))
1414 continue;
31f18b77 1415 auto it = num_pg_by_osd.find(*p);
81eedcae
TL
1416 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1417 it->second.up_not_acting--;
31f18b77
FG
1418 }
1419
1420 if (s.up_primary >= 0) {
1421 auto it = num_pg_by_osd.find(s.up_primary);
1422 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1423 it->second.primary--;
1424 }
11fdf7f2
TL
1425 return pool_erased;
1426}
1427
1428void PGMap::calc_purged_snaps()
1429{
1430 purged_snaps.clear();
1431 set<int64_t> unknown;
1432 for (auto& i : pg_stat) {
1433 if (i.second.state == 0) {
1434 unknown.insert(i.first.pool());
1435 purged_snaps.erase(i.first.pool());
1436 continue;
1437 } else if (unknown.count(i.first.pool())) {
1438 continue;
1439 }
1440 auto j = purged_snaps.find(i.first.pool());
1441 if (j == purged_snaps.end()) {
1442 // base case
1443 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1444 } else {
1445 j->second.intersection_of(i.second.purged_snaps);
1446 }
1447 }
31f18b77
FG
1448}
1449
11fdf7f2 1450void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
31f18b77 1451{
11fdf7f2
TL
1452 osd_sum_by_class.clear();
1453 for (auto& i : osd_stat) {
1454 const char *class_name = osdmap.crush->get_item_class(i.first);
1455 if (class_name) {
1456 osd_sum_by_class[class_name].add(i.second);
1457 }
1458 }
31f18b77
FG
1459}
1460
1461void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1462{
1463 num_osd++;
1464 osd_sum.add(s);
1465 if (osd >= (int)osd_last_seq.size()) {
1466 osd_last_seq.resize(osd + 1);
1467 }
1468 osd_last_seq[osd] = s.seq;
1469}
1470
1471void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1472{
1473 num_osd--;
1474 osd_sum.sub(s);
11fdf7f2 1475 ceph_assert(osd < (int)osd_last_seq.size());
31f18b77
FG
1476 osd_last_seq[osd] = 0;
1477}
1478
31f18b77 1479void PGMap::encode_digest(const OSDMap& osdmap,
11fdf7f2 1480 bufferlist& bl, uint64_t features)
31f18b77
FG
1481{
1482 get_rules_avail(osdmap, &avail_space_by_rule);
11fdf7f2
TL
1483 calc_osd_sum_by_class(osdmap);
1484 calc_purged_snaps();
31f18b77
FG
1485 PGMapDigest::encode(bl, features);
1486}
1487
1488void PGMap::encode(bufferlist &bl, uint64_t features) const
1489{
11fdf7f2
TL
1490 ENCODE_START(8, 8, bl);
1491 encode(version, bl);
1492 encode(pg_stat, bl);
1493 encode(osd_stat, bl, features);
1494 encode(last_osdmap_epoch, bl);
1495 encode(last_pg_scan, bl);
1496 encode(stamp, bl);
1497 encode(pool_statfs, bl, features);
31f18b77
FG
1498 ENCODE_FINISH(bl);
1499}
1500
11fdf7f2 1501void PGMap::decode(bufferlist::const_iterator &bl)
31f18b77 1502{
11fdf7f2
TL
1503 DECODE_START(8, bl);
1504 decode(version, bl);
1505 decode(pg_stat, bl);
1506 decode(osd_stat, bl);
1507 decode(last_osdmap_epoch, bl);
1508 decode(last_pg_scan, bl);
1509 decode(stamp, bl);
1510 decode(pool_statfs, bl);
31f18b77
FG
1511 DECODE_FINISH(bl);
1512
1513 calc_stats();
7c673cae
FG
1514}
1515
9f95a23c 1516void PGMap::dump(ceph::Formatter *f, bool with_net) const
31f18b77
FG
1517{
1518 dump_basic(f);
1519 dump_pg_stats(f, false);
1520 dump_pool_stats(f);
9f95a23c 1521 dump_osd_stats(f, with_net);
31f18b77
FG
1522}
1523
9f95a23c 1524void PGMap::dump_basic(ceph::Formatter *f) const
31f18b77
FG
1525{
1526 f->dump_unsigned("version", version);
1527 f->dump_stream("stamp") << stamp;
1528 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1529 f->dump_unsigned("last_pg_scan", last_pg_scan);
31f18b77
FG
1530
1531 f->open_object_section("pg_stats_sum");
1532 pg_sum.dump(f);
1533 f->close_section();
1534
1535 f->open_object_section("osd_stats_sum");
1536 osd_sum.dump(f);
1537 f->close_section();
1538
31f18b77
FG
1539 dump_delta(f);
1540}
1541
9f95a23c 1542void PGMap::dump_delta(ceph::Formatter *f) const
31f18b77
FG
1543{
1544 f->open_object_section("pg_stats_delta");
1545 pg_sum_delta.dump(f);
11fdf7f2 1546 f->dump_stream("stamp_delta") << stamp_delta;
31f18b77
FG
1547 f->close_section();
1548}
1549
9f95a23c 1550void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
31f18b77
FG
1551{
1552 f->open_array_section("pg_stats");
1553 for (auto i = pg_stat.begin();
1554 i != pg_stat.end();
1555 ++i) {
1556 f->open_object_section("pg_stat");
1557 f->dump_stream("pgid") << i->first;
1558 if (brief)
1559 i->second.dump_brief(f);
1560 else
1561 i->second.dump(f);
1562 f->close_section();
1563 }
1564 f->close_section();
1565}
1566
20effc67
TL
1567void PGMap::dump_pg_progress(ceph::Formatter *f) const
1568{
1569 f->open_object_section("pgs");
1570 for (auto& i : pg_stat) {
1571 std::string n = stringify(i.first);
1572 f->open_object_section(n.c_str());
1573 f->dump_int("num_bytes_recovered", i.second.stats.sum.num_bytes_recovered);
1574 f->dump_int("num_bytes", i.second.stats.sum.num_bytes);
1575 f->dump_unsigned("reported_epoch", i.second.reported_epoch);
1576 f->dump_string("state", pg_state_string(i.second.state));
1577 f->close_section();
1578 }
1579 f->close_section();
1580}
1581
9f95a23c 1582void PGMap::dump_pool_stats(ceph::Formatter *f) const
31f18b77
FG
1583{
1584 f->open_array_section("pool_stats");
1585 for (auto p = pg_pool_sum.begin();
1586 p != pg_pool_sum.end();
1587 ++p) {
1588 f->open_object_section("pool_stat");
1589 f->dump_int("poolid", p->first);
1590 auto q = num_pg_by_pool.find(p->first);
1591 if (q != num_pg_by_pool.end())
1592 f->dump_unsigned("num_pg", q->second);
1593 p->second.dump(f);
1594 f->close_section();
1595 }
1596 f->close_section();
1597}
1598
9f95a23c 1599void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
31f18b77
FG
1600{
1601 f->open_array_section("osd_stats");
1602 for (auto q = osd_stat.begin();
1603 q != osd_stat.end();
1604 ++q) {
1605 f->open_object_section("osd_stat");
1606 f->dump_int("osd", q->first);
ded94939 1607 q->second.dump(f, with_net);
31f18b77
FG
1608 f->close_section();
1609 }
1610 f->close_section();
f67539c2
TL
1611
1612 f->open_array_section("pool_statfs");
1613 for (auto& p : pool_statfs) {
1614 f->open_object_section("item");
1615 f->dump_int("poolid", p.first.first);
1616 f->dump_int("osd", p.first.second);
1617 p.second.dump(f);
1618 f->close_section();
1619 }
1620 f->close_section();
31f18b77
FG
1621}
1622
9f95a23c
TL
1623void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
1624{
1625 f->open_array_section("osd_ping_times");
20effc67 1626 for (const auto& [osd, stat] : osd_stat) {
9f95a23c
TL
1627 f->open_object_section("osd_ping_time");
1628 f->dump_int("osd", osd);
1629 stat.dump_ping_time(f);
1630 f->close_section();
1631 }
1632 f->close_section();
1633}
1634
20effc67 1635// note: dump_pg_stats_plain() is static
31f18b77
FG
1636void PGMap::dump_pg_stats_plain(
1637 ostream& ss,
1638 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
20effc67 1639 bool brief)
31f18b77
FG
1640{
1641 TextTable tab;
1642
1643 if (brief){
1644 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1645 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1646 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1647 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1648 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1649 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1650 }
1651 else {
1652 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1653 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1654 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1655 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1656 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1657 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1658 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1659 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1660 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77 1661 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1e59de90 1662 tab.define_column("LOG_DUPS", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1663 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1664 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1665 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1666 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1667 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1668 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1669 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1670 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1671 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1672 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1673 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1674 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1675 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
b32b8144 1676 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
20effc67
TL
1677 tab.define_column("LAST_SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT);
1678 tab.define_column("SCRUB_SCHEDULING", TextTable::LEFT, TextTable::LEFT);
1679 tab.define_column("OBJECTS_SCRUBBED", TextTable::LEFT, TextTable::RIGHT);
1d09f67e 1680 tab.define_column("OBJECTS_TRIMMED", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1681 }
1682
20effc67 1683 for (const auto& [pg, st] : pg_stats) {
31f18b77 1684 if (brief) {
20effc67 1685 tab << pg
31f18b77
FG
1686 << pg_state_string(st.state)
1687 << st.up
1688 << st.up_primary
1689 << st.acting
1690 << st.acting_primary
1691 << TextTable::endrow;
7c673cae 1692 } else {
31f18b77
FG
1693 ostringstream reported;
1694 reported << st.reported_epoch << ":" << st.reported_seq;
1695
20effc67 1696 tab << pg
31f18b77
FG
1697 << st.stats.sum.num_objects
1698 << st.stats.sum.num_objects_missing_on_primary
1699 << st.stats.sum.num_objects_degraded
1700 << st.stats.sum.num_objects_misplaced
1701 << st.stats.sum.num_objects_unfound
1702 << st.stats.sum.num_bytes
11fdf7f2
TL
1703 << st.stats.sum.num_omap_bytes
1704 << st.stats.sum.num_omap_keys
31f18b77 1705 << st.log_size
1e59de90 1706 << st.log_dups_size
31f18b77
FG
1707 << st.ondisk_log_size
1708 << pg_state_string(st.state)
1709 << st.last_change
1710 << st.version
1711 << reported.str()
1712 << pg_vector_string(st.up)
1713 << st.up_primary
1714 << pg_vector_string(st.acting)
1715 << st.acting_primary
1716 << st.last_scrub
1717 << st.last_scrub_stamp
1718 << st.last_deep_scrub
1719 << st.last_deep_scrub_stamp
b32b8144 1720 << st.snaptrimq_len
20effc67
TL
1721 << st.last_scrub_duration
1722 << st.dump_scrub_schedule()
1723 << st.objects_scrubbed
1d09f67e 1724 << st.objects_trimmed
31f18b77 1725 << TextTable::endrow;
7c673cae
FG
1726 }
1727 }
7c673cae 1728
31f18b77
FG
1729 ss << tab;
1730}
1731
1732void PGMap::dump(ostream& ss) const
1733{
1734 dump_basic(ss);
1735 dump_pg_stats(ss, false);
1736 dump_pool_stats(ss, false);
1737 dump_pg_sum_stats(ss, false);
1738 dump_osd_stats(ss);
1739}
1740
1741void PGMap::dump_basic(ostream& ss) const
1742{
1743 ss << "version " << version << std::endl;
1744 ss << "stamp " << stamp << std::endl;
1745 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1746 ss << "last_pg_scan " << last_pg_scan << std::endl;
31f18b77
FG
1747}
1748
1749void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1750{
1751 dump_pg_stats_plain(ss, pg_stat, brief);
1752}
1753
1754void PGMap::dump_pool_stats(ostream& ss, bool header) const
1755{
1756 TextTable tab;
1757
1758 if (header) {
1759 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1760 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1761 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1762 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1763 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1764 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1765 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1766 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1768 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1770 } else {
1771 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1772 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1774 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1775 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1776 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1777 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1778 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1780 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1782 }
1783
1784 for (auto p = pg_pool_sum.begin();
1785 p != pg_pool_sum.end();
1786 ++p) {
1787 tab << p->first
1788 << p->second.stats.sum.num_objects
1789 << p->second.stats.sum.num_objects_missing_on_primary
1790 << p->second.stats.sum.num_objects_degraded
1791 << p->second.stats.sum.num_objects_misplaced
1792 << p->second.stats.sum.num_objects_unfound
1793 << p->second.stats.sum.num_bytes
11fdf7f2
TL
1794 << p->second.stats.sum.num_omap_bytes
1795 << p->second.stats.sum.num_omap_keys
31f18b77
FG
1796 << p->second.log_size
1797 << p->second.ondisk_log_size
1798 << TextTable::endrow;
1799 }
1800
1801 ss << tab;
1802}
1803
1804void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1805{
1806 TextTable tab;
1807
1808 if (header) {
1809 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1810 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1811 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1812 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1813 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1814 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1815 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1816 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1817 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1818 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1819 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1820 } else {
1821 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1822 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1823 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1824 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1825 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1826 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1827 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1828 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1829 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1830 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1831 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1832 };
1833
1834 tab << "sum"
1835 << pg_sum.stats.sum.num_objects
1836 << pg_sum.stats.sum.num_objects_missing_on_primary
1837 << pg_sum.stats.sum.num_objects_degraded
1838 << pg_sum.stats.sum.num_objects_misplaced
1839 << pg_sum.stats.sum.num_objects_unfound
1840 << pg_sum.stats.sum.num_bytes
11fdf7f2
TL
1841 << pg_sum.stats.sum.num_omap_bytes
1842 << pg_sum.stats.sum.num_omap_keys
31f18b77
FG
1843 << pg_sum.log_size
1844 << pg_sum.ondisk_log_size
1845 << TextTable::endrow;
1846
1847 ss << tab;
1848}
1849
1850void PGMap::dump_osd_stats(ostream& ss) const
1851{
1852 TextTable tab;
1853
1854 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1855 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1856 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 1857 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1858 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1859 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1860 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1862
1863 for (auto p = osd_stat.begin();
1864 p != osd_stat.end();
1865 ++p) {
1866 tab << p->first
11fdf7f2
TL
1867 << byte_u_t(p->second.statfs.get_used())
1868 << byte_u_t(p->second.statfs.available)
1869 << byte_u_t(p->second.statfs.get_used_raw())
1870 << byte_u_t(p->second.statfs.total)
31f18b77
FG
1871 << p->second.hb_peers
1872 << get_num_pg_by_osd(p->first)
1873 << get_num_primary_pg_by_osd(p->first)
1874 << TextTable::endrow;
1875 }
1876
1877 tab << "sum"
11fdf7f2
TL
1878 << byte_u_t(osd_sum.statfs.get_used())
1879 << byte_u_t(osd_sum.statfs.available)
1880 << byte_u_t(osd_sum.statfs.get_used_raw())
1881 << byte_u_t(osd_sum.statfs.total)
31f18b77 1882 << TextTable::endrow;
7c673cae 1883
31f18b77 1884 ss << tab;
7c673cae
FG
1885}
1886
31f18b77 1887void PGMap::dump_osd_sum_stats(ostream& ss) const
7c673cae 1888{
31f18b77 1889 TextTable tab;
7c673cae 1890
31f18b77
FG
1891 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1892 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1893 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 1894 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
31f18b77 1895 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
7c673cae 1896
31f18b77 1897 tab << "sum"
11fdf7f2
TL
1898 << byte_u_t(osd_sum.statfs.get_used())
1899 << byte_u_t(osd_sum.statfs.available)
1900 << byte_u_t(osd_sum.statfs.get_used_raw())
1901 << byte_u_t(osd_sum.statfs.total)
31f18b77 1902 << TextTable::endrow;
7c673cae 1903
31f18b77 1904 ss << tab;
7c673cae
FG
1905}
1906
31f18b77
FG
1907void PGMap::get_stuck_stats(
1908 int types, const utime_t cutoff,
1909 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
7c673cae 1910{
11fdf7f2 1911 ceph_assert(types != 0);
31f18b77
FG
1912 for (auto i = pg_stat.begin();
1913 i != pg_stat.end();
1914 ++i) {
1915 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1916
1917 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1918 if (i->second.last_active < val)
1919 val = i->second.last_active;
7c673cae 1920 }
31f18b77
FG
1921
1922 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1923 if (i->second.last_clean < val)
1924 val = i->second.last_clean;
7c673cae 1925 }
31f18b77
FG
1926
1927 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1928 if (i->second.last_undegraded < val)
1929 val = i->second.last_undegraded;
7c673cae 1930 }
7c673cae 1931
31f18b77
FG
1932 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1933 if (i->second.last_fullsized < val)
1934 val = i->second.last_fullsized;
1935 }
7c673cae 1936
31f18b77
FG
1937 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1938 if (i->second.last_unstale < val)
1939 val = i->second.last_unstale;
1940 }
7c673cae 1941
31f18b77
FG
1942 // val is now the earliest any of the requested stuck states began
1943 if (val < cutoff) {
1944 stuck_pgs[i->first] = i->second;
1945 }
1946 }
7c673cae
FG
1947}
1948
9f95a23c 1949void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
31f18b77
FG
1950{
1951 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1952 get_stuck_stats(types, cutoff, stuck_pg_stats);
1953 f->open_array_section("stuck_pg_stats");
1954 for (auto i = stuck_pg_stats.begin();
1955 i != stuck_pg_stats.end();
1956 ++i) {
1957 f->open_object_section("pg_stat");
1958 f->dump_stream("pgid") << i->first;
1959 i->second.dump(f);
1960 f->close_section();
1961 }
1962 f->close_section();
1963}
1964
1965void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
1966{
1967 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1968 get_stuck_stats(types, cutoff, stuck_pg_stats);
1969 if (!stuck_pg_stats.empty())
1970 dump_pg_stats_plain(ss, stuck_pg_stats, true);
1971}
1972
1973int PGMap::dump_stuck_pg_stats(
1974 stringstream &ds,
9f95a23c 1975 ceph::Formatter *f,
31f18b77
FG
1976 int threshold,
1977 vector<string>& args) const
1978{
1979 int stuck_types = 0;
1980
1981 for (auto i = args.begin(); i != args.end(); ++i) {
1982 if (*i == "inactive")
1983 stuck_types |= PGMap::STUCK_INACTIVE;
1984 else if (*i == "unclean")
1985 stuck_types |= PGMap::STUCK_UNCLEAN;
1986 else if (*i == "undersized")
1987 stuck_types |= PGMap::STUCK_UNDERSIZED;
1988 else if (*i == "degraded")
1989 stuck_types |= PGMap::STUCK_DEGRADED;
1990 else if (*i == "stale")
1991 stuck_types |= PGMap::STUCK_STALE;
1992 else {
1993 ds << "Unknown type: " << *i << std::endl;
1994 return -EINVAL;
7c673cae
FG
1995 }
1996 }
31f18b77
FG
1997
1998 utime_t now(ceph_clock_now());
1999 utime_t cutoff = now - utime_t(threshold, 0);
2000
2001 if (!f) {
2002 dump_stuck_plain(ds, stuck_types, cutoff);
2003 } else {
2004 dump_stuck(f, stuck_types, cutoff);
2005 f->flush(ds);
7c673cae 2006 }
31f18b77
FG
2007
2008 return 0;
7c673cae
FG
2009}
2010
9f95a23c 2011void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
7c673cae 2012{
31f18b77
FG
2013 f->open_array_section("osd_perf_infos");
2014 for (auto i = osd_stat.begin();
2015 i != osd_stat.end();
2016 ++i) {
2017 f->open_object_section("osd");
2018 f->dump_int("id", i->first);
2019 {
2020 f->open_object_section("perf_stats");
2021 i->second.os_perf_stat.dump(f);
2022 f->close_section();
2023 }
2024 f->close_section();
2025 }
2026 f->close_section();
7c673cae 2027}
31f18b77 2028void PGMap::print_osd_perf_stats(std::ostream *ss) const
7c673cae 2029{
31f18b77
FG
2030 TextTable tab;
2031 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2032 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2033 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2034 for (auto i = osd_stat.begin();
2035 i != osd_stat.end();
2036 ++i) {
2037 tab << i->first;
11fdf7f2
TL
2038 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2039 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
31f18b77
FG
2040 tab << TextTable::endrow;
2041 }
2042 (*ss) << tab;
2043}
7c673cae 2044
9f95a23c 2045void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
31f18b77
FG
2046{
2047 f->open_array_section("osd_blocked_by_infos");
2048 for (auto i = blocked_by_sum.begin();
2049 i != blocked_by_sum.end();
2050 ++i) {
2051 f->open_object_section("osd");
2052 f->dump_int("id", i->first);
2053 f->dump_int("num_blocked", i->second);
2054 f->close_section();
2055 }
2056 f->close_section();
2057}
2058void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2059{
2060 TextTable tab;
2061 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2062 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2063 for (auto i = blocked_by_sum.begin();
2064 i != blocked_by_sum.end();
2065 ++i) {
2066 tab << i->first;
2067 tab << i->second;
2068 tab << TextTable::endrow;
2069 }
2070 (*ss) << tab;
7c673cae
FG
2071}
2072
31f18b77 2073
7c673cae
FG
2074/**
2075 * update aggregated delta
2076 *
2077 * @param cct ceph context
2078 * @param ts Timestamp for the stats being delta'ed
2079 * @param old_pool_sum Previous stats sum
2080 * @param last_ts Last timestamp for pool
2081 * @param result_pool_sum Resulting stats
2082 * @param result_pool_delta Resulting pool delta
2083 * @param result_ts_delta Resulting timestamp delta
2084 * @param delta_avg_list List of last N computed deltas, used to average
2085 */
31f18b77
FG
2086void PGMap::update_delta(
2087 CephContext *cct,
2088 const utime_t ts,
2089 const pool_stat_t& old_pool_sum,
2090 utime_t *last_ts,
2091 const pool_stat_t& current_pool_sum,
2092 pool_stat_t *result_pool_delta,
2093 utime_t *result_ts_delta,
2094 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
7c673cae
FG
2095{
2096 /* @p ts is the timestamp we want to associate with the data
2097 * in @p old_pool_sum, and on which we will base ourselves to
2098 * calculate the delta, stored in 'delta_t'.
2099 */
2100 utime_t delta_t;
2101 delta_t = ts; // start with the provided timestamp
2102 delta_t -= *last_ts; // take the last timestamp we saw
2103 *last_ts = ts; // @p ts becomes the last timestamp we saw
2104
31f18b77
FG
2105 // adjust delta_t, quick start if there is no update in a long period
2106 delta_t = std::min(delta_t,
2107 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2108
2109 // calculate a delta, and average over the last 6 deltas by default.
7c673cae
FG
2110 /* start by taking a copy of our current @p result_pool_sum, and by
2111 * taking out the stats from @p old_pool_sum. This generates a stats
2112 * delta. Stash this stats delta in @p delta_avg_list, along with the
2113 * timestamp delta for these results.
2114 */
2115 pool_stat_t d = current_pool_sum;
2116 d.stats.sub(old_pool_sum.stats);
7c673cae
FG
2117
2118 /* Aggregate current delta, and take out the last seen delta (if any) to
2119 * average it out.
b32b8144 2120 * Skip calculating delta while sum was not synchronized.
7c673cae 2121 */
b32b8144
FG
2122 if(!old_pool_sum.stats.sum.is_zero()) {
2123 delta_avg_list->push_back(make_pair(d,delta_t));
2124 *result_ts_delta += delta_t;
2125 result_pool_delta->stats.add(d.stats);
2126 }
11fdf7f2
TL
2127 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2128 while (delta_avg_list->size() > s) {
7c673cae
FG
2129 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2130 *result_ts_delta -= delta_avg_list->front().second;
2131 delta_avg_list->pop_front();
2132 }
2133}
2134
7c673cae
FG
2135/**
2136 * Update a given pool's deltas
2137 *
2138 * @param cct Ceph Context
2139 * @param ts Timestamp for the stats being delta'ed
2140 * @param pool Pool's id
2141 * @param old_pool_sum Previous stats sum
2142 */
31f18b77
FG
2143void PGMap::update_one_pool_delta(
2144 CephContext *cct,
2145 const utime_t ts,
11fdf7f2 2146 const int64_t pool,
31f18b77 2147 const pool_stat_t& old_pool_sum)
7c673cae
FG
2148{
2149 if (per_pool_sum_deltas.count(pool) == 0) {
11fdf7f2
TL
2150 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2151 ceph_assert(per_pool_sum_delta.count(pool) == 0);
7c673cae
FG
2152 }
2153
31f18b77 2154 auto& sum_delta = per_pool_sum_delta[pool];
7c673cae
FG
2155
2156 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2157 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2158 &per_pool_sum_deltas[pool]);
2159}
2160
2161/**
2162 * Update pools' deltas
2163 *
2164 * @param cct CephContext
2165 * @param ts Timestamp for the stats being delta'ed
2166 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2167 */
31f18b77
FG
2168void PGMap::update_pool_deltas(
2169 CephContext *cct, const utime_t ts,
11fdf7f2 2170 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
7c673cae 2171{
31f18b77 2172 for (auto it = pg_pool_sum_old.begin();
7c673cae
FG
2173 it != pg_pool_sum_old.end(); ++it) {
2174 update_one_pool_delta(cct, ts, it->first, it->second);
2175 }
2176}
2177
2178void PGMap::clear_delta()
2179{
2180 pg_sum_delta = pool_stat_t();
2181 pg_sum_deltas.clear();
2182 stamp_delta = utime_t();
2183}
2184
7c673cae
FG
2185void PGMap::generate_test_instances(list<PGMap*>& o)
2186{
2187 o.push_back(new PGMap);
2188 list<Incremental*> inc;
2189 Incremental::generate_test_instances(inc);
2190 delete inc.front();
2191 inc.pop_front();
2192 while (!inc.empty()) {
2193 PGMap *pmp = new PGMap();
2194 *pmp = *o.back();
2195 o.push_back(pmp);
2196 o.back()->apply_incremental(NULL, *inc.front());
2197 delete inc.front();
2198 inc.pop_front();
2199 }
2200}
2201
11fdf7f2 2202void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
7c673cae
FG
2203 bool primary, set<pg_t>& pgs) const
2204{
31f18b77 2205 for (auto i = pg_stat.begin();
7c673cae
FG
2206 i != pg_stat.end();
2207 ++i) {
11fdf7f2 2208 if ((poolid >= 0) && (poolid != i->first.pool()))
7c673cae
FG
2209 continue;
2210 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2211 continue;
11fdf7f2
TL
2212 if (state == (uint64_t)-1 || // "all"
2213 (i->second.state & state) || // matches a state bit
2214 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2215 pgs.insert(i->first);
2216 }
7c673cae
FG
2217 }
2218}
2219
9f95a23c 2220void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
7c673cae
FG
2221{
2222 f->open_array_section("pg_stats");
31f18b77 2223 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2224 const pg_stat_t& st = pg_stat.at(*i);
2225 f->open_object_section("pg_stat");
2226 f->dump_stream("pgid") << *i;
2227 st.dump(f);
2228 f->close_section();
2229 }
2230 f->close_section();
2231}
2232
2233void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2234{
2235 TextTable tab;
11fdf7f2 2236 utime_t now = ceph_clock_now();
7c673cae 2237
11fdf7f2 2238 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
7c673cae 2239 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2240 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2241 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2242 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2243 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
2244 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2245 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2246 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1e59de90 2247 tab.define_column("LOG_DUPS", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2248 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 2249 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2250 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2251 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2252 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2253 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2254 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2255 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
20effc67
TL
2256 tab.define_column("LAST_SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT);
2257 tab.define_column("SCRUB_SCHEDULING", TextTable::LEFT, TextTable::LEFT);
7c673cae 2258
31f18b77 2259 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2260 const pg_stat_t& st = pg_stat.at(*i);
2261
2262 ostringstream reported;
2263 reported << st.reported_epoch << ":" << st.reported_seq;
2264
11fdf7f2 2265 ostringstream upstr, actingstr;
9f95a23c
TL
2266 upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
2267 actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
7c673cae
FG
2268 tab << *i
2269 << st.stats.sum.num_objects
7c673cae
FG
2270 << st.stats.sum.num_objects_degraded
2271 << st.stats.sum.num_objects_misplaced
2272 << st.stats.sum.num_objects_unfound
2273 << st.stats.sum.num_bytes
11fdf7f2
TL
2274 << st.stats.sum.num_omap_bytes
2275 << st.stats.sum.num_omap_keys
7c673cae 2276 << st.log_size
1e59de90 2277 << st.log_dups_size
7c673cae 2278 << pg_state_string(st.state)
11fdf7f2 2279 << utimespan_str(now - st.last_change)
7c673cae
FG
2280 << st.version
2281 << reported.str()
11fdf7f2
TL
2282 << upstr.str()
2283 << actingstr.str()
7c673cae 2284 << st.last_scrub_stamp
7c673cae 2285 << st.last_deep_scrub_stamp
20effc67
TL
2286 << st.last_scrub_duration
2287 << st.dump_scrub_schedule()
2288 << TextTable::endrow;
7c673cae
FG
2289 }
2290
2291 ss << tab;
2292}
2293
11fdf7f2 2294void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
9f95a23c 2295 ceph::Formatter *f,
11fdf7f2 2296 stringstream *rs) const {
20effc67 2297 const string& pool_name = osd_map.get_pool_name(poolid);
11fdf7f2
TL
2298 if (f) {
2299 f->open_object_section("pool");
2300 f->dump_string("pool_name", pool_name.c_str());
2301 f->dump_int("pool_id", poolid);
2302 f->open_object_section("recovery");
2303 }
2304 list<string> sl;
2305 stringstream tss;
2306 pool_recovery_summary(f, &sl, poolid);
2307 if (!f && !sl.empty()) {
2308 for (auto &p : sl)
2309 tss << " " << p << "\n";
2310 }
2311 if (f) {
2312 f->close_section(); // object section recovery
2313 f->open_object_section("recovery_rate");
2314 }
2315 ostringstream rss;
2316 pool_recovery_rate_summary(f, &rss, poolid);
2317 if (!f && !rss.str().empty())
2318 tss << " recovery io " << rss.str() << "\n";
2319 if (f) {
2320 f->close_section(); // object section recovery_rate
2321 f->open_object_section("client_io_rate");
2322 }
2323 rss.clear();
2324 rss.str("");
2325 pool_client_io_rate_summary(f, &rss, poolid);
2326 if (!f && !rss.str().empty())
2327 tss << " client io " << rss.str() << "\n";
2328 // dump cache tier IO rate for cache pool
2329 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2330 if (pool->is_tier()) {
2331 if (f) {
2332 f->close_section(); // object section client_io_rate
2333 f->open_object_section("cache_io_rate");
7c673cae 2334 }
11fdf7f2
TL
2335 rss.clear();
2336 rss.str("");
2337 pool_cache_io_rate_summary(f, &rss, poolid);
2338 if (!f && !rss.str().empty())
2339 tss << " cache tier io " << rss.str() << "\n";
2340 }
2341 if (f) {
2342 f->close_section(); // object section cache_io_rate
2343 f->close_section(); // object section pool
2344 } else {
2345 *rs << "pool " << pool_name << " id " << poolid << "\n";
2346 if (!tss.str().empty())
2347 *rs << tss.str() << "\n";
2348 else
2349 *rs << " nothing is going on\n\n";
7c673cae 2350 }
7c673cae
FG
2351}
2352
9f95a23c
TL
2353// Get crush parentage for an osd (skip root)
2354set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
2355{
2356 set<std::string> reporters_by_subtree;
2357 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2358
2359 auto loc = osdmap.crush->get_full_location(id);
2360 for (auto& [parent_bucket_type, parent_id] : loc) {
2361 // Should we show the root? Might not be too informative like "default"
2362 if (parent_bucket_type != "root" &&
2363 parent_bucket_type != reporter_subtree_level) {
2364 reporters_by_subtree.insert(parent_id);
2365 }
2366 }
2367 return reporters_by_subtree;
2368}
2369
11fdf7f2 2370void PGMap::get_health_checks(
31f18b77 2371 CephContext *cct,
11fdf7f2
TL
2372 const OSDMap& osdmap,
2373 health_check_map_t *checks) const
7c673cae 2374{
11fdf7f2
TL
2375 utime_t now = ceph_clock_now();
2376 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2377 const auto& pools = osdmap.get_pools();
224ce89b 2378
224ce89b
WB
2379 typedef enum pg_consequence_t {
2380 UNAVAILABLE = 1, // Client IO to the pool may block
2381 DEGRADED = 2, // Fewer than the requested number of replicas are present
eafe8130
TL
2382 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2383 // This may or may not be a deadlock condition.
2384 DAMAGED = 4, // The data may be missing or inconsistent on disk and
224ce89b 2385 // requires repair
eafe8130 2386 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
224ce89b
WB
2387 } pg_consequence_t;
2388
2389 // For a given PG state, how should it be reported at the pool level?
2390 class PgStateResponse {
2391 public:
2392 pg_consequence_t consequence;
2393 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2394 stuck_cb stuck_since;
2395 bool invert;
2396
11fdf7f2
TL
2397 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2398 : consequence(c), stuck_since(std::move(s)), invert(false)
224ce89b
WB
2399 {
2400 }
2401
11fdf7f2
TL
2402 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2403 : consequence(c), stuck_since(std::move(s)), invert(i)
224ce89b
WB
2404 {
2405 }
2406 };
2407
2408 // Record the PG state counts that contributed to a reported pool state
2409 class PgCauses {
2410 public:
2411 // Map of PG_STATE_* to number of pgs in that state.
2412 std::map<unsigned, unsigned> states;
2413
2414 // List of all PG IDs that had a state contributing
2415 // to this health condition.
2416 std::set<pg_t> pgs;
2417
2418 std::map<pg_t, std::string> pg_messages;
2419 };
2420
2421 // Map of PG state to how to respond to it
2422 std::map<unsigned, PgStateResponse> state_to_response = {
2423 // Immediate reports
2424 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
c07f9fc5 2425 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
224ce89b 2426 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
b32b8144
FG
2427 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2428 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
eafe8130
TL
2429 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2430 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
224ce89b
WB
2431 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2432 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2433 // Delayed (wait until stuck) reports
2434 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2435 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2436 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2437 // Delayed and inverted reports
b32b8144 2438 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
224ce89b
WB
2439 };
2440
2441 // Specialized state printer that takes account of inversion of
2442 // ACTIVE, CLEAN checks.
11fdf7f2 2443 auto state_name = [](const uint64_t &state) {
224ce89b
WB
2444 // Special cases for the states that are inverted checks
2445 if (state == PG_STATE_CLEAN) {
2446 return std::string("unclean");
2447 } else if (state == PG_STATE_ACTIVE) {
2448 return std::string("inactive");
2449 } else {
2450 return pg_state_string(state);
2451 }
2452 };
2453
2454 // Map of what is wrong to information about why, implicitly also stores
2455 // the list of what is wrong.
2456 std::map<pg_consequence_t, PgCauses> detected;
2457
2458 // Optimisation: trim down the number of checks to apply based on
2459 // the summary counters
2460 std::map<unsigned, PgStateResponse> possible_responses;
2461 for (const auto &i : num_pg_by_state) {
2462 for (const auto &j : state_to_response) {
2463 if (!j.second.invert) {
2464 // Check for normal tests by seeing if any pgs have the flag
2465 if (i.first & j.first) {
2466 possible_responses.insert(j);
2467 }
2468 }
2469 }
2470 }
2471
2472 for (const auto &j : state_to_response) {
2473 if (j.second.invert) {
2474 // Check for inverted tests by seeing if not-all pgs have the flag
2475 const auto &found = num_pg_by_state.find(j.first);
2476 if (found == num_pg_by_state.end() || found->second != num_pg) {
2477 possible_responses.insert(j);
2478 }
2479 }
2480 }
2481
11fdf7f2 2482 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
224ce89b
WB
2483 // Loop over all PGs, if there are any possibly-unhealthy states in there
2484 if (!possible_responses.empty()) {
2485 for (const auto& i : pg_stat) {
2486 const auto &pg_id = i.first;
2487 const auto &pg_info = i.second;
2488
2489 for (const auto &j : state_to_response) {
2490 const auto &pg_response_state = j.first;
2491 const auto &pg_response = j.second;
2492
2493 // Apply the state test
2494 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2495 continue;
2496 }
2497
2498 // Apply stuckness test if needed
2499 if (pg_response.stuck_since) {
2500 // Delayed response, check for stuckness
2501 utime_t last_whatever = pg_response.stuck_since(pg_info);
f6b5b4d7
TL
2502 if (last_whatever.is_zero() &&
2503 pg_info.last_change >= cutoff) {
2504 // still moving, ignore
2505 continue;
2506 } else if (last_whatever >= cutoff) {
224ce89b
WB
2507 // Not stuck enough, ignore.
2508 continue;
2509 } else {
2510
2511 }
2512 }
2513
2514 auto &causes = detected[pg_response.consequence];
2515 causes.states[pg_response_state]++;
2516 causes.pgs.insert(pg_id);
2517
2518 // Don't bother composing detail string if we have already recorded
2519 // too many
2520 if (causes.pg_messages.size() > max) {
2521 continue;
2522 }
2523
2524 std::ostringstream ss;
2525 if (pg_response.stuck_since) {
2526 utime_t since = pg_response.stuck_since(pg_info);
2527 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2528 if (since == utime_t()) {
2529 ss << " since forever";
2530 } else {
2531 utime_t dur = now - since;
9f95a23c 2532 ss << " for " << utimespan_str(dur);
224ce89b
WB
2533 }
2534 ss << ", current state " << pg_state_string(pg_info.state)
20effc67 2535 << ", last acting " << pg_vector_string(pg_info.acting);
224ce89b
WB
2536 } else {
2537 ss << "pg " << pg_id << " is "
2538 << pg_state_string(pg_info.state);
20effc67 2539 ss << ", acting " << pg_vector_string(pg_info.acting);
224ce89b
WB
2540 if (pg_info.stats.sum.num_objects_unfound) {
2541 ss << ", " << pg_info.stats.sum.num_objects_unfound
2542 << " unfound";
2543 }
2544 }
2545
2546 if (pg_info.state & PG_STATE_INCOMPLETE) {
2547 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2548 if (pi && pi->min_size > 1) {
2549 ss << " (reducing pool "
2550 << osdmap.get_pool_name(pg_id.pool())
2551 << " min_size from " << (int)pi->min_size
2552 << " may help; search ceph.com/docs for 'incomplete')";
2553 }
2554 }
2555
2556 causes.pg_messages[pg_id] = ss.str();
2557 }
2558 }
2559 } else {
2560 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2561 }
2562
2563 for (const auto &i : detected) {
2564 std::string health_code;
2565 health_status_t sev;
2566 std::string summary;
2567 switch(i.first) {
2568 case UNAVAILABLE:
2569 health_code = "PG_AVAILABILITY";
2570 sev = HEALTH_WARN;
2571 summary = "Reduced data availability: ";
2572 break;
2573 case DEGRADED:
2574 health_code = "PG_DEGRADED";
2575 summary = "Degraded data redundancy: ";
2576 sev = HEALTH_WARN;
2577 break;
eafe8130
TL
2578 case BACKFILL_FULL:
2579 health_code = "PG_BACKFILL_FULL";
2580 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2581 sev = HEALTH_WARN;
224ce89b
WB
2582 break;
2583 case DAMAGED:
2584 health_code = "PG_DAMAGED";
2585 summary = "Possible data damage: ";
2586 sev = HEALTH_ERR;
2587 break;
eafe8130
TL
2588 case RECOVERY_FULL:
2589 health_code = "PG_RECOVERY_FULL";
2590 summary = "Full OSDs blocking recovery: ";
2591 sev = HEALTH_ERR;
2592 break;
224ce89b 2593 default:
11fdf7f2 2594 ceph_abort();
224ce89b
WB
2595 }
2596
2597 if (i.first == DEGRADED) {
2598 if (pg_sum.stats.sum.num_objects_degraded &&
2599 pg_sum.stats.sum.num_object_copies > 0) {
2600 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2601 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2602 char b[20];
2603 snprintf(b, sizeof(b), "%.3lf", pc);
2604 ostringstream ss;
2605 ss << pg_sum.stats.sum.num_objects_degraded
2606 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2607 << b << "%)";
2608
2609 // Throw in a comma for the benefit of the following PG counts
2610 summary += ss.str() + ", ";
2611 }
2612 }
2613
2614 // Compose summary message saying how many PGs in what states led
2615 // to this health check failing
2616 std::vector<std::string> pg_msgs;
9f95a23c 2617 int64_t count = 0;
224ce89b
WB
2618 for (const auto &j : i.second.states) {
2619 std::ostringstream msg;
2620 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2621 pg_msgs.push_back(msg.str());
9f95a23c 2622 count += j.second;
224ce89b
WB
2623 }
2624 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2625
224ce89b
WB
2626 health_check_t *check = &checks->add(
2627 health_code,
2628 sev,
9f95a23c
TL
2629 summary,
2630 count);
224ce89b
WB
2631
2632 // Compose list of PGs contributing to this health check failing
2633 for (const auto &j : i.second.pg_messages) {
2634 check->detail.push_back(j.second);
2635 }
2636 }
2637
224ce89b
WB
2638 // OSD_SCRUB_ERRORS
2639 if (pg_sum.stats.sum.num_scrub_errors) {
2640 ostringstream ss;
2641 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
9f95a23c
TL
2642 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2643 pg_sum.stats.sum.num_scrub_errors);
224ce89b
WB
2644 }
2645
28e407b8
AA
2646 // LARGE_OMAP_OBJECTS
2647 if (pg_sum.stats.sum.num_large_omap_objects) {
2648 list<string> detail;
2649 for (auto &pool : pools) {
2650 const string& pool_name = osdmap.get_pool_name(pool.first);
2651 auto it2 = pg_pool_sum.find(pool.first);
2652 if (it2 == pg_pool_sum.end()) {
2653 continue;
2654 }
2655 const pool_stat_t *pstat = &it2->second;
2656 if (pstat == nullptr) {
2657 continue;
2658 }
2659 const object_stat_sum_t& sum = pstat->stats.sum;
2660 if (sum.num_large_omap_objects) {
2661 stringstream ss;
2662 ss << sum.num_large_omap_objects << " large objects found in pool "
2663 << "'" << pool_name << "'";
2664 detail.push_back(ss.str());
2665 }
2666 }
2667 if (!detail.empty()) {
2668 ostringstream ss;
2669 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
9f95a23c
TL
2670 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2671 pg_sum.stats.sum.num_large_omap_objects);
28e407b8
AA
2672 stringstream tip;
2673 tip << "Search the cluster log for 'Large omap object found' for more "
2674 << "details.";
2675 detail.push_back(tip.str());
2676 d.detail.swap(detail);
2677 }
2678 }
2679
224ce89b
WB
2680 // CACHE_POOL_NEAR_FULL
2681 {
2682 list<string> detail;
2683 unsigned num_pools = 0;
2684 for (auto& p : pools) {
2685 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2686 !pg_pool_sum.count(p.first)) {
2687 continue;
2688 }
2689 bool nearfull = false;
2690 const string& name = osdmap.get_pool_name(p.first);
2691 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2692 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2693 ((1000000 - p.second.cache_target_full_ratio_micro) *
2694 cct->_conf->mon_cache_target_full_warn_ratio);
2695 if (p.second.target_max_objects &&
2696 (uint64_t)(st.stats.sum.num_objects -
2697 st.stats.sum.num_objects_hit_set_archive) >
2698 p.second.target_max_objects * (ratio / 1000000.0)) {
2699 ostringstream ss;
2700 ss << "cache pool '" << name << "' with "
1adf2230 2701 << si_u_t(st.stats.sum.num_objects)
224ce89b 2702 << " objects at/near target max "
1adf2230 2703 << si_u_t(p.second.target_max_objects) << " objects";
224ce89b
WB
2704 detail.push_back(ss.str());
2705 nearfull = true;
2706 }
2707 if (p.second.target_max_bytes &&
2708 (uint64_t)(st.stats.sum.num_bytes -
2709 st.stats.sum.num_bytes_hit_set_archive) >
2710 p.second.target_max_bytes * (ratio / 1000000.0)) {
2711 ostringstream ss;
2712 ss << "cache pool '" << name
1adf2230
AA
2713 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2714 << " at/near target max "
2715 << byte_u_t(p.second.target_max_bytes);
224ce89b
WB
2716 detail.push_back(ss.str());
2717 nearfull = true;
2718 }
2719 if (nearfull) {
2720 ++num_pools;
2721 }
2722 }
2723 if (!detail.empty()) {
2724 ostringstream ss;
2725 ss << num_pools << " cache pools at or near target size";
9f95a23c
TL
2726 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2727 num_pools);
224ce89b
WB
2728 d.detail.swap(detail);
2729 }
2730 }
2731
2732 // TOO_FEW_PGS
3efd9988
FG
2733 unsigned num_in = osdmap.get_num_in_osds();
2734 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2735 const auto min_pg_per_osd =
11fdf7f2 2736 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
3efd9988
FG
2737 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2738 auto per = sum_pg_up / num_in;
2739 if (per < min_pg_per_osd && per) {
224ce89b
WB
2740 ostringstream ss;
2741 ss << "too few PGs per OSD (" << per
3efd9988 2742 << " < min " << min_pg_per_osd << ")";
9f95a23c
TL
2743 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2744 min_pg_per_osd - per);
224ce89b
WB
2745 }
2746 }
2747
2748 // TOO_MANY_PGS
11fdf7f2 2749 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
2750 if (num_in && max_pg_per_osd > 0) {
2751 auto per = sum_pg_up / num_in;
2752 if (per > max_pg_per_osd) {
224ce89b
WB
2753 ostringstream ss;
2754 ss << "too many PGs per OSD (" << per
3efd9988 2755 << " > max " << max_pg_per_osd << ")";
9f95a23c
TL
2756 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2757 per - max_pg_per_osd);
224ce89b
WB
2758 }
2759 }
2760
eafe8130
TL
2761 // TOO_FEW_OSDS
2762 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2763 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2764 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2765 ostringstream ss;
2766 ss << "OSD count " << osdmap.get_num_osds()
2767 << " < osd_pool_default_size " << osd_pool_default_size;
9f95a23c
TL
2768 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2769 osd_pool_default_size - osdmap.get_num_osds());
eafe8130
TL
2770 }
2771
2772 // SLOW_PING_TIME
2773 // Convert milliseconds to microseconds
2774 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2775 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2776 if (warn_slow_ping_time == 0) {
2777 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2778 warn_slow_ping_time = grace;
2779 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2780 }
2781 if (warn_slow_ping_time > 0) {
2782
2783 struct mon_ping_item_t {
2784 uint32_t pingtime;
2785 int from;
2786 int to;
2787 bool improving;
2788
2789 bool operator<(const mon_ping_item_t& rhs) const {
2790 if (pingtime < rhs.pingtime)
2791 return true;
2792 if (pingtime > rhs.pingtime)
2793 return false;
2794 if (from < rhs.from)
2795 return true;
2796 if (from > rhs.from)
2797 return false;
2798 return to < rhs.to;
2799 }
2800 };
2801
2802 list<string> detail_back;
2803 list<string> detail_front;
f6b5b4d7 2804 list<string> detail;
eafe8130
TL
2805 set<mon_ping_item_t> back_sorted, front_sorted;
2806 for (auto i : osd_stat) {
2807 for (auto j : i.second.hb_pingtime) {
2808
2809 // Maybe source info is old
2810 if (now.sec() - j.second.last_update > grace * 60)
2811 continue;
2812
2813 mon_ping_item_t back;
2814 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2815 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2816 back.from = i.first;
2817 back.to = j.first;
2818 if (back.pingtime > warn_slow_ping_time) {
2819 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2820 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2821 back_sorted.emplace(back);
2822 }
2823
2824 mon_ping_item_t front;
2825 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2826 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2827 front.from = i.first;
2828 front.to = j.first;
2829 if (front.pingtime > warn_slow_ping_time) {
2830 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
2831 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2832 front_sorted.emplace(front);
2833 }
2834 }
f6b5b4d7
TL
2835 if (i.second.num_shards_repaired >
2836 cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
2837 ostringstream ss;
2838 ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
2839 detail.push_back(ss.str());
2840 }
2841 }
2842 if (!detail.empty()) {
2843 ostringstream ss;
2844 ss << "Too many repaired reads on " << detail.size() << " OSDs";
2845 auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
2846 detail.size());
2847 d.detail.swap(detail);
eafe8130
TL
2848 }
2849 int max_detail = 10;
2850 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2851 ostringstream ss;
2852 if (max_detail == 0) {
2853 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2854 detail_back.push_back(ss.str());
2855 break;
2856 }
2857 max_detail--;
9f95a23c
TL
2858 ss << "Slow OSD heartbeats on back from osd." << sback.from
2859 << " [" << osd_parentage(osdmap, sback.from) << "]"
eafe8130
TL
2860 << (osdmap.is_down(sback.from) ? " (down)" : "")
2861 << " to osd." << sback.to
9f95a23c 2862 << " [" << osd_parentage(osdmap, sback.to) << "]"
eafe8130
TL
2863 << (osdmap.is_down(sback.to) ? " (down)" : "")
2864 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2865 << (sback.improving ? " possibly improving" : "");
2866 detail_back.push_back(ss.str());
2867 }
2868 max_detail = 10;
2869 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2870 ostringstream ss;
2871 if (max_detail == 0) {
2872 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2873 detail_front.push_back(ss.str());
2874 break;
2875 }
2876 max_detail--;
9f95a23c
TL
2877 // Get crush parentage for each osd
2878 ss << "Slow OSD heartbeats on front from osd." << sfront.from
2879 << " [" << osd_parentage(osdmap, sfront.from) << "]"
eafe8130
TL
2880 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2881 << " to osd." << sfront.to
9f95a23c 2882 << " [" << osd_parentage(osdmap, sfront.to) << "]"
eafe8130
TL
2883 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2884 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2885 << (sfront.improving ? " possibly improving" : "");
2886 detail_front.push_back(ss.str());
2887 }
2888 if (detail_back.size() != 0) {
2889 ostringstream ss;
9f95a23c
TL
2890 ss << "Slow OSD heartbeats on back (longest "
2891 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
2892 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2893 back_sorted.size());
eafe8130
TL
2894 d.detail.swap(detail_back);
2895 }
2896 if (detail_front.size() != 0) {
2897 ostringstream ss;
9f95a23c
TL
2898 ss << "Slow OSD heartbeats on front (longest "
2899 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
2900 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2901 front_sorted.size());
eafe8130
TL
2902 d.detail.swap(detail_front);
2903 }
2904 }
2905
224ce89b
WB
2906 // SMALLER_PGP_NUM
2907 // MANY_OBJECTS_PER_PG
2908 if (!pg_stat.empty()) {
2909 list<string> pgp_detail, many_detail;
b32b8144 2910 const auto mon_pg_warn_min_objects =
11fdf7f2 2911 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
b32b8144 2912 const auto mon_pg_warn_min_pool_objects =
11fdf7f2 2913 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
b32b8144 2914 const auto mon_pg_warn_max_object_skew =
11fdf7f2 2915 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
224ce89b
WB
2916 for (auto p = pg_pool_sum.begin();
2917 p != pg_pool_sum.end();
2918 ++p) {
2919 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2920 if (!pi)
2921 continue; // in case osdmap changes haven't propagated to PGMap yet
2922 const string& name = osdmap.get_pool_name(p->first);
11fdf7f2
TL
2923 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2924 // the warnings. If the cluster is failing to converge on the target
2925 // values that is a separate issue!
2926 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
224ce89b
WB
2927 !(name.find(".DELETED") != string::npos &&
2928 cct->_conf->mon_fake_pool_delete)) {
2929 ostringstream ss;
2930 ss << "pool " << name << " pg_num "
11fdf7f2
TL
2931 << pi->get_pg_num_target()
2932 << " > pgp_num " << pi->get_pgp_num_target();
224ce89b
WB
2933 pgp_detail.push_back(ss.str());
2934 }
2935 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2936 if (average_objects_per_pg > 0 &&
b32b8144
FG
2937 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2938 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
11fdf7f2
TL
2939 int objects_per_pg = p->second.stats.sum.num_objects /
2940 pi->get_pg_num_target();
224ce89b 2941 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
b32b8144
FG
2942 if (mon_pg_warn_max_object_skew > 0 &&
2943 ratio > mon_pg_warn_max_object_skew) {
224ce89b 2944 ostringstream ss;
20effc67
TL
2945 if (pi->pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::ON) {
2946 ss << "pool " << name << " objects per pg ("
2947 << objects_per_pg << ") is more than " << ratio
2948 << " times cluster average ("
2949 << average_objects_per_pg << ")";
2950 many_detail.push_back(ss.str());
2951 }
224ce89b
WB
2952 }
2953 }
2954 }
2955 if (!pgp_detail.empty()) {
2956 ostringstream ss;
2957 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
9f95a23c
TL
2958 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
2959 pgp_detail.size());
224ce89b
WB
2960 d.detail.swap(pgp_detail);
2961 }
2962 if (!many_detail.empty()) {
2963 ostringstream ss;
2964 ss << many_detail.size() << " pools have many more objects per pg than"
2965 << " average";
9f95a23c
TL
2966 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
2967 many_detail.size());
224ce89b
WB
2968 d.detail.swap(many_detail);
2969 }
2970 }
2971
2972 // POOL_FULL
2973 // POOL_NEAR_FULL
2974 {
11fdf7f2
TL
2975 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
2976 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
224ce89b
WB
2977 list<string> full_detail, nearfull_detail;
2978 unsigned full_pools = 0, nearfull_pools = 0;
2979 for (auto it : pools) {
2980 auto it2 = pg_pool_sum.find(it.first);
2981 if (it2 == pg_pool_sum.end()) {
2982 continue;
2983 }
2984 const pool_stat_t *pstat = &it2->second;
2985 const object_stat_sum_t& sum = pstat->stats.sum;
2986 const string& pool_name = osdmap.get_pool_name(it.first);
2987 const pg_pool_t &pool = it.second;
2988 bool full = false, nearfull = false;
2989 if (pool.quota_max_objects > 0) {
2990 stringstream ss;
2991 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
2992 } else if (crit_threshold > 0 &&
2993 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
2994 ss << "pool '" << pool_name
2995 << "' has " << sum.num_objects << " objects"
2996 << " (max " << pool.quota_max_objects << ")";
2997 full_detail.push_back(ss.str());
2998 full = true;
2999 } else if (warn_threshold > 0 &&
3000 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3001 ss << "pool '" << pool_name
3002 << "' has " << sum.num_objects << " objects"
3003 << " (max " << pool.quota_max_objects << ")";
3004 nearfull_detail.push_back(ss.str());
3005 nearfull = true;
3006 }
3007 }
3008 if (pool.quota_max_bytes > 0) {
3009 stringstream ss;
3010 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3011 } else if (crit_threshold > 0 &&
3012 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3013 ss << "pool '" << pool_name
1adf2230
AA
3014 << "' has " << byte_u_t(sum.num_bytes)
3015 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
224ce89b
WB
3016 full_detail.push_back(ss.str());
3017 full = true;
3018 } else if (warn_threshold > 0 &&
3019 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3020 ss << "pool '" << pool_name
1adf2230
AA
3021 << "' has " << byte_u_t(sum.num_bytes)
3022 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
224ce89b
WB
3023 nearfull_detail.push_back(ss.str());
3024 nearfull = true;
3025 }
3026 }
3027 if (full) {
3028 ++full_pools;
3029 }
3030 if (nearfull) {
3031 ++nearfull_pools;
3032 }
3033 }
3034 if (full_pools) {
3035 ostringstream ss;
3036 ss << full_pools << " pools full";
9f95a23c 3037 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
224ce89b
WB
3038 d.detail.swap(full_detail);
3039 }
3040 if (nearfull_pools) {
3041 ostringstream ss;
11fdf7f2 3042 ss << nearfull_pools << " pools nearfull";
9f95a23c 3043 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
224ce89b
WB
3044 d.detail.swap(nearfull_detail);
3045 }
3046 }
3047
3048 // OBJECT_MISPLACED
3049 if (pg_sum.stats.sum.num_objects_misplaced &&
11fdf7f2
TL
3050 pg_sum.stats.sum.num_object_copies > 0 &&
3051 cct->_conf->mon_warn_on_misplaced) {
224ce89b
WB
3052 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3053 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3054 char b[20];
3055 snprintf(b, sizeof(b), "%.3lf", pc);
3056 ostringstream ss;
3057 ss << pg_sum.stats.sum.num_objects_misplaced
3058 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3059 << b << "%)";
9f95a23c
TL
3060 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3061 pg_sum.stats.sum.num_objects_misplaced);
224ce89b
WB
3062 }
3063
3064 // OBJECT_UNFOUND
3065 if (pg_sum.stats.sum.num_objects_unfound &&
3066 pg_sum.stats.sum.num_objects) {
3067 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3068 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3069 char b[20];
3070 snprintf(b, sizeof(b), "%.3lf", pc);
3071 ostringstream ss;
3072 ss << pg_sum.stats.sum.num_objects_unfound
b5b8bbf5 3073 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
9f95a23c
TL
3074 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3075 pg_sum.stats.sum.num_objects_unfound);
c07f9fc5
FG
3076
3077 for (auto& p : pg_stat) {
3078 if (p.second.stats.sum.num_objects_unfound) {
3079 ostringstream ss;
3080 ss << "pg " << p.first
3081 << " has " << p.second.stats.sum.num_objects_unfound
3082 << " unfound objects";
3083 d.detail.push_back(ss.str());
3084 if (d.detail.size() > max) {
3085 d.detail.push_back("(additional pgs left out for brevity)");
3086 break;
3087 }
3088 }
3089 }
224ce89b
WB
3090 }
3091
3092 // REQUEST_SLOW
3093 // REQUEST_STUCK
11fdf7f2 3094 // SLOW_OPS unifies them in mimic.
9f95a23c 3095 if (osdmap.require_osd_release < ceph_release_t::mimic &&
11fdf7f2 3096 cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3097 !osd_sum.op_queue_age_hist.h.empty() &&
3098 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
224ce89b
WB
3099 cct->_conf->mon_osd_warn_op_age) {
3100 list<string> warn_detail, error_detail;
3101 unsigned warn = 0, error = 0;
3102 float err_age =
3103 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3104 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3105 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3106 float ub = (float)(1 << i) / 1000.0;
3107 if (ub < cct->_conf->mon_osd_warn_op_age)
3108 break;
3109 if (h.h[i]) {
3110 ostringstream ss;
3111 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3112 if (ub > err_age) {
3113 error += h.h[i];
3114 error_detail.push_back(ss.str());
3115 } else {
3116 warn += h.h[i];
3117 warn_detail.push_back(ss.str());
3118 }
3119 }
3120 }
3121
3122 map<float,set<int>> warn_osd_by_max; // max -> osds
3123 map<float,set<int>> error_osd_by_max; // max -> osds
3124 if (!warn_detail.empty() || !error_detail.empty()) {
3125 for (auto& p : osd_stat) {
3126 const pow2_hist_t& h = p.second.op_queue_age_hist;
3127 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3128 float ub = (float)(1 << i) / 1000.0;
3129 if (ub < cct->_conf->mon_osd_warn_op_age)
3130 break;
3131 if (h.h[i]) {
3132 if (ub > err_age) {
3133 error_osd_by_max[ub].insert(p.first);
3134 } else {
3135 warn_osd_by_max[ub].insert(p.first);
3136 }
3137 break;
3138 }
3139 }
3140 }
3141 }
3142
3143 if (!warn_detail.empty()) {
11fdf7f2
TL
3144 ostringstream ss;
3145 ss << warn << " slow requests are blocked > "
3146 << cct->_conf->mon_osd_warn_op_age << " sec";
9f95a23c 3147 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
11fdf7f2 3148 d.detail.swap(warn_detail);
224ce89b
WB
3149 int left = max;
3150 for (auto& p : warn_osd_by_max) {
3151 ostringstream ss;
3152 if (p.second.size() > 1) {
c07f9fc5
FG
3153 ss << "osds " << p.second
3154 << " have blocked requests > " << p.first << " sec";
224ce89b 3155 } else {
c07f9fc5
FG
3156 ss << "osd." << *p.second.begin()
3157 << " has blocked requests > " << p.first << " sec";
224ce89b 3158 }
11fdf7f2 3159 d.detail.push_back(ss.str());
224ce89b
WB
3160 if (--left == 0) {
3161 break;
3162 }
3163 }
3164 }
3165 if (!error_detail.empty()) {
11fdf7f2
TL
3166 ostringstream ss;
3167 ss << error << " stuck requests are blocked > "
3168 << err_age << " sec";
9f95a23c 3169 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
11fdf7f2 3170 d.detail.swap(error_detail);
224ce89b
WB
3171 int left = max;
3172 for (auto& p : error_osd_by_max) {
3173 ostringstream ss;
3174 if (p.second.size() > 1) {
c07f9fc5
FG
3175 ss << "osds " << p.second
3176 << " have stuck requests > " << p.first << " sec";
224ce89b 3177 } else {
c07f9fc5
FG
3178 ss << "osd." << *p.second.begin()
3179 << " has stuck requests > " << p.first << " sec";
224ce89b 3180 }
11fdf7f2 3181 d.detail.push_back(ss.str());
224ce89b
WB
3182 if (--left == 0) {
3183 break;
3184 }
3185 }
3186 }
3187 }
7c673cae 3188
11fdf7f2
TL
3189 // OBJECT_STORE_WARN
3190 if (osd_sum.os_alerts.size()) {
3191 map<string, pair<size_t, list<string>>> os_alerts_sum;
3192
3193 for (auto& a : osd_sum.os_alerts) {
3194 int left = max;
3195 string s0 = " osd.";
3196 s0 += stringify(a.first);
3197 for (auto& aa : a.second) {
3198 string s(s0);
3199 s += " ";
3200 s += aa.second;
3201 auto it = os_alerts_sum.find(aa.first);
3202 if (it == os_alerts_sum.end()) {
3203 list<string> d;
3204 d.emplace_back(s);
3205 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3206 } else {
3207 auto& p = it->second;
3208 ++p.first;
3209 p.second.emplace_back(s);
3210 }
3211 if (--left == 0) {
3212 break;
3213 }
3214 }
3215 }
3216
3217 for (auto& asum : os_alerts_sum) {
9f95a23c 3218 string summary = stringify(asum.second.first) + " OSD(s)";
11fdf7f2 3219 if (asum.first == "BLUEFS_SPILLOVER") {
9f95a23c 3220 summary += " experiencing BlueFS spillover";
11fdf7f2 3221 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
9f95a23c 3222 summary += " have broken BlueStore compression";
81eedcae 3223 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
9f95a23c 3224 summary += " reporting legacy (not per-pool) BlueStore stats";
81eedcae 3225 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
9f95a23c 3226 summary += " have dangerous mismatch between BlueStore block device and free list sizes";
f67539c2
TL
3227 } else if (asum.first == "BLUESTORE_NO_PER_PG_OMAP") {
3228 summary += " reporting legacy (not per-pg) BlueStore omap";
9f95a23c
TL
3229 } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3230 summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
f67539c2
TL
3231 } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
3232 summary += " have spurious read errors";
11fdf7f2 3233 }
f67539c2 3234
9f95a23c 3235 auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
11fdf7f2
TL
3236 for (auto& s : asum.second.second) {
3237 d.detail.push_back(s);
3238 }
3239 }
3240 }
224ce89b
WB
3241 // PG_NOT_SCRUBBED
3242 // PG_NOT_DEEP_SCRUBBED
11fdf7f2
TL
3243 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3244 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
a8e16298
TL
3245 list<string> detail, deep_detail;
3246 int detail_max = max, deep_detail_max = max;
3247 int detail_more = 0, deep_detail_more = 0;
3248 int detail_total = 0, deep_detail_total = 0;
3249 for (auto& p : pg_stat) {
3250 int64_t pnum = p.first.pool();
3251 auto pool = osdmap.get_pg_pool(pnum);
3252 if (!pool)
3253 continue;
11fdf7f2 3254 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
a8e16298
TL
3255 double scrub_max_interval = 0;
3256 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3257 if (scrub_max_interval <= 0) {
3258 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
c07f9fc5 3259 }
11fdf7f2 3260 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
a8e16298
TL
3261 scrub_max_interval;
3262 utime_t cutoff = now;
3263 cutoff -= age;
3264 if (p.second.last_scrub_stamp < cutoff) {
3265 if (detail_max > 0) {
3266 ostringstream ss;
3267 ss << "pg " << p.first << " not scrubbed since "
3268 << p.second.last_scrub_stamp;
3269 detail.push_back(ss.str());
3270 --detail_max;
3271 } else {
3272 ++detail_more;
3273 }
3274 ++detail_total;
3275 }
3276 }
11fdf7f2 3277 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
a8e16298
TL
3278 double deep_scrub_interval = 0;
3279 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3280 if (deep_scrub_interval <= 0) {
3281 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3282 }
11fdf7f2 3283 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
a8e16298
TL
3284 deep_scrub_interval;
3285 utime_t deep_cutoff = now;
3286 deep_cutoff -= deep_age;
3287 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3288 if (deep_detail_max > 0) {
3289 ostringstream ss;
3290 ss << "pg " << p.first << " not deep-scrubbed since "
3291 << p.second.last_deep_scrub_stamp;
3292 deep_detail.push_back(ss.str());
3293 --deep_detail_max;
3294 } else {
3295 ++deep_detail_more;
3296 }
3297 ++deep_detail_total;
c07f9fc5 3298 }
224ce89b 3299 }
a8e16298
TL
3300 }
3301 if (detail_total) {
3302 ostringstream ss;
3303 ss << detail_total << " pgs not scrubbed in time";
9f95a23c 3304 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
a8e16298 3305
c07f9fc5 3306 if (!detail.empty()) {
c07f9fc5 3307 d.detail.swap(detail);
a8e16298
TL
3308
3309 if (detail_more) {
3310 ostringstream ss;
3311 ss << detail_more << " more pgs... ";
3312 d.detail.push_back(ss.str());
3313 }
c07f9fc5 3314 }
a8e16298
TL
3315 }
3316 if (deep_detail_total) {
3317 ostringstream ss;
3318 ss << deep_detail_total << " pgs not deep-scrubbed in time";
9f95a23c
TL
3319 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3320 deep_detail_total);
a8e16298 3321
c07f9fc5 3322 if (!deep_detail.empty()) {
c07f9fc5 3323 d.detail.swap(deep_detail);
a8e16298
TL
3324
3325 if (deep_detail_more) {
3326 ostringstream ss;
3327 ss << deep_detail_more << " more pgs... ";
3328 d.detail.push_back(ss.str());
3329 }
c07f9fc5
FG
3330 }
3331 }
3332 }
3333
3334 // POOL_APP
11fdf7f2 3335 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
c07f9fc5
FG
3336 list<string> detail;
3337 for (auto &it : pools) {
3338 const pg_pool_t &pool = it.second;
3339 const string& pool_name = osdmap.get_pool_name(it.first);
3340 auto it2 = pg_pool_sum.find(it.first);
3341 if (it2 == pg_pool_sum.end()) {
3342 continue;
3343 }
3344 const pool_stat_t *pstat = &it2->second;
3345 if (pstat == nullptr) {
3346 continue;
3347 }
3348 const object_stat_sum_t& sum = pstat->stats.sum;
3349 // application metadata is not encoded until luminous is minimum
3350 // required release
11fdf7f2
TL
3351 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3352 !pool.is_tier()) {
c07f9fc5
FG
3353 stringstream ss;
3354 ss << "application not enabled on pool '" << pool_name << "'";
3355 detail.push_back(ss.str());
224ce89b
WB
3356 }
3357 }
3358 if (!detail.empty()) {
3359 ostringstream ss;
9f95a23c
TL
3360 ss << detail.size() << " pool(s) do not have an application enabled";
3361 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3362 detail.size());
c07f9fc5
FG
3363 stringstream tip;
3364 tip << "use 'ceph osd pool application enable <pool-name> "
3365 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3366 << "or freeform for custom applications.";
3367 detail.push_back(tip.str());
224ce89b
WB
3368 d.detail.swap(detail);
3369 }
31f18b77 3370 }
b32b8144
FG
3371
3372 // PG_SLOW_SNAP_TRIMMING
3373 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3374 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3375 uint64_t snaptrimq_exceeded = 0;
3376 uint32_t longest_queue = 0;
3377 const pg_t* longest_q_pg = nullptr;
3378 list<string> detail;
3379
3380 for (auto& i: pg_stat) {
3381 uint32_t current_len = i.second.snaptrimq_len;
3382 if (current_len >= snapthreshold) {
3383 snaptrimq_exceeded++;
3384 if (longest_queue <= current_len) {
3385 longest_q_pg = &i.first;
3386 longest_queue = current_len;
3387 }
3388 if (detail.size() < max - 1) {
3389 stringstream ss;
3390 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3391 detail.push_back(ss.str());
3392 continue;
3393 }
3394 if (detail.size() < max) {
3395 detail.push_back("...more pgs affected");
3396 continue;
3397 }
3398 }
3399 }
3400
3401 if (snaptrimq_exceeded) {
3402 {
3403 ostringstream ss;
3404 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3405 detail.push_back(ss.str());
3406 }
3407
3408 stringstream ss;
3409 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
9f95a23c
TL
3410 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3411 snaptrimq_exceeded);
b32b8144
FG
3412 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3413 d.detail.swap(detail);
3414 }
3415 }
31f18b77 3416}
7c673cae 3417
9f95a23c
TL
3418void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
3419{
3420 if (f) {
3421 f->open_array_section("pgs_by_pool_state");
3422 for (auto& i: num_pg_by_pool_state) {
3423 f->open_object_section("per_pool_pgs_by_state");
3424 f->dump_int("pool_id", i.first);
3425 f->open_array_section("pg_state_counts");
3426 for (auto& j : i.second) {
3427 f->open_object_section("pg_state_count");
3428 f->dump_string("state_name", pg_state_string(j.first));
3429 f->dump_int("count", j.second);
3430 f->close_section();
3431 }
3432 f->close_section();
3433 f->close_section();
3434 }
3435 f->close_section();
3436 }
3437 PGMapDigest::print_summary(f, out);
3438}
3439
7c673cae
FG
3440int process_pg_map_command(
3441 const string& orig_prefix,
11fdf7f2 3442 const cmdmap_t& orig_cmdmap,
7c673cae
FG
3443 const PGMap& pg_map,
3444 const OSDMap& osdmap,
9f95a23c 3445 ceph::Formatter *f,
7c673cae
FG
3446 stringstream *ss,
3447 bufferlist *odata)
3448{
3449 string prefix = orig_prefix;
11fdf7f2
TL
3450 auto cmdmap = orig_cmdmap;
3451
3452 string omap_stats_note =
3453 "\n* NOTE: Omap statistics are gathered during deep scrub and "
9f95a23c 3454 "may be inaccurate soon afterwards depending on utilization. See "
f67539c2 3455 "http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics "
11fdf7f2
TL
3456 "for further details.\n";
3457 bool omap_stats_note_required = false;
7c673cae
FG
3458
3459 // perhaps these would be better in the parsing, but it's weird
3460 bool primary = false;
3461 if (prefix == "pg dump_json") {
3462 vector<string> v;
3463 v.push_back(string("all"));
7c673cae
FG
3464 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3465 prefix = "pg dump";
3466 } else if (prefix == "pg dump_pools_json") {
3467 vector<string> v;
3468 v.push_back(string("pools"));
7c673cae
FG
3469 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3470 prefix = "pg dump";
3471 } else if (prefix == "pg ls-by-primary") {
3472 primary = true;
3473 prefix = "pg ls";
3474 } else if (prefix == "pg ls-by-osd") {
3475 prefix = "pg ls";
3476 } else if (prefix == "pg ls-by-pool") {
3477 prefix = "pg ls";
3478 string poolstr;
9f95a23c 3479 cmd_getval(cmdmap, "poolstr", poolstr);
7c673cae
FG
3480 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3481 if (pool < 0) {
3482 *ss << "pool " << poolstr << " does not exist";
3483 return -ENOENT;
3484 }
3485 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3486 }
3487
7c673cae
FG
3488 stringstream ds;
3489 if (prefix == "pg stat") {
3490 if (f) {
3491 f->open_object_section("pg_summary");
3492 pg_map.print_oneline_summary(f, NULL);
3493 f->close_section();
3494 f->flush(ds);
3495 } else {
3496 ds << pg_map;
3497 }
3498 odata->append(ds);
3499 return 0;
3500 }
3501
3502 if (prefix == "pg getmap") {
3503 pg_map.encode(*odata);
3504 *ss << "got pgmap version " << pg_map.version;
3505 return 0;
3506 }
3507
3508 if (prefix == "pg dump") {
3509 string val;
3510 vector<string> dumpcontents;
3511 set<string> what;
9f95a23c 3512 if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
7c673cae
FG
3513 copy(dumpcontents.begin(), dumpcontents.end(),
3514 inserter(what, what.end()));
3515 }
3516 if (what.empty())
3517 what.insert("all");
3518 if (f) {
3519 if (what.count("all")) {
3520 f->open_object_section("pg_map");
3521 pg_map.dump(f);
3522 f->close_section();
3523 } else if (what.count("summary") || what.count("sum")) {
3524 f->open_object_section("pg_map");
3525 pg_map.dump_basic(f);
3526 f->close_section();
3527 } else {
3528 if (what.count("pools")) {
3529 pg_map.dump_pool_stats(f);
3530 }
3531 if (what.count("osds")) {
3532 pg_map.dump_osd_stats(f);
3533 }
3534 if (what.count("pgs")) {
3535 pg_map.dump_pg_stats(f, false);
3536 }
3537 if (what.count("pgs_brief")) {
3538 pg_map.dump_pg_stats(f, true);
3539 }
3540 if (what.count("delta")) {
3541 f->open_object_section("delta");
3542 pg_map.dump_delta(f);
3543 f->close_section();
3544 }
3545 }
3546 f->flush(*odata);
3547 } else {
3548 if (what.count("all")) {
3549 pg_map.dump(ds);
11fdf7f2 3550 omap_stats_note_required = true;
7c673cae
FG
3551 } else if (what.count("summary") || what.count("sum")) {
3552 pg_map.dump_basic(ds);
3553 pg_map.dump_pg_sum_stats(ds, true);
3554 pg_map.dump_osd_sum_stats(ds);
11fdf7f2 3555 omap_stats_note_required = true;
7c673cae
FG
3556 } else {
3557 if (what.count("pgs_brief")) {
3558 pg_map.dump_pg_stats(ds, true);
3559 }
3560 bool header = true;
3561 if (what.count("pgs")) {
3562 pg_map.dump_pg_stats(ds, false);
3563 header = false;
11fdf7f2 3564 omap_stats_note_required = true;
7c673cae
FG
3565 }
3566 if (what.count("pools")) {
3567 pg_map.dump_pool_stats(ds, header);
11fdf7f2 3568 omap_stats_note_required = true;
7c673cae
FG
3569 }
3570 if (what.count("osds")) {
3571 pg_map.dump_osd_stats(ds);
3572 }
3573 }
3574 odata->append(ds);
11fdf7f2
TL
3575 if (omap_stats_note_required) {
3576 odata->append(omap_stats_note);
3577 }
7c673cae
FG
3578 }
3579 *ss << "dumped " << what;
3580 return 0;
3581 }
3582
3583 if (prefix == "pg ls") {
3584 int64_t osd = -1;
3585 int64_t pool = -1;
3586 vector<string>states;
3587 set<pg_t> pgs;
9f95a23c
TL
3588 cmd_getval(cmdmap, "pool", pool);
3589 cmd_getval(cmdmap, "osd", osd);
3590 cmd_getval(cmdmap, "states", states);
7c673cae
FG
3591 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3592 *ss << "pool " << pool << " does not exist";
3593 return -ENOENT;
3594 }
3595 if (osd >= 0 && !osdmap.is_up(osd)) {
3596 *ss << "osd " << osd << " is not up";
3597 return -EAGAIN;
3598 }
3599 if (states.empty())
3600 states.push_back("all");
3601
11fdf7f2 3602 uint64_t state = 0;
7c673cae
FG
3603
3604 while (!states.empty()) {
3605 string state_str = states.back();
3606
3607 if (state_str == "all") {
3608 state = -1;
3609 break;
3610 } else {
3efd9988
FG
3611 auto filter = pg_string_state(state_str);
3612 if (!filter) {
c07f9fc5
FG
3613 *ss << "'" << state_str << "' is not a valid pg state,"
3614 << " available choices: " << pg_state_string(0xFFFFFFFF);
3615 return -EINVAL;
3616 }
3efd9988 3617 state |= *filter;
7c673cae
FG
3618 }
3619
3620 states.pop_back();
3621 }
3622
3623 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3624
3625 if (f && !pgs.empty()) {
3626 pg_map.dump_filtered_pg_stats(f, pgs);
3627 f->flush(*odata);
3628 } else if (!pgs.empty()) {
3629 pg_map.dump_filtered_pg_stats(ds, pgs);
3630 odata->append(ds);
11fdf7f2 3631 odata->append(omap_stats_note);
7c673cae
FG
3632 }
3633 return 0;
3634 }
3635
3636 if (prefix == "pg dump_stuck") {
3637 vector<string> stuckop_vec;
9f95a23c 3638 cmd_getval(cmdmap, "stuckops", stuckop_vec);
7c673cae
FG
3639 if (stuckop_vec.empty())
3640 stuckop_vec.push_back("unclean");
20effc67
TL
3641 const int64_t threshold = cmd_getval_or<int64_t>(
3642 cmdmap, "threshold",
3643 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
7c673cae 3644
11fdf7f2 3645 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
7c673cae 3646 *ss << "failed";
11fdf7f2 3647 } else {
7c673cae 3648 *ss << "ok";
11fdf7f2
TL
3649 }
3650 odata->append(ds);
7c673cae
FG
3651 return 0;
3652 }
3653
3654 if (prefix == "pg debug") {
20effc67
TL
3655 const string debugop = cmd_getval_or<string>(
3656 cmdmap, "debugop",
3657 "unfound_objects_exist");
7c673cae
FG
3658 if (debugop == "unfound_objects_exist") {
3659 bool unfound_objects_exist = false;
3660 for (const auto& p : pg_map.pg_stat) {
3661 if (p.second.stats.sum.num_objects_unfound > 0) {
3662 unfound_objects_exist = true;
3663 break;
3664 }
3665 }
3666 if (unfound_objects_exist)
3667 ds << "TRUE";
3668 else
3669 ds << "FALSE";
3670 odata->append(ds);
3671 return 0;
3672 }
3673 if (debugop == "degraded_pgs_exist") {
3674 bool degraded_pgs_exist = false;
3675 for (const auto& p : pg_map.pg_stat) {
3676 if (p.second.stats.sum.num_objects_degraded > 0) {
3677 degraded_pgs_exist = true;
3678 break;
3679 }
3680 }
3681 if (degraded_pgs_exist)
3682 ds << "TRUE";
3683 else
3684 ds << "FALSE";
3685 odata->append(ds);
3686 return 0;
3687 }
3688 }
3689
3690 if (prefix == "osd perf") {
3691 if (f) {
3692 f->open_object_section("osdstats");
3693 pg_map.dump_osd_perf_stats(f);
3694 f->close_section();
3695 f->flush(ds);
3696 } else {
3697 pg_map.print_osd_perf_stats(&ds);
3698 }
3699 odata->append(ds);
3700 return 0;
3701 }
3702
3703 if (prefix == "osd blocked-by") {
3704 if (f) {
3705 f->open_object_section("osd_blocked_by");
3706 pg_map.dump_osd_blocked_by_stats(f);
3707 f->close_section();
3708 f->flush(ds);
3709 } else {
3710 pg_map.print_osd_blocked_by_stats(&ds);
3711 }
3712 odata->append(ds);
3713 return 0;
3714 }
3715
7c673cae
FG
3716 return -EOPNOTSUPP;
3717}
3718
31f18b77
FG
3719void PGMapUpdater::check_osd_map(
3720 CephContext *cct,
3721 const OSDMap& osdmap,
3722 const PGMap& pgmap,
3723 PGMap::Incremental *pending_inc)
3724{
3725 for (auto& p : pgmap.osd_stat) {
3726 if (!osdmap.exists(p.first)) {
3727 // remove osd_stat
3728 pending_inc->rm_stat(p.first);
3729 } else if (osdmap.is_out(p.first)) {
3730 // zero osd_stat
11fdf7f2
TL
3731 if (p.second.statfs.total != 0) {
3732 pending_inc->stat_osd_out(p.first);
31f18b77
FG
3733 }
3734 } else if (!osdmap.is_up(p.first)) {
3735 // zero the op_queue_age_hist
3736 if (!p.second.op_queue_age_hist.empty()) {
11fdf7f2 3737 pending_inc->stat_osd_down_up(p.first, pgmap);
31f18b77
FG
3738 }
3739 }
3740 }
3741
3742 // deleted pgs (pools)?
3743 for (auto& p : pgmap.pg_pool_sum) {
3744 if (!osdmap.have_pg_pool(p.first)) {
3745 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3746 << dendl;
3747 for (auto& q : pgmap.pg_stat) {
11fdf7f2 3748 if (q.first.pool() == p.first) {
31f18b77
FG
3749 pending_inc->pg_remove.insert(q.first);
3750 }
3751 }
3752 auto q = pending_inc->pg_stat_updates.begin();
3753 while (q != pending_inc->pg_stat_updates.end()) {
11fdf7f2 3754 if (q->first.pool() == p.first) {
31f18b77
FG
3755 q = pending_inc->pg_stat_updates.erase(q);
3756 } else {
3757 ++q;
3758 }
3759 }
3760 }
3761 }
3762
11fdf7f2
TL
3763 // new (split or new pool) or merged pgs?
3764 map<int64_t,unsigned> new_pg_num;
31f18b77
FG
3765 for (auto& p : osdmap.get_pools()) {
3766 int64_t poolid = p.first;
3767 const pg_pool_t& pi = p.second;
3768 auto q = pgmap.num_pg_by_pool.find(poolid);
3769 unsigned my_pg_num = 0;
3770 if (q != pgmap.num_pg_by_pool.end())
3771 my_pg_num = q->second;
3772 unsigned pg_num = pi.get_pg_num();
11fdf7f2
TL
3773 new_pg_num[poolid] = pg_num;
3774 if (my_pg_num < pg_num) {
224ce89b 3775 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
11fdf7f2 3776 << " > my pg_num " << my_pg_num << dendl;
31f18b77
FG
3777 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3778 pg_t pgid(ps, poolid);
3779 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
224ce89b 3780 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
31f18b77
FG
3781 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3782 stats.last_fresh = osdmap.get_modified();
3783 stats.last_active = osdmap.get_modified();
3784 stats.last_change = osdmap.get_modified();
3785 stats.last_peered = osdmap.get_modified();
3786 stats.last_clean = osdmap.get_modified();
3787 stats.last_unstale = osdmap.get_modified();
3788 stats.last_undegraded = osdmap.get_modified();
3789 stats.last_fullsized = osdmap.get_modified();
3790 stats.last_scrub_stamp = osdmap.get_modified();
3791 stats.last_deep_scrub_stamp = osdmap.get_modified();
3792 stats.last_clean_scrub_stamp = osdmap.get_modified();
3793 }
3794 }
11fdf7f2
TL
3795 } else if (my_pg_num > pg_num) {
3796 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3797 << " < my pg_num " << my_pg_num << dendl;
3798 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3799 pg_t pgid(i, poolid);
3800 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3801 if (pgmap.pg_stat.count(pgid)) {
3802 pending_inc->pg_remove.insert(pgid);
3803 }
3804 pending_inc->pg_stat_updates.erase(pgid);
7c673cae 3805 }
7c673cae
FG
3806 }
3807 }
11fdf7f2
TL
3808 auto i = pending_inc->pg_stat_updates.begin();
3809 while (i != pending_inc->pg_stat_updates.end()) {
3810 auto j = new_pg_num.find(i->first.pool());
3811 if (j == new_pg_num.end() ||
3812 i->first.ps() >= j->second) {
3813 ldout(cct,20) << __func__ << " removing pending update to old "
3814 << i->first << dendl;
3815 i = pending_inc->pg_stat_updates.erase(i);
3816 } else {
3817 ++i;
7c673cae
FG
3818 }
3819 }
7c673cae
FG
3820}
3821
3822static void _try_mark_pg_stale(
3823 const OSDMap& osdmap,
3824 pg_t pgid,
3825 const pg_stat_t& cur,
3826 PGMap::Incremental *pending_inc)
3827{
3828 if ((cur.state & PG_STATE_STALE) == 0 &&
3829 cur.acting_primary != -1 &&
3830 osdmap.is_down(cur.acting_primary)) {
3831 pg_stat_t *newstat;
3832 auto q = pending_inc->pg_stat_updates.find(pgid);
3833 if (q != pending_inc->pg_stat_updates.end()) {
3834 if ((q->second.acting_primary == cur.acting_primary) ||
3835 ((q->second.state & PG_STATE_STALE) == 0 &&
3836 q->second.acting_primary != -1 &&
3837 osdmap.is_down(q->second.acting_primary))) {
3838 newstat = &q->second;
3839 } else {
3840 // pending update is no longer down or already stale
3841 return;
3842 }
3843 } else {
3844 newstat = &pending_inc->pg_stat_updates[pgid];
3845 *newstat = cur;
3846 }
3847 dout(10) << __func__ << " marking pg " << pgid
3848 << " stale (acting_primary " << newstat->acting_primary
3849 << ")" << dendl;
3850 newstat->state |= PG_STATE_STALE;
3851 newstat->last_unstale = ceph_clock_now();
3852 }
3853}
3854
3855void PGMapUpdater::check_down_pgs(
3856 const OSDMap &osdmap,
3857 const PGMap &pg_map,
3858 bool check_all,
3859 const set<int>& need_check_down_pg_osds,
3860 PGMap::Incremental *pending_inc)
3861{
3862 // if a large number of osds changed state, just iterate over the whole
3863 // pg map.
3864 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
11fdf7f2 3865 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
7c673cae
FG
3866 check_all = true;
3867 }
3868
3869 if (check_all) {
3870 for (const auto& p : pg_map.pg_stat) {
3871 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3872 }
3873 } else {
3874 for (auto osd : need_check_down_pg_osds) {
3875 if (osdmap.is_down(osd)) {
3876 auto p = pg_map.pg_by_osd.find(osd);
3877 if (p == pg_map.pg_by_osd.end()) {
3878 continue;
3879 }
3880 for (auto pgid : p->second) {
3881 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
11fdf7f2 3882 ceph_assert(stat.acting_primary == osd);
7c673cae
FG
3883 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3884 }
3885 }
3886 }
3887 }
3888}
3889
3890int reweight::by_utilization(
3891 const OSDMap &osdmap,
3892 const PGMap &pgm,
3893 int oload,
3894 double max_changef,
3895 int max_osds,
3896 bool by_pg, const set<int64_t> *pools,
3897 bool no_increasing,
3898 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3899 std::stringstream *ss,
3900 std::string *out_str,
9f95a23c 3901 ceph::Formatter *f)
7c673cae
FG
3902{
3903 if (oload <= 100) {
3904 *ss << "You must give a percentage higher than 100. "
3905 "The reweighting threshold will be calculated as <average-utilization> "
3906 "times <input-percentage>. For example, an argument of 200 would "
3907 "reweight OSDs which are twice as utilized as the average OSD.\n";
3908 return -EINVAL;
3909 }
3910
3911 vector<int> pgs_by_osd(osdmap.get_max_osd());
3912
3913 // Avoid putting a small number (or 0) in the denominator when calculating
3914 // average_util
3915 double average_util;
3916 if (by_pg) {
3917 // by pg mapping
3918 double weight_sum = 0.0; // sum up the crush weights
3919 unsigned num_pg_copies = 0;
3920 int num_osds = 0;
3921 for (const auto& pg : pgm.pg_stat) {
3922 if (pools && pools->count(pg.first.pool()) == 0)
3923 continue;
3924 for (const auto acting : pg.second.acting) {
b5b8bbf5
FG
3925 if (!osdmap.exists(acting)) {
3926 continue;
3927 }
7c673cae
FG
3928 if (acting >= (int)pgs_by_osd.size())
3929 pgs_by_osd.resize(acting);
3930 if (pgs_by_osd[acting] == 0) {
3931 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3932 //skip if we currently can not identify item
3933 continue;
3934 }
3935 weight_sum += osdmap.crush->get_item_weightf(acting);
3936 ++num_osds;
3937 }
3938 ++pgs_by_osd[acting];
3939 ++num_pg_copies;
3940 }
3941 }
3942
11fdf7f2 3943 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
7c673cae
FG
3944 *ss << "Refusing to reweight: we only have " << num_pg_copies
3945 << " PGs across " << num_osds << " osds!\n";
3946 return -EDOM;
3947 }
3948
3949 average_util = (double)num_pg_copies / weight_sum;
3950 } else {
3951 // by osd utilization
11fdf7f2
TL
3952 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3953 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3954 < g_conf()->mon_reweight_min_bytes_per_osd) {
3955 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
7c673cae
FG
3956 << " kb across all osds!\n";
3957 return -EDOM;
3958 }
11fdf7f2
TL
3959 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3960 < g_conf()->mon_reweight_min_bytes_per_osd) {
3961 *ss << "Refusing to reweight: we only have "
3962 << pgm.osd_sum.statfs.kb_used_raw()
7c673cae
FG
3963 << " kb used across all osds!\n";
3964 return -EDOM;
3965 }
3966
11fdf7f2
TL
3967 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
3968 (double)pgm.osd_sum.statfs.total;
7c673cae
FG
3969 }
3970
3971 // adjust down only if we are above the threshold
3972 const double overload_util = average_util * (double)oload / 100.0;
3973
3974 // but aggressively adjust weights up whenever possible.
3975 const double underload_util = average_util;
3976
20effc67 3977 const unsigned max_change = (unsigned)(max_changef * (double)CEPH_OSD_IN);
7c673cae
FG
3978
3979 ostringstream oss;
3980 if (f) {
3981 f->open_object_section("reweight_by_utilization");
3982 f->dump_int("overload_min", oload);
3983 f->dump_float("max_change", max_changef);
3984 f->dump_int("max_change_osds", max_osds);
3985 f->dump_float("average_utilization", average_util);
3986 f->dump_float("overload_utilization", overload_util);
3987 } else {
3988 oss << "oload " << oload << "\n";
3989 oss << "max_change " << max_changef << "\n";
3990 oss << "max_change_osds " << max_osds << "\n";
3991 oss.precision(4);
3992 oss << "average_utilization " << std::fixed << average_util << "\n";
3993 oss << "overload_utilization " << overload_util << "\n";
3994 }
3995 int num_changed = 0;
3996
3997 // precompute util for each OSD
3998 std::vector<std::pair<int, float> > util_by_osd;
3999 for (const auto& p : pgm.osd_stat) {
4000 std::pair<int, float> osd_util;
4001 osd_util.first = p.first;
4002 if (by_pg) {
4003 if (p.first >= (int)pgs_by_osd.size() ||
4004 pgs_by_osd[p.first] == 0) {
4005 // skip if this OSD does not contain any pg
4006 // belonging to the specified pool(s).
4007 continue;
4008 }
4009
4010 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4011 // skip if we are unable to locate item.
4012 continue;
4013 }
4014
11fdf7f2
TL
4015 osd_util.second =
4016 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
7c673cae 4017 } else {
11fdf7f2
TL
4018 osd_util.second =
4019 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
7c673cae
FG
4020 }
4021 util_by_osd.push_back(osd_util);
4022 }
4023
4024 // sort by absolute deviation from the mean utilization,
4025 // in descending order.
4026 std::sort(util_by_osd.begin(), util_by_osd.end(),
4027 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4028 return abs(l.second - average_util) > abs(r.second - average_util);
4029 }
4030 );
4031
4032 if (f)
4033 f->open_array_section("reweights");
4034
4035 for (const auto& p : util_by_osd) {
4036 unsigned weight = osdmap.get_weight(p.first);
4037 if (weight == 0) {
4038 // skip if OSD is currently out
4039 continue;
4040 }
4041 float util = p.second;
4042
4043 if (util >= overload_util) {
4044 // Assign a lower weight to overloaded OSDs. The current weight
4045 // is a factor to take into account the original weights,
4046 // to represent e.g. differing storage capacities
4047 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4048 if (weight > max_change)
11fdf7f2 4049 new_weight = std::max(new_weight, weight - max_change);
7c673cae
FG
4050 new_weights->insert({p.first, new_weight});
4051 if (f) {
4052 f->open_object_section("osd");
4053 f->dump_int("osd", p.first);
20effc67
TL
4054 f->dump_float("weight", (float)weight / (float)CEPH_OSD_IN);
4055 f->dump_float("new_weight", (float)new_weight / (float)CEPH_OSD_IN);
7c673cae
FG
4056 f->close_section();
4057 } else {
4058 oss << "osd." << p.first << " weight "
20effc67
TL
4059 << (float)weight / (float)CEPH_OSD_IN << " -> "
4060 << (float)new_weight / (float)CEPH_OSD_IN << "\n";
7c673cae
FG
4061 }
4062 if (++num_changed >= max_osds)
4063 break;
4064 }
4065 if (!no_increasing && util <= underload_util) {
4066 // assign a higher weight.. if we can.
4067 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
11fdf7f2 4068 new_weight = std::min(new_weight, weight + max_change);
20effc67
TL
4069 if (new_weight > CEPH_OSD_IN)
4070 new_weight = CEPH_OSD_IN;
7c673cae
FG
4071 if (new_weight > weight) {
4072 new_weights->insert({p.first, new_weight});
4073 oss << "osd." << p.first << " weight "
20effc67
TL
4074 << (float)weight / (float)CEPH_OSD_IN << " -> "
4075 << (float)new_weight / (float)CEPH_OSD_IN << "\n";
7c673cae
FG
4076 if (++num_changed >= max_osds)
4077 break;
4078 }
4079 }
4080 }
4081 if (f) {
4082 f->close_section();
4083 }
4084
4085 OSDMap newmap;
4086 newmap.deepish_copy_from(osdmap);
4087 OSDMap::Incremental newinc;
4088 newinc.fsid = newmap.get_fsid();
4089 newinc.epoch = newmap.get_epoch() + 1;
4090 newinc.new_weight = *new_weights;
4091 newmap.apply_incremental(newinc);
4092
4093 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4094
4095 if (f) {
4096 f->close_section();
4097 } else {
4098 *out_str += "\n";
4099 *out_str += oss.str();
4100 }
4101 return num_changed;
4102}