]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PGMap.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mon / PGMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
224ce89b
WB
4#include <boost/algorithm/string.hpp>
5
7c673cae
FG
6#include "PGMap.h"
7
8#define dout_subsys ceph_subsys_mon
9#include "common/debug.h"
11fdf7f2 10#include "common/Clock.h"
7c673cae 11#include "common/Formatter.h"
11fdf7f2 12#include "global/global_context.h"
7c673cae
FG
13#include "include/ceph_features.h"
14#include "include/stringify.h"
15
16#include "osd/osd_types.h"
17#include "osd/OSDMap.h"
eafe8130 18#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
19
20#define dout_context g_ceph_context
21
9f95a23c
TL
22using std::list;
23using std::make_pair;
24using std::map;
25using std::pair;
26using std::ostream;
27using std::ostringstream;
28using std::set;
29using std::string;
30using std::stringstream;
31using std::vector;
32
33using ceph::bufferlist;
34using TOPNSPC::common::cmd_getval;
35
31f18b77
FG
36MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
37MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
38MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
39
40
41// ---------------------
42// PGMapDigest
43
44void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
45{
46 // NOTE: see PGMap::encode_digest
11fdf7f2
TL
47 uint8_t v = 4;
48 if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
49 v = 1;
50 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
51 v = 3;
52 }
53 ENCODE_START(v, 1, bl);
54 encode(num_pg, bl);
55 encode(num_pg_active, bl);
56 encode(num_pg_unknown, bl);
57 encode(num_osd, bl);
58 encode(pg_pool_sum, bl, features);
59 encode(pg_sum, bl, features);
60 encode(osd_sum, bl, features);
61 if (v >= 2) {
62 encode(num_pg_by_state, bl);
63 } else {
64 uint32_t n = num_pg_by_state.size();
65 encode(n, bl);
66 for (auto p : num_pg_by_state) {
9f95a23c 67 encode((int32_t)p.first, bl);
11fdf7f2
TL
68 encode(p.second, bl);
69 }
70 }
71 encode(num_pg_by_osd, bl);
72 encode(num_pg_by_pool, bl);
73 encode(osd_last_seq, bl);
74 encode(per_pool_sum_delta, bl, features);
75 encode(per_pool_sum_deltas_stamps, bl);
76 encode(pg_sum_delta, bl, features);
77 encode(stamp_delta, bl);
78 encode(avail_space_by_rule, bl);
79 if (struct_v >= 3) {
80 encode(purged_snaps, bl);
81 }
82 if (struct_v >= 4) {
83 encode(osd_sum_by_class, bl, features);
84 }
7c673cae
FG
85 ENCODE_FINISH(bl);
86}
87
11fdf7f2 88void PGMapDigest::decode(bufferlist::const_iterator& p)
31f18b77 89{
11fdf7f2
TL
90 DECODE_START(4, p);
91 decode(num_pg, p);
92 decode(num_pg_active, p);
93 decode(num_pg_unknown, p);
94 decode(num_osd, p);
95 decode(pg_pool_sum, p);
96 decode(pg_sum, p);
97 decode(osd_sum, p);
98 if (struct_v >= 2) {
99 decode(num_pg_by_state, p);
100 } else {
101 map<int32_t, int32_t> nps;
102 decode(nps, p);
103 num_pg_by_state.clear();
104 for (auto i : nps) {
105 num_pg_by_state[i.first] = i.second;
106 }
107 }
108 decode(num_pg_by_osd, p);
109 decode(num_pg_by_pool, p);
110 decode(osd_last_seq, p);
111 decode(per_pool_sum_delta, p);
112 decode(per_pool_sum_deltas_stamps, p);
113 decode(pg_sum_delta, p);
114 decode(stamp_delta, p);
115 decode(avail_space_by_rule, p);
116 if (struct_v >= 3) {
117 decode(purged_snaps, p);
118 }
119 if (struct_v >= 4) {
120 decode(osd_sum_by_class, p);
121 }
31f18b77
FG
122 DECODE_FINISH(p);
123}
124
9f95a23c 125void PGMapDigest::dump(ceph::Formatter *f) const
31f18b77
FG
126{
127 f->dump_unsigned("num_pg", num_pg);
128 f->dump_unsigned("num_pg_active", num_pg_active);
129 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
130 f->dump_unsigned("num_osd", num_osd);
131 f->dump_object("pool_sum", pg_sum);
132 f->dump_object("osd_sum", osd_sum);
11fdf7f2
TL
133
134 f->open_object_section("osd_sum_by_class");
135 for (auto& i : osd_sum_by_class) {
136 f->dump_object(i.first.c_str(), i.second);
137 }
138 f->close_section();
139
31f18b77
FG
140 f->open_array_section("pool_stats");
141 for (auto& p : pg_pool_sum) {
142 f->open_object_section("pool_stat");
143 f->dump_int("poolid", p.first);
144 auto q = num_pg_by_pool.find(p.first);
145 if (q != num_pg_by_pool.end())
146 f->dump_unsigned("num_pg", q->second);
147 p.second.dump(f);
7c673cae
FG
148 f->close_section();
149 }
150 f->close_section();
31f18b77
FG
151 f->open_array_section("osd_stats");
152 int i = 0;
153 // TODO: this isn't really correct since we can dump non-existent OSDs
154 // I dunno what osd_last_seq is set to in that case...
155 for (auto& p : osd_last_seq) {
7c673cae 156 f->open_object_section("osd_stat");
31f18b77
FG
157 f->dump_int("osd", i);
158 f->dump_unsigned("seq", p);
7c673cae 159 f->close_section();
31f18b77 160 ++i;
7c673cae
FG
161 }
162 f->close_section();
31f18b77
FG
163 f->open_array_section("num_pg_by_state");
164 for (auto& p : num_pg_by_state) {
165 f->open_object_section("count");
166 f->dump_string("state", pg_state_string(p.first));
167 f->dump_unsigned("num", p.second);
168 f->close_section();
169 }
7c673cae 170 f->close_section();
31f18b77
FG
171 f->open_array_section("num_pg_by_osd");
172 for (auto& p : num_pg_by_osd) {
173 f->open_object_section("count");
174 f->dump_unsigned("osd", p.first);
175 f->dump_unsigned("num_primary_pg", p.second.primary);
176 f->dump_unsigned("num_acting_pg", p.second.acting);
81eedcae 177 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
31f18b77
FG
178 f->close_section();
179 }
7c673cae 180 f->close_section();
11fdf7f2
TL
181 f->open_array_section("purged_snaps");
182 for (auto& j : purged_snaps) {
183 f->open_object_section("pool");
184 f->dump_int("pool", j.first);
185 f->open_object_section("purged_snaps");
186 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
187 f->open_object_section("interval");
188 f->dump_stream("start") << i.get_start();
189 f->dump_stream("length") << i.get_len();
190 f->close_section();
191 }
192 f->close_section();
193 f->close_section();
194 }
195 f->close_section();
7c673cae
FG
196}
197
31f18b77 198void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
7c673cae 199{
31f18b77 200 ls.push_back(new PGMapDigest);
7c673cae
FG
201}
202
31f18b77
FG
203inline std::string percentify(const float& a) {
204 std::stringstream ss;
205 if (a < 0.01)
206 ss << "0";
207 else
208 ss << std::fixed << std::setprecision(2) << a;
209 return ss.str();
210}
7c673cae 211
9f95a23c 212void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
7c673cae 213{
31f18b77
FG
214 if (f)
215 f->open_array_section("pgs_by_state");
7c673cae 216
31f18b77 217 // list is descending numeric order (by count)
9f95a23c 218 std::multimap<int,uint64_t> state_by_count; // count -> state
31f18b77
FG
219 for (auto p = num_pg_by_state.begin();
220 p != num_pg_by_state.end();
221 ++p) {
222 state_by_count.insert(make_pair(p->second, p->first));
7c673cae 223 }
31f18b77
FG
224 if (f) {
225 for (auto p = state_by_count.rbegin();
226 p != state_by_count.rend();
227 ++p)
228 {
229 f->open_object_section("pgs_by_state_element");
230 f->dump_string("state_name", pg_state_string(p->second));
231 f->dump_unsigned("count", p->first);
232 f->close_section();
233 }
7c673cae 234 }
31f18b77
FG
235 if (f)
236 f->close_section();
7c673cae 237
31f18b77
FG
238 if (f) {
239 f->dump_unsigned("num_pgs", num_pg);
240 f->dump_unsigned("num_pools", pg_pool_sum.size());
241 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
242 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
11fdf7f2
TL
243 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
244 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
245 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
31f18b77
FG
246 } else {
247 *out << " pools: " << pg_pool_sum.size() << " pools, "
248 << num_pg << " pgs\n";
1adf2230
AA
249 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
250 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
31f18b77 251 *out << " usage: "
11fdf7f2
TL
252 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
253 << byte_u_t(osd_sum.statfs.available) << " / "
254 << byte_u_t(osd_sum.statfs.total) << " avail\n";
31f18b77
FG
255 *out << " pgs: ";
256 }
7c673cae 257
31f18b77 258 bool pad = false;
7c673cae 259
31f18b77
FG
260 if (num_pg_unknown > 0) {
261 float p = (float)num_pg_unknown / (float)num_pg;
262 if (f) {
263 f->dump_float("unknown_pgs_ratio", p);
7c673cae 264 } else {
31f18b77
FG
265 char b[20];
266 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
267 *out << b << "% pgs unknown\n";
268 pad = true;
7c673cae 269 }
7c673cae 270 }
7c673cae 271
31f18b77
FG
272 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
273 if (num_pg_inactive > 0) {
274 float p = (float)num_pg_inactive / (float)num_pg;
275 if (f) {
276 f->dump_float("inactive_pgs_ratio", p);
7c673cae 277 } else {
31f18b77
FG
278 if (pad) {
279 *out << " ";
280 }
281 char b[20];
282 snprintf(b, sizeof(b), "%.3f", p * 100.0);
283 *out << b << "% pgs not active\n";
284 pad = true;
7c673cae 285 }
7c673cae 286 }
31f18b77
FG
287
288 list<string> sl;
289 overall_recovery_summary(f, &sl);
290 if (!f && !sl.empty()) {
291 for (auto p = sl.begin(); p != sl.end(); ++p) {
292 if (pad) {
293 *out << " ";
294 }
295 *out << *p << "\n";
296 pad = true;
7c673cae 297 }
7c673cae 298 }
31f18b77 299 sl.clear();
7c673cae 300
31f18b77
FG
301 if (!f) {
302 unsigned max_width = 1;
9f95a23c 303 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
31f18b77
FG
304 {
305 std::stringstream ss;
306 ss << p->first;
11fdf7f2 307 max_width = std::max<size_t>(ss.str().size(), max_width);
7c673cae
FG
308 }
309
9f95a23c 310 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
31f18b77
FG
311 {
312 if (pad) {
313 *out << " ";
314 }
315 pad = true;
316 out->setf(std::ios::left);
317 *out << std::setw(max_width) << p->first
318 << " " << pg_state_string(p->second) << "\n";
319 out->unsetf(std::ios::left);
320 }
7c673cae
FG
321 }
322
31f18b77
FG
323 ostringstream ss_rec_io;
324 overall_recovery_rate_summary(f, &ss_rec_io);
325 ostringstream ss_client_io;
326 overall_client_io_rate_summary(f, &ss_client_io);
327 ostringstream ss_cache_io;
328 overall_cache_io_rate_summary(f, &ss_cache_io);
7c673cae 329
31f18b77
FG
330 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
331 || ss_cache_io.str().length())) {
332 *out << "\n \n";
333 *out << " io:\n";
7c673cae
FG
334 }
335
31f18b77
FG
336 if (!f && ss_client_io.str().length())
337 *out << " client: " << ss_client_io.str() << "\n";
338 if (!f && ss_rec_io.str().length())
339 *out << " recovery: " << ss_rec_io.str() << "\n";
340 if (!f && ss_cache_io.str().length())
341 *out << " cache: " << ss_cache_io.str() << "\n";
7c673cae
FG
342}
343
9f95a23c 344void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
7c673cae 345{
31f18b77
FG
346 std::stringstream ss;
347
348 if (f)
349 f->open_array_section("num_pg_by_state");
350 for (auto p = num_pg_by_state.begin();
351 p != num_pg_by_state.end();
352 ++p) {
353 if (f) {
354 f->open_object_section("state");
355 f->dump_string("name", pg_state_string(p->first));
356 f->dump_unsigned("num", p->second);
357 f->close_section();
358 }
359 if (p != num_pg_by_state.begin())
360 ss << ", ";
361 ss << p->second << " " << pg_state_string(p->first);
7c673cae 362 }
31f18b77
FG
363 if (f)
364 f->close_section();
7c673cae 365
31f18b77
FG
366 string states = ss.str();
367 if (out)
368 *out << num_pg << " pgs: "
369 << states << "; "
1adf2230 370 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
11fdf7f2
TL
371 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
372 << byte_u_t(osd_sum.statfs.available) << " / "
373 << byte_u_t(osd_sum.statfs.total) << " avail";
31f18b77
FG
374 if (f) {
375 f->dump_unsigned("num_pgs", num_pg);
376 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
11fdf7f2
TL
377 f->dump_int("total_bytes", osd_sum.statfs.total);
378 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
379 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
380 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
31f18b77 381 }
7c673cae 382
31f18b77
FG
383 // make non-negative; we can get negative values if osds send
384 // uncommitted stats and then "go backward" or if they are just
385 // buggy/wrong.
386 pool_stat_t pos_delta = pg_sum_delta;
387 pos_delta.floor(0);
388 if (pos_delta.stats.sum.num_rd ||
389 pos_delta.stats.sum.num_wr) {
390 if (out)
391 *out << "; ";
392 if (pos_delta.stats.sum.num_rd) {
393 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
394 if (out)
1adf2230 395 *out << byte_u_t(rd) << "/s rd, ";
31f18b77
FG
396 if (f)
397 f->dump_unsigned("read_bytes_sec", rd);
398 }
399 if (pos_delta.stats.sum.num_wr) {
400 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
401 if (out)
1adf2230 402 *out << byte_u_t(wr) << "/s wr, ";
31f18b77
FG
403 if (f)
404 f->dump_unsigned("write_bytes_sec", wr);
405 }
406 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
407 if (out)
11fdf7f2 408 *out << si_u_t(iops) << " op/s";
31f18b77
FG
409 if (f)
410 f->dump_unsigned("io_sec", iops);
7c673cae 411 }
31f18b77
FG
412
413 list<string> sl;
414 overall_recovery_summary(f, &sl);
415 if (out)
416 for (auto p = sl.begin(); p != sl.end(); ++p)
417 *out << "; " << *p;
418 std::stringstream ssr;
419 overall_recovery_rate_summary(f, &ssr);
420 if (out && ssr.str().length())
421 *out << "; " << ssr.str() << " recovering";
7c673cae
FG
422}
423
11fdf7f2
TL
424void PGMapDigest::get_recovery_stats(
425 double *misplaced_ratio,
426 double *degraded_ratio,
427 double *inactive_pgs_ratio,
428 double *unknown_pgs_ratio) const
429{
430 if (pg_sum.stats.sum.num_objects_degraded &&
431 pg_sum.stats.sum.num_object_copies > 0) {
432 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
433 (double)pg_sum.stats.sum.num_object_copies;
434 } else {
435 *degraded_ratio = 0;
436 }
437 if (pg_sum.stats.sum.num_objects_misplaced &&
438 pg_sum.stats.sum.num_object_copies > 0) {
439 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
440 (double)pg_sum.stats.sum.num_object_copies;
441 } else {
442 *misplaced_ratio = 0;
443 }
444 if (num_pg > 0) {
445 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
446 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
447 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
448 } else {
449 *inactive_pgs_ratio = 0;
450 *unknown_pgs_ratio = 0;
451 }
452}
453
9f95a23c 454void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
b32b8144 455 const pool_stat_t& pool_sum) const
7c673cae 456{
b32b8144
FG
457 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
458 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
459 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
460 char b[20];
461 snprintf(b, sizeof(b), "%.3lf", pc);
462 if (f) {
b32b8144
FG
463 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
464 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
465 f->dump_float("degraded_ratio", pc / 100.0);
466 } else {
467 ostringstream ss;
b32b8144
FG
468 ss << pool_sum.stats.sum.num_objects_degraded
469 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
31f18b77
FG
470 psl->push_back(ss.str());
471 }
472 }
b32b8144
FG
473 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
474 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
475 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
476 char b[20];
477 snprintf(b, sizeof(b), "%.3lf", pc);
478 if (f) {
b32b8144
FG
479 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
480 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
481 f->dump_float("misplaced_ratio", pc / 100.0);
482 } else {
483 ostringstream ss;
b32b8144
FG
484 ss << pool_sum.stats.sum.num_objects_misplaced
485 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
31f18b77
FG
486 psl->push_back(ss.str());
487 }
488 }
b32b8144
FG
489 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
490 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
491 (double)pool_sum.stats.sum.num_objects * (double)100.0;
31f18b77
FG
492 char b[20];
493 snprintf(b, sizeof(b), "%.3lf", pc);
494 if (f) {
b32b8144
FG
495 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
496 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
31f18b77
FG
497 f->dump_float("unfound_ratio", pc / 100.0);
498 } else {
499 ostringstream ss;
b32b8144
FG
500 ss << pool_sum.stats.sum.num_objects_unfound
501 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
31f18b77
FG
502 psl->push_back(ss.str());
503 }
7c673cae 504 }
7c673cae
FG
505}
506
9f95a23c 507void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
508 const pool_stat_t& delta_sum,
509 utime_t delta_stamp) const
7c673cae 510{
31f18b77
FG
511 // make non-negative; we can get negative values if osds send
512 // uncommitted stats and then "go backward" or if they are just
513 // buggy/wrong.
514 pool_stat_t pos_delta = delta_sum;
515 pos_delta.floor(0);
516 if (pos_delta.stats.sum.num_objects_recovered ||
517 pos_delta.stats.sum.num_bytes_recovered ||
518 pos_delta.stats.sum.num_keys_recovered) {
519 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
520 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
521 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
522 if (f) {
523 f->dump_int("recovering_objects_per_sec", objps);
524 f->dump_int("recovering_bytes_per_sec", bps);
525 f->dump_int("recovering_keys_per_sec", kps);
526 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
527 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
528 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
529 } else {
1adf2230 530 *out << byte_u_t(bps) << "/s";
31f18b77 531 if (pos_delta.stats.sum.num_keys_recovered)
11fdf7f2
TL
532 *out << ", " << si_u_t(kps) << " keys/s";
533 *out << ", " << si_u_t(objps) << " objects/s";
31f18b77 534 }
7c673cae 535 }
31f18b77 536}
7c673cae 537
9f95a23c 538void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
31f18b77
FG
539{
540 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
541}
542
9f95a23c 543void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
7c673cae 544{
31f18b77 545 recovery_summary(f, psl, pg_sum);
7c673cae
FG
546}
547
9f95a23c 548void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77 549 uint64_t poolid) const
7c673cae 550{
31f18b77
FG
551 auto p = per_pool_sum_delta.find(poolid);
552 if (p == per_pool_sum_delta.end())
553 return;
7c673cae 554
31f18b77 555 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 556 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77
FG
557 recovery_rate_summary(f, out, p->second.first, ts->second);
558}
7c673cae 559
9f95a23c 560void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
31f18b77
FG
561 uint64_t poolid) const
562{
b32b8144
FG
563 auto p = pg_pool_sum.find(poolid);
564 if (p == pg_pool_sum.end())
31f18b77 565 return;
7c673cae 566
b32b8144 567 recovery_summary(f, psl, p->second);
7c673cae
FG
568}
569
9f95a23c 570void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
571 const pool_stat_t& delta_sum,
572 utime_t delta_stamp) const
7c673cae 573{
31f18b77
FG
574 pool_stat_t pos_delta = delta_sum;
575 pos_delta.floor(0);
576 if (pos_delta.stats.sum.num_rd ||
577 pos_delta.stats.sum.num_wr) {
578 if (pos_delta.stats.sum.num_rd) {
579 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
580 if (f) {
581 f->dump_int("read_bytes_sec", rd);
582 } else {
1adf2230 583 *out << byte_u_t(rd) << "/s rd, ";
31f18b77
FG
584 }
585 }
586 if (pos_delta.stats.sum.num_wr) {
587 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
588 if (f) {
589 f->dump_int("write_bytes_sec", wr);
590 } else {
1adf2230 591 *out << byte_u_t(wr) << "/s wr, ";
31f18b77
FG
592 }
593 }
594 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
595 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
596 if (f) {
597 f->dump_int("read_op_per_sec", iops_rd);
598 f->dump_int("write_op_per_sec", iops_wr);
599 } else {
11fdf7f2 600 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
31f18b77 601 }
7c673cae
FG
602 }
603}
604
9f95a23c 605void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
7c673cae 606{
31f18b77
FG
607 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
608}
7c673cae 609
9f95a23c 610void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
611 uint64_t poolid) const
612{
613 auto p = per_pool_sum_delta.find(poolid);
614 if (p == per_pool_sum_delta.end())
7c673cae
FG
615 return;
616
31f18b77 617 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 618 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77 619 client_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
620}
621
9f95a23c 622void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
623 const pool_stat_t& delta_sum,
624 utime_t delta_stamp) const
7c673cae 625{
31f18b77
FG
626 pool_stat_t pos_delta = delta_sum;
627 pos_delta.floor(0);
628 bool have_output = false;
7c673cae 629
31f18b77
FG
630 if (pos_delta.stats.sum.num_flush) {
631 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
632 if (f) {
633 f->dump_int("flush_bytes_sec", flush);
634 } else {
1adf2230 635 *out << byte_u_t(flush) << "/s flush";
31f18b77 636 have_output = true;
7c673cae
FG
637 }
638 }
31f18b77
FG
639 if (pos_delta.stats.sum.num_evict) {
640 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
641 if (f) {
642 f->dump_int("evict_bytes_sec", evict);
643 } else {
644 if (have_output)
645 *out << ", ";
1adf2230 646 *out << byte_u_t(evict) << "/s evict";
31f18b77
FG
647 have_output = true;
648 }
7c673cae 649 }
31f18b77
FG
650 if (pos_delta.stats.sum.num_promote) {
651 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
652 if (f) {
653 f->dump_int("promote_op_per_sec", promote);
654 } else {
655 if (have_output)
656 *out << ", ";
11fdf7f2 657 *out << si_u_t(promote) << " op/s promote";
31f18b77
FG
658 have_output = true;
659 }
7c673cae 660 }
31f18b77
FG
661 if (pos_delta.stats.sum.num_flush_mode_low) {
662 if (f) {
663 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
664 } else {
665 if (have_output)
666 *out << ", ";
11fdf7f2 667 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
31f18b77
FG
668 have_output = true;
669 }
7c673cae 670 }
31f18b77
FG
671 if (pos_delta.stats.sum.num_flush_mode_high) {
672 if (f) {
673 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
674 } else {
675 if (have_output)
676 *out << ", ";
11fdf7f2 677 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
31f18b77
FG
678 have_output = true;
679 }
7c673cae 680 }
31f18b77
FG
681 if (pos_delta.stats.sum.num_evict_mode_some) {
682 if (f) {
683 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
684 } else {
685 if (have_output)
686 *out << ", ";
11fdf7f2 687 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
31f18b77
FG
688 have_output = true;
689 }
690 }
691 if (pos_delta.stats.sum.num_evict_mode_full) {
692 if (f) {
693 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
694 } else {
695 if (have_output)
696 *out << ", ";
11fdf7f2 697 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
31f18b77 698 }
7c673cae
FG
699 }
700}
701
9f95a23c 702void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
7c673cae 703{
31f18b77 704 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
705}
706
9f95a23c 707void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77 708 uint64_t poolid) const
7c673cae 709{
31f18b77
FG
710 auto p = per_pool_sum_delta.find(poolid);
711 if (p == per_pool_sum_delta.end())
712 return;
7c673cae 713
31f18b77 714 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 715 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77 716 cache_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
717}
718
d2e6a577
FG
719ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
720 boost::optional<int64_t> data_pool) const
721{
722 ceph_statfs statfs;
723 bool filter = false;
724 object_stat_sum_t sum;
725
726 if (data_pool) {
727 auto i = pg_pool_sum.find(*data_pool);
728 if (i != pg_pool_sum.end()) {
729 sum = i->second.stats.sum;
730 filter = true;
731 }
732 }
733
734 if (filter) {
735 statfs.kb_used = (sum.num_bytes >> 10);
736 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
737 statfs.num_objects = sum.num_objects;
738 statfs.kb = statfs.kb_used + statfs.kb_avail;
739 } else {
740 // these are in KB.
11fdf7f2
TL
741 statfs.kb = osd_sum.statfs.kb();
742 statfs.kb_used = osd_sum.statfs.kb_used_raw();
743 statfs.kb_avail = osd_sum.statfs.kb_avail();
d2e6a577
FG
744 statfs.num_objects = pg_sum.stats.sum.num_objects;
745 }
746
747 return statfs;
748}
749
31f18b77
FG
750void PGMapDigest::dump_pool_stats_full(
751 const OSDMap &osd_map,
752 stringstream *ss,
9f95a23c 753 ceph::Formatter *f,
31f18b77 754 bool verbose) const
7c673cae 755{
31f18b77 756 TextTable tbl;
7c673cae 757
31f18b77
FG
758 if (f) {
759 f->open_array_section("pools");
760 } else {
11fdf7f2
TL
761 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
762 tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT);
f91f0fd5 763 tbl.define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 764 tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT);
9f95a23c
TL
765 if (verbose) {
766 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
767 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
768 }
11fdf7f2 769 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
31f18b77 770 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
9f95a23c
TL
771 if (verbose) {
772 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
773 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
774 }
31f18b77
FG
775 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
776 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 777
31f18b77 778 if (verbose) {
11fdf7f2
TL
779 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
780 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
31f18b77 781 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
782 tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT);
783 tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
784 }
785 }
786
787 map<int,uint64_t> avail_by_rule;
788 for (auto p = osd_map.get_pools().begin();
789 p != osd_map.get_pools().end(); ++p) {
790 int64_t pool_id = p->first;
791 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
792 continue;
11fdf7f2 793
31f18b77 794 const string& pool_name = osd_map.get_pool_name(pool_id);
f91f0fd5 795 auto pool_pg_num = osd_map.get_pg_num(pool_id);
31f18b77
FG
796 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
797
798 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
799 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
800 pool->get_type(),
801 pool->get_size());
802 int64_t avail;
31f18b77
FG
803 if (avail_by_rule.count(ruleno) == 0) {
804 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
805 avail = get_rule_avail(ruleno);
806 if (avail < 0)
807 avail = 0;
808 avail_by_rule[ruleno] = avail;
809 } else {
810 avail = avail_by_rule[ruleno];
811 }
31f18b77
FG
812 if (f) {
813 f->open_object_section("pool");
814 f->dump_string("name", pool_name);
815 f->dump_int("id", pool_id);
816 f->open_object_section("stats");
817 } else {
818 tbl << pool_name
f91f0fd5
TL
819 << pool_id
820 << pool_pg_num;
31f18b77 821 }
11fdf7f2 822 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
81eedcae 823 bool per_pool = use_per_pool_stats();
9f95a23c 824 bool per_pool_omap = use_per_pool_omap_stats();
81eedcae 825 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
9f95a23c 826 per_pool_omap, pool);
11fdf7f2 827 if (f) {
31f18b77 828 f->close_section(); // stats
31f18b77 829 f->close_section(); // pool
11fdf7f2
TL
830 } else {
831 tbl << TextTable::endrow;
832 }
31f18b77
FG
833 }
834 if (f)
835 f->close_section();
836 else {
11fdf7f2 837 ceph_assert(ss != nullptr);
9f95a23c 838 *ss << "--- POOLS ---\n";
31f18b77
FG
839 *ss << tbl;
840 }
841}
842
11fdf7f2 843void PGMapDigest::dump_cluster_stats(stringstream *ss,
9f95a23c 844 ceph::Formatter *f,
11fdf7f2 845 bool verbose) const
31f18b77
FG
846{
847 if (f) {
848 f->open_object_section("stats");
11fdf7f2
TL
849 f->dump_int("total_bytes", osd_sum.statfs.total);
850 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
851 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
852 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
853 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
81eedcae
TL
854 f->dump_unsigned("num_osds", osd_sum.num_osds);
855 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
9f95a23c 856 f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
11fdf7f2
TL
857 f->close_section();
858 f->open_object_section("stats_by_class");
859 for (auto& i : osd_sum_by_class) {
860 f->open_object_section(i.first.c_str());
861 f->dump_int("total_bytes", i.second.statfs.total);
862 f->dump_int("total_avail_bytes", i.second.statfs.available);
863 f->dump_int("total_used_bytes", i.second.statfs.get_used());
864 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
865 f->dump_float("total_used_raw_ratio",
866 i.second.statfs.get_used_raw_ratio());
867 f->close_section();
31f18b77
FG
868 }
869 f->close_section();
870 } else {
11fdf7f2 871 ceph_assert(ss != nullptr);
31f18b77 872 TextTable tbl;
11fdf7f2 873 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
31f18b77
FG
874 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
875 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 876 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
877 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
878 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
31f18b77 879
11fdf7f2
TL
880
881 for (auto& i : osd_sum_by_class) {
882 tbl << i.first;
883 tbl << stringify(byte_u_t(i.second.statfs.total))
884 << stringify(byte_u_t(i.second.statfs.available))
885 << stringify(byte_u_t(i.second.statfs.get_used()))
886 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
887 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
888 << TextTable::endrow;
889 }
890 tbl << "TOTAL";
891 tbl << stringify(byte_u_t(osd_sum.statfs.total))
892 << stringify(byte_u_t(osd_sum.statfs.available))
893 << stringify(byte_u_t(osd_sum.statfs.get_used()))
894 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
895 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
896 << TextTable::endrow;
897
9f95a23c 898 *ss << "--- RAW STORAGE ---\n";
31f18b77
FG
899 *ss << tbl;
900 }
901}
902
903void PGMapDigest::dump_object_stat_sum(
9f95a23c 904 TextTable &tbl, ceph::Formatter *f,
11fdf7f2 905 const pool_stat_t &pool_stat, uint64_t avail,
9f95a23c 906 float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
31f18b77
FG
907 const pg_pool_t *pool)
908{
11fdf7f2
TL
909 const object_stat_sum_t &sum = pool_stat.stats.sum;
910 const store_statfs_t statfs = pool_stat.store_stats;
911
912 if (sum.num_object_copies > 0) {
913 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
914 }
81eedcae 915
9f95a23c
TL
916 uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
917 uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
918 uint64_t used_bytes = used_data_bytes + used_omap_bytes;
31f18b77
FG
919
920 float used = 0.0;
3efd9988 921 // note avail passed in is raw_avail, calc raw_used here.
31f18b77 922 if (avail) {
11fdf7f2 923 used = used_bytes;
31f18b77 924 used /= used + avail;
11fdf7f2 925 } else if (used_bytes) {
31f18b77
FG
926 used = 1.0;
927 }
11fdf7f2
TL
928 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
929 // an approximation for actually stored user data
9f95a23c
TL
930 auto stored_data_normalized = pool_stat.get_user_data_bytes(
931 raw_used_rate, per_pool);
932 auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
933 raw_used_rate, per_pool_omap);
934 auto stored_normalized = stored_data_normalized + stored_omap_normalized;
935 // same, amplied by replication or EC
936 auto stored_raw = stored_normalized * raw_used_rate;
31f18b77 937 if (f) {
11fdf7f2 938 f->dump_int("stored", stored_normalized);
9f95a23c
TL
939 if (verbose) {
940 f->dump_int("stored_data", stored_data_normalized);
941 f->dump_int("stored_omap", stored_omap_normalized);
942 }
31f18b77 943 f->dump_int("objects", sum.num_objects);
11fdf7f2
TL
944 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
945 f->dump_int("bytes_used", used_bytes);
9f95a23c
TL
946 if (verbose) {
947 f->dump_int("data_bytes_used", used_data_bytes);
948 f->dump_int("omap_bytes_used", used_omap_bytes);
949 }
11fdf7f2
TL
950 f->dump_float("percent_used", used);
951 f->dump_unsigned("max_avail", avail_res);
31f18b77
FG
952 if (verbose) {
953 f->dump_int("quota_objects", pool->quota_max_objects);
954 f->dump_int("quota_bytes", pool->quota_max_bytes);
955 f->dump_int("dirty", sum.num_objects_dirty);
956 f->dump_int("rd", sum.num_rd);
957 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
958 f->dump_int("wr", sum.num_wr);
959 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
11fdf7f2
TL
960 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
961 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
962 // Stored by user amplified by replication
9f95a23c 963 f->dump_int("stored_raw", stored_raw);
f6b5b4d7 964 f->dump_unsigned("avail_raw", avail);
31f18b77
FG
965 }
966 } else {
11fdf7f2 967 tbl << stringify(byte_u_t(stored_normalized));
9f95a23c
TL
968 if (verbose) {
969 tbl << stringify(byte_u_t(stored_data_normalized));
970 tbl << stringify(byte_u_t(stored_omap_normalized));
971 }
11fdf7f2
TL
972 tbl << stringify(si_u_t(sum.num_objects));
973 tbl << stringify(byte_u_t(used_bytes));
9f95a23c
TL
974 if (verbose) {
975 tbl << stringify(byte_u_t(used_data_bytes));
976 tbl << stringify(byte_u_t(used_omap_bytes));
977 }
31f18b77 978 tbl << percentify(used*100);
11fdf7f2 979 tbl << stringify(byte_u_t(avail_res));
31f18b77 980 if (verbose) {
11fdf7f2
TL
981 if (pool->quota_max_objects == 0)
982 tbl << "N/A";
983 else
984 tbl << stringify(si_u_t(pool->quota_max_objects));
985
986 if (pool->quota_max_bytes == 0)
987 tbl << "N/A";
988 else
989 tbl << stringify(byte_u_t(pool->quota_max_bytes));
990
1adf2230 991 tbl << stringify(si_u_t(sum.num_objects_dirty))
11fdf7f2
TL
992 << stringify(byte_u_t(statfs.data_compressed_allocated))
993 << stringify(byte_u_t(statfs.data_compressed_original))
994 ;
31f18b77
FG
995 }
996 }
997}
998
d2e6a577
FG
999int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
1000 int64_t poolid) const
1001{
1002 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
1003 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
1004 pool->get_type(),
1005 pool->get_size());
1006 int64_t avail;
1007 avail = get_rule_avail(ruleno);
1008 if (avail < 0)
1009 avail = 0;
1010
11fdf7f2 1011 return avail / osd_map.pool_raw_used_rate(poolid);
d2e6a577
FG
1012}
1013
31f18b77
FG
1014int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1015{
1016 map<int,float> wm;
1017 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1018 if (r < 0) {
1019 return r;
1020 }
1021 if (wm.empty()) {
1022 return 0;
1023 }
1024
11fdf7f2 1025 float fratio = osdmap.get_full_ratio();
31f18b77
FG
1026
1027 int64_t min = -1;
1028 for (auto p = wm.begin(); p != wm.end(); ++p) {
1029 auto osd_info = osd_stat.find(p->first);
1030 if (osd_info != osd_stat.end()) {
11fdf7f2 1031 if (osd_info->second.statfs.total == 0 || p->second == 0) {
31f18b77
FG
1032 // osd must be out, hence its stats have been zeroed
1033 // (unless we somehow managed to have a disk with size 0...)
1034 //
1035 // (p->second == 0), if osd weight is 0, no need to
1036 // calculate proj below.
1037 continue;
1038 }
11fdf7f2 1039 double unusable = (double)osd_info->second.statfs.kb() *
31f18b77 1040 (1.0 - fratio);
11fdf7f2 1041 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
31f18b77
FG
1042 avail *= 1024.0;
1043 int64_t proj = (int64_t)(avail / (double)p->second);
1044 if (min < 0 || proj < min) {
1045 min = proj;
1046 }
1047 } else {
94b18763
FG
1048 if (osdmap.is_up(p->first)) {
1049 // This is a level 4 rather than an error, because we might have
1050 // only just started, and not received the first stats message yet.
1051 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1052 }
31f18b77
FG
1053 }
1054 }
1055 return min;
1056}
1057
1058void PGMap::get_rules_avail(const OSDMap& osdmap,
1059 std::map<int,int64_t> *avail_map) const
1060{
1061 avail_map->clear();
1062 for (auto p : osdmap.get_pools()) {
1063 int64_t pool_id = p.first;
1064 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1065 continue;
1066 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1067 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1068 pool->get_type(),
1069 pool->get_size());
1070 if (avail_map->count(ruleno) == 0)
1071 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1072 }
1073}
1074
1075// ---------------------
1076// PGMap
1077
9f95a23c 1078void PGMap::Incremental::dump(ceph::Formatter *f) const
7c673cae
FG
1079{
1080 f->dump_unsigned("version", version);
1081 f->dump_stream("stamp") << stamp;
31f18b77
FG
1082 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1083 f->dump_unsigned("pg_scan_epoch", pg_scan);
7c673cae 1084
31f18b77
FG
1085 f->open_array_section("pg_stat_updates");
1086 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1087 f->open_object_section("pg_stat");
1088 f->dump_stream("pgid") << p->first;
1089 p->second.dump(f);
1090 f->close_section();
1091 }
7c673cae
FG
1092 f->close_section();
1093
31f18b77
FG
1094 f->open_array_section("osd_stat_updates");
1095 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1096 f->open_object_section("osd_stat");
1097 f->dump_int("osd", p->first);
1098 p->second.dump(f);
7c673cae
FG
1099 f->close_section();
1100 }
1101 f->close_section();
11fdf7f2
TL
1102 f->open_array_section("pool_statfs_updates");
1103 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1104 f->open_object_section("pool_statfs");
1105 f->dump_stream("poolid/osd") << p->first;
1106 p->second.dump(f);
1107 f->close_section();
1108 }
1109 f->close_section();
7c673cae 1110
31f18b77
FG
1111 f->open_array_section("osd_stat_removals");
1112 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1113 f->dump_int("osd", *p);
7c673cae 1114 f->close_section();
7c673cae 1115
31f18b77
FG
1116 f->open_array_section("pg_removals");
1117 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1118 f->dump_stream("pgid") << *p;
7c673cae
FG
1119 f->close_section();
1120}
1121
31f18b77 1122void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
7c673cae 1123{
31f18b77
FG
1124 o.push_back(new Incremental);
1125 o.push_back(new Incremental);
1126 o.back()->version = 1;
1127 o.back()->stamp = utime_t(123,345);
1128 o.push_back(new Incremental);
1129 o.back()->version = 2;
11fdf7f2 1130 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
31f18b77 1131 o.back()->osd_stat_updates[5] = osd_stat_t();
31f18b77
FG
1132 o.push_back(new Incremental);
1133 o.back()->version = 3;
1134 o.back()->osdmap_epoch = 1;
1135 o.back()->pg_scan = 2;
11fdf7f2 1136 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
31f18b77 1137 o.back()->osd_stat_updates[6] = osd_stat_t();
11fdf7f2 1138 o.back()->pg_remove.insert(pg_t(1,2));
31f18b77 1139 o.back()->osd_stat_rm.insert(5);
11fdf7f2 1140 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
7c673cae
FG
1141}
1142
31f18b77
FG
1143// --
1144
1145void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
7c673cae 1146{
11fdf7f2 1147 ceph_assert(inc.version == version+1);
31f18b77 1148 version++;
7c673cae 1149
31f18b77 1150 pool_stat_t pg_sum_old = pg_sum;
11fdf7f2
TL
1151 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1152 pg_pool_sum_old = pg_pool_sum;
7c673cae 1153
31f18b77
FG
1154 for (auto p = inc.pg_stat_updates.begin();
1155 p != inc.pg_stat_updates.end();
1156 ++p) {
1157 const pg_t &update_pg(p->first);
11fdf7f2 1158 auto update_pool = update_pg.pool();
31f18b77 1159 const pg_stat_t &update_stat(p->second);
7c673cae 1160
11fdf7f2
TL
1161 auto pg_stat_iter = pg_stat.find(update_pg);
1162 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1163 if (pg_stat_iter == pg_stat.end()) {
31f18b77
FG
1164 pg_stat.insert(make_pair(update_pg, update_stat));
1165 } else {
11fdf7f2
TL
1166 stat_pg_sub(update_pg, pg_stat_iter->second);
1167 pool_sum_ref.sub(pg_stat_iter->second);
1168 pg_stat_iter->second = update_stat;
7c673cae 1169 }
31f18b77 1170 stat_pg_add(update_pg, update_stat);
11fdf7f2 1171 pool_sum_ref.add(update_stat);
7c673cae 1172 }
11fdf7f2
TL
1173
1174 for (auto p = inc.pool_statfs_updates.begin();
1175 p != inc.pool_statfs_updates.end();
1176 ++p) {
1177 auto update_pool = p->first.first;
1178 auto update_osd = p->first.second;
1179 auto& statfs_inc = p->second;
1180
1181 auto pool_statfs_iter =
1182 pool_statfs.find(std::make_pair(update_pool, update_osd));
f6b5b4d7 1183 if (pg_pool_sum.count(update_pool)) {
eafe8130
TL
1184 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1185 if (pool_statfs_iter == pool_statfs.end()) {
1186 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1187 } else {
1188 pool_sum_ref.sub(pool_statfs_iter->second);
1189 pool_statfs_iter->second = statfs_inc;
1190 }
1191 pool_sum_ref.add(statfs_inc);
11fdf7f2 1192 }
11fdf7f2
TL
1193 }
1194
31f18b77
FG
1195 for (auto p = inc.get_osd_stat_updates().begin();
1196 p != inc.get_osd_stat_updates().end();
1197 ++p) {
1198 int osd = p->first;
1199 const osd_stat_t &new_stats(p->second);
7c673cae 1200
31f18b77
FG
1201 auto t = osd_stat.find(osd);
1202 if (t == osd_stat.end()) {
1203 osd_stat.insert(make_pair(osd, new_stats));
1204 } else {
1205 stat_osd_sub(t->first, t->second);
1206 t->second = new_stats;
1207 }
31f18b77 1208 stat_osd_add(osd, new_stats);
31f18b77
FG
1209 }
1210 set<int64_t> deleted_pools;
1211 for (auto p = inc.pg_remove.begin();
1212 p != inc.pg_remove.end();
1213 ++p) {
1214 const pg_t &removed_pg(*p);
1215 auto s = pg_stat.find(removed_pg);
11fdf7f2 1216 bool pool_erased = false;
31f18b77 1217 if (s != pg_stat.end()) {
11fdf7f2 1218 pool_erased = stat_pg_sub(removed_pg, s->second);
f6b5b4d7
TL
1219
1220 // decrease pool stats if pg was removed
1221 auto pool_stats_it = pg_pool_sum.find(removed_pg.pool());
1222 if (pool_stats_it != pg_pool_sum.end()) {
1223 pool_stats_it->second.sub(s->second);
1224 }
1225
31f18b77 1226 pg_stat.erase(s);
11fdf7f2
TL
1227 if (pool_erased) {
1228 deleted_pools.insert(removed_pg.pool());
1229 }
31f18b77 1230 }
7c673cae
FG
1231 }
1232
31f18b77
FG
1233 for (auto p = inc.get_osd_stat_rm().begin();
1234 p != inc.get_osd_stat_rm().end();
7c673cae 1235 ++p) {
31f18b77
FG
1236 auto t = osd_stat.find(*p);
1237 if (t != osd_stat.end()) {
1238 stat_osd_sub(t->first, t->second);
1239 osd_stat.erase(t);
31f18b77 1240 }
11fdf7f2
TL
1241 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1242 if (i->first.second == *p) {
1243 pg_pool_sum[i->first.first].sub(i->second);
1244 pool_statfs.erase(i);
1245 }
1246 }
7c673cae
FG
1247 }
1248
b32b8144
FG
1249 // skip calculating delta while sum was not synchronized
1250 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1251 utime_t delta_t;
1252 delta_t = inc.stamp;
1253 delta_t -= stamp;
1254 // calculate a delta, and average over the last 2 deltas.
1255 pool_stat_t d = pg_sum;
1256 d.stats.sub(pg_sum_old.stats);
1257 pg_sum_deltas.push_back(make_pair(d, delta_t));
1258 stamp_delta += delta_t;
1259 pg_sum_delta.stats.add(d.stats);
1260 auto smooth_intervals =
11fdf7f2
TL
1261 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1262 while (pg_sum_deltas.size() > smooth_intervals) {
b32b8144
FG
1263 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1264 stamp_delta -= pg_sum_deltas.front().second;
1265 pg_sum_deltas.pop_front();
1266 }
31f18b77 1267 }
b32b8144 1268 stamp = inc.stamp;
7c673cae 1269
31f18b77 1270 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
7c673cae 1271
31f18b77
FG
1272 for (auto p : deleted_pools) {
1273 if (cct)
1274 dout(20) << " deleted pool " << p << dendl;
1275 deleted_pool(p);
1276 }
7c673cae 1277
31f18b77
FG
1278 if (inc.osdmap_epoch)
1279 last_osdmap_epoch = inc.osdmap_epoch;
1280 if (inc.pg_scan)
1281 last_pg_scan = inc.pg_scan;
7c673cae
FG
1282}
1283
31f18b77 1284void PGMap::calc_stats()
7c673cae 1285{
31f18b77
FG
1286 num_pg = 0;
1287 num_pg_active = 0;
1288 num_pg_unknown = 0;
1289 num_osd = 0;
1290 pg_pool_sum.clear();
1291 num_pg_by_pool.clear();
1292 pg_by_osd.clear();
1293 pg_sum = pool_stat_t();
1294 osd_sum = osd_stat_t();
11fdf7f2 1295 osd_sum_by_class.clear();
31f18b77 1296 num_pg_by_state.clear();
11fdf7f2 1297 num_pg_by_pool_state.clear();
31f18b77 1298 num_pg_by_osd.clear();
7c673cae 1299
31f18b77
FG
1300 for (auto p = pg_stat.begin();
1301 p != pg_stat.end();
1302 ++p) {
11fdf7f2
TL
1303 auto pg = p->first;
1304 stat_pg_add(pg, p->second);
1305 pg_pool_sum[pg.pool()].add(p->second);
1306 }
1307 for (auto p = pool_statfs.begin();
1308 p != pool_statfs.end();
1309 ++p) {
1310 auto pool = p->first.first;
1311 pg_pool_sum[pool].add(p->second);
31f18b77
FG
1312 }
1313 for (auto p = osd_stat.begin();
1314 p != osd_stat.end();
1315 ++p)
1316 stat_osd_add(p->first, p->second);
7c673cae
FG
1317}
1318
31f18b77
FG
1319void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1320 bool sameosds)
7c673cae 1321{
11fdf7f2 1322 auto pool = pgid.pool();
31f18b77 1323 pg_sum.add(s);
7c673cae 1324
31f18b77
FG
1325 num_pg++;
1326 num_pg_by_state[s.state]++;
11fdf7f2
TL
1327 num_pg_by_pool_state[pgid.pool()][s.state]++;
1328 num_pg_by_pool[pool]++;
7c673cae 1329
31f18b77
FG
1330 if ((s.state & PG_STATE_CREATING) &&
1331 s.parent_split_bits == 0) {
1332 creating_pgs.insert(pgid);
1333 if (s.acting_primary >= 0) {
1334 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
7c673cae
FG
1335 }
1336 }
1337
31f18b77
FG
1338 if (s.state & PG_STATE_ACTIVE) {
1339 ++num_pg_active;
1340 }
1341 if (s.state == 0) {
1342 ++num_pg_unknown;
7c673cae
FG
1343 }
1344
31f18b77
FG
1345 if (sameosds)
1346 return;
7c673cae 1347
31f18b77
FG
1348 for (auto p = s.blocked_by.begin();
1349 p != s.blocked_by.end();
1350 ++p) {
1351 ++blocked_by_sum[*p];
7c673cae 1352 }
31f18b77
FG
1353
1354 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1355 pg_by_osd[*p].insert(pgid);
1356 num_pg_by_osd[*p].acting++;
1357 }
1358 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
81eedcae
TL
1359 auto& t = pg_by_osd[*p];
1360 if (t.find(pgid) == t.end()) {
1361 t.insert(pgid);
1362 num_pg_by_osd[*p].up_not_acting++;
1363 }
7c673cae 1364 }
7c673cae 1365
31f18b77
FG
1366 if (s.up_primary >= 0) {
1367 num_pg_by_osd[s.up_primary].primary++;
7c673cae 1368 }
7c673cae 1369}
31f18b77 1370
11fdf7f2 1371bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
31f18b77 1372 bool sameosds)
7c673cae 1373{
11fdf7f2 1374 bool pool_erased = false;
31f18b77
FG
1375 pg_sum.sub(s);
1376
1377 num_pg--;
1378 int end = --num_pg_by_state[s.state];
11fdf7f2 1379 ceph_assert(end >= 0);
31f18b77
FG
1380 if (end == 0)
1381 num_pg_by_state.erase(s.state);
11fdf7f2
TL
1382 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1383 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1384 }
31f18b77
FG
1385 end = --num_pg_by_pool[pgid.pool()];
1386 if (end == 0) {
11fdf7f2 1387 pool_erased = true;
7c673cae 1388 }
7c673cae 1389
31f18b77
FG
1390 if ((s.state & PG_STATE_CREATING) &&
1391 s.parent_split_bits == 0) {
1392 creating_pgs.erase(pgid);
1393 if (s.acting_primary >= 0) {
1394 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1395 r[s.mapping_epoch].erase(pgid);
1396 if (r[s.mapping_epoch].empty())
1397 r.erase(s.mapping_epoch);
1398 if (r.empty())
1399 creating_pgs_by_osd_epoch.erase(s.acting_primary);
7c673cae
FG
1400 }
1401 }
31f18b77
FG
1402
1403 if (s.state & PG_STATE_ACTIVE) {
1404 --num_pg_active;
1405 }
1406 if (s.state == 0) {
1407 --num_pg_unknown;
1408 }
1409
1410 if (sameosds)
11fdf7f2 1411 return pool_erased;
31f18b77
FG
1412
1413 for (auto p = s.blocked_by.begin();
1414 p != s.blocked_by.end();
1415 ++p) {
1416 auto q = blocked_by_sum.find(*p);
11fdf7f2 1417 ceph_assert(q != blocked_by_sum.end());
31f18b77
FG
1418 --q->second;
1419 if (q->second == 0)
1420 blocked_by_sum.erase(q);
1421 }
1422
81eedcae 1423 set<int32_t> actingset;
31f18b77 1424 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
81eedcae 1425 actingset.insert(*p);
31f18b77
FG
1426 auto& oset = pg_by_osd[*p];
1427 oset.erase(pgid);
1428 if (oset.empty())
1429 pg_by_osd.erase(*p);
1430 auto it = num_pg_by_osd.find(*p);
1431 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1432 it->second.acting--;
1433 }
1434 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1435 auto& oset = pg_by_osd[*p];
1436 oset.erase(pgid);
1437 if (oset.empty())
1438 pg_by_osd.erase(*p);
81eedcae
TL
1439 if (actingset.count(*p))
1440 continue;
31f18b77 1441 auto it = num_pg_by_osd.find(*p);
81eedcae
TL
1442 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1443 it->second.up_not_acting--;
31f18b77
FG
1444 }
1445
1446 if (s.up_primary >= 0) {
1447 auto it = num_pg_by_osd.find(s.up_primary);
1448 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1449 it->second.primary--;
1450 }
11fdf7f2
TL
1451 return pool_erased;
1452}
1453
1454void PGMap::calc_purged_snaps()
1455{
1456 purged_snaps.clear();
1457 set<int64_t> unknown;
1458 for (auto& i : pg_stat) {
1459 if (i.second.state == 0) {
1460 unknown.insert(i.first.pool());
1461 purged_snaps.erase(i.first.pool());
1462 continue;
1463 } else if (unknown.count(i.first.pool())) {
1464 continue;
1465 }
1466 auto j = purged_snaps.find(i.first.pool());
1467 if (j == purged_snaps.end()) {
1468 // base case
1469 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1470 } else {
1471 j->second.intersection_of(i.second.purged_snaps);
1472 }
1473 }
31f18b77
FG
1474}
1475
11fdf7f2 1476void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
31f18b77 1477{
11fdf7f2
TL
1478 osd_sum_by_class.clear();
1479 for (auto& i : osd_stat) {
1480 const char *class_name = osdmap.crush->get_item_class(i.first);
1481 if (class_name) {
1482 osd_sum_by_class[class_name].add(i.second);
1483 }
1484 }
31f18b77
FG
1485}
1486
1487void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1488{
1489 num_osd++;
1490 osd_sum.add(s);
1491 if (osd >= (int)osd_last_seq.size()) {
1492 osd_last_seq.resize(osd + 1);
1493 }
1494 osd_last_seq[osd] = s.seq;
1495}
1496
1497void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1498{
1499 num_osd--;
1500 osd_sum.sub(s);
11fdf7f2 1501 ceph_assert(osd < (int)osd_last_seq.size());
31f18b77
FG
1502 osd_last_seq[osd] = 0;
1503}
1504
31f18b77 1505void PGMap::encode_digest(const OSDMap& osdmap,
11fdf7f2 1506 bufferlist& bl, uint64_t features)
31f18b77
FG
1507{
1508 get_rules_avail(osdmap, &avail_space_by_rule);
11fdf7f2
TL
1509 calc_osd_sum_by_class(osdmap);
1510 calc_purged_snaps();
31f18b77
FG
1511 PGMapDigest::encode(bl, features);
1512}
1513
1514void PGMap::encode(bufferlist &bl, uint64_t features) const
1515{
11fdf7f2
TL
1516 ENCODE_START(8, 8, bl);
1517 encode(version, bl);
1518 encode(pg_stat, bl);
1519 encode(osd_stat, bl, features);
1520 encode(last_osdmap_epoch, bl);
1521 encode(last_pg_scan, bl);
1522 encode(stamp, bl);
1523 encode(pool_statfs, bl, features);
31f18b77
FG
1524 ENCODE_FINISH(bl);
1525}
1526
11fdf7f2 1527void PGMap::decode(bufferlist::const_iterator &bl)
31f18b77 1528{
11fdf7f2
TL
1529 DECODE_START(8, bl);
1530 decode(version, bl);
1531 decode(pg_stat, bl);
1532 decode(osd_stat, bl);
1533 decode(last_osdmap_epoch, bl);
1534 decode(last_pg_scan, bl);
1535 decode(stamp, bl);
1536 decode(pool_statfs, bl);
31f18b77
FG
1537 DECODE_FINISH(bl);
1538
1539 calc_stats();
7c673cae
FG
1540}
1541
9f95a23c 1542void PGMap::dump(ceph::Formatter *f, bool with_net) const
31f18b77
FG
1543{
1544 dump_basic(f);
1545 dump_pg_stats(f, false);
1546 dump_pool_stats(f);
9f95a23c 1547 dump_osd_stats(f, with_net);
31f18b77
FG
1548}
1549
9f95a23c 1550void PGMap::dump_basic(ceph::Formatter *f) const
31f18b77
FG
1551{
1552 f->dump_unsigned("version", version);
1553 f->dump_stream("stamp") << stamp;
1554 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1555 f->dump_unsigned("last_pg_scan", last_pg_scan);
31f18b77
FG
1556
1557 f->open_object_section("pg_stats_sum");
1558 pg_sum.dump(f);
1559 f->close_section();
1560
1561 f->open_object_section("osd_stats_sum");
1562 osd_sum.dump(f);
1563 f->close_section();
1564
31f18b77
FG
1565 dump_delta(f);
1566}
1567
9f95a23c 1568void PGMap::dump_delta(ceph::Formatter *f) const
31f18b77
FG
1569{
1570 f->open_object_section("pg_stats_delta");
1571 pg_sum_delta.dump(f);
11fdf7f2 1572 f->dump_stream("stamp_delta") << stamp_delta;
31f18b77
FG
1573 f->close_section();
1574}
1575
9f95a23c 1576void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
31f18b77
FG
1577{
1578 f->open_array_section("pg_stats");
1579 for (auto i = pg_stat.begin();
1580 i != pg_stat.end();
1581 ++i) {
1582 f->open_object_section("pg_stat");
1583 f->dump_stream("pgid") << i->first;
1584 if (brief)
1585 i->second.dump_brief(f);
1586 else
1587 i->second.dump(f);
1588 f->close_section();
1589 }
1590 f->close_section();
1591}
1592
9f95a23c 1593void PGMap::dump_pool_stats(ceph::Formatter *f) const
31f18b77
FG
1594{
1595 f->open_array_section("pool_stats");
1596 for (auto p = pg_pool_sum.begin();
1597 p != pg_pool_sum.end();
1598 ++p) {
1599 f->open_object_section("pool_stat");
1600 f->dump_int("poolid", p->first);
1601 auto q = num_pg_by_pool.find(p->first);
1602 if (q != num_pg_by_pool.end())
1603 f->dump_unsigned("num_pg", q->second);
1604 p->second.dump(f);
1605 f->close_section();
1606 }
1607 f->close_section();
1608}
1609
9f95a23c 1610void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
31f18b77
FG
1611{
1612 f->open_array_section("osd_stats");
1613 for (auto q = osd_stat.begin();
1614 q != osd_stat.end();
1615 ++q) {
1616 f->open_object_section("osd_stat");
1617 f->dump_int("osd", q->first);
ded94939 1618 q->second.dump(f, with_net);
31f18b77
FG
1619 f->close_section();
1620 }
1621 f->close_section();
1622}
1623
9f95a23c
TL
1624void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
1625{
1626 f->open_array_section("osd_ping_times");
1627 for (auto& [osd, stat] : osd_stat) {
1628 f->open_object_section("osd_ping_time");
1629 f->dump_int("osd", osd);
1630 stat.dump_ping_time(f);
1631 f->close_section();
1632 }
1633 f->close_section();
1634}
1635
31f18b77
FG
1636void PGMap::dump_pg_stats_plain(
1637 ostream& ss,
1638 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1639 bool brief) const
1640{
1641 TextTable tab;
1642
1643 if (brief){
1644 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1645 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1646 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1647 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1648 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1649 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1650 }
1651 else {
1652 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1653 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1654 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1655 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1656 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1657 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1658 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1659 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1660 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1661 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1662 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1663 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1664 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1665 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1666 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1667 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1668 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1669 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1670 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1671 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1672 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1673 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1674 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
b32b8144 1675 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1676 }
1677
1678 for (auto i = pg_stats.begin();
1679 i != pg_stats.end(); ++i) {
1680 const pg_stat_t &st(i->second);
1681 if (brief) {
1682 tab << i->first
1683 << pg_state_string(st.state)
1684 << st.up
1685 << st.up_primary
1686 << st.acting
1687 << st.acting_primary
1688 << TextTable::endrow;
7c673cae 1689 } else {
31f18b77
FG
1690 ostringstream reported;
1691 reported << st.reported_epoch << ":" << st.reported_seq;
1692
1693 tab << i->first
1694 << st.stats.sum.num_objects
1695 << st.stats.sum.num_objects_missing_on_primary
1696 << st.stats.sum.num_objects_degraded
1697 << st.stats.sum.num_objects_misplaced
1698 << st.stats.sum.num_objects_unfound
1699 << st.stats.sum.num_bytes
11fdf7f2
TL
1700 << st.stats.sum.num_omap_bytes
1701 << st.stats.sum.num_omap_keys
31f18b77
FG
1702 << st.log_size
1703 << st.ondisk_log_size
1704 << pg_state_string(st.state)
1705 << st.last_change
1706 << st.version
1707 << reported.str()
1708 << pg_vector_string(st.up)
1709 << st.up_primary
1710 << pg_vector_string(st.acting)
1711 << st.acting_primary
1712 << st.last_scrub
1713 << st.last_scrub_stamp
1714 << st.last_deep_scrub
1715 << st.last_deep_scrub_stamp
b32b8144 1716 << st.snaptrimq_len
31f18b77 1717 << TextTable::endrow;
7c673cae
FG
1718 }
1719 }
7c673cae 1720
31f18b77
FG
1721 ss << tab;
1722}
1723
1724void PGMap::dump(ostream& ss) const
1725{
1726 dump_basic(ss);
1727 dump_pg_stats(ss, false);
1728 dump_pool_stats(ss, false);
1729 dump_pg_sum_stats(ss, false);
1730 dump_osd_stats(ss);
1731}
1732
1733void PGMap::dump_basic(ostream& ss) const
1734{
1735 ss << "version " << version << std::endl;
1736 ss << "stamp " << stamp << std::endl;
1737 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1738 ss << "last_pg_scan " << last_pg_scan << std::endl;
31f18b77
FG
1739}
1740
1741void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1742{
1743 dump_pg_stats_plain(ss, pg_stat, brief);
1744}
1745
1746void PGMap::dump_pool_stats(ostream& ss, bool header) const
1747{
1748 TextTable tab;
1749
1750 if (header) {
1751 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1752 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1753 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1754 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1755 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1756 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1757 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1758 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1759 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1760 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1761 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1762 } else {
1763 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1764 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1765 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1766 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1768 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1770 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1771 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1772 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1774 }
1775
1776 for (auto p = pg_pool_sum.begin();
1777 p != pg_pool_sum.end();
1778 ++p) {
1779 tab << p->first
1780 << p->second.stats.sum.num_objects
1781 << p->second.stats.sum.num_objects_missing_on_primary
1782 << p->second.stats.sum.num_objects_degraded
1783 << p->second.stats.sum.num_objects_misplaced
1784 << p->second.stats.sum.num_objects_unfound
1785 << p->second.stats.sum.num_bytes
11fdf7f2
TL
1786 << p->second.stats.sum.num_omap_bytes
1787 << p->second.stats.sum.num_omap_keys
31f18b77
FG
1788 << p->second.log_size
1789 << p->second.ondisk_log_size
1790 << TextTable::endrow;
1791 }
1792
1793 ss << tab;
1794}
1795
1796void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1797{
1798 TextTable tab;
1799
1800 if (header) {
1801 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1802 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1803 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1804 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1805 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1806 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1807 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1808 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1809 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1810 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1811 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1812 } else {
1813 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1814 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1815 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1816 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1817 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1818 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1819 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1820 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1821 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1822 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1823 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1824 };
1825
1826 tab << "sum"
1827 << pg_sum.stats.sum.num_objects
1828 << pg_sum.stats.sum.num_objects_missing_on_primary
1829 << pg_sum.stats.sum.num_objects_degraded
1830 << pg_sum.stats.sum.num_objects_misplaced
1831 << pg_sum.stats.sum.num_objects_unfound
1832 << pg_sum.stats.sum.num_bytes
11fdf7f2
TL
1833 << pg_sum.stats.sum.num_omap_bytes
1834 << pg_sum.stats.sum.num_omap_keys
31f18b77
FG
1835 << pg_sum.log_size
1836 << pg_sum.ondisk_log_size
1837 << TextTable::endrow;
1838
1839 ss << tab;
1840}
1841
1842void PGMap::dump_osd_stats(ostream& ss) const
1843{
1844 TextTable tab;
1845
1846 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1847 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1848 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 1849 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1850 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1851 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1852 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1853 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1854
1855 for (auto p = osd_stat.begin();
1856 p != osd_stat.end();
1857 ++p) {
1858 tab << p->first
11fdf7f2
TL
1859 << byte_u_t(p->second.statfs.get_used())
1860 << byte_u_t(p->second.statfs.available)
1861 << byte_u_t(p->second.statfs.get_used_raw())
1862 << byte_u_t(p->second.statfs.total)
31f18b77
FG
1863 << p->second.hb_peers
1864 << get_num_pg_by_osd(p->first)
1865 << get_num_primary_pg_by_osd(p->first)
1866 << TextTable::endrow;
1867 }
1868
1869 tab << "sum"
11fdf7f2
TL
1870 << byte_u_t(osd_sum.statfs.get_used())
1871 << byte_u_t(osd_sum.statfs.available)
1872 << byte_u_t(osd_sum.statfs.get_used_raw())
1873 << byte_u_t(osd_sum.statfs.total)
31f18b77 1874 << TextTable::endrow;
7c673cae 1875
31f18b77 1876 ss << tab;
7c673cae
FG
1877}
1878
31f18b77 1879void PGMap::dump_osd_sum_stats(ostream& ss) const
7c673cae 1880{
31f18b77 1881 TextTable tab;
7c673cae 1882
31f18b77
FG
1883 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1884 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1885 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 1886 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
31f18b77 1887 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
7c673cae 1888
31f18b77 1889 tab << "sum"
11fdf7f2
TL
1890 << byte_u_t(osd_sum.statfs.get_used())
1891 << byte_u_t(osd_sum.statfs.available)
1892 << byte_u_t(osd_sum.statfs.get_used_raw())
1893 << byte_u_t(osd_sum.statfs.total)
31f18b77 1894 << TextTable::endrow;
7c673cae 1895
31f18b77 1896 ss << tab;
7c673cae
FG
1897}
1898
31f18b77
FG
1899void PGMap::get_stuck_stats(
1900 int types, const utime_t cutoff,
1901 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
7c673cae 1902{
11fdf7f2 1903 ceph_assert(types != 0);
31f18b77
FG
1904 for (auto i = pg_stat.begin();
1905 i != pg_stat.end();
1906 ++i) {
1907 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1908
1909 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1910 if (i->second.last_active < val)
1911 val = i->second.last_active;
7c673cae 1912 }
31f18b77
FG
1913
1914 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1915 if (i->second.last_clean < val)
1916 val = i->second.last_clean;
7c673cae 1917 }
31f18b77
FG
1918
1919 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1920 if (i->second.last_undegraded < val)
1921 val = i->second.last_undegraded;
7c673cae 1922 }
7c673cae 1923
31f18b77
FG
1924 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1925 if (i->second.last_fullsized < val)
1926 val = i->second.last_fullsized;
1927 }
7c673cae 1928
31f18b77
FG
1929 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1930 if (i->second.last_unstale < val)
1931 val = i->second.last_unstale;
1932 }
7c673cae 1933
31f18b77
FG
1934 // val is now the earliest any of the requested stuck states began
1935 if (val < cutoff) {
1936 stuck_pgs[i->first] = i->second;
1937 }
1938 }
7c673cae
FG
1939}
1940
31f18b77 1941bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
7c673cae 1942{
31f18b77
FG
1943 int inactive = 0;
1944 int unclean = 0;
1945 int degraded = 0;
1946 int undersized = 0;
1947 int stale = 0;
7c673cae 1948
31f18b77
FG
1949 for (auto i = pg_stat.begin();
1950 i != pg_stat.end();
1951 ++i) {
1952 if (! (i->second.state & PG_STATE_ACTIVE)) {
1953 if (i->second.last_active < cutoff)
1954 ++inactive;
7c673cae 1955 }
31f18b77
FG
1956 if (! (i->second.state & PG_STATE_CLEAN)) {
1957 if (i->second.last_clean < cutoff)
1958 ++unclean;
7c673cae 1959 }
31f18b77
FG
1960 if (i->second.state & PG_STATE_DEGRADED) {
1961 if (i->second.last_undegraded < cutoff)
1962 ++degraded;
7c673cae 1963 }
31f18b77
FG
1964 if (i->second.state & PG_STATE_UNDERSIZED) {
1965 if (i->second.last_fullsized < cutoff)
1966 ++undersized;
7c673cae 1967 }
31f18b77
FG
1968 if (i->second.state & PG_STATE_STALE) {
1969 if (i->second.last_unstale < cutoff)
1970 ++stale;
7c673cae
FG
1971 }
1972 }
31f18b77
FG
1973
1974 if (inactive)
1975 note["stuck inactive"] = inactive;
1976
1977 if (unclean)
1978 note["stuck unclean"] = unclean;
1979
1980 if (undersized)
1981 note["stuck undersized"] = undersized;
1982
1983 if (degraded)
1984 note["stuck degraded"] = degraded;
1985
1986 if (stale)
1987 note["stuck stale"] = stale;
1988
1989 return inactive || unclean || undersized || degraded || stale;
1990}
1991
9f95a23c 1992void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
31f18b77
FG
1993{
1994 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1995 get_stuck_stats(types, cutoff, stuck_pg_stats);
1996 f->open_array_section("stuck_pg_stats");
1997 for (auto i = stuck_pg_stats.begin();
1998 i != stuck_pg_stats.end();
1999 ++i) {
2000 f->open_object_section("pg_stat");
2001 f->dump_stream("pgid") << i->first;
2002 i->second.dump(f);
2003 f->close_section();
2004 }
2005 f->close_section();
2006}
2007
2008void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2009{
2010 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2011 get_stuck_stats(types, cutoff, stuck_pg_stats);
2012 if (!stuck_pg_stats.empty())
2013 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2014}
2015
2016int PGMap::dump_stuck_pg_stats(
2017 stringstream &ds,
9f95a23c 2018 ceph::Formatter *f,
31f18b77
FG
2019 int threshold,
2020 vector<string>& args) const
2021{
2022 int stuck_types = 0;
2023
2024 for (auto i = args.begin(); i != args.end(); ++i) {
2025 if (*i == "inactive")
2026 stuck_types |= PGMap::STUCK_INACTIVE;
2027 else if (*i == "unclean")
2028 stuck_types |= PGMap::STUCK_UNCLEAN;
2029 else if (*i == "undersized")
2030 stuck_types |= PGMap::STUCK_UNDERSIZED;
2031 else if (*i == "degraded")
2032 stuck_types |= PGMap::STUCK_DEGRADED;
2033 else if (*i == "stale")
2034 stuck_types |= PGMap::STUCK_STALE;
2035 else {
2036 ds << "Unknown type: " << *i << std::endl;
2037 return -EINVAL;
7c673cae
FG
2038 }
2039 }
31f18b77
FG
2040
2041 utime_t now(ceph_clock_now());
2042 utime_t cutoff = now - utime_t(threshold, 0);
2043
2044 if (!f) {
2045 dump_stuck_plain(ds, stuck_types, cutoff);
2046 } else {
2047 dump_stuck(f, stuck_types, cutoff);
2048 f->flush(ds);
7c673cae 2049 }
31f18b77
FG
2050
2051 return 0;
7c673cae
FG
2052}
2053
9f95a23c 2054void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
7c673cae 2055{
31f18b77
FG
2056 f->open_array_section("osd_perf_infos");
2057 for (auto i = osd_stat.begin();
2058 i != osd_stat.end();
2059 ++i) {
2060 f->open_object_section("osd");
2061 f->dump_int("id", i->first);
2062 {
2063 f->open_object_section("perf_stats");
2064 i->second.os_perf_stat.dump(f);
2065 f->close_section();
2066 }
2067 f->close_section();
2068 }
2069 f->close_section();
7c673cae 2070}
31f18b77 2071void PGMap::print_osd_perf_stats(std::ostream *ss) const
7c673cae 2072{
31f18b77
FG
2073 TextTable tab;
2074 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2075 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2076 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2077 for (auto i = osd_stat.begin();
2078 i != osd_stat.end();
2079 ++i) {
2080 tab << i->first;
11fdf7f2
TL
2081 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2082 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
31f18b77
FG
2083 tab << TextTable::endrow;
2084 }
2085 (*ss) << tab;
2086}
7c673cae 2087
9f95a23c 2088void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
31f18b77
FG
2089{
2090 f->open_array_section("osd_blocked_by_infos");
2091 for (auto i = blocked_by_sum.begin();
2092 i != blocked_by_sum.end();
2093 ++i) {
2094 f->open_object_section("osd");
2095 f->dump_int("id", i->first);
2096 f->dump_int("num_blocked", i->second);
2097 f->close_section();
2098 }
2099 f->close_section();
2100}
2101void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2102{
2103 TextTable tab;
2104 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2105 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2106 for (auto i = blocked_by_sum.begin();
2107 i != blocked_by_sum.end();
2108 ++i) {
2109 tab << i->first;
2110 tab << i->second;
2111 tab << TextTable::endrow;
2112 }
2113 (*ss) << tab;
7c673cae
FG
2114}
2115
31f18b77 2116
7c673cae
FG
2117/**
2118 * update aggregated delta
2119 *
2120 * @param cct ceph context
2121 * @param ts Timestamp for the stats being delta'ed
2122 * @param old_pool_sum Previous stats sum
2123 * @param last_ts Last timestamp for pool
2124 * @param result_pool_sum Resulting stats
2125 * @param result_pool_delta Resulting pool delta
2126 * @param result_ts_delta Resulting timestamp delta
2127 * @param delta_avg_list List of last N computed deltas, used to average
2128 */
31f18b77
FG
2129void PGMap::update_delta(
2130 CephContext *cct,
2131 const utime_t ts,
2132 const pool_stat_t& old_pool_sum,
2133 utime_t *last_ts,
2134 const pool_stat_t& current_pool_sum,
2135 pool_stat_t *result_pool_delta,
2136 utime_t *result_ts_delta,
2137 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
7c673cae
FG
2138{
2139 /* @p ts is the timestamp we want to associate with the data
2140 * in @p old_pool_sum, and on which we will base ourselves to
2141 * calculate the delta, stored in 'delta_t'.
2142 */
2143 utime_t delta_t;
2144 delta_t = ts; // start with the provided timestamp
2145 delta_t -= *last_ts; // take the last timestamp we saw
2146 *last_ts = ts; // @p ts becomes the last timestamp we saw
2147
31f18b77
FG
2148 // adjust delta_t, quick start if there is no update in a long period
2149 delta_t = std::min(delta_t,
2150 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2151
2152 // calculate a delta, and average over the last 6 deltas by default.
7c673cae
FG
2153 /* start by taking a copy of our current @p result_pool_sum, and by
2154 * taking out the stats from @p old_pool_sum. This generates a stats
2155 * delta. Stash this stats delta in @p delta_avg_list, along with the
2156 * timestamp delta for these results.
2157 */
2158 pool_stat_t d = current_pool_sum;
2159 d.stats.sub(old_pool_sum.stats);
7c673cae
FG
2160
2161 /* Aggregate current delta, and take out the last seen delta (if any) to
2162 * average it out.
b32b8144 2163 * Skip calculating delta while sum was not synchronized.
7c673cae 2164 */
b32b8144
FG
2165 if(!old_pool_sum.stats.sum.is_zero()) {
2166 delta_avg_list->push_back(make_pair(d,delta_t));
2167 *result_ts_delta += delta_t;
2168 result_pool_delta->stats.add(d.stats);
2169 }
11fdf7f2
TL
2170 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2171 while (delta_avg_list->size() > s) {
7c673cae
FG
2172 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2173 *result_ts_delta -= delta_avg_list->front().second;
2174 delta_avg_list->pop_front();
2175 }
2176}
2177
7c673cae
FG
2178/**
2179 * Update a given pool's deltas
2180 *
2181 * @param cct Ceph Context
2182 * @param ts Timestamp for the stats being delta'ed
2183 * @param pool Pool's id
2184 * @param old_pool_sum Previous stats sum
2185 */
31f18b77
FG
2186void PGMap::update_one_pool_delta(
2187 CephContext *cct,
2188 const utime_t ts,
11fdf7f2 2189 const int64_t pool,
31f18b77 2190 const pool_stat_t& old_pool_sum)
7c673cae
FG
2191{
2192 if (per_pool_sum_deltas.count(pool) == 0) {
11fdf7f2
TL
2193 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2194 ceph_assert(per_pool_sum_delta.count(pool) == 0);
7c673cae
FG
2195 }
2196
31f18b77 2197 auto& sum_delta = per_pool_sum_delta[pool];
7c673cae
FG
2198
2199 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2200 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2201 &per_pool_sum_deltas[pool]);
2202}
2203
2204/**
2205 * Update pools' deltas
2206 *
2207 * @param cct CephContext
2208 * @param ts Timestamp for the stats being delta'ed
2209 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2210 */
31f18b77
FG
2211void PGMap::update_pool_deltas(
2212 CephContext *cct, const utime_t ts,
11fdf7f2 2213 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
7c673cae 2214{
31f18b77 2215 for (auto it = pg_pool_sum_old.begin();
7c673cae
FG
2216 it != pg_pool_sum_old.end(); ++it) {
2217 update_one_pool_delta(cct, ts, it->first, it->second);
2218 }
2219}
2220
2221void PGMap::clear_delta()
2222{
2223 pg_sum_delta = pool_stat_t();
2224 pg_sum_deltas.clear();
2225 stamp_delta = utime_t();
2226}
2227
7c673cae
FG
2228void PGMap::generate_test_instances(list<PGMap*>& o)
2229{
2230 o.push_back(new PGMap);
2231 list<Incremental*> inc;
2232 Incremental::generate_test_instances(inc);
2233 delete inc.front();
2234 inc.pop_front();
2235 while (!inc.empty()) {
2236 PGMap *pmp = new PGMap();
2237 *pmp = *o.back();
2238 o.push_back(pmp);
2239 o.back()->apply_incremental(NULL, *inc.front());
2240 delete inc.front();
2241 inc.pop_front();
2242 }
2243}
2244
11fdf7f2 2245void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
7c673cae
FG
2246 bool primary, set<pg_t>& pgs) const
2247{
31f18b77 2248 for (auto i = pg_stat.begin();
7c673cae
FG
2249 i != pg_stat.end();
2250 ++i) {
11fdf7f2 2251 if ((poolid >= 0) && (poolid != i->first.pool()))
7c673cae
FG
2252 continue;
2253 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2254 continue;
11fdf7f2
TL
2255 if (state == (uint64_t)-1 || // "all"
2256 (i->second.state & state) || // matches a state bit
2257 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2258 pgs.insert(i->first);
2259 }
7c673cae
FG
2260 }
2261}
2262
9f95a23c 2263void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
7c673cae
FG
2264{
2265 f->open_array_section("pg_stats");
31f18b77 2266 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2267 const pg_stat_t& st = pg_stat.at(*i);
2268 f->open_object_section("pg_stat");
2269 f->dump_stream("pgid") << *i;
2270 st.dump(f);
2271 f->close_section();
2272 }
2273 f->close_section();
2274}
2275
2276void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2277{
2278 TextTable tab;
11fdf7f2 2279 utime_t now = ceph_clock_now();
7c673cae 2280
11fdf7f2 2281 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
7c673cae 2282 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2283 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2284 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2285 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2286 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
2287 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2288 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2289 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2290 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 2291 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2292 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2293 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2294 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2295 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2296 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2297 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2298
31f18b77 2299 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2300 const pg_stat_t& st = pg_stat.at(*i);
2301
2302 ostringstream reported;
2303 reported << st.reported_epoch << ":" << st.reported_seq;
2304
11fdf7f2 2305 ostringstream upstr, actingstr;
9f95a23c
TL
2306 upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
2307 actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
7c673cae
FG
2308 tab << *i
2309 << st.stats.sum.num_objects
7c673cae
FG
2310 << st.stats.sum.num_objects_degraded
2311 << st.stats.sum.num_objects_misplaced
2312 << st.stats.sum.num_objects_unfound
2313 << st.stats.sum.num_bytes
11fdf7f2
TL
2314 << st.stats.sum.num_omap_bytes
2315 << st.stats.sum.num_omap_keys
7c673cae 2316 << st.log_size
7c673cae 2317 << pg_state_string(st.state)
11fdf7f2 2318 << utimespan_str(now - st.last_change)
7c673cae
FG
2319 << st.version
2320 << reported.str()
11fdf7f2
TL
2321 << upstr.str()
2322 << actingstr.str()
7c673cae 2323 << st.last_scrub_stamp
7c673cae
FG
2324 << st.last_deep_scrub_stamp
2325 << TextTable::endrow;
2326 }
2327
2328 ss << tab;
2329}
2330
11fdf7f2 2331void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
9f95a23c 2332 ceph::Formatter *f,
11fdf7f2
TL
2333 stringstream *rs) const {
2334 string pool_name = osd_map.get_pool_name(poolid);
2335 if (f) {
2336 f->open_object_section("pool");
2337 f->dump_string("pool_name", pool_name.c_str());
2338 f->dump_int("pool_id", poolid);
2339 f->open_object_section("recovery");
2340 }
2341 list<string> sl;
2342 stringstream tss;
2343 pool_recovery_summary(f, &sl, poolid);
2344 if (!f && !sl.empty()) {
2345 for (auto &p : sl)
2346 tss << " " << p << "\n";
2347 }
2348 if (f) {
2349 f->close_section(); // object section recovery
2350 f->open_object_section("recovery_rate");
2351 }
2352 ostringstream rss;
2353 pool_recovery_rate_summary(f, &rss, poolid);
2354 if (!f && !rss.str().empty())
2355 tss << " recovery io " << rss.str() << "\n";
2356 if (f) {
2357 f->close_section(); // object section recovery_rate
2358 f->open_object_section("client_io_rate");
2359 }
2360 rss.clear();
2361 rss.str("");
2362 pool_client_io_rate_summary(f, &rss, poolid);
2363 if (!f && !rss.str().empty())
2364 tss << " client io " << rss.str() << "\n";
2365 // dump cache tier IO rate for cache pool
2366 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2367 if (pool->is_tier()) {
2368 if (f) {
2369 f->close_section(); // object section client_io_rate
2370 f->open_object_section("cache_io_rate");
7c673cae 2371 }
11fdf7f2
TL
2372 rss.clear();
2373 rss.str("");
2374 pool_cache_io_rate_summary(f, &rss, poolid);
2375 if (!f && !rss.str().empty())
2376 tss << " cache tier io " << rss.str() << "\n";
2377 }
2378 if (f) {
2379 f->close_section(); // object section cache_io_rate
2380 f->close_section(); // object section pool
2381 } else {
2382 *rs << "pool " << pool_name << " id " << poolid << "\n";
2383 if (!tss.str().empty())
2384 *rs << tss.str() << "\n";
2385 else
2386 *rs << " nothing is going on\n\n";
7c673cae 2387 }
7c673cae
FG
2388}
2389
9f95a23c
TL
2390// Get crush parentage for an osd (skip root)
2391set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
2392{
2393 set<std::string> reporters_by_subtree;
2394 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2395
2396 auto loc = osdmap.crush->get_full_location(id);
2397 for (auto& [parent_bucket_type, parent_id] : loc) {
2398 // Should we show the root? Might not be too informative like "default"
2399 if (parent_bucket_type != "root" &&
2400 parent_bucket_type != reporter_subtree_level) {
2401 reporters_by_subtree.insert(parent_id);
2402 }
2403 }
2404 return reporters_by_subtree;
2405}
2406
11fdf7f2 2407void PGMap::get_health_checks(
31f18b77 2408 CephContext *cct,
11fdf7f2
TL
2409 const OSDMap& osdmap,
2410 health_check_map_t *checks) const
7c673cae 2411{
11fdf7f2
TL
2412 utime_t now = ceph_clock_now();
2413 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2414 const auto& pools = osdmap.get_pools();
224ce89b 2415
224ce89b
WB
2416 typedef enum pg_consequence_t {
2417 UNAVAILABLE = 1, // Client IO to the pool may block
2418 DEGRADED = 2, // Fewer than the requested number of replicas are present
eafe8130
TL
2419 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2420 // This may or may not be a deadlock condition.
2421 DAMAGED = 4, // The data may be missing or inconsistent on disk and
224ce89b 2422 // requires repair
eafe8130 2423 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
224ce89b
WB
2424 } pg_consequence_t;
2425
2426 // For a given PG state, how should it be reported at the pool level?
2427 class PgStateResponse {
2428 public:
2429 pg_consequence_t consequence;
2430 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2431 stuck_cb stuck_since;
2432 bool invert;
2433
11fdf7f2
TL
2434 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2435 : consequence(c), stuck_since(std::move(s)), invert(false)
224ce89b
WB
2436 {
2437 }
2438
11fdf7f2
TL
2439 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2440 : consequence(c), stuck_since(std::move(s)), invert(i)
224ce89b
WB
2441 {
2442 }
2443 };
2444
2445 // Record the PG state counts that contributed to a reported pool state
2446 class PgCauses {
2447 public:
2448 // Map of PG_STATE_* to number of pgs in that state.
2449 std::map<unsigned, unsigned> states;
2450
2451 // List of all PG IDs that had a state contributing
2452 // to this health condition.
2453 std::set<pg_t> pgs;
2454
2455 std::map<pg_t, std::string> pg_messages;
2456 };
2457
2458 // Map of PG state to how to respond to it
2459 std::map<unsigned, PgStateResponse> state_to_response = {
2460 // Immediate reports
2461 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
c07f9fc5 2462 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
224ce89b 2463 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
b32b8144
FG
2464 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2465 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
eafe8130
TL
2466 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2467 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
224ce89b
WB
2468 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2469 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2470 // Delayed (wait until stuck) reports
2471 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2472 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2473 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2474 // Delayed and inverted reports
b32b8144 2475 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
224ce89b
WB
2476 };
2477
2478 // Specialized state printer that takes account of inversion of
2479 // ACTIVE, CLEAN checks.
11fdf7f2 2480 auto state_name = [](const uint64_t &state) {
224ce89b
WB
2481 // Special cases for the states that are inverted checks
2482 if (state == PG_STATE_CLEAN) {
2483 return std::string("unclean");
2484 } else if (state == PG_STATE_ACTIVE) {
2485 return std::string("inactive");
2486 } else {
2487 return pg_state_string(state);
2488 }
2489 };
2490
2491 // Map of what is wrong to information about why, implicitly also stores
2492 // the list of what is wrong.
2493 std::map<pg_consequence_t, PgCauses> detected;
2494
2495 // Optimisation: trim down the number of checks to apply based on
2496 // the summary counters
2497 std::map<unsigned, PgStateResponse> possible_responses;
2498 for (const auto &i : num_pg_by_state) {
2499 for (const auto &j : state_to_response) {
2500 if (!j.second.invert) {
2501 // Check for normal tests by seeing if any pgs have the flag
2502 if (i.first & j.first) {
2503 possible_responses.insert(j);
2504 }
2505 }
2506 }
2507 }
2508
2509 for (const auto &j : state_to_response) {
2510 if (j.second.invert) {
2511 // Check for inverted tests by seeing if not-all pgs have the flag
2512 const auto &found = num_pg_by_state.find(j.first);
2513 if (found == num_pg_by_state.end() || found->second != num_pg) {
2514 possible_responses.insert(j);
2515 }
2516 }
2517 }
2518
11fdf7f2 2519 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
224ce89b
WB
2520 // Loop over all PGs, if there are any possibly-unhealthy states in there
2521 if (!possible_responses.empty()) {
2522 for (const auto& i : pg_stat) {
2523 const auto &pg_id = i.first;
2524 const auto &pg_info = i.second;
2525
2526 for (const auto &j : state_to_response) {
2527 const auto &pg_response_state = j.first;
2528 const auto &pg_response = j.second;
2529
2530 // Apply the state test
2531 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2532 continue;
2533 }
2534
2535 // Apply stuckness test if needed
2536 if (pg_response.stuck_since) {
2537 // Delayed response, check for stuckness
2538 utime_t last_whatever = pg_response.stuck_since(pg_info);
f6b5b4d7
TL
2539 if (last_whatever.is_zero() &&
2540 pg_info.last_change >= cutoff) {
2541 // still moving, ignore
2542 continue;
2543 } else if (last_whatever >= cutoff) {
224ce89b
WB
2544 // Not stuck enough, ignore.
2545 continue;
2546 } else {
2547
2548 }
2549 }
2550
2551 auto &causes = detected[pg_response.consequence];
2552 causes.states[pg_response_state]++;
2553 causes.pgs.insert(pg_id);
2554
2555 // Don't bother composing detail string if we have already recorded
2556 // too many
2557 if (causes.pg_messages.size() > max) {
2558 continue;
2559 }
2560
2561 std::ostringstream ss;
2562 if (pg_response.stuck_since) {
2563 utime_t since = pg_response.stuck_since(pg_info);
2564 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2565 if (since == utime_t()) {
2566 ss << " since forever";
2567 } else {
2568 utime_t dur = now - since;
9f95a23c 2569 ss << " for " << utimespan_str(dur);
224ce89b
WB
2570 }
2571 ss << ", current state " << pg_state_string(pg_info.state)
2572 << ", last acting " << pg_info.acting;
2573 } else {
2574 ss << "pg " << pg_id << " is "
2575 << pg_state_string(pg_info.state);
2576 ss << ", acting " << pg_info.acting;
2577 if (pg_info.stats.sum.num_objects_unfound) {
2578 ss << ", " << pg_info.stats.sum.num_objects_unfound
2579 << " unfound";
2580 }
2581 }
2582
2583 if (pg_info.state & PG_STATE_INCOMPLETE) {
2584 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2585 if (pi && pi->min_size > 1) {
2586 ss << " (reducing pool "
2587 << osdmap.get_pool_name(pg_id.pool())
2588 << " min_size from " << (int)pi->min_size
2589 << " may help; search ceph.com/docs for 'incomplete')";
2590 }
2591 }
2592
2593 causes.pg_messages[pg_id] = ss.str();
2594 }
2595 }
2596 } else {
2597 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2598 }
2599
2600 for (const auto &i : detected) {
2601 std::string health_code;
2602 health_status_t sev;
2603 std::string summary;
2604 switch(i.first) {
2605 case UNAVAILABLE:
2606 health_code = "PG_AVAILABILITY";
2607 sev = HEALTH_WARN;
2608 summary = "Reduced data availability: ";
2609 break;
2610 case DEGRADED:
2611 health_code = "PG_DEGRADED";
2612 summary = "Degraded data redundancy: ";
2613 sev = HEALTH_WARN;
2614 break;
eafe8130
TL
2615 case BACKFILL_FULL:
2616 health_code = "PG_BACKFILL_FULL";
2617 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2618 sev = HEALTH_WARN;
224ce89b
WB
2619 break;
2620 case DAMAGED:
2621 health_code = "PG_DAMAGED";
2622 summary = "Possible data damage: ";
2623 sev = HEALTH_ERR;
2624 break;
eafe8130
TL
2625 case RECOVERY_FULL:
2626 health_code = "PG_RECOVERY_FULL";
2627 summary = "Full OSDs blocking recovery: ";
2628 sev = HEALTH_ERR;
2629 break;
224ce89b 2630 default:
11fdf7f2 2631 ceph_abort();
224ce89b
WB
2632 }
2633
2634 if (i.first == DEGRADED) {
2635 if (pg_sum.stats.sum.num_objects_degraded &&
2636 pg_sum.stats.sum.num_object_copies > 0) {
2637 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2638 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2639 char b[20];
2640 snprintf(b, sizeof(b), "%.3lf", pc);
2641 ostringstream ss;
2642 ss << pg_sum.stats.sum.num_objects_degraded
2643 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2644 << b << "%)";
2645
2646 // Throw in a comma for the benefit of the following PG counts
2647 summary += ss.str() + ", ";
2648 }
2649 }
2650
2651 // Compose summary message saying how many PGs in what states led
2652 // to this health check failing
2653 std::vector<std::string> pg_msgs;
9f95a23c 2654 int64_t count = 0;
224ce89b
WB
2655 for (const auto &j : i.second.states) {
2656 std::ostringstream msg;
2657 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2658 pg_msgs.push_back(msg.str());
9f95a23c 2659 count += j.second;
224ce89b
WB
2660 }
2661 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2662
224ce89b
WB
2663 health_check_t *check = &checks->add(
2664 health_code,
2665 sev,
9f95a23c
TL
2666 summary,
2667 count);
224ce89b
WB
2668
2669 // Compose list of PGs contributing to this health check failing
2670 for (const auto &j : i.second.pg_messages) {
2671 check->detail.push_back(j.second);
2672 }
2673 }
2674
224ce89b
WB
2675 // OSD_SCRUB_ERRORS
2676 if (pg_sum.stats.sum.num_scrub_errors) {
2677 ostringstream ss;
2678 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
9f95a23c
TL
2679 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2680 pg_sum.stats.sum.num_scrub_errors);
224ce89b
WB
2681 }
2682
28e407b8
AA
2683 // LARGE_OMAP_OBJECTS
2684 if (pg_sum.stats.sum.num_large_omap_objects) {
2685 list<string> detail;
2686 for (auto &pool : pools) {
2687 const string& pool_name = osdmap.get_pool_name(pool.first);
2688 auto it2 = pg_pool_sum.find(pool.first);
2689 if (it2 == pg_pool_sum.end()) {
2690 continue;
2691 }
2692 const pool_stat_t *pstat = &it2->second;
2693 if (pstat == nullptr) {
2694 continue;
2695 }
2696 const object_stat_sum_t& sum = pstat->stats.sum;
2697 if (sum.num_large_omap_objects) {
2698 stringstream ss;
2699 ss << sum.num_large_omap_objects << " large objects found in pool "
2700 << "'" << pool_name << "'";
2701 detail.push_back(ss.str());
2702 }
2703 }
2704 if (!detail.empty()) {
2705 ostringstream ss;
2706 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
9f95a23c
TL
2707 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2708 pg_sum.stats.sum.num_large_omap_objects);
28e407b8
AA
2709 stringstream tip;
2710 tip << "Search the cluster log for 'Large omap object found' for more "
2711 << "details.";
2712 detail.push_back(tip.str());
2713 d.detail.swap(detail);
2714 }
2715 }
2716
224ce89b
WB
2717 // CACHE_POOL_NEAR_FULL
2718 {
2719 list<string> detail;
2720 unsigned num_pools = 0;
2721 for (auto& p : pools) {
2722 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2723 !pg_pool_sum.count(p.first)) {
2724 continue;
2725 }
2726 bool nearfull = false;
2727 const string& name = osdmap.get_pool_name(p.first);
2728 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2729 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2730 ((1000000 - p.second.cache_target_full_ratio_micro) *
2731 cct->_conf->mon_cache_target_full_warn_ratio);
2732 if (p.second.target_max_objects &&
2733 (uint64_t)(st.stats.sum.num_objects -
2734 st.stats.sum.num_objects_hit_set_archive) >
2735 p.second.target_max_objects * (ratio / 1000000.0)) {
2736 ostringstream ss;
2737 ss << "cache pool '" << name << "' with "
1adf2230 2738 << si_u_t(st.stats.sum.num_objects)
224ce89b 2739 << " objects at/near target max "
1adf2230 2740 << si_u_t(p.second.target_max_objects) << " objects";
224ce89b
WB
2741 detail.push_back(ss.str());
2742 nearfull = true;
2743 }
2744 if (p.second.target_max_bytes &&
2745 (uint64_t)(st.stats.sum.num_bytes -
2746 st.stats.sum.num_bytes_hit_set_archive) >
2747 p.second.target_max_bytes * (ratio / 1000000.0)) {
2748 ostringstream ss;
2749 ss << "cache pool '" << name
1adf2230
AA
2750 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2751 << " at/near target max "
2752 << byte_u_t(p.second.target_max_bytes);
224ce89b
WB
2753 detail.push_back(ss.str());
2754 nearfull = true;
2755 }
2756 if (nearfull) {
2757 ++num_pools;
2758 }
2759 }
2760 if (!detail.empty()) {
2761 ostringstream ss;
2762 ss << num_pools << " cache pools at or near target size";
9f95a23c
TL
2763 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2764 num_pools);
224ce89b
WB
2765 d.detail.swap(detail);
2766 }
2767 }
2768
2769 // TOO_FEW_PGS
3efd9988
FG
2770 unsigned num_in = osdmap.get_num_in_osds();
2771 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2772 const auto min_pg_per_osd =
11fdf7f2 2773 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
3efd9988
FG
2774 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2775 auto per = sum_pg_up / num_in;
2776 if (per < min_pg_per_osd && per) {
224ce89b
WB
2777 ostringstream ss;
2778 ss << "too few PGs per OSD (" << per
3efd9988 2779 << " < min " << min_pg_per_osd << ")";
9f95a23c
TL
2780 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2781 min_pg_per_osd - per);
224ce89b
WB
2782 }
2783 }
2784
2785 // TOO_MANY_PGS
11fdf7f2 2786 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
2787 if (num_in && max_pg_per_osd > 0) {
2788 auto per = sum_pg_up / num_in;
2789 if (per > max_pg_per_osd) {
224ce89b
WB
2790 ostringstream ss;
2791 ss << "too many PGs per OSD (" << per
3efd9988 2792 << " > max " << max_pg_per_osd << ")";
9f95a23c
TL
2793 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2794 per - max_pg_per_osd);
224ce89b
WB
2795 }
2796 }
2797
eafe8130
TL
2798 // TOO_FEW_OSDS
2799 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2800 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2801 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2802 ostringstream ss;
2803 ss << "OSD count " << osdmap.get_num_osds()
2804 << " < osd_pool_default_size " << osd_pool_default_size;
9f95a23c
TL
2805 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2806 osd_pool_default_size - osdmap.get_num_osds());
eafe8130
TL
2807 }
2808
2809 // SLOW_PING_TIME
2810 // Convert milliseconds to microseconds
2811 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2812 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2813 if (warn_slow_ping_time == 0) {
2814 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2815 warn_slow_ping_time = grace;
2816 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2817 }
2818 if (warn_slow_ping_time > 0) {
2819
2820 struct mon_ping_item_t {
2821 uint32_t pingtime;
2822 int from;
2823 int to;
2824 bool improving;
2825
2826 bool operator<(const mon_ping_item_t& rhs) const {
2827 if (pingtime < rhs.pingtime)
2828 return true;
2829 if (pingtime > rhs.pingtime)
2830 return false;
2831 if (from < rhs.from)
2832 return true;
2833 if (from > rhs.from)
2834 return false;
2835 return to < rhs.to;
2836 }
2837 };
2838
2839 list<string> detail_back;
2840 list<string> detail_front;
f6b5b4d7 2841 list<string> detail;
eafe8130
TL
2842 set<mon_ping_item_t> back_sorted, front_sorted;
2843 for (auto i : osd_stat) {
2844 for (auto j : i.second.hb_pingtime) {
2845
2846 // Maybe source info is old
2847 if (now.sec() - j.second.last_update > grace * 60)
2848 continue;
2849
2850 mon_ping_item_t back;
2851 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2852 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2853 back.from = i.first;
2854 back.to = j.first;
2855 if (back.pingtime > warn_slow_ping_time) {
2856 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2857 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2858 back_sorted.emplace(back);
2859 }
2860
2861 mon_ping_item_t front;
2862 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2863 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2864 front.from = i.first;
2865 front.to = j.first;
2866 if (front.pingtime > warn_slow_ping_time) {
2867 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
2868 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2869 front_sorted.emplace(front);
2870 }
2871 }
f6b5b4d7
TL
2872 if (i.second.num_shards_repaired >
2873 cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
2874 ostringstream ss;
2875 ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
2876 detail.push_back(ss.str());
2877 }
2878 }
2879 if (!detail.empty()) {
2880 ostringstream ss;
2881 ss << "Too many repaired reads on " << detail.size() << " OSDs";
2882 auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
2883 detail.size());
2884 d.detail.swap(detail);
eafe8130
TL
2885 }
2886 int max_detail = 10;
2887 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2888 ostringstream ss;
2889 if (max_detail == 0) {
2890 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2891 detail_back.push_back(ss.str());
2892 break;
2893 }
2894 max_detail--;
9f95a23c
TL
2895 ss << "Slow OSD heartbeats on back from osd." << sback.from
2896 << " [" << osd_parentage(osdmap, sback.from) << "]"
eafe8130
TL
2897 << (osdmap.is_down(sback.from) ? " (down)" : "")
2898 << " to osd." << sback.to
9f95a23c 2899 << " [" << osd_parentage(osdmap, sback.to) << "]"
eafe8130
TL
2900 << (osdmap.is_down(sback.to) ? " (down)" : "")
2901 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2902 << (sback.improving ? " possibly improving" : "");
2903 detail_back.push_back(ss.str());
2904 }
2905 max_detail = 10;
2906 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2907 ostringstream ss;
2908 if (max_detail == 0) {
2909 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2910 detail_front.push_back(ss.str());
2911 break;
2912 }
2913 max_detail--;
9f95a23c
TL
2914 // Get crush parentage for each osd
2915 ss << "Slow OSD heartbeats on front from osd." << sfront.from
2916 << " [" << osd_parentage(osdmap, sfront.from) << "]"
eafe8130
TL
2917 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2918 << " to osd." << sfront.to
9f95a23c 2919 << " [" << osd_parentage(osdmap, sfront.to) << "]"
eafe8130
TL
2920 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2921 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2922 << (sfront.improving ? " possibly improving" : "");
2923 detail_front.push_back(ss.str());
2924 }
2925 if (detail_back.size() != 0) {
2926 ostringstream ss;
9f95a23c
TL
2927 ss << "Slow OSD heartbeats on back (longest "
2928 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
2929 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2930 back_sorted.size());
eafe8130
TL
2931 d.detail.swap(detail_back);
2932 }
2933 if (detail_front.size() != 0) {
2934 ostringstream ss;
9f95a23c
TL
2935 ss << "Slow OSD heartbeats on front (longest "
2936 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
2937 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2938 front_sorted.size());
eafe8130
TL
2939 d.detail.swap(detail_front);
2940 }
2941 }
2942
224ce89b
WB
2943 // SMALLER_PGP_NUM
2944 // MANY_OBJECTS_PER_PG
2945 if (!pg_stat.empty()) {
2946 list<string> pgp_detail, many_detail;
b32b8144 2947 const auto mon_pg_warn_min_objects =
11fdf7f2 2948 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
b32b8144 2949 const auto mon_pg_warn_min_pool_objects =
11fdf7f2 2950 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
b32b8144 2951 const auto mon_pg_warn_max_object_skew =
11fdf7f2 2952 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
224ce89b
WB
2953 for (auto p = pg_pool_sum.begin();
2954 p != pg_pool_sum.end();
2955 ++p) {
2956 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2957 if (!pi)
2958 continue; // in case osdmap changes haven't propagated to PGMap yet
2959 const string& name = osdmap.get_pool_name(p->first);
11fdf7f2
TL
2960 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2961 // the warnings. If the cluster is failing to converge on the target
2962 // values that is a separate issue!
2963 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
224ce89b
WB
2964 !(name.find(".DELETED") != string::npos &&
2965 cct->_conf->mon_fake_pool_delete)) {
2966 ostringstream ss;
2967 ss << "pool " << name << " pg_num "
11fdf7f2
TL
2968 << pi->get_pg_num_target()
2969 << " > pgp_num " << pi->get_pgp_num_target();
224ce89b
WB
2970 pgp_detail.push_back(ss.str());
2971 }
2972 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2973 if (average_objects_per_pg > 0 &&
b32b8144
FG
2974 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2975 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
11fdf7f2
TL
2976 int objects_per_pg = p->second.stats.sum.num_objects /
2977 pi->get_pg_num_target();
224ce89b 2978 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
b32b8144
FG
2979 if (mon_pg_warn_max_object_skew > 0 &&
2980 ratio > mon_pg_warn_max_object_skew) {
224ce89b
WB
2981 ostringstream ss;
2982 ss << "pool " << name << " objects per pg ("
2983 << objects_per_pg << ") is more than " << ratio
2984 << " times cluster average ("
2985 << average_objects_per_pg << ")";
2986 many_detail.push_back(ss.str());
2987 }
2988 }
2989 }
2990 if (!pgp_detail.empty()) {
2991 ostringstream ss;
2992 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
9f95a23c
TL
2993 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
2994 pgp_detail.size());
224ce89b
WB
2995 d.detail.swap(pgp_detail);
2996 }
2997 if (!many_detail.empty()) {
2998 ostringstream ss;
2999 ss << many_detail.size() << " pools have many more objects per pg than"
3000 << " average";
9f95a23c
TL
3001 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
3002 many_detail.size());
224ce89b
WB
3003 d.detail.swap(many_detail);
3004 }
3005 }
3006
3007 // POOL_FULL
3008 // POOL_NEAR_FULL
3009 {
11fdf7f2
TL
3010 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
3011 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
224ce89b
WB
3012 list<string> full_detail, nearfull_detail;
3013 unsigned full_pools = 0, nearfull_pools = 0;
3014 for (auto it : pools) {
3015 auto it2 = pg_pool_sum.find(it.first);
3016 if (it2 == pg_pool_sum.end()) {
3017 continue;
3018 }
3019 const pool_stat_t *pstat = &it2->second;
3020 const object_stat_sum_t& sum = pstat->stats.sum;
3021 const string& pool_name = osdmap.get_pool_name(it.first);
3022 const pg_pool_t &pool = it.second;
3023 bool full = false, nearfull = false;
3024 if (pool.quota_max_objects > 0) {
3025 stringstream ss;
3026 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3027 } else if (crit_threshold > 0 &&
3028 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3029 ss << "pool '" << pool_name
3030 << "' has " << sum.num_objects << " objects"
3031 << " (max " << pool.quota_max_objects << ")";
3032 full_detail.push_back(ss.str());
3033 full = true;
3034 } else if (warn_threshold > 0 &&
3035 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3036 ss << "pool '" << pool_name
3037 << "' has " << sum.num_objects << " objects"
3038 << " (max " << pool.quota_max_objects << ")";
3039 nearfull_detail.push_back(ss.str());
3040 nearfull = true;
3041 }
3042 }
3043 if (pool.quota_max_bytes > 0) {
3044 stringstream ss;
3045 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3046 } else if (crit_threshold > 0 &&
3047 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3048 ss << "pool '" << pool_name
1adf2230
AA
3049 << "' has " << byte_u_t(sum.num_bytes)
3050 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
224ce89b
WB
3051 full_detail.push_back(ss.str());
3052 full = true;
3053 } else if (warn_threshold > 0 &&
3054 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3055 ss << "pool '" << pool_name
1adf2230
AA
3056 << "' has " << byte_u_t(sum.num_bytes)
3057 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
224ce89b
WB
3058 nearfull_detail.push_back(ss.str());
3059 nearfull = true;
3060 }
3061 }
3062 if (full) {
3063 ++full_pools;
3064 }
3065 if (nearfull) {
3066 ++nearfull_pools;
3067 }
3068 }
3069 if (full_pools) {
3070 ostringstream ss;
3071 ss << full_pools << " pools full";
9f95a23c 3072 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
224ce89b
WB
3073 d.detail.swap(full_detail);
3074 }
3075 if (nearfull_pools) {
3076 ostringstream ss;
11fdf7f2 3077 ss << nearfull_pools << " pools nearfull";
9f95a23c 3078 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
224ce89b
WB
3079 d.detail.swap(nearfull_detail);
3080 }
3081 }
3082
3083 // OBJECT_MISPLACED
3084 if (pg_sum.stats.sum.num_objects_misplaced &&
11fdf7f2
TL
3085 pg_sum.stats.sum.num_object_copies > 0 &&
3086 cct->_conf->mon_warn_on_misplaced) {
224ce89b
WB
3087 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3088 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3089 char b[20];
3090 snprintf(b, sizeof(b), "%.3lf", pc);
3091 ostringstream ss;
3092 ss << pg_sum.stats.sum.num_objects_misplaced
3093 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3094 << b << "%)";
9f95a23c
TL
3095 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3096 pg_sum.stats.sum.num_objects_misplaced);
224ce89b
WB
3097 }
3098
3099 // OBJECT_UNFOUND
3100 if (pg_sum.stats.sum.num_objects_unfound &&
3101 pg_sum.stats.sum.num_objects) {
3102 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3103 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3104 char b[20];
3105 snprintf(b, sizeof(b), "%.3lf", pc);
3106 ostringstream ss;
3107 ss << pg_sum.stats.sum.num_objects_unfound
b5b8bbf5 3108 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
9f95a23c
TL
3109 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3110 pg_sum.stats.sum.num_objects_unfound);
c07f9fc5
FG
3111
3112 for (auto& p : pg_stat) {
3113 if (p.second.stats.sum.num_objects_unfound) {
3114 ostringstream ss;
3115 ss << "pg " << p.first
3116 << " has " << p.second.stats.sum.num_objects_unfound
3117 << " unfound objects";
3118 d.detail.push_back(ss.str());
3119 if (d.detail.size() > max) {
3120 d.detail.push_back("(additional pgs left out for brevity)");
3121 break;
3122 }
3123 }
3124 }
224ce89b
WB
3125 }
3126
3127 // REQUEST_SLOW
3128 // REQUEST_STUCK
11fdf7f2 3129 // SLOW_OPS unifies them in mimic.
9f95a23c 3130 if (osdmap.require_osd_release < ceph_release_t::mimic &&
11fdf7f2 3131 cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3132 !osd_sum.op_queue_age_hist.h.empty() &&
3133 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
224ce89b
WB
3134 cct->_conf->mon_osd_warn_op_age) {
3135 list<string> warn_detail, error_detail;
3136 unsigned warn = 0, error = 0;
3137 float err_age =
3138 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3139 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3140 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3141 float ub = (float)(1 << i) / 1000.0;
3142 if (ub < cct->_conf->mon_osd_warn_op_age)
3143 break;
3144 if (h.h[i]) {
3145 ostringstream ss;
3146 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3147 if (ub > err_age) {
3148 error += h.h[i];
3149 error_detail.push_back(ss.str());
3150 } else {
3151 warn += h.h[i];
3152 warn_detail.push_back(ss.str());
3153 }
3154 }
3155 }
3156
3157 map<float,set<int>> warn_osd_by_max; // max -> osds
3158 map<float,set<int>> error_osd_by_max; // max -> osds
3159 if (!warn_detail.empty() || !error_detail.empty()) {
3160 for (auto& p : osd_stat) {
3161 const pow2_hist_t& h = p.second.op_queue_age_hist;
3162 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3163 float ub = (float)(1 << i) / 1000.0;
3164 if (ub < cct->_conf->mon_osd_warn_op_age)
3165 break;
3166 if (h.h[i]) {
3167 if (ub > err_age) {
3168 error_osd_by_max[ub].insert(p.first);
3169 } else {
3170 warn_osd_by_max[ub].insert(p.first);
3171 }
3172 break;
3173 }
3174 }
3175 }
3176 }
3177
3178 if (!warn_detail.empty()) {
11fdf7f2
TL
3179 ostringstream ss;
3180 ss << warn << " slow requests are blocked > "
3181 << cct->_conf->mon_osd_warn_op_age << " sec";
9f95a23c 3182 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
11fdf7f2 3183 d.detail.swap(warn_detail);
224ce89b
WB
3184 int left = max;
3185 for (auto& p : warn_osd_by_max) {
3186 ostringstream ss;
3187 if (p.second.size() > 1) {
c07f9fc5
FG
3188 ss << "osds " << p.second
3189 << " have blocked requests > " << p.first << " sec";
224ce89b 3190 } else {
c07f9fc5
FG
3191 ss << "osd." << *p.second.begin()
3192 << " has blocked requests > " << p.first << " sec";
224ce89b 3193 }
11fdf7f2 3194 d.detail.push_back(ss.str());
224ce89b
WB
3195 if (--left == 0) {
3196 break;
3197 }
3198 }
3199 }
3200 if (!error_detail.empty()) {
11fdf7f2
TL
3201 ostringstream ss;
3202 ss << error << " stuck requests are blocked > "
3203 << err_age << " sec";
9f95a23c 3204 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
11fdf7f2 3205 d.detail.swap(error_detail);
224ce89b
WB
3206 int left = max;
3207 for (auto& p : error_osd_by_max) {
3208 ostringstream ss;
3209 if (p.second.size() > 1) {
c07f9fc5
FG
3210 ss << "osds " << p.second
3211 << " have stuck requests > " << p.first << " sec";
224ce89b 3212 } else {
c07f9fc5
FG
3213 ss << "osd." << *p.second.begin()
3214 << " has stuck requests > " << p.first << " sec";
224ce89b 3215 }
11fdf7f2 3216 d.detail.push_back(ss.str());
224ce89b
WB
3217 if (--left == 0) {
3218 break;
3219 }
3220 }
3221 }
3222 }
7c673cae 3223
11fdf7f2
TL
3224 // OBJECT_STORE_WARN
3225 if (osd_sum.os_alerts.size()) {
3226 map<string, pair<size_t, list<string>>> os_alerts_sum;
3227
3228 for (auto& a : osd_sum.os_alerts) {
3229 int left = max;
3230 string s0 = " osd.";
3231 s0 += stringify(a.first);
3232 for (auto& aa : a.second) {
3233 string s(s0);
3234 s += " ";
3235 s += aa.second;
3236 auto it = os_alerts_sum.find(aa.first);
3237 if (it == os_alerts_sum.end()) {
3238 list<string> d;
3239 d.emplace_back(s);
3240 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3241 } else {
3242 auto& p = it->second;
3243 ++p.first;
3244 p.second.emplace_back(s);
3245 }
3246 if (--left == 0) {
3247 break;
3248 }
3249 }
3250 }
3251
3252 for (auto& asum : os_alerts_sum) {
9f95a23c 3253 string summary = stringify(asum.second.first) + " OSD(s)";
11fdf7f2 3254 if (asum.first == "BLUEFS_SPILLOVER") {
9f95a23c 3255 summary += " experiencing BlueFS spillover";
11fdf7f2 3256 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
9f95a23c 3257 summary += " have broken BlueStore compression";
81eedcae 3258 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
9f95a23c 3259 summary += " reporting legacy (not per-pool) BlueStore stats";
81eedcae 3260 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
9f95a23c
TL
3261 summary += " have dangerous mismatch between BlueStore block device and free list sizes";
3262 } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3263 summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
11fdf7f2 3264 }
9f95a23c 3265 auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
11fdf7f2
TL
3266 for (auto& s : asum.second.second) {
3267 d.detail.push_back(s);
3268 }
3269 }
3270 }
224ce89b
WB
3271 // PG_NOT_SCRUBBED
3272 // PG_NOT_DEEP_SCRUBBED
11fdf7f2
TL
3273 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3274 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
a8e16298
TL
3275 list<string> detail, deep_detail;
3276 int detail_max = max, deep_detail_max = max;
3277 int detail_more = 0, deep_detail_more = 0;
3278 int detail_total = 0, deep_detail_total = 0;
3279 for (auto& p : pg_stat) {
3280 int64_t pnum = p.first.pool();
3281 auto pool = osdmap.get_pg_pool(pnum);
3282 if (!pool)
3283 continue;
11fdf7f2 3284 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
a8e16298
TL
3285 double scrub_max_interval = 0;
3286 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3287 if (scrub_max_interval <= 0) {
3288 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
c07f9fc5 3289 }
11fdf7f2 3290 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
a8e16298
TL
3291 scrub_max_interval;
3292 utime_t cutoff = now;
3293 cutoff -= age;
3294 if (p.second.last_scrub_stamp < cutoff) {
3295 if (detail_max > 0) {
3296 ostringstream ss;
3297 ss << "pg " << p.first << " not scrubbed since "
3298 << p.second.last_scrub_stamp;
3299 detail.push_back(ss.str());
3300 --detail_max;
3301 } else {
3302 ++detail_more;
3303 }
3304 ++detail_total;
3305 }
3306 }
11fdf7f2 3307 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
a8e16298
TL
3308 double deep_scrub_interval = 0;
3309 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3310 if (deep_scrub_interval <= 0) {
3311 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3312 }
11fdf7f2 3313 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
a8e16298
TL
3314 deep_scrub_interval;
3315 utime_t deep_cutoff = now;
3316 deep_cutoff -= deep_age;
3317 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3318 if (deep_detail_max > 0) {
3319 ostringstream ss;
3320 ss << "pg " << p.first << " not deep-scrubbed since "
3321 << p.second.last_deep_scrub_stamp;
3322 deep_detail.push_back(ss.str());
3323 --deep_detail_max;
3324 } else {
3325 ++deep_detail_more;
3326 }
3327 ++deep_detail_total;
c07f9fc5 3328 }
224ce89b 3329 }
a8e16298
TL
3330 }
3331 if (detail_total) {
3332 ostringstream ss;
3333 ss << detail_total << " pgs not scrubbed in time";
9f95a23c 3334 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
a8e16298 3335
c07f9fc5 3336 if (!detail.empty()) {
c07f9fc5 3337 d.detail.swap(detail);
a8e16298
TL
3338
3339 if (detail_more) {
3340 ostringstream ss;
3341 ss << detail_more << " more pgs... ";
3342 d.detail.push_back(ss.str());
3343 }
c07f9fc5 3344 }
a8e16298
TL
3345 }
3346 if (deep_detail_total) {
3347 ostringstream ss;
3348 ss << deep_detail_total << " pgs not deep-scrubbed in time";
9f95a23c
TL
3349 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3350 deep_detail_total);
a8e16298 3351
c07f9fc5 3352 if (!deep_detail.empty()) {
c07f9fc5 3353 d.detail.swap(deep_detail);
a8e16298
TL
3354
3355 if (deep_detail_more) {
3356 ostringstream ss;
3357 ss << deep_detail_more << " more pgs... ";
3358 d.detail.push_back(ss.str());
3359 }
c07f9fc5
FG
3360 }
3361 }
3362 }
3363
3364 // POOL_APP
11fdf7f2 3365 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
c07f9fc5
FG
3366 list<string> detail;
3367 for (auto &it : pools) {
3368 const pg_pool_t &pool = it.second;
3369 const string& pool_name = osdmap.get_pool_name(it.first);
3370 auto it2 = pg_pool_sum.find(it.first);
3371 if (it2 == pg_pool_sum.end()) {
3372 continue;
3373 }
3374 const pool_stat_t *pstat = &it2->second;
3375 if (pstat == nullptr) {
3376 continue;
3377 }
3378 const object_stat_sum_t& sum = pstat->stats.sum;
3379 // application metadata is not encoded until luminous is minimum
3380 // required release
11fdf7f2
TL
3381 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3382 !pool.is_tier()) {
c07f9fc5
FG
3383 stringstream ss;
3384 ss << "application not enabled on pool '" << pool_name << "'";
3385 detail.push_back(ss.str());
224ce89b
WB
3386 }
3387 }
3388 if (!detail.empty()) {
3389 ostringstream ss;
9f95a23c
TL
3390 ss << detail.size() << " pool(s) do not have an application enabled";
3391 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3392 detail.size());
c07f9fc5
FG
3393 stringstream tip;
3394 tip << "use 'ceph osd pool application enable <pool-name> "
3395 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3396 << "or freeform for custom applications.";
3397 detail.push_back(tip.str());
224ce89b
WB
3398 d.detail.swap(detail);
3399 }
31f18b77 3400 }
b32b8144
FG
3401
3402 // PG_SLOW_SNAP_TRIMMING
3403 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3404 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3405 uint64_t snaptrimq_exceeded = 0;
3406 uint32_t longest_queue = 0;
3407 const pg_t* longest_q_pg = nullptr;
3408 list<string> detail;
3409
3410 for (auto& i: pg_stat) {
3411 uint32_t current_len = i.second.snaptrimq_len;
3412 if (current_len >= snapthreshold) {
3413 snaptrimq_exceeded++;
3414 if (longest_queue <= current_len) {
3415 longest_q_pg = &i.first;
3416 longest_queue = current_len;
3417 }
3418 if (detail.size() < max - 1) {
3419 stringstream ss;
3420 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3421 detail.push_back(ss.str());
3422 continue;
3423 }
3424 if (detail.size() < max) {
3425 detail.push_back("...more pgs affected");
3426 continue;
3427 }
3428 }
3429 }
3430
3431 if (snaptrimq_exceeded) {
3432 {
3433 ostringstream ss;
3434 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3435 detail.push_back(ss.str());
3436 }
3437
3438 stringstream ss;
3439 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
9f95a23c
TL
3440 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3441 snaptrimq_exceeded);
b32b8144
FG
3442 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3443 d.detail.swap(detail);
3444 }
3445 }
31f18b77 3446}
7c673cae 3447
9f95a23c
TL
3448void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
3449{
3450 if (f) {
3451 f->open_array_section("pgs_by_pool_state");
3452 for (auto& i: num_pg_by_pool_state) {
3453 f->open_object_section("per_pool_pgs_by_state");
3454 f->dump_int("pool_id", i.first);
3455 f->open_array_section("pg_state_counts");
3456 for (auto& j : i.second) {
3457 f->open_object_section("pg_state_count");
3458 f->dump_string("state_name", pg_state_string(j.first));
3459 f->dump_int("count", j.second);
3460 f->close_section();
3461 }
3462 f->close_section();
3463 f->close_section();
3464 }
3465 f->close_section();
3466 }
3467 PGMapDigest::print_summary(f, out);
3468}
3469
7c673cae
FG
3470int process_pg_map_command(
3471 const string& orig_prefix,
11fdf7f2 3472 const cmdmap_t& orig_cmdmap,
7c673cae
FG
3473 const PGMap& pg_map,
3474 const OSDMap& osdmap,
9f95a23c 3475 ceph::Formatter *f,
7c673cae
FG
3476 stringstream *ss,
3477 bufferlist *odata)
3478{
3479 string prefix = orig_prefix;
11fdf7f2
TL
3480 auto cmdmap = orig_cmdmap;
3481
3482 string omap_stats_note =
3483 "\n* NOTE: Omap statistics are gathered during deep scrub and "
9f95a23c 3484 "may be inaccurate soon afterwards depending on utilization. See "
11fdf7f2
TL
3485 "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3486 "for further details.\n";
3487 bool omap_stats_note_required = false;
7c673cae
FG
3488
3489 // perhaps these would be better in the parsing, but it's weird
3490 bool primary = false;
3491 if (prefix == "pg dump_json") {
3492 vector<string> v;
3493 v.push_back(string("all"));
3494 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3495 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3496 prefix = "pg dump";
3497 } else if (prefix == "pg dump_pools_json") {
3498 vector<string> v;
3499 v.push_back(string("pools"));
3500 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3501 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3502 prefix = "pg dump";
3503 } else if (prefix == "pg ls-by-primary") {
3504 primary = true;
3505 prefix = "pg ls";
3506 } else if (prefix == "pg ls-by-osd") {
3507 prefix = "pg ls";
3508 } else if (prefix == "pg ls-by-pool") {
3509 prefix = "pg ls";
3510 string poolstr;
9f95a23c 3511 cmd_getval(cmdmap, "poolstr", poolstr);
7c673cae
FG
3512 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3513 if (pool < 0) {
3514 *ss << "pool " << poolstr << " does not exist";
3515 return -ENOENT;
3516 }
3517 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3518 }
3519
7c673cae
FG
3520 stringstream ds;
3521 if (prefix == "pg stat") {
3522 if (f) {
3523 f->open_object_section("pg_summary");
3524 pg_map.print_oneline_summary(f, NULL);
3525 f->close_section();
3526 f->flush(ds);
3527 } else {
3528 ds << pg_map;
3529 }
3530 odata->append(ds);
3531 return 0;
3532 }
3533
3534 if (prefix == "pg getmap") {
3535 pg_map.encode(*odata);
3536 *ss << "got pgmap version " << pg_map.version;
3537 return 0;
3538 }
3539
3540 if (prefix == "pg dump") {
3541 string val;
3542 vector<string> dumpcontents;
3543 set<string> what;
9f95a23c 3544 if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
7c673cae
FG
3545 copy(dumpcontents.begin(), dumpcontents.end(),
3546 inserter(what, what.end()));
3547 }
3548 if (what.empty())
3549 what.insert("all");
3550 if (f) {
3551 if (what.count("all")) {
3552 f->open_object_section("pg_map");
3553 pg_map.dump(f);
3554 f->close_section();
3555 } else if (what.count("summary") || what.count("sum")) {
3556 f->open_object_section("pg_map");
3557 pg_map.dump_basic(f);
3558 f->close_section();
3559 } else {
3560 if (what.count("pools")) {
3561 pg_map.dump_pool_stats(f);
3562 }
3563 if (what.count("osds")) {
3564 pg_map.dump_osd_stats(f);
3565 }
3566 if (what.count("pgs")) {
3567 pg_map.dump_pg_stats(f, false);
3568 }
3569 if (what.count("pgs_brief")) {
3570 pg_map.dump_pg_stats(f, true);
3571 }
3572 if (what.count("delta")) {
3573 f->open_object_section("delta");
3574 pg_map.dump_delta(f);
3575 f->close_section();
3576 }
3577 }
3578 f->flush(*odata);
3579 } else {
3580 if (what.count("all")) {
3581 pg_map.dump(ds);
11fdf7f2 3582 omap_stats_note_required = true;
7c673cae
FG
3583 } else if (what.count("summary") || what.count("sum")) {
3584 pg_map.dump_basic(ds);
3585 pg_map.dump_pg_sum_stats(ds, true);
3586 pg_map.dump_osd_sum_stats(ds);
11fdf7f2 3587 omap_stats_note_required = true;
7c673cae
FG
3588 } else {
3589 if (what.count("pgs_brief")) {
3590 pg_map.dump_pg_stats(ds, true);
3591 }
3592 bool header = true;
3593 if (what.count("pgs")) {
3594 pg_map.dump_pg_stats(ds, false);
3595 header = false;
11fdf7f2 3596 omap_stats_note_required = true;
7c673cae
FG
3597 }
3598 if (what.count("pools")) {
3599 pg_map.dump_pool_stats(ds, header);
11fdf7f2 3600 omap_stats_note_required = true;
7c673cae
FG
3601 }
3602 if (what.count("osds")) {
3603 pg_map.dump_osd_stats(ds);
3604 }
3605 }
3606 odata->append(ds);
11fdf7f2
TL
3607 if (omap_stats_note_required) {
3608 odata->append(omap_stats_note);
3609 }
7c673cae
FG
3610 }
3611 *ss << "dumped " << what;
3612 return 0;
3613 }
3614
3615 if (prefix == "pg ls") {
3616 int64_t osd = -1;
3617 int64_t pool = -1;
3618 vector<string>states;
3619 set<pg_t> pgs;
9f95a23c
TL
3620 cmd_getval(cmdmap, "pool", pool);
3621 cmd_getval(cmdmap, "osd", osd);
3622 cmd_getval(cmdmap, "states", states);
7c673cae
FG
3623 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3624 *ss << "pool " << pool << " does not exist";
3625 return -ENOENT;
3626 }
3627 if (osd >= 0 && !osdmap.is_up(osd)) {
3628 *ss << "osd " << osd << " is not up";
3629 return -EAGAIN;
3630 }
3631 if (states.empty())
3632 states.push_back("all");
3633
11fdf7f2 3634 uint64_t state = 0;
7c673cae
FG
3635
3636 while (!states.empty()) {
3637 string state_str = states.back();
3638
3639 if (state_str == "all") {
3640 state = -1;
3641 break;
3642 } else {
3efd9988
FG
3643 auto filter = pg_string_state(state_str);
3644 if (!filter) {
c07f9fc5
FG
3645 *ss << "'" << state_str << "' is not a valid pg state,"
3646 << " available choices: " << pg_state_string(0xFFFFFFFF);
3647 return -EINVAL;
3648 }
3efd9988 3649 state |= *filter;
7c673cae
FG
3650 }
3651
3652 states.pop_back();
3653 }
3654
3655 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3656
3657 if (f && !pgs.empty()) {
3658 pg_map.dump_filtered_pg_stats(f, pgs);
3659 f->flush(*odata);
3660 } else if (!pgs.empty()) {
3661 pg_map.dump_filtered_pg_stats(ds, pgs);
3662 odata->append(ds);
11fdf7f2 3663 odata->append(omap_stats_note);
7c673cae
FG
3664 }
3665 return 0;
3666 }
3667
3668 if (prefix == "pg dump_stuck") {
3669 vector<string> stuckop_vec;
9f95a23c 3670 cmd_getval(cmdmap, "stuckops", stuckop_vec);
7c673cae
FG
3671 if (stuckop_vec.empty())
3672 stuckop_vec.push_back("unclean");
3673 int64_t threshold;
9f95a23c 3674 cmd_getval(cmdmap, "threshold", threshold,
11fdf7f2 3675 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
7c673cae 3676
11fdf7f2 3677 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
7c673cae 3678 *ss << "failed";
11fdf7f2 3679 } else {
7c673cae 3680 *ss << "ok";
11fdf7f2
TL
3681 }
3682 odata->append(ds);
7c673cae
FG
3683 return 0;
3684 }
3685
3686 if (prefix == "pg debug") {
3687 string debugop;
9f95a23c 3688 cmd_getval(cmdmap, "debugop", debugop,
7c673cae
FG
3689 string("unfound_objects_exist"));
3690 if (debugop == "unfound_objects_exist") {
3691 bool unfound_objects_exist = false;
3692 for (const auto& p : pg_map.pg_stat) {
3693 if (p.second.stats.sum.num_objects_unfound > 0) {
3694 unfound_objects_exist = true;
3695 break;
3696 }
3697 }
3698 if (unfound_objects_exist)
3699 ds << "TRUE";
3700 else
3701 ds << "FALSE";
3702 odata->append(ds);
3703 return 0;
3704 }
3705 if (debugop == "degraded_pgs_exist") {
3706 bool degraded_pgs_exist = false;
3707 for (const auto& p : pg_map.pg_stat) {
3708 if (p.second.stats.sum.num_objects_degraded > 0) {
3709 degraded_pgs_exist = true;
3710 break;
3711 }
3712 }
3713 if (degraded_pgs_exist)
3714 ds << "TRUE";
3715 else
3716 ds << "FALSE";
3717 odata->append(ds);
3718 return 0;
3719 }
3720 }
3721
3722 if (prefix == "osd perf") {
3723 if (f) {
3724 f->open_object_section("osdstats");
3725 pg_map.dump_osd_perf_stats(f);
3726 f->close_section();
3727 f->flush(ds);
3728 } else {
3729 pg_map.print_osd_perf_stats(&ds);
3730 }
3731 odata->append(ds);
3732 return 0;
3733 }
3734
3735 if (prefix == "osd blocked-by") {
3736 if (f) {
3737 f->open_object_section("osd_blocked_by");
3738 pg_map.dump_osd_blocked_by_stats(f);
3739 f->close_section();
3740 f->flush(ds);
3741 } else {
3742 pg_map.print_osd_blocked_by_stats(&ds);
3743 }
3744 odata->append(ds);
3745 return 0;
3746 }
3747
7c673cae
FG
3748 return -EOPNOTSUPP;
3749}
3750
31f18b77
FG
3751void PGMapUpdater::check_osd_map(
3752 CephContext *cct,
3753 const OSDMap& osdmap,
3754 const PGMap& pgmap,
3755 PGMap::Incremental *pending_inc)
3756{
3757 for (auto& p : pgmap.osd_stat) {
3758 if (!osdmap.exists(p.first)) {
3759 // remove osd_stat
3760 pending_inc->rm_stat(p.first);
3761 } else if (osdmap.is_out(p.first)) {
3762 // zero osd_stat
11fdf7f2
TL
3763 if (p.second.statfs.total != 0) {
3764 pending_inc->stat_osd_out(p.first);
31f18b77
FG
3765 }
3766 } else if (!osdmap.is_up(p.first)) {
3767 // zero the op_queue_age_hist
3768 if (!p.second.op_queue_age_hist.empty()) {
11fdf7f2 3769 pending_inc->stat_osd_down_up(p.first, pgmap);
31f18b77
FG
3770 }
3771 }
3772 }
3773
3774 // deleted pgs (pools)?
3775 for (auto& p : pgmap.pg_pool_sum) {
3776 if (!osdmap.have_pg_pool(p.first)) {
3777 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3778 << dendl;
3779 for (auto& q : pgmap.pg_stat) {
11fdf7f2 3780 if (q.first.pool() == p.first) {
31f18b77
FG
3781 pending_inc->pg_remove.insert(q.first);
3782 }
3783 }
3784 auto q = pending_inc->pg_stat_updates.begin();
3785 while (q != pending_inc->pg_stat_updates.end()) {
11fdf7f2 3786 if (q->first.pool() == p.first) {
31f18b77
FG
3787 q = pending_inc->pg_stat_updates.erase(q);
3788 } else {
3789 ++q;
3790 }
3791 }
3792 }
3793 }
3794
11fdf7f2
TL
3795 // new (split or new pool) or merged pgs?
3796 map<int64_t,unsigned> new_pg_num;
31f18b77
FG
3797 for (auto& p : osdmap.get_pools()) {
3798 int64_t poolid = p.first;
3799 const pg_pool_t& pi = p.second;
3800 auto q = pgmap.num_pg_by_pool.find(poolid);
3801 unsigned my_pg_num = 0;
3802 if (q != pgmap.num_pg_by_pool.end())
3803 my_pg_num = q->second;
3804 unsigned pg_num = pi.get_pg_num();
11fdf7f2
TL
3805 new_pg_num[poolid] = pg_num;
3806 if (my_pg_num < pg_num) {
224ce89b 3807 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
11fdf7f2 3808 << " > my pg_num " << my_pg_num << dendl;
31f18b77
FG
3809 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3810 pg_t pgid(ps, poolid);
3811 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
224ce89b 3812 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
31f18b77
FG
3813 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3814 stats.last_fresh = osdmap.get_modified();
3815 stats.last_active = osdmap.get_modified();
3816 stats.last_change = osdmap.get_modified();
3817 stats.last_peered = osdmap.get_modified();
3818 stats.last_clean = osdmap.get_modified();
3819 stats.last_unstale = osdmap.get_modified();
3820 stats.last_undegraded = osdmap.get_modified();
3821 stats.last_fullsized = osdmap.get_modified();
3822 stats.last_scrub_stamp = osdmap.get_modified();
3823 stats.last_deep_scrub_stamp = osdmap.get_modified();
3824 stats.last_clean_scrub_stamp = osdmap.get_modified();
3825 }
3826 }
11fdf7f2
TL
3827 } else if (my_pg_num > pg_num) {
3828 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3829 << " < my pg_num " << my_pg_num << dendl;
3830 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3831 pg_t pgid(i, poolid);
3832 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3833 if (pgmap.pg_stat.count(pgid)) {
3834 pending_inc->pg_remove.insert(pgid);
3835 }
3836 pending_inc->pg_stat_updates.erase(pgid);
7c673cae 3837 }
7c673cae
FG
3838 }
3839 }
11fdf7f2
TL
3840 auto i = pending_inc->pg_stat_updates.begin();
3841 while (i != pending_inc->pg_stat_updates.end()) {
3842 auto j = new_pg_num.find(i->first.pool());
3843 if (j == new_pg_num.end() ||
3844 i->first.ps() >= j->second) {
3845 ldout(cct,20) << __func__ << " removing pending update to old "
3846 << i->first << dendl;
3847 i = pending_inc->pg_stat_updates.erase(i);
3848 } else {
3849 ++i;
7c673cae
FG
3850 }
3851 }
7c673cae
FG
3852}
3853
3854static void _try_mark_pg_stale(
3855 const OSDMap& osdmap,
3856 pg_t pgid,
3857 const pg_stat_t& cur,
3858 PGMap::Incremental *pending_inc)
3859{
3860 if ((cur.state & PG_STATE_STALE) == 0 &&
3861 cur.acting_primary != -1 &&
3862 osdmap.is_down(cur.acting_primary)) {
3863 pg_stat_t *newstat;
3864 auto q = pending_inc->pg_stat_updates.find(pgid);
3865 if (q != pending_inc->pg_stat_updates.end()) {
3866 if ((q->second.acting_primary == cur.acting_primary) ||
3867 ((q->second.state & PG_STATE_STALE) == 0 &&
3868 q->second.acting_primary != -1 &&
3869 osdmap.is_down(q->second.acting_primary))) {
3870 newstat = &q->second;
3871 } else {
3872 // pending update is no longer down or already stale
3873 return;
3874 }
3875 } else {
3876 newstat = &pending_inc->pg_stat_updates[pgid];
3877 *newstat = cur;
3878 }
3879 dout(10) << __func__ << " marking pg " << pgid
3880 << " stale (acting_primary " << newstat->acting_primary
3881 << ")" << dendl;
3882 newstat->state |= PG_STATE_STALE;
3883 newstat->last_unstale = ceph_clock_now();
3884 }
3885}
3886
3887void PGMapUpdater::check_down_pgs(
3888 const OSDMap &osdmap,
3889 const PGMap &pg_map,
3890 bool check_all,
3891 const set<int>& need_check_down_pg_osds,
3892 PGMap::Incremental *pending_inc)
3893{
3894 // if a large number of osds changed state, just iterate over the whole
3895 // pg map.
3896 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
11fdf7f2 3897 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
7c673cae
FG
3898 check_all = true;
3899 }
3900
3901 if (check_all) {
3902 for (const auto& p : pg_map.pg_stat) {
3903 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3904 }
3905 } else {
3906 for (auto osd : need_check_down_pg_osds) {
3907 if (osdmap.is_down(osd)) {
3908 auto p = pg_map.pg_by_osd.find(osd);
3909 if (p == pg_map.pg_by_osd.end()) {
3910 continue;
3911 }
3912 for (auto pgid : p->second) {
3913 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
11fdf7f2 3914 ceph_assert(stat.acting_primary == osd);
7c673cae
FG
3915 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3916 }
3917 }
3918 }
3919 }
3920}
3921
3922int reweight::by_utilization(
3923 const OSDMap &osdmap,
3924 const PGMap &pgm,
3925 int oload,
3926 double max_changef,
3927 int max_osds,
3928 bool by_pg, const set<int64_t> *pools,
3929 bool no_increasing,
3930 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3931 std::stringstream *ss,
3932 std::string *out_str,
9f95a23c 3933 ceph::Formatter *f)
7c673cae
FG
3934{
3935 if (oload <= 100) {
3936 *ss << "You must give a percentage higher than 100. "
3937 "The reweighting threshold will be calculated as <average-utilization> "
3938 "times <input-percentage>. For example, an argument of 200 would "
3939 "reweight OSDs which are twice as utilized as the average OSD.\n";
3940 return -EINVAL;
3941 }
3942
3943 vector<int> pgs_by_osd(osdmap.get_max_osd());
3944
3945 // Avoid putting a small number (or 0) in the denominator when calculating
3946 // average_util
3947 double average_util;
3948 if (by_pg) {
3949 // by pg mapping
3950 double weight_sum = 0.0; // sum up the crush weights
3951 unsigned num_pg_copies = 0;
3952 int num_osds = 0;
3953 for (const auto& pg : pgm.pg_stat) {
3954 if (pools && pools->count(pg.first.pool()) == 0)
3955 continue;
3956 for (const auto acting : pg.second.acting) {
b5b8bbf5
FG
3957 if (!osdmap.exists(acting)) {
3958 continue;
3959 }
7c673cae
FG
3960 if (acting >= (int)pgs_by_osd.size())
3961 pgs_by_osd.resize(acting);
3962 if (pgs_by_osd[acting] == 0) {
3963 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3964 //skip if we currently can not identify item
3965 continue;
3966 }
3967 weight_sum += osdmap.crush->get_item_weightf(acting);
3968 ++num_osds;
3969 }
3970 ++pgs_by_osd[acting];
3971 ++num_pg_copies;
3972 }
3973 }
3974
11fdf7f2 3975 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
7c673cae
FG
3976 *ss << "Refusing to reweight: we only have " << num_pg_copies
3977 << " PGs across " << num_osds << " osds!\n";
3978 return -EDOM;
3979 }
3980
3981 average_util = (double)num_pg_copies / weight_sum;
3982 } else {
3983 // by osd utilization
11fdf7f2
TL
3984 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3985 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3986 < g_conf()->mon_reweight_min_bytes_per_osd) {
3987 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
7c673cae
FG
3988 << " kb across all osds!\n";
3989 return -EDOM;
3990 }
11fdf7f2
TL
3991 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3992 < g_conf()->mon_reweight_min_bytes_per_osd) {
3993 *ss << "Refusing to reweight: we only have "
3994 << pgm.osd_sum.statfs.kb_used_raw()
7c673cae
FG
3995 << " kb used across all osds!\n";
3996 return -EDOM;
3997 }
3998
11fdf7f2
TL
3999 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
4000 (double)pgm.osd_sum.statfs.total;
7c673cae
FG
4001 }
4002
4003 // adjust down only if we are above the threshold
4004 const double overload_util = average_util * (double)oload / 100.0;
4005
4006 // but aggressively adjust weights up whenever possible.
4007 const double underload_util = average_util;
4008
4009 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4010
4011 ostringstream oss;
4012 if (f) {
4013 f->open_object_section("reweight_by_utilization");
4014 f->dump_int("overload_min", oload);
4015 f->dump_float("max_change", max_changef);
4016 f->dump_int("max_change_osds", max_osds);
4017 f->dump_float("average_utilization", average_util);
4018 f->dump_float("overload_utilization", overload_util);
4019 } else {
4020 oss << "oload " << oload << "\n";
4021 oss << "max_change " << max_changef << "\n";
4022 oss << "max_change_osds " << max_osds << "\n";
4023 oss.precision(4);
4024 oss << "average_utilization " << std::fixed << average_util << "\n";
4025 oss << "overload_utilization " << overload_util << "\n";
4026 }
4027 int num_changed = 0;
4028
4029 // precompute util for each OSD
4030 std::vector<std::pair<int, float> > util_by_osd;
4031 for (const auto& p : pgm.osd_stat) {
4032 std::pair<int, float> osd_util;
4033 osd_util.first = p.first;
4034 if (by_pg) {
4035 if (p.first >= (int)pgs_by_osd.size() ||
4036 pgs_by_osd[p.first] == 0) {
4037 // skip if this OSD does not contain any pg
4038 // belonging to the specified pool(s).
4039 continue;
4040 }
4041
4042 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4043 // skip if we are unable to locate item.
4044 continue;
4045 }
4046
11fdf7f2
TL
4047 osd_util.second =
4048 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
7c673cae 4049 } else {
11fdf7f2
TL
4050 osd_util.second =
4051 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
7c673cae
FG
4052 }
4053 util_by_osd.push_back(osd_util);
4054 }
4055
4056 // sort by absolute deviation from the mean utilization,
4057 // in descending order.
4058 std::sort(util_by_osd.begin(), util_by_osd.end(),
4059 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4060 return abs(l.second - average_util) > abs(r.second - average_util);
4061 }
4062 );
4063
4064 if (f)
4065 f->open_array_section("reweights");
4066
4067 for (const auto& p : util_by_osd) {
4068 unsigned weight = osdmap.get_weight(p.first);
4069 if (weight == 0) {
4070 // skip if OSD is currently out
4071 continue;
4072 }
4073 float util = p.second;
4074
4075 if (util >= overload_util) {
4076 // Assign a lower weight to overloaded OSDs. The current weight
4077 // is a factor to take into account the original weights,
4078 // to represent e.g. differing storage capacities
4079 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4080 if (weight > max_change)
11fdf7f2 4081 new_weight = std::max(new_weight, weight - max_change);
7c673cae
FG
4082 new_weights->insert({p.first, new_weight});
4083 if (f) {
4084 f->open_object_section("osd");
4085 f->dump_int("osd", p.first);
4086 f->dump_float("weight", (float)weight / (float)0x10000);
4087 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4088 f->close_section();
4089 } else {
4090 oss << "osd." << p.first << " weight "
4091 << (float)weight / (float)0x10000 << " -> "
4092 << (float)new_weight / (float)0x10000 << "\n";
4093 }
4094 if (++num_changed >= max_osds)
4095 break;
4096 }
4097 if (!no_increasing && util <= underload_util) {
4098 // assign a higher weight.. if we can.
4099 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
11fdf7f2 4100 new_weight = std::min(new_weight, weight + max_change);
7c673cae
FG
4101 if (new_weight > 0x10000)
4102 new_weight = 0x10000;
4103 if (new_weight > weight) {
4104 new_weights->insert({p.first, new_weight});
4105 oss << "osd." << p.first << " weight "
4106 << (float)weight / (float)0x10000 << " -> "
4107 << (float)new_weight / (float)0x10000 << "\n";
4108 if (++num_changed >= max_osds)
4109 break;
4110 }
4111 }
4112 }
4113 if (f) {
4114 f->close_section();
4115 }
4116
4117 OSDMap newmap;
4118 newmap.deepish_copy_from(osdmap);
4119 OSDMap::Incremental newinc;
4120 newinc.fsid = newmap.get_fsid();
4121 newinc.epoch = newmap.get_epoch() + 1;
4122 newinc.new_weight = *new_weights;
4123 newmap.apply_incremental(newinc);
4124
4125 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4126
4127 if (f) {
4128 f->close_section();
4129 } else {
4130 *out_str += "\n";
4131 *out_str += oss.str();
4132 }
4133 return num_changed;
4134}