]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PGMap.cc
bump version to 15.2.4-pve1
[ceph.git] / ceph / src / mon / PGMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
224ce89b
WB
4#include <boost/algorithm/string.hpp>
5
7c673cae
FG
6#include "PGMap.h"
7
8#define dout_subsys ceph_subsys_mon
9#include "common/debug.h"
11fdf7f2 10#include "common/Clock.h"
7c673cae 11#include "common/Formatter.h"
11fdf7f2 12#include "global/global_context.h"
7c673cae
FG
13#include "include/ceph_features.h"
14#include "include/stringify.h"
15
16#include "osd/osd_types.h"
17#include "osd/OSDMap.h"
eafe8130 18#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
19
20#define dout_context g_ceph_context
21
9f95a23c
TL
22using std::list;
23using std::make_pair;
24using std::map;
25using std::pair;
26using std::ostream;
27using std::ostringstream;
28using std::set;
29using std::string;
30using std::stringstream;
31using std::vector;
32
33using ceph::bufferlist;
34using TOPNSPC::common::cmd_getval;
35
31f18b77
FG
36MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
37MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
38MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
39
40
41// ---------------------
42// PGMapDigest
43
44void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
45{
46 // NOTE: see PGMap::encode_digest
11fdf7f2
TL
47 uint8_t v = 4;
48 if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
49 v = 1;
50 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
51 v = 3;
52 }
53 ENCODE_START(v, 1, bl);
54 encode(num_pg, bl);
55 encode(num_pg_active, bl);
56 encode(num_pg_unknown, bl);
57 encode(num_osd, bl);
58 encode(pg_pool_sum, bl, features);
59 encode(pg_sum, bl, features);
60 encode(osd_sum, bl, features);
61 if (v >= 2) {
62 encode(num_pg_by_state, bl);
63 } else {
64 uint32_t n = num_pg_by_state.size();
65 encode(n, bl);
66 for (auto p : num_pg_by_state) {
9f95a23c 67 encode((int32_t)p.first, bl);
11fdf7f2
TL
68 encode(p.second, bl);
69 }
70 }
71 encode(num_pg_by_osd, bl);
72 encode(num_pg_by_pool, bl);
73 encode(osd_last_seq, bl);
74 encode(per_pool_sum_delta, bl, features);
75 encode(per_pool_sum_deltas_stamps, bl);
76 encode(pg_sum_delta, bl, features);
77 encode(stamp_delta, bl);
78 encode(avail_space_by_rule, bl);
79 if (struct_v >= 3) {
80 encode(purged_snaps, bl);
81 }
82 if (struct_v >= 4) {
83 encode(osd_sum_by_class, bl, features);
84 }
7c673cae
FG
85 ENCODE_FINISH(bl);
86}
87
11fdf7f2 88void PGMapDigest::decode(bufferlist::const_iterator& p)
31f18b77 89{
11fdf7f2
TL
90 DECODE_START(4, p);
91 decode(num_pg, p);
92 decode(num_pg_active, p);
93 decode(num_pg_unknown, p);
94 decode(num_osd, p);
95 decode(pg_pool_sum, p);
96 decode(pg_sum, p);
97 decode(osd_sum, p);
98 if (struct_v >= 2) {
99 decode(num_pg_by_state, p);
100 } else {
101 map<int32_t, int32_t> nps;
102 decode(nps, p);
103 num_pg_by_state.clear();
104 for (auto i : nps) {
105 num_pg_by_state[i.first] = i.second;
106 }
107 }
108 decode(num_pg_by_osd, p);
109 decode(num_pg_by_pool, p);
110 decode(osd_last_seq, p);
111 decode(per_pool_sum_delta, p);
112 decode(per_pool_sum_deltas_stamps, p);
113 decode(pg_sum_delta, p);
114 decode(stamp_delta, p);
115 decode(avail_space_by_rule, p);
116 if (struct_v >= 3) {
117 decode(purged_snaps, p);
118 }
119 if (struct_v >= 4) {
120 decode(osd_sum_by_class, p);
121 }
31f18b77
FG
122 DECODE_FINISH(p);
123}
124
9f95a23c 125void PGMapDigest::dump(ceph::Formatter *f) const
31f18b77
FG
126{
127 f->dump_unsigned("num_pg", num_pg);
128 f->dump_unsigned("num_pg_active", num_pg_active);
129 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
130 f->dump_unsigned("num_osd", num_osd);
131 f->dump_object("pool_sum", pg_sum);
132 f->dump_object("osd_sum", osd_sum);
11fdf7f2
TL
133
134 f->open_object_section("osd_sum_by_class");
135 for (auto& i : osd_sum_by_class) {
136 f->dump_object(i.first.c_str(), i.second);
137 }
138 f->close_section();
139
31f18b77
FG
140 f->open_array_section("pool_stats");
141 for (auto& p : pg_pool_sum) {
142 f->open_object_section("pool_stat");
143 f->dump_int("poolid", p.first);
144 auto q = num_pg_by_pool.find(p.first);
145 if (q != num_pg_by_pool.end())
146 f->dump_unsigned("num_pg", q->second);
147 p.second.dump(f);
7c673cae
FG
148 f->close_section();
149 }
150 f->close_section();
31f18b77
FG
151 f->open_array_section("osd_stats");
152 int i = 0;
153 // TODO: this isn't really correct since we can dump non-existent OSDs
154 // I dunno what osd_last_seq is set to in that case...
155 for (auto& p : osd_last_seq) {
7c673cae 156 f->open_object_section("osd_stat");
31f18b77
FG
157 f->dump_int("osd", i);
158 f->dump_unsigned("seq", p);
7c673cae 159 f->close_section();
31f18b77 160 ++i;
7c673cae
FG
161 }
162 f->close_section();
31f18b77
FG
163 f->open_array_section("num_pg_by_state");
164 for (auto& p : num_pg_by_state) {
165 f->open_object_section("count");
166 f->dump_string("state", pg_state_string(p.first));
167 f->dump_unsigned("num", p.second);
168 f->close_section();
169 }
7c673cae 170 f->close_section();
31f18b77
FG
171 f->open_array_section("num_pg_by_osd");
172 for (auto& p : num_pg_by_osd) {
173 f->open_object_section("count");
174 f->dump_unsigned("osd", p.first);
175 f->dump_unsigned("num_primary_pg", p.second.primary);
176 f->dump_unsigned("num_acting_pg", p.second.acting);
81eedcae 177 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
31f18b77
FG
178 f->close_section();
179 }
7c673cae 180 f->close_section();
11fdf7f2
TL
181 f->open_array_section("purged_snaps");
182 for (auto& j : purged_snaps) {
183 f->open_object_section("pool");
184 f->dump_int("pool", j.first);
185 f->open_object_section("purged_snaps");
186 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
187 f->open_object_section("interval");
188 f->dump_stream("start") << i.get_start();
189 f->dump_stream("length") << i.get_len();
190 f->close_section();
191 }
192 f->close_section();
193 f->close_section();
194 }
195 f->close_section();
7c673cae
FG
196}
197
31f18b77 198void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
7c673cae 199{
31f18b77 200 ls.push_back(new PGMapDigest);
7c673cae
FG
201}
202
31f18b77
FG
203inline std::string percentify(const float& a) {
204 std::stringstream ss;
205 if (a < 0.01)
206 ss << "0";
207 else
208 ss << std::fixed << std::setprecision(2) << a;
209 return ss.str();
210}
7c673cae 211
9f95a23c 212void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
7c673cae 213{
31f18b77
FG
214 if (f)
215 f->open_array_section("pgs_by_state");
7c673cae 216
31f18b77 217 // list is descending numeric order (by count)
9f95a23c 218 std::multimap<int,uint64_t> state_by_count; // count -> state
31f18b77
FG
219 for (auto p = num_pg_by_state.begin();
220 p != num_pg_by_state.end();
221 ++p) {
222 state_by_count.insert(make_pair(p->second, p->first));
7c673cae 223 }
31f18b77
FG
224 if (f) {
225 for (auto p = state_by_count.rbegin();
226 p != state_by_count.rend();
227 ++p)
228 {
229 f->open_object_section("pgs_by_state_element");
230 f->dump_string("state_name", pg_state_string(p->second));
231 f->dump_unsigned("count", p->first);
232 f->close_section();
233 }
7c673cae 234 }
31f18b77
FG
235 if (f)
236 f->close_section();
7c673cae 237
31f18b77
FG
238 if (f) {
239 f->dump_unsigned("num_pgs", num_pg);
240 f->dump_unsigned("num_pools", pg_pool_sum.size());
241 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
242 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
11fdf7f2
TL
243 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
244 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
245 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
31f18b77
FG
246 } else {
247 *out << " pools: " << pg_pool_sum.size() << " pools, "
248 << num_pg << " pgs\n";
1adf2230
AA
249 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
250 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
31f18b77 251 *out << " usage: "
11fdf7f2
TL
252 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
253 << byte_u_t(osd_sum.statfs.available) << " / "
254 << byte_u_t(osd_sum.statfs.total) << " avail\n";
31f18b77
FG
255 *out << " pgs: ";
256 }
7c673cae 257
31f18b77 258 bool pad = false;
7c673cae 259
31f18b77
FG
260 if (num_pg_unknown > 0) {
261 float p = (float)num_pg_unknown / (float)num_pg;
262 if (f) {
263 f->dump_float("unknown_pgs_ratio", p);
7c673cae 264 } else {
31f18b77
FG
265 char b[20];
266 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
267 *out << b << "% pgs unknown\n";
268 pad = true;
7c673cae 269 }
7c673cae 270 }
7c673cae 271
31f18b77
FG
272 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
273 if (num_pg_inactive > 0) {
274 float p = (float)num_pg_inactive / (float)num_pg;
275 if (f) {
276 f->dump_float("inactive_pgs_ratio", p);
7c673cae 277 } else {
31f18b77
FG
278 if (pad) {
279 *out << " ";
280 }
281 char b[20];
282 snprintf(b, sizeof(b), "%.3f", p * 100.0);
283 *out << b << "% pgs not active\n";
284 pad = true;
7c673cae 285 }
7c673cae 286 }
31f18b77
FG
287
288 list<string> sl;
289 overall_recovery_summary(f, &sl);
290 if (!f && !sl.empty()) {
291 for (auto p = sl.begin(); p != sl.end(); ++p) {
292 if (pad) {
293 *out << " ";
294 }
295 *out << *p << "\n";
296 pad = true;
7c673cae 297 }
7c673cae 298 }
31f18b77 299 sl.clear();
7c673cae 300
31f18b77
FG
301 if (!f) {
302 unsigned max_width = 1;
9f95a23c 303 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
31f18b77
FG
304 {
305 std::stringstream ss;
306 ss << p->first;
11fdf7f2 307 max_width = std::max<size_t>(ss.str().size(), max_width);
7c673cae
FG
308 }
309
9f95a23c 310 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
31f18b77
FG
311 {
312 if (pad) {
313 *out << " ";
314 }
315 pad = true;
316 out->setf(std::ios::left);
317 *out << std::setw(max_width) << p->first
318 << " " << pg_state_string(p->second) << "\n";
319 out->unsetf(std::ios::left);
320 }
7c673cae
FG
321 }
322
31f18b77
FG
323 ostringstream ss_rec_io;
324 overall_recovery_rate_summary(f, &ss_rec_io);
325 ostringstream ss_client_io;
326 overall_client_io_rate_summary(f, &ss_client_io);
327 ostringstream ss_cache_io;
328 overall_cache_io_rate_summary(f, &ss_cache_io);
7c673cae 329
31f18b77
FG
330 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
331 || ss_cache_io.str().length())) {
332 *out << "\n \n";
333 *out << " io:\n";
7c673cae
FG
334 }
335
31f18b77
FG
336 if (!f && ss_client_io.str().length())
337 *out << " client: " << ss_client_io.str() << "\n";
338 if (!f && ss_rec_io.str().length())
339 *out << " recovery: " << ss_rec_io.str() << "\n";
340 if (!f && ss_cache_io.str().length())
341 *out << " cache: " << ss_cache_io.str() << "\n";
7c673cae
FG
342}
343
9f95a23c 344void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
7c673cae 345{
31f18b77
FG
346 std::stringstream ss;
347
348 if (f)
349 f->open_array_section("num_pg_by_state");
350 for (auto p = num_pg_by_state.begin();
351 p != num_pg_by_state.end();
352 ++p) {
353 if (f) {
354 f->open_object_section("state");
355 f->dump_string("name", pg_state_string(p->first));
356 f->dump_unsigned("num", p->second);
357 f->close_section();
358 }
359 if (p != num_pg_by_state.begin())
360 ss << ", ";
361 ss << p->second << " " << pg_state_string(p->first);
7c673cae 362 }
31f18b77
FG
363 if (f)
364 f->close_section();
7c673cae 365
31f18b77
FG
366 string states = ss.str();
367 if (out)
368 *out << num_pg << " pgs: "
369 << states << "; "
1adf2230 370 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
11fdf7f2
TL
371 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
372 << byte_u_t(osd_sum.statfs.available) << " / "
373 << byte_u_t(osd_sum.statfs.total) << " avail";
31f18b77
FG
374 if (f) {
375 f->dump_unsigned("num_pgs", num_pg);
376 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
11fdf7f2
TL
377 f->dump_int("total_bytes", osd_sum.statfs.total);
378 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
379 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
380 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
31f18b77 381 }
7c673cae 382
31f18b77
FG
383 // make non-negative; we can get negative values if osds send
384 // uncommitted stats and then "go backward" or if they are just
385 // buggy/wrong.
386 pool_stat_t pos_delta = pg_sum_delta;
387 pos_delta.floor(0);
388 if (pos_delta.stats.sum.num_rd ||
389 pos_delta.stats.sum.num_wr) {
390 if (out)
391 *out << "; ";
392 if (pos_delta.stats.sum.num_rd) {
393 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
394 if (out)
1adf2230 395 *out << byte_u_t(rd) << "/s rd, ";
31f18b77
FG
396 if (f)
397 f->dump_unsigned("read_bytes_sec", rd);
398 }
399 if (pos_delta.stats.sum.num_wr) {
400 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
401 if (out)
1adf2230 402 *out << byte_u_t(wr) << "/s wr, ";
31f18b77
FG
403 if (f)
404 f->dump_unsigned("write_bytes_sec", wr);
405 }
406 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
407 if (out)
11fdf7f2 408 *out << si_u_t(iops) << " op/s";
31f18b77
FG
409 if (f)
410 f->dump_unsigned("io_sec", iops);
7c673cae 411 }
31f18b77
FG
412
413 list<string> sl;
414 overall_recovery_summary(f, &sl);
415 if (out)
416 for (auto p = sl.begin(); p != sl.end(); ++p)
417 *out << "; " << *p;
418 std::stringstream ssr;
419 overall_recovery_rate_summary(f, &ssr);
420 if (out && ssr.str().length())
421 *out << "; " << ssr.str() << " recovering";
7c673cae
FG
422}
423
11fdf7f2
TL
424void PGMapDigest::get_recovery_stats(
425 double *misplaced_ratio,
426 double *degraded_ratio,
427 double *inactive_pgs_ratio,
428 double *unknown_pgs_ratio) const
429{
430 if (pg_sum.stats.sum.num_objects_degraded &&
431 pg_sum.stats.sum.num_object_copies > 0) {
432 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
433 (double)pg_sum.stats.sum.num_object_copies;
434 } else {
435 *degraded_ratio = 0;
436 }
437 if (pg_sum.stats.sum.num_objects_misplaced &&
438 pg_sum.stats.sum.num_object_copies > 0) {
439 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
440 (double)pg_sum.stats.sum.num_object_copies;
441 } else {
442 *misplaced_ratio = 0;
443 }
444 if (num_pg > 0) {
445 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
446 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
447 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
448 } else {
449 *inactive_pgs_ratio = 0;
450 *unknown_pgs_ratio = 0;
451 }
452}
453
9f95a23c 454void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
b32b8144 455 const pool_stat_t& pool_sum) const
7c673cae 456{
b32b8144
FG
457 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
458 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
459 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
460 char b[20];
461 snprintf(b, sizeof(b), "%.3lf", pc);
462 if (f) {
b32b8144
FG
463 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
464 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
465 f->dump_float("degraded_ratio", pc / 100.0);
466 } else {
467 ostringstream ss;
b32b8144
FG
468 ss << pool_sum.stats.sum.num_objects_degraded
469 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
31f18b77
FG
470 psl->push_back(ss.str());
471 }
472 }
b32b8144
FG
473 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
474 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
475 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
476 char b[20];
477 snprintf(b, sizeof(b), "%.3lf", pc);
478 if (f) {
b32b8144
FG
479 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
480 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
481 f->dump_float("misplaced_ratio", pc / 100.0);
482 } else {
483 ostringstream ss;
b32b8144
FG
484 ss << pool_sum.stats.sum.num_objects_misplaced
485 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
31f18b77
FG
486 psl->push_back(ss.str());
487 }
488 }
b32b8144
FG
489 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
490 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
491 (double)pool_sum.stats.sum.num_objects * (double)100.0;
31f18b77
FG
492 char b[20];
493 snprintf(b, sizeof(b), "%.3lf", pc);
494 if (f) {
b32b8144
FG
495 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
496 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
31f18b77
FG
497 f->dump_float("unfound_ratio", pc / 100.0);
498 } else {
499 ostringstream ss;
b32b8144
FG
500 ss << pool_sum.stats.sum.num_objects_unfound
501 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
31f18b77
FG
502 psl->push_back(ss.str());
503 }
7c673cae 504 }
7c673cae
FG
505}
506
9f95a23c 507void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
508 const pool_stat_t& delta_sum,
509 utime_t delta_stamp) const
7c673cae 510{
31f18b77
FG
511 // make non-negative; we can get negative values if osds send
512 // uncommitted stats and then "go backward" or if they are just
513 // buggy/wrong.
514 pool_stat_t pos_delta = delta_sum;
515 pos_delta.floor(0);
516 if (pos_delta.stats.sum.num_objects_recovered ||
517 pos_delta.stats.sum.num_bytes_recovered ||
518 pos_delta.stats.sum.num_keys_recovered) {
519 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
520 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
521 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
522 if (f) {
523 f->dump_int("recovering_objects_per_sec", objps);
524 f->dump_int("recovering_bytes_per_sec", bps);
525 f->dump_int("recovering_keys_per_sec", kps);
526 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
527 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
528 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
529 } else {
1adf2230 530 *out << byte_u_t(bps) << "/s";
31f18b77 531 if (pos_delta.stats.sum.num_keys_recovered)
11fdf7f2
TL
532 *out << ", " << si_u_t(kps) << " keys/s";
533 *out << ", " << si_u_t(objps) << " objects/s";
31f18b77 534 }
7c673cae 535 }
31f18b77 536}
7c673cae 537
9f95a23c 538void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
31f18b77
FG
539{
540 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
541}
542
9f95a23c 543void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
7c673cae 544{
31f18b77 545 recovery_summary(f, psl, pg_sum);
7c673cae
FG
546}
547
9f95a23c 548void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77 549 uint64_t poolid) const
7c673cae 550{
31f18b77
FG
551 auto p = per_pool_sum_delta.find(poolid);
552 if (p == per_pool_sum_delta.end())
553 return;
7c673cae 554
31f18b77 555 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 556 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77
FG
557 recovery_rate_summary(f, out, p->second.first, ts->second);
558}
7c673cae 559
9f95a23c 560void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
31f18b77
FG
561 uint64_t poolid) const
562{
b32b8144
FG
563 auto p = pg_pool_sum.find(poolid);
564 if (p == pg_pool_sum.end())
31f18b77 565 return;
7c673cae 566
b32b8144 567 recovery_summary(f, psl, p->second);
7c673cae
FG
568}
569
9f95a23c 570void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
571 const pool_stat_t& delta_sum,
572 utime_t delta_stamp) const
7c673cae 573{
31f18b77
FG
574 pool_stat_t pos_delta = delta_sum;
575 pos_delta.floor(0);
576 if (pos_delta.stats.sum.num_rd ||
577 pos_delta.stats.sum.num_wr) {
578 if (pos_delta.stats.sum.num_rd) {
579 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
580 if (f) {
581 f->dump_int("read_bytes_sec", rd);
582 } else {
1adf2230 583 *out << byte_u_t(rd) << "/s rd, ";
31f18b77
FG
584 }
585 }
586 if (pos_delta.stats.sum.num_wr) {
587 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
588 if (f) {
589 f->dump_int("write_bytes_sec", wr);
590 } else {
1adf2230 591 *out << byte_u_t(wr) << "/s wr, ";
31f18b77
FG
592 }
593 }
594 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
595 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
596 if (f) {
597 f->dump_int("read_op_per_sec", iops_rd);
598 f->dump_int("write_op_per_sec", iops_wr);
599 } else {
11fdf7f2 600 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
31f18b77 601 }
7c673cae
FG
602 }
603}
604
9f95a23c 605void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
7c673cae 606{
31f18b77
FG
607 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
608}
7c673cae 609
9f95a23c 610void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
611 uint64_t poolid) const
612{
613 auto p = per_pool_sum_delta.find(poolid);
614 if (p == per_pool_sum_delta.end())
7c673cae
FG
615 return;
616
31f18b77 617 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 618 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77 619 client_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
620}
621
9f95a23c 622void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77
FG
623 const pool_stat_t& delta_sum,
624 utime_t delta_stamp) const
7c673cae 625{
31f18b77
FG
626 pool_stat_t pos_delta = delta_sum;
627 pos_delta.floor(0);
628 bool have_output = false;
7c673cae 629
31f18b77
FG
630 if (pos_delta.stats.sum.num_flush) {
631 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
632 if (f) {
633 f->dump_int("flush_bytes_sec", flush);
634 } else {
1adf2230 635 *out << byte_u_t(flush) << "/s flush";
31f18b77 636 have_output = true;
7c673cae
FG
637 }
638 }
31f18b77
FG
639 if (pos_delta.stats.sum.num_evict) {
640 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
641 if (f) {
642 f->dump_int("evict_bytes_sec", evict);
643 } else {
644 if (have_output)
645 *out << ", ";
1adf2230 646 *out << byte_u_t(evict) << "/s evict";
31f18b77
FG
647 have_output = true;
648 }
7c673cae 649 }
31f18b77
FG
650 if (pos_delta.stats.sum.num_promote) {
651 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
652 if (f) {
653 f->dump_int("promote_op_per_sec", promote);
654 } else {
655 if (have_output)
656 *out << ", ";
11fdf7f2 657 *out << si_u_t(promote) << " op/s promote";
31f18b77
FG
658 have_output = true;
659 }
7c673cae 660 }
31f18b77
FG
661 if (pos_delta.stats.sum.num_flush_mode_low) {
662 if (f) {
663 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
664 } else {
665 if (have_output)
666 *out << ", ";
11fdf7f2 667 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
31f18b77
FG
668 have_output = true;
669 }
7c673cae 670 }
31f18b77
FG
671 if (pos_delta.stats.sum.num_flush_mode_high) {
672 if (f) {
673 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
674 } else {
675 if (have_output)
676 *out << ", ";
11fdf7f2 677 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
31f18b77
FG
678 have_output = true;
679 }
7c673cae 680 }
31f18b77
FG
681 if (pos_delta.stats.sum.num_evict_mode_some) {
682 if (f) {
683 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
684 } else {
685 if (have_output)
686 *out << ", ";
11fdf7f2 687 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
31f18b77
FG
688 have_output = true;
689 }
690 }
691 if (pos_delta.stats.sum.num_evict_mode_full) {
692 if (f) {
693 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
694 } else {
695 if (have_output)
696 *out << ", ";
11fdf7f2 697 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
31f18b77 698 }
7c673cae
FG
699 }
700}
701
9f95a23c 702void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
7c673cae 703{
31f18b77 704 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
705}
706
9f95a23c 707void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
31f18b77 708 uint64_t poolid) const
7c673cae 709{
31f18b77
FG
710 auto p = per_pool_sum_delta.find(poolid);
711 if (p == per_pool_sum_delta.end())
712 return;
7c673cae 713
31f18b77 714 auto ts = per_pool_sum_deltas_stamps.find(p->first);
11fdf7f2 715 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
31f18b77 716 cache_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
717}
718
d2e6a577
FG
719ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
720 boost::optional<int64_t> data_pool) const
721{
722 ceph_statfs statfs;
723 bool filter = false;
724 object_stat_sum_t sum;
725
726 if (data_pool) {
727 auto i = pg_pool_sum.find(*data_pool);
728 if (i != pg_pool_sum.end()) {
729 sum = i->second.stats.sum;
730 filter = true;
731 }
732 }
733
734 if (filter) {
735 statfs.kb_used = (sum.num_bytes >> 10);
736 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
737 statfs.num_objects = sum.num_objects;
738 statfs.kb = statfs.kb_used + statfs.kb_avail;
739 } else {
740 // these are in KB.
11fdf7f2
TL
741 statfs.kb = osd_sum.statfs.kb();
742 statfs.kb_used = osd_sum.statfs.kb_used_raw();
743 statfs.kb_avail = osd_sum.statfs.kb_avail();
d2e6a577
FG
744 statfs.num_objects = pg_sum.stats.sum.num_objects;
745 }
746
747 return statfs;
748}
749
31f18b77
FG
750void PGMapDigest::dump_pool_stats_full(
751 const OSDMap &osd_map,
752 stringstream *ss,
9f95a23c 753 ceph::Formatter *f,
31f18b77 754 bool verbose) const
7c673cae 755{
31f18b77 756 TextTable tbl;
7c673cae 757
31f18b77
FG
758 if (f) {
759 f->open_array_section("pools");
760 } else {
11fdf7f2
TL
761 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
762 tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT);
763 tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT);
9f95a23c
TL
764 if (verbose) {
765 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
766 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
767 }
11fdf7f2 768 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
31f18b77 769 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
9f95a23c
TL
770 if (verbose) {
771 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
772 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
773 }
31f18b77
FG
774 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
775 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 776
31f18b77 777 if (verbose) {
11fdf7f2
TL
778 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
779 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
31f18b77 780 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
781 tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT);
782 tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
783 }
784 }
785
786 map<int,uint64_t> avail_by_rule;
787 for (auto p = osd_map.get_pools().begin();
788 p != osd_map.get_pools().end(); ++p) {
789 int64_t pool_id = p->first;
790 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
791 continue;
11fdf7f2 792
31f18b77
FG
793 const string& pool_name = osd_map.get_pool_name(pool_id);
794 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
795
796 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
797 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
798 pool->get_type(),
799 pool->get_size());
800 int64_t avail;
31f18b77
FG
801 if (avail_by_rule.count(ruleno) == 0) {
802 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
803 avail = get_rule_avail(ruleno);
804 if (avail < 0)
805 avail = 0;
806 avail_by_rule[ruleno] = avail;
807 } else {
808 avail = avail_by_rule[ruleno];
809 }
31f18b77
FG
810 if (f) {
811 f->open_object_section("pool");
812 f->dump_string("name", pool_name);
813 f->dump_int("id", pool_id);
814 f->open_object_section("stats");
815 } else {
816 tbl << pool_name
817 << pool_id;
31f18b77 818 }
11fdf7f2 819 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
81eedcae 820 bool per_pool = use_per_pool_stats();
9f95a23c 821 bool per_pool_omap = use_per_pool_omap_stats();
81eedcae 822 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
9f95a23c 823 per_pool_omap, pool);
11fdf7f2 824 if (f) {
31f18b77 825 f->close_section(); // stats
31f18b77 826 f->close_section(); // pool
11fdf7f2
TL
827 } else {
828 tbl << TextTable::endrow;
829 }
31f18b77
FG
830 }
831 if (f)
832 f->close_section();
833 else {
11fdf7f2 834 ceph_assert(ss != nullptr);
9f95a23c 835 *ss << "--- POOLS ---\n";
31f18b77
FG
836 *ss << tbl;
837 }
838}
839
11fdf7f2 840void PGMapDigest::dump_cluster_stats(stringstream *ss,
9f95a23c 841 ceph::Formatter *f,
11fdf7f2 842 bool verbose) const
31f18b77
FG
843{
844 if (f) {
845 f->open_object_section("stats");
11fdf7f2
TL
846 f->dump_int("total_bytes", osd_sum.statfs.total);
847 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
848 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
849 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
850 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
81eedcae
TL
851 f->dump_unsigned("num_osds", osd_sum.num_osds);
852 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
9f95a23c 853 f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
11fdf7f2
TL
854 f->close_section();
855 f->open_object_section("stats_by_class");
856 for (auto& i : osd_sum_by_class) {
857 f->open_object_section(i.first.c_str());
858 f->dump_int("total_bytes", i.second.statfs.total);
859 f->dump_int("total_avail_bytes", i.second.statfs.available);
860 f->dump_int("total_used_bytes", i.second.statfs.get_used());
861 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
862 f->dump_float("total_used_raw_ratio",
863 i.second.statfs.get_used_raw_ratio());
864 f->close_section();
31f18b77
FG
865 }
866 f->close_section();
867 } else {
11fdf7f2 868 ceph_assert(ss != nullptr);
31f18b77 869 TextTable tbl;
11fdf7f2 870 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
31f18b77
FG
871 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
872 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 873 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
874 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
875 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
31f18b77 876
11fdf7f2
TL
877
878 for (auto& i : osd_sum_by_class) {
879 tbl << i.first;
880 tbl << stringify(byte_u_t(i.second.statfs.total))
881 << stringify(byte_u_t(i.second.statfs.available))
882 << stringify(byte_u_t(i.second.statfs.get_used()))
883 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
884 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
885 << TextTable::endrow;
886 }
887 tbl << "TOTAL";
888 tbl << stringify(byte_u_t(osd_sum.statfs.total))
889 << stringify(byte_u_t(osd_sum.statfs.available))
890 << stringify(byte_u_t(osd_sum.statfs.get_used()))
891 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
892 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
893 << TextTable::endrow;
894
9f95a23c 895 *ss << "--- RAW STORAGE ---\n";
31f18b77
FG
896 *ss << tbl;
897 }
898}
899
900void PGMapDigest::dump_object_stat_sum(
9f95a23c 901 TextTable &tbl, ceph::Formatter *f,
11fdf7f2 902 const pool_stat_t &pool_stat, uint64_t avail,
9f95a23c 903 float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
31f18b77
FG
904 const pg_pool_t *pool)
905{
11fdf7f2
TL
906 const object_stat_sum_t &sum = pool_stat.stats.sum;
907 const store_statfs_t statfs = pool_stat.store_stats;
908
909 if (sum.num_object_copies > 0) {
910 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
911 }
81eedcae 912
9f95a23c
TL
913 uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
914 uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
915 uint64_t used_bytes = used_data_bytes + used_omap_bytes;
31f18b77
FG
916
917 float used = 0.0;
3efd9988 918 // note avail passed in is raw_avail, calc raw_used here.
31f18b77 919 if (avail) {
11fdf7f2 920 used = used_bytes;
31f18b77 921 used /= used + avail;
11fdf7f2 922 } else if (used_bytes) {
31f18b77
FG
923 used = 1.0;
924 }
11fdf7f2
TL
925 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
926 // an approximation for actually stored user data
9f95a23c
TL
927 auto stored_data_normalized = pool_stat.get_user_data_bytes(
928 raw_used_rate, per_pool);
929 auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
930 raw_used_rate, per_pool_omap);
931 auto stored_normalized = stored_data_normalized + stored_omap_normalized;
932 // same, amplied by replication or EC
933 auto stored_raw = stored_normalized * raw_used_rate;
31f18b77 934 if (f) {
11fdf7f2 935 f->dump_int("stored", stored_normalized);
9f95a23c
TL
936 if (verbose) {
937 f->dump_int("stored_data", stored_data_normalized);
938 f->dump_int("stored_omap", stored_omap_normalized);
939 }
31f18b77 940 f->dump_int("objects", sum.num_objects);
11fdf7f2
TL
941 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
942 f->dump_int("bytes_used", used_bytes);
9f95a23c
TL
943 if (verbose) {
944 f->dump_int("data_bytes_used", used_data_bytes);
945 f->dump_int("omap_bytes_used", used_omap_bytes);
946 }
11fdf7f2
TL
947 f->dump_float("percent_used", used);
948 f->dump_unsigned("max_avail", avail_res);
31f18b77
FG
949 if (verbose) {
950 f->dump_int("quota_objects", pool->quota_max_objects);
951 f->dump_int("quota_bytes", pool->quota_max_bytes);
952 f->dump_int("dirty", sum.num_objects_dirty);
953 f->dump_int("rd", sum.num_rd);
954 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
955 f->dump_int("wr", sum.num_wr);
956 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
11fdf7f2
TL
957 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
958 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
959 // Stored by user amplified by replication
9f95a23c 960 f->dump_int("stored_raw", stored_raw);
31f18b77
FG
961 }
962 } else {
11fdf7f2 963 tbl << stringify(byte_u_t(stored_normalized));
9f95a23c
TL
964 if (verbose) {
965 tbl << stringify(byte_u_t(stored_data_normalized));
966 tbl << stringify(byte_u_t(stored_omap_normalized));
967 }
11fdf7f2
TL
968 tbl << stringify(si_u_t(sum.num_objects));
969 tbl << stringify(byte_u_t(used_bytes));
9f95a23c
TL
970 if (verbose) {
971 tbl << stringify(byte_u_t(used_data_bytes));
972 tbl << stringify(byte_u_t(used_omap_bytes));
973 }
31f18b77 974 tbl << percentify(used*100);
11fdf7f2 975 tbl << stringify(byte_u_t(avail_res));
31f18b77 976 if (verbose) {
11fdf7f2
TL
977 if (pool->quota_max_objects == 0)
978 tbl << "N/A";
979 else
980 tbl << stringify(si_u_t(pool->quota_max_objects));
981
982 if (pool->quota_max_bytes == 0)
983 tbl << "N/A";
984 else
985 tbl << stringify(byte_u_t(pool->quota_max_bytes));
986
1adf2230 987 tbl << stringify(si_u_t(sum.num_objects_dirty))
11fdf7f2
TL
988 << stringify(byte_u_t(statfs.data_compressed_allocated))
989 << stringify(byte_u_t(statfs.data_compressed_original))
990 ;
31f18b77
FG
991 }
992 }
993}
994
d2e6a577
FG
995int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
996 int64_t poolid) const
997{
998 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
999 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
1000 pool->get_type(),
1001 pool->get_size());
1002 int64_t avail;
1003 avail = get_rule_avail(ruleno);
1004 if (avail < 0)
1005 avail = 0;
1006
11fdf7f2 1007 return avail / osd_map.pool_raw_used_rate(poolid);
d2e6a577
FG
1008}
1009
31f18b77
FG
1010int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1011{
1012 map<int,float> wm;
1013 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1014 if (r < 0) {
1015 return r;
1016 }
1017 if (wm.empty()) {
1018 return 0;
1019 }
1020
11fdf7f2 1021 float fratio = osdmap.get_full_ratio();
31f18b77
FG
1022
1023 int64_t min = -1;
1024 for (auto p = wm.begin(); p != wm.end(); ++p) {
1025 auto osd_info = osd_stat.find(p->first);
1026 if (osd_info != osd_stat.end()) {
11fdf7f2 1027 if (osd_info->second.statfs.total == 0 || p->second == 0) {
31f18b77
FG
1028 // osd must be out, hence its stats have been zeroed
1029 // (unless we somehow managed to have a disk with size 0...)
1030 //
1031 // (p->second == 0), if osd weight is 0, no need to
1032 // calculate proj below.
1033 continue;
1034 }
11fdf7f2 1035 double unusable = (double)osd_info->second.statfs.kb() *
31f18b77 1036 (1.0 - fratio);
11fdf7f2 1037 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
31f18b77
FG
1038 avail *= 1024.0;
1039 int64_t proj = (int64_t)(avail / (double)p->second);
1040 if (min < 0 || proj < min) {
1041 min = proj;
1042 }
1043 } else {
94b18763
FG
1044 if (osdmap.is_up(p->first)) {
1045 // This is a level 4 rather than an error, because we might have
1046 // only just started, and not received the first stats message yet.
1047 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1048 }
31f18b77
FG
1049 }
1050 }
1051 return min;
1052}
1053
1054void PGMap::get_rules_avail(const OSDMap& osdmap,
1055 std::map<int,int64_t> *avail_map) const
1056{
1057 avail_map->clear();
1058 for (auto p : osdmap.get_pools()) {
1059 int64_t pool_id = p.first;
1060 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1061 continue;
1062 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1063 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1064 pool->get_type(),
1065 pool->get_size());
1066 if (avail_map->count(ruleno) == 0)
1067 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1068 }
1069}
1070
1071// ---------------------
1072// PGMap
1073
9f95a23c 1074void PGMap::Incremental::dump(ceph::Formatter *f) const
7c673cae
FG
1075{
1076 f->dump_unsigned("version", version);
1077 f->dump_stream("stamp") << stamp;
31f18b77
FG
1078 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1079 f->dump_unsigned("pg_scan_epoch", pg_scan);
7c673cae 1080
31f18b77
FG
1081 f->open_array_section("pg_stat_updates");
1082 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1083 f->open_object_section("pg_stat");
1084 f->dump_stream("pgid") << p->first;
1085 p->second.dump(f);
1086 f->close_section();
1087 }
7c673cae
FG
1088 f->close_section();
1089
31f18b77
FG
1090 f->open_array_section("osd_stat_updates");
1091 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1092 f->open_object_section("osd_stat");
1093 f->dump_int("osd", p->first);
1094 p->second.dump(f);
7c673cae
FG
1095 f->close_section();
1096 }
1097 f->close_section();
11fdf7f2
TL
1098 f->open_array_section("pool_statfs_updates");
1099 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1100 f->open_object_section("pool_statfs");
1101 f->dump_stream("poolid/osd") << p->first;
1102 p->second.dump(f);
1103 f->close_section();
1104 }
1105 f->close_section();
7c673cae 1106
31f18b77
FG
1107 f->open_array_section("osd_stat_removals");
1108 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1109 f->dump_int("osd", *p);
7c673cae 1110 f->close_section();
7c673cae 1111
31f18b77
FG
1112 f->open_array_section("pg_removals");
1113 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1114 f->dump_stream("pgid") << *p;
7c673cae
FG
1115 f->close_section();
1116}
1117
31f18b77 1118void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
7c673cae 1119{
31f18b77
FG
1120 o.push_back(new Incremental);
1121 o.push_back(new Incremental);
1122 o.back()->version = 1;
1123 o.back()->stamp = utime_t(123,345);
1124 o.push_back(new Incremental);
1125 o.back()->version = 2;
11fdf7f2 1126 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
31f18b77 1127 o.back()->osd_stat_updates[5] = osd_stat_t();
31f18b77
FG
1128 o.push_back(new Incremental);
1129 o.back()->version = 3;
1130 o.back()->osdmap_epoch = 1;
1131 o.back()->pg_scan = 2;
11fdf7f2 1132 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
31f18b77 1133 o.back()->osd_stat_updates[6] = osd_stat_t();
11fdf7f2 1134 o.back()->pg_remove.insert(pg_t(1,2));
31f18b77 1135 o.back()->osd_stat_rm.insert(5);
11fdf7f2 1136 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
7c673cae
FG
1137}
1138
31f18b77
FG
1139// --
1140
1141void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
7c673cae 1142{
11fdf7f2 1143 ceph_assert(inc.version == version+1);
31f18b77 1144 version++;
7c673cae 1145
31f18b77 1146 pool_stat_t pg_sum_old = pg_sum;
11fdf7f2
TL
1147 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1148 pg_pool_sum_old = pg_pool_sum;
7c673cae 1149
31f18b77
FG
1150 for (auto p = inc.pg_stat_updates.begin();
1151 p != inc.pg_stat_updates.end();
1152 ++p) {
1153 const pg_t &update_pg(p->first);
11fdf7f2 1154 auto update_pool = update_pg.pool();
31f18b77 1155 const pg_stat_t &update_stat(p->second);
7c673cae 1156
11fdf7f2
TL
1157 auto pg_stat_iter = pg_stat.find(update_pg);
1158 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1159 if (pg_stat_iter == pg_stat.end()) {
31f18b77
FG
1160 pg_stat.insert(make_pair(update_pg, update_stat));
1161 } else {
11fdf7f2
TL
1162 stat_pg_sub(update_pg, pg_stat_iter->second);
1163 pool_sum_ref.sub(pg_stat_iter->second);
1164 pg_stat_iter->second = update_stat;
7c673cae 1165 }
31f18b77 1166 stat_pg_add(update_pg, update_stat);
11fdf7f2 1167 pool_sum_ref.add(update_stat);
7c673cae 1168 }
11fdf7f2
TL
1169
1170 for (auto p = inc.pool_statfs_updates.begin();
1171 p != inc.pool_statfs_updates.end();
1172 ++p) {
1173 auto update_pool = p->first.first;
1174 auto update_osd = p->first.second;
1175 auto& statfs_inc = p->second;
1176
1177 auto pool_statfs_iter =
1178 pool_statfs.find(std::make_pair(update_pool, update_osd));
eafe8130
TL
1179 if (pg_pool_sum.count(update_pool)) {
1180 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1181 if (pool_statfs_iter == pool_statfs.end()) {
1182 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1183 } else {
1184 pool_sum_ref.sub(pool_statfs_iter->second);
1185 pool_statfs_iter->second = statfs_inc;
1186 }
1187 pool_sum_ref.add(statfs_inc);
11fdf7f2 1188 }
11fdf7f2
TL
1189 }
1190
31f18b77
FG
1191 for (auto p = inc.get_osd_stat_updates().begin();
1192 p != inc.get_osd_stat_updates().end();
1193 ++p) {
1194 int osd = p->first;
1195 const osd_stat_t &new_stats(p->second);
7c673cae 1196
31f18b77
FG
1197 auto t = osd_stat.find(osd);
1198 if (t == osd_stat.end()) {
1199 osd_stat.insert(make_pair(osd, new_stats));
1200 } else {
1201 stat_osd_sub(t->first, t->second);
1202 t->second = new_stats;
1203 }
31f18b77 1204 stat_osd_add(osd, new_stats);
31f18b77
FG
1205 }
1206 set<int64_t> deleted_pools;
1207 for (auto p = inc.pg_remove.begin();
1208 p != inc.pg_remove.end();
1209 ++p) {
1210 const pg_t &removed_pg(*p);
1211 auto s = pg_stat.find(removed_pg);
11fdf7f2 1212 bool pool_erased = false;
31f18b77 1213 if (s != pg_stat.end()) {
11fdf7f2 1214 pool_erased = stat_pg_sub(removed_pg, s->second);
31f18b77 1215 pg_stat.erase(s);
11fdf7f2
TL
1216 if (pool_erased) {
1217 deleted_pools.insert(removed_pg.pool());
1218 }
31f18b77 1219 }
7c673cae
FG
1220 }
1221
31f18b77
FG
1222 for (auto p = inc.get_osd_stat_rm().begin();
1223 p != inc.get_osd_stat_rm().end();
7c673cae 1224 ++p) {
31f18b77
FG
1225 auto t = osd_stat.find(*p);
1226 if (t != osd_stat.end()) {
1227 stat_osd_sub(t->first, t->second);
1228 osd_stat.erase(t);
31f18b77 1229 }
11fdf7f2
TL
1230 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1231 if (i->first.second == *p) {
1232 pg_pool_sum[i->first.first].sub(i->second);
1233 pool_statfs.erase(i);
1234 }
1235 }
7c673cae
FG
1236 }
1237
b32b8144
FG
1238 // skip calculating delta while sum was not synchronized
1239 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1240 utime_t delta_t;
1241 delta_t = inc.stamp;
1242 delta_t -= stamp;
1243 // calculate a delta, and average over the last 2 deltas.
1244 pool_stat_t d = pg_sum;
1245 d.stats.sub(pg_sum_old.stats);
1246 pg_sum_deltas.push_back(make_pair(d, delta_t));
1247 stamp_delta += delta_t;
1248 pg_sum_delta.stats.add(d.stats);
1249 auto smooth_intervals =
11fdf7f2
TL
1250 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1251 while (pg_sum_deltas.size() > smooth_intervals) {
b32b8144
FG
1252 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1253 stamp_delta -= pg_sum_deltas.front().second;
1254 pg_sum_deltas.pop_front();
1255 }
31f18b77 1256 }
b32b8144 1257 stamp = inc.stamp;
7c673cae 1258
31f18b77 1259 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
7c673cae 1260
31f18b77
FG
1261 for (auto p : deleted_pools) {
1262 if (cct)
1263 dout(20) << " deleted pool " << p << dendl;
1264 deleted_pool(p);
1265 }
7c673cae 1266
31f18b77
FG
1267 if (inc.osdmap_epoch)
1268 last_osdmap_epoch = inc.osdmap_epoch;
1269 if (inc.pg_scan)
1270 last_pg_scan = inc.pg_scan;
7c673cae
FG
1271}
1272
31f18b77 1273void PGMap::calc_stats()
7c673cae 1274{
31f18b77
FG
1275 num_pg = 0;
1276 num_pg_active = 0;
1277 num_pg_unknown = 0;
1278 num_osd = 0;
1279 pg_pool_sum.clear();
1280 num_pg_by_pool.clear();
1281 pg_by_osd.clear();
1282 pg_sum = pool_stat_t();
1283 osd_sum = osd_stat_t();
11fdf7f2 1284 osd_sum_by_class.clear();
31f18b77 1285 num_pg_by_state.clear();
11fdf7f2 1286 num_pg_by_pool_state.clear();
31f18b77 1287 num_pg_by_osd.clear();
7c673cae 1288
31f18b77
FG
1289 for (auto p = pg_stat.begin();
1290 p != pg_stat.end();
1291 ++p) {
11fdf7f2
TL
1292 auto pg = p->first;
1293 stat_pg_add(pg, p->second);
1294 pg_pool_sum[pg.pool()].add(p->second);
1295 }
1296 for (auto p = pool_statfs.begin();
1297 p != pool_statfs.end();
1298 ++p) {
1299 auto pool = p->first.first;
1300 pg_pool_sum[pool].add(p->second);
31f18b77
FG
1301 }
1302 for (auto p = osd_stat.begin();
1303 p != osd_stat.end();
1304 ++p)
1305 stat_osd_add(p->first, p->second);
7c673cae
FG
1306}
1307
31f18b77
FG
1308void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1309 bool sameosds)
7c673cae 1310{
11fdf7f2 1311 auto pool = pgid.pool();
31f18b77 1312 pg_sum.add(s);
7c673cae 1313
31f18b77
FG
1314 num_pg++;
1315 num_pg_by_state[s.state]++;
11fdf7f2
TL
1316 num_pg_by_pool_state[pgid.pool()][s.state]++;
1317 num_pg_by_pool[pool]++;
7c673cae 1318
31f18b77
FG
1319 if ((s.state & PG_STATE_CREATING) &&
1320 s.parent_split_bits == 0) {
1321 creating_pgs.insert(pgid);
1322 if (s.acting_primary >= 0) {
1323 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
7c673cae
FG
1324 }
1325 }
1326
31f18b77
FG
1327 if (s.state & PG_STATE_ACTIVE) {
1328 ++num_pg_active;
1329 }
1330 if (s.state == 0) {
1331 ++num_pg_unknown;
7c673cae
FG
1332 }
1333
31f18b77
FG
1334 if (sameosds)
1335 return;
7c673cae 1336
31f18b77
FG
1337 for (auto p = s.blocked_by.begin();
1338 p != s.blocked_by.end();
1339 ++p) {
1340 ++blocked_by_sum[*p];
7c673cae 1341 }
31f18b77
FG
1342
1343 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1344 pg_by_osd[*p].insert(pgid);
1345 num_pg_by_osd[*p].acting++;
1346 }
1347 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
81eedcae
TL
1348 auto& t = pg_by_osd[*p];
1349 if (t.find(pgid) == t.end()) {
1350 t.insert(pgid);
1351 num_pg_by_osd[*p].up_not_acting++;
1352 }
7c673cae 1353 }
7c673cae 1354
31f18b77
FG
1355 if (s.up_primary >= 0) {
1356 num_pg_by_osd[s.up_primary].primary++;
7c673cae 1357 }
7c673cae 1358}
31f18b77 1359
11fdf7f2 1360bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
31f18b77 1361 bool sameosds)
7c673cae 1362{
11fdf7f2 1363 bool pool_erased = false;
31f18b77
FG
1364 pg_sum.sub(s);
1365
1366 num_pg--;
1367 int end = --num_pg_by_state[s.state];
11fdf7f2 1368 ceph_assert(end >= 0);
31f18b77
FG
1369 if (end == 0)
1370 num_pg_by_state.erase(s.state);
11fdf7f2
TL
1371 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1372 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1373 }
31f18b77
FG
1374 end = --num_pg_by_pool[pgid.pool()];
1375 if (end == 0) {
11fdf7f2 1376 pool_erased = true;
7c673cae 1377 }
7c673cae 1378
31f18b77
FG
1379 if ((s.state & PG_STATE_CREATING) &&
1380 s.parent_split_bits == 0) {
1381 creating_pgs.erase(pgid);
1382 if (s.acting_primary >= 0) {
1383 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1384 r[s.mapping_epoch].erase(pgid);
1385 if (r[s.mapping_epoch].empty())
1386 r.erase(s.mapping_epoch);
1387 if (r.empty())
1388 creating_pgs_by_osd_epoch.erase(s.acting_primary);
7c673cae
FG
1389 }
1390 }
31f18b77
FG
1391
1392 if (s.state & PG_STATE_ACTIVE) {
1393 --num_pg_active;
1394 }
1395 if (s.state == 0) {
1396 --num_pg_unknown;
1397 }
1398
1399 if (sameosds)
11fdf7f2 1400 return pool_erased;
31f18b77
FG
1401
1402 for (auto p = s.blocked_by.begin();
1403 p != s.blocked_by.end();
1404 ++p) {
1405 auto q = blocked_by_sum.find(*p);
11fdf7f2 1406 ceph_assert(q != blocked_by_sum.end());
31f18b77
FG
1407 --q->second;
1408 if (q->second == 0)
1409 blocked_by_sum.erase(q);
1410 }
1411
81eedcae 1412 set<int32_t> actingset;
31f18b77 1413 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
81eedcae 1414 actingset.insert(*p);
31f18b77
FG
1415 auto& oset = pg_by_osd[*p];
1416 oset.erase(pgid);
1417 if (oset.empty())
1418 pg_by_osd.erase(*p);
1419 auto it = num_pg_by_osd.find(*p);
1420 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1421 it->second.acting--;
1422 }
1423 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1424 auto& oset = pg_by_osd[*p];
1425 oset.erase(pgid);
1426 if (oset.empty())
1427 pg_by_osd.erase(*p);
81eedcae
TL
1428 if (actingset.count(*p))
1429 continue;
31f18b77 1430 auto it = num_pg_by_osd.find(*p);
81eedcae
TL
1431 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1432 it->second.up_not_acting--;
31f18b77
FG
1433 }
1434
1435 if (s.up_primary >= 0) {
1436 auto it = num_pg_by_osd.find(s.up_primary);
1437 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1438 it->second.primary--;
1439 }
11fdf7f2
TL
1440 return pool_erased;
1441}
1442
1443void PGMap::calc_purged_snaps()
1444{
1445 purged_snaps.clear();
1446 set<int64_t> unknown;
1447 for (auto& i : pg_stat) {
1448 if (i.second.state == 0) {
1449 unknown.insert(i.first.pool());
1450 purged_snaps.erase(i.first.pool());
1451 continue;
1452 } else if (unknown.count(i.first.pool())) {
1453 continue;
1454 }
1455 auto j = purged_snaps.find(i.first.pool());
1456 if (j == purged_snaps.end()) {
1457 // base case
1458 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1459 } else {
1460 j->second.intersection_of(i.second.purged_snaps);
1461 }
1462 }
31f18b77
FG
1463}
1464
11fdf7f2 1465void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
31f18b77 1466{
11fdf7f2
TL
1467 osd_sum_by_class.clear();
1468 for (auto& i : osd_stat) {
1469 const char *class_name = osdmap.crush->get_item_class(i.first);
1470 if (class_name) {
1471 osd_sum_by_class[class_name].add(i.second);
1472 }
1473 }
31f18b77
FG
1474}
1475
1476void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1477{
1478 num_osd++;
1479 osd_sum.add(s);
1480 if (osd >= (int)osd_last_seq.size()) {
1481 osd_last_seq.resize(osd + 1);
1482 }
1483 osd_last_seq[osd] = s.seq;
1484}
1485
1486void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1487{
1488 num_osd--;
1489 osd_sum.sub(s);
11fdf7f2 1490 ceph_assert(osd < (int)osd_last_seq.size());
31f18b77
FG
1491 osd_last_seq[osd] = 0;
1492}
1493
31f18b77 1494void PGMap::encode_digest(const OSDMap& osdmap,
11fdf7f2 1495 bufferlist& bl, uint64_t features)
31f18b77
FG
1496{
1497 get_rules_avail(osdmap, &avail_space_by_rule);
11fdf7f2
TL
1498 calc_osd_sum_by_class(osdmap);
1499 calc_purged_snaps();
31f18b77
FG
1500 PGMapDigest::encode(bl, features);
1501}
1502
1503void PGMap::encode(bufferlist &bl, uint64_t features) const
1504{
11fdf7f2
TL
1505 ENCODE_START(8, 8, bl);
1506 encode(version, bl);
1507 encode(pg_stat, bl);
1508 encode(osd_stat, bl, features);
1509 encode(last_osdmap_epoch, bl);
1510 encode(last_pg_scan, bl);
1511 encode(stamp, bl);
1512 encode(pool_statfs, bl, features);
31f18b77
FG
1513 ENCODE_FINISH(bl);
1514}
1515
11fdf7f2 1516void PGMap::decode(bufferlist::const_iterator &bl)
31f18b77 1517{
11fdf7f2
TL
1518 DECODE_START(8, bl);
1519 decode(version, bl);
1520 decode(pg_stat, bl);
1521 decode(osd_stat, bl);
1522 decode(last_osdmap_epoch, bl);
1523 decode(last_pg_scan, bl);
1524 decode(stamp, bl);
1525 decode(pool_statfs, bl);
31f18b77
FG
1526 DECODE_FINISH(bl);
1527
1528 calc_stats();
7c673cae
FG
1529}
1530
9f95a23c 1531void PGMap::dump(ceph::Formatter *f, bool with_net) const
31f18b77
FG
1532{
1533 dump_basic(f);
1534 dump_pg_stats(f, false);
1535 dump_pool_stats(f);
9f95a23c 1536 dump_osd_stats(f, with_net);
31f18b77
FG
1537}
1538
9f95a23c 1539void PGMap::dump_basic(ceph::Formatter *f) const
31f18b77
FG
1540{
1541 f->dump_unsigned("version", version);
1542 f->dump_stream("stamp") << stamp;
1543 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1544 f->dump_unsigned("last_pg_scan", last_pg_scan);
31f18b77
FG
1545
1546 f->open_object_section("pg_stats_sum");
1547 pg_sum.dump(f);
1548 f->close_section();
1549
1550 f->open_object_section("osd_stats_sum");
1551 osd_sum.dump(f);
1552 f->close_section();
1553
31f18b77
FG
1554 dump_delta(f);
1555}
1556
9f95a23c 1557void PGMap::dump_delta(ceph::Formatter *f) const
31f18b77
FG
1558{
1559 f->open_object_section("pg_stats_delta");
1560 pg_sum_delta.dump(f);
11fdf7f2 1561 f->dump_stream("stamp_delta") << stamp_delta;
31f18b77
FG
1562 f->close_section();
1563}
1564
9f95a23c 1565void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
31f18b77
FG
1566{
1567 f->open_array_section("pg_stats");
1568 for (auto i = pg_stat.begin();
1569 i != pg_stat.end();
1570 ++i) {
1571 f->open_object_section("pg_stat");
1572 f->dump_stream("pgid") << i->first;
1573 if (brief)
1574 i->second.dump_brief(f);
1575 else
1576 i->second.dump(f);
1577 f->close_section();
1578 }
1579 f->close_section();
1580}
1581
9f95a23c 1582void PGMap::dump_pool_stats(ceph::Formatter *f) const
31f18b77
FG
1583{
1584 f->open_array_section("pool_stats");
1585 for (auto p = pg_pool_sum.begin();
1586 p != pg_pool_sum.end();
1587 ++p) {
1588 f->open_object_section("pool_stat");
1589 f->dump_int("poolid", p->first);
1590 auto q = num_pg_by_pool.find(p->first);
1591 if (q != num_pg_by_pool.end())
1592 f->dump_unsigned("num_pg", q->second);
1593 p->second.dump(f);
1594 f->close_section();
1595 }
1596 f->close_section();
1597}
1598
9f95a23c 1599void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
31f18b77
FG
1600{
1601 f->open_array_section("osd_stats");
1602 for (auto q = osd_stat.begin();
1603 q != osd_stat.end();
1604 ++q) {
1605 f->open_object_section("osd_stat");
1606 f->dump_int("osd", q->first);
ded94939 1607 q->second.dump(f, with_net);
31f18b77
FG
1608 f->close_section();
1609 }
1610 f->close_section();
1611}
1612
9f95a23c
TL
1613void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
1614{
1615 f->open_array_section("osd_ping_times");
1616 for (auto& [osd, stat] : osd_stat) {
1617 f->open_object_section("osd_ping_time");
1618 f->dump_int("osd", osd);
1619 stat.dump_ping_time(f);
1620 f->close_section();
1621 }
1622 f->close_section();
1623}
1624
31f18b77
FG
1625void PGMap::dump_pg_stats_plain(
1626 ostream& ss,
1627 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1628 bool brief) const
1629{
1630 TextTable tab;
1631
1632 if (brief){
1633 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1634 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1635 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1636 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1637 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1638 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1639 }
1640 else {
1641 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1642 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1643 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1644 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1645 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1646 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1647 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1648 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1649 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1650 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1651 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1652 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1653 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1654 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1655 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1656 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1657 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1658 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1659 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1660 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1661 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1662 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1663 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
b32b8144 1664 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1665 }
1666
1667 for (auto i = pg_stats.begin();
1668 i != pg_stats.end(); ++i) {
1669 const pg_stat_t &st(i->second);
1670 if (brief) {
1671 tab << i->first
1672 << pg_state_string(st.state)
1673 << st.up
1674 << st.up_primary
1675 << st.acting
1676 << st.acting_primary
1677 << TextTable::endrow;
7c673cae 1678 } else {
31f18b77
FG
1679 ostringstream reported;
1680 reported << st.reported_epoch << ":" << st.reported_seq;
1681
1682 tab << i->first
1683 << st.stats.sum.num_objects
1684 << st.stats.sum.num_objects_missing_on_primary
1685 << st.stats.sum.num_objects_degraded
1686 << st.stats.sum.num_objects_misplaced
1687 << st.stats.sum.num_objects_unfound
1688 << st.stats.sum.num_bytes
11fdf7f2
TL
1689 << st.stats.sum.num_omap_bytes
1690 << st.stats.sum.num_omap_keys
31f18b77
FG
1691 << st.log_size
1692 << st.ondisk_log_size
1693 << pg_state_string(st.state)
1694 << st.last_change
1695 << st.version
1696 << reported.str()
1697 << pg_vector_string(st.up)
1698 << st.up_primary
1699 << pg_vector_string(st.acting)
1700 << st.acting_primary
1701 << st.last_scrub
1702 << st.last_scrub_stamp
1703 << st.last_deep_scrub
1704 << st.last_deep_scrub_stamp
b32b8144 1705 << st.snaptrimq_len
31f18b77 1706 << TextTable::endrow;
7c673cae
FG
1707 }
1708 }
7c673cae 1709
31f18b77
FG
1710 ss << tab;
1711}
1712
1713void PGMap::dump(ostream& ss) const
1714{
1715 dump_basic(ss);
1716 dump_pg_stats(ss, false);
1717 dump_pool_stats(ss, false);
1718 dump_pg_sum_stats(ss, false);
1719 dump_osd_stats(ss);
1720}
1721
1722void PGMap::dump_basic(ostream& ss) const
1723{
1724 ss << "version " << version << std::endl;
1725 ss << "stamp " << stamp << std::endl;
1726 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1727 ss << "last_pg_scan " << last_pg_scan << std::endl;
31f18b77
FG
1728}
1729
1730void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1731{
1732 dump_pg_stats_plain(ss, pg_stat, brief);
1733}
1734
1735void PGMap::dump_pool_stats(ostream& ss, bool header) const
1736{
1737 TextTable tab;
1738
1739 if (header) {
1740 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1741 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1742 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1743 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1744 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1745 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1746 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1747 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1748 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1749 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1750 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1751 } else {
1752 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1753 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1754 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1755 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1756 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1757 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1758 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1759 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1760 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1761 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1762 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1763 }
1764
1765 for (auto p = pg_pool_sum.begin();
1766 p != pg_pool_sum.end();
1767 ++p) {
1768 tab << p->first
1769 << p->second.stats.sum.num_objects
1770 << p->second.stats.sum.num_objects_missing_on_primary
1771 << p->second.stats.sum.num_objects_degraded
1772 << p->second.stats.sum.num_objects_misplaced
1773 << p->second.stats.sum.num_objects_unfound
1774 << p->second.stats.sum.num_bytes
11fdf7f2
TL
1775 << p->second.stats.sum.num_omap_bytes
1776 << p->second.stats.sum.num_omap_keys
31f18b77
FG
1777 << p->second.log_size
1778 << p->second.ondisk_log_size
1779 << TextTable::endrow;
1780 }
1781
1782 ss << tab;
1783}
1784
1785void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1786{
1787 TextTable tab;
1788
1789 if (header) {
1790 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1791 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1792 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1793 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1794 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1795 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1796 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1797 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1798 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1799 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1800 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1801 } else {
1802 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1803 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1804 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1805 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1806 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1807 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1808 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1809 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1810 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
1811 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1812 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1813 };
1814
1815 tab << "sum"
1816 << pg_sum.stats.sum.num_objects
1817 << pg_sum.stats.sum.num_objects_missing_on_primary
1818 << pg_sum.stats.sum.num_objects_degraded
1819 << pg_sum.stats.sum.num_objects_misplaced
1820 << pg_sum.stats.sum.num_objects_unfound
1821 << pg_sum.stats.sum.num_bytes
11fdf7f2
TL
1822 << pg_sum.stats.sum.num_omap_bytes
1823 << pg_sum.stats.sum.num_omap_keys
31f18b77
FG
1824 << pg_sum.log_size
1825 << pg_sum.ondisk_log_size
1826 << TextTable::endrow;
1827
1828 ss << tab;
1829}
1830
1831void PGMap::dump_osd_stats(ostream& ss) const
1832{
1833 TextTable tab;
1834
1835 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1836 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1837 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 1838 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1839 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1840 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1841 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1842 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1843
1844 for (auto p = osd_stat.begin();
1845 p != osd_stat.end();
1846 ++p) {
1847 tab << p->first
11fdf7f2
TL
1848 << byte_u_t(p->second.statfs.get_used())
1849 << byte_u_t(p->second.statfs.available)
1850 << byte_u_t(p->second.statfs.get_used_raw())
1851 << byte_u_t(p->second.statfs.total)
31f18b77
FG
1852 << p->second.hb_peers
1853 << get_num_pg_by_osd(p->first)
1854 << get_num_primary_pg_by_osd(p->first)
1855 << TextTable::endrow;
1856 }
1857
1858 tab << "sum"
11fdf7f2
TL
1859 << byte_u_t(osd_sum.statfs.get_used())
1860 << byte_u_t(osd_sum.statfs.available)
1861 << byte_u_t(osd_sum.statfs.get_used_raw())
1862 << byte_u_t(osd_sum.statfs.total)
31f18b77 1863 << TextTable::endrow;
7c673cae 1864
31f18b77 1865 ss << tab;
7c673cae
FG
1866}
1867
31f18b77 1868void PGMap::dump_osd_sum_stats(ostream& ss) const
7c673cae 1869{
31f18b77 1870 TextTable tab;
7c673cae 1871
31f18b77
FG
1872 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1873 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1874 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 1875 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
31f18b77 1876 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
7c673cae 1877
31f18b77 1878 tab << "sum"
11fdf7f2
TL
1879 << byte_u_t(osd_sum.statfs.get_used())
1880 << byte_u_t(osd_sum.statfs.available)
1881 << byte_u_t(osd_sum.statfs.get_used_raw())
1882 << byte_u_t(osd_sum.statfs.total)
31f18b77 1883 << TextTable::endrow;
7c673cae 1884
31f18b77 1885 ss << tab;
7c673cae
FG
1886}
1887
31f18b77
FG
1888void PGMap::get_stuck_stats(
1889 int types, const utime_t cutoff,
1890 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
7c673cae 1891{
11fdf7f2 1892 ceph_assert(types != 0);
31f18b77
FG
1893 for (auto i = pg_stat.begin();
1894 i != pg_stat.end();
1895 ++i) {
1896 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1897
1898 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1899 if (i->second.last_active < val)
1900 val = i->second.last_active;
7c673cae 1901 }
31f18b77
FG
1902
1903 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1904 if (i->second.last_clean < val)
1905 val = i->second.last_clean;
7c673cae 1906 }
31f18b77
FG
1907
1908 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1909 if (i->second.last_undegraded < val)
1910 val = i->second.last_undegraded;
7c673cae 1911 }
7c673cae 1912
31f18b77
FG
1913 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1914 if (i->second.last_fullsized < val)
1915 val = i->second.last_fullsized;
1916 }
7c673cae 1917
31f18b77
FG
1918 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1919 if (i->second.last_unstale < val)
1920 val = i->second.last_unstale;
1921 }
7c673cae 1922
31f18b77
FG
1923 // val is now the earliest any of the requested stuck states began
1924 if (val < cutoff) {
1925 stuck_pgs[i->first] = i->second;
1926 }
1927 }
7c673cae
FG
1928}
1929
31f18b77 1930bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
7c673cae 1931{
31f18b77
FG
1932 int inactive = 0;
1933 int unclean = 0;
1934 int degraded = 0;
1935 int undersized = 0;
1936 int stale = 0;
7c673cae 1937
31f18b77
FG
1938 for (auto i = pg_stat.begin();
1939 i != pg_stat.end();
1940 ++i) {
1941 if (! (i->second.state & PG_STATE_ACTIVE)) {
1942 if (i->second.last_active < cutoff)
1943 ++inactive;
7c673cae 1944 }
31f18b77
FG
1945 if (! (i->second.state & PG_STATE_CLEAN)) {
1946 if (i->second.last_clean < cutoff)
1947 ++unclean;
7c673cae 1948 }
31f18b77
FG
1949 if (i->second.state & PG_STATE_DEGRADED) {
1950 if (i->second.last_undegraded < cutoff)
1951 ++degraded;
7c673cae 1952 }
31f18b77
FG
1953 if (i->second.state & PG_STATE_UNDERSIZED) {
1954 if (i->second.last_fullsized < cutoff)
1955 ++undersized;
7c673cae 1956 }
31f18b77
FG
1957 if (i->second.state & PG_STATE_STALE) {
1958 if (i->second.last_unstale < cutoff)
1959 ++stale;
7c673cae
FG
1960 }
1961 }
31f18b77
FG
1962
1963 if (inactive)
1964 note["stuck inactive"] = inactive;
1965
1966 if (unclean)
1967 note["stuck unclean"] = unclean;
1968
1969 if (undersized)
1970 note["stuck undersized"] = undersized;
1971
1972 if (degraded)
1973 note["stuck degraded"] = degraded;
1974
1975 if (stale)
1976 note["stuck stale"] = stale;
1977
1978 return inactive || unclean || undersized || degraded || stale;
1979}
1980
9f95a23c 1981void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
31f18b77
FG
1982{
1983 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1984 get_stuck_stats(types, cutoff, stuck_pg_stats);
1985 f->open_array_section("stuck_pg_stats");
1986 for (auto i = stuck_pg_stats.begin();
1987 i != stuck_pg_stats.end();
1988 ++i) {
1989 f->open_object_section("pg_stat");
1990 f->dump_stream("pgid") << i->first;
1991 i->second.dump(f);
1992 f->close_section();
1993 }
1994 f->close_section();
1995}
1996
1997void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
1998{
1999 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2000 get_stuck_stats(types, cutoff, stuck_pg_stats);
2001 if (!stuck_pg_stats.empty())
2002 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2003}
2004
2005int PGMap::dump_stuck_pg_stats(
2006 stringstream &ds,
9f95a23c 2007 ceph::Formatter *f,
31f18b77
FG
2008 int threshold,
2009 vector<string>& args) const
2010{
2011 int stuck_types = 0;
2012
2013 for (auto i = args.begin(); i != args.end(); ++i) {
2014 if (*i == "inactive")
2015 stuck_types |= PGMap::STUCK_INACTIVE;
2016 else if (*i == "unclean")
2017 stuck_types |= PGMap::STUCK_UNCLEAN;
2018 else if (*i == "undersized")
2019 stuck_types |= PGMap::STUCK_UNDERSIZED;
2020 else if (*i == "degraded")
2021 stuck_types |= PGMap::STUCK_DEGRADED;
2022 else if (*i == "stale")
2023 stuck_types |= PGMap::STUCK_STALE;
2024 else {
2025 ds << "Unknown type: " << *i << std::endl;
2026 return -EINVAL;
7c673cae
FG
2027 }
2028 }
31f18b77
FG
2029
2030 utime_t now(ceph_clock_now());
2031 utime_t cutoff = now - utime_t(threshold, 0);
2032
2033 if (!f) {
2034 dump_stuck_plain(ds, stuck_types, cutoff);
2035 } else {
2036 dump_stuck(f, stuck_types, cutoff);
2037 f->flush(ds);
7c673cae 2038 }
31f18b77
FG
2039
2040 return 0;
7c673cae
FG
2041}
2042
9f95a23c 2043void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
7c673cae 2044{
31f18b77
FG
2045 f->open_array_section("osd_perf_infos");
2046 for (auto i = osd_stat.begin();
2047 i != osd_stat.end();
2048 ++i) {
2049 f->open_object_section("osd");
2050 f->dump_int("id", i->first);
2051 {
2052 f->open_object_section("perf_stats");
2053 i->second.os_perf_stat.dump(f);
2054 f->close_section();
2055 }
2056 f->close_section();
2057 }
2058 f->close_section();
7c673cae 2059}
31f18b77 2060void PGMap::print_osd_perf_stats(std::ostream *ss) const
7c673cae 2061{
31f18b77
FG
2062 TextTable tab;
2063 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2064 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2065 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2066 for (auto i = osd_stat.begin();
2067 i != osd_stat.end();
2068 ++i) {
2069 tab << i->first;
11fdf7f2
TL
2070 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2071 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
31f18b77
FG
2072 tab << TextTable::endrow;
2073 }
2074 (*ss) << tab;
2075}
7c673cae 2076
9f95a23c 2077void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
31f18b77
FG
2078{
2079 f->open_array_section("osd_blocked_by_infos");
2080 for (auto i = blocked_by_sum.begin();
2081 i != blocked_by_sum.end();
2082 ++i) {
2083 f->open_object_section("osd");
2084 f->dump_int("id", i->first);
2085 f->dump_int("num_blocked", i->second);
2086 f->close_section();
2087 }
2088 f->close_section();
2089}
2090void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2091{
2092 TextTable tab;
2093 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2094 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2095 for (auto i = blocked_by_sum.begin();
2096 i != blocked_by_sum.end();
2097 ++i) {
2098 tab << i->first;
2099 tab << i->second;
2100 tab << TextTable::endrow;
2101 }
2102 (*ss) << tab;
7c673cae
FG
2103}
2104
31f18b77 2105
7c673cae
FG
2106/**
2107 * update aggregated delta
2108 *
2109 * @param cct ceph context
2110 * @param ts Timestamp for the stats being delta'ed
2111 * @param old_pool_sum Previous stats sum
2112 * @param last_ts Last timestamp for pool
2113 * @param result_pool_sum Resulting stats
2114 * @param result_pool_delta Resulting pool delta
2115 * @param result_ts_delta Resulting timestamp delta
2116 * @param delta_avg_list List of last N computed deltas, used to average
2117 */
31f18b77
FG
2118void PGMap::update_delta(
2119 CephContext *cct,
2120 const utime_t ts,
2121 const pool_stat_t& old_pool_sum,
2122 utime_t *last_ts,
2123 const pool_stat_t& current_pool_sum,
2124 pool_stat_t *result_pool_delta,
2125 utime_t *result_ts_delta,
2126 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
7c673cae
FG
2127{
2128 /* @p ts is the timestamp we want to associate with the data
2129 * in @p old_pool_sum, and on which we will base ourselves to
2130 * calculate the delta, stored in 'delta_t'.
2131 */
2132 utime_t delta_t;
2133 delta_t = ts; // start with the provided timestamp
2134 delta_t -= *last_ts; // take the last timestamp we saw
2135 *last_ts = ts; // @p ts becomes the last timestamp we saw
2136
31f18b77
FG
2137 // adjust delta_t, quick start if there is no update in a long period
2138 delta_t = std::min(delta_t,
2139 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2140
2141 // calculate a delta, and average over the last 6 deltas by default.
7c673cae
FG
2142 /* start by taking a copy of our current @p result_pool_sum, and by
2143 * taking out the stats from @p old_pool_sum. This generates a stats
2144 * delta. Stash this stats delta in @p delta_avg_list, along with the
2145 * timestamp delta for these results.
2146 */
2147 pool_stat_t d = current_pool_sum;
2148 d.stats.sub(old_pool_sum.stats);
7c673cae
FG
2149
2150 /* Aggregate current delta, and take out the last seen delta (if any) to
2151 * average it out.
b32b8144 2152 * Skip calculating delta while sum was not synchronized.
7c673cae 2153 */
b32b8144
FG
2154 if(!old_pool_sum.stats.sum.is_zero()) {
2155 delta_avg_list->push_back(make_pair(d,delta_t));
2156 *result_ts_delta += delta_t;
2157 result_pool_delta->stats.add(d.stats);
2158 }
11fdf7f2
TL
2159 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2160 while (delta_avg_list->size() > s) {
7c673cae
FG
2161 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2162 *result_ts_delta -= delta_avg_list->front().second;
2163 delta_avg_list->pop_front();
2164 }
2165}
2166
7c673cae
FG
2167/**
2168 * Update a given pool's deltas
2169 *
2170 * @param cct Ceph Context
2171 * @param ts Timestamp for the stats being delta'ed
2172 * @param pool Pool's id
2173 * @param old_pool_sum Previous stats sum
2174 */
31f18b77
FG
2175void PGMap::update_one_pool_delta(
2176 CephContext *cct,
2177 const utime_t ts,
11fdf7f2 2178 const int64_t pool,
31f18b77 2179 const pool_stat_t& old_pool_sum)
7c673cae
FG
2180{
2181 if (per_pool_sum_deltas.count(pool) == 0) {
11fdf7f2
TL
2182 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2183 ceph_assert(per_pool_sum_delta.count(pool) == 0);
7c673cae
FG
2184 }
2185
31f18b77 2186 auto& sum_delta = per_pool_sum_delta[pool];
7c673cae
FG
2187
2188 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2189 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2190 &per_pool_sum_deltas[pool]);
2191}
2192
2193/**
2194 * Update pools' deltas
2195 *
2196 * @param cct CephContext
2197 * @param ts Timestamp for the stats being delta'ed
2198 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2199 */
31f18b77
FG
2200void PGMap::update_pool_deltas(
2201 CephContext *cct, const utime_t ts,
11fdf7f2 2202 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
7c673cae 2203{
31f18b77 2204 for (auto it = pg_pool_sum_old.begin();
7c673cae
FG
2205 it != pg_pool_sum_old.end(); ++it) {
2206 update_one_pool_delta(cct, ts, it->first, it->second);
2207 }
2208}
2209
2210void PGMap::clear_delta()
2211{
2212 pg_sum_delta = pool_stat_t();
2213 pg_sum_deltas.clear();
2214 stamp_delta = utime_t();
2215}
2216
7c673cae
FG
2217void PGMap::generate_test_instances(list<PGMap*>& o)
2218{
2219 o.push_back(new PGMap);
2220 list<Incremental*> inc;
2221 Incremental::generate_test_instances(inc);
2222 delete inc.front();
2223 inc.pop_front();
2224 while (!inc.empty()) {
2225 PGMap *pmp = new PGMap();
2226 *pmp = *o.back();
2227 o.push_back(pmp);
2228 o.back()->apply_incremental(NULL, *inc.front());
2229 delete inc.front();
2230 inc.pop_front();
2231 }
2232}
2233
11fdf7f2 2234void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
7c673cae
FG
2235 bool primary, set<pg_t>& pgs) const
2236{
31f18b77 2237 for (auto i = pg_stat.begin();
7c673cae
FG
2238 i != pg_stat.end();
2239 ++i) {
11fdf7f2 2240 if ((poolid >= 0) && (poolid != i->first.pool()))
7c673cae
FG
2241 continue;
2242 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2243 continue;
11fdf7f2
TL
2244 if (state == (uint64_t)-1 || // "all"
2245 (i->second.state & state) || // matches a state bit
2246 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2247 pgs.insert(i->first);
2248 }
7c673cae
FG
2249 }
2250}
2251
9f95a23c 2252void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
7c673cae
FG
2253{
2254 f->open_array_section("pg_stats");
31f18b77 2255 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2256 const pg_stat_t& st = pg_stat.at(*i);
2257 f->open_object_section("pg_stat");
2258 f->dump_stream("pgid") << *i;
2259 st.dump(f);
2260 f->close_section();
2261 }
2262 f->close_section();
2263}
2264
2265void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2266{
2267 TextTable tab;
11fdf7f2 2268 utime_t now = ceph_clock_now();
7c673cae 2269
11fdf7f2 2270 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
7c673cae 2271 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2272 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2273 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2274 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2275 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2
TL
2276 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2277 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2278 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2279 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
11fdf7f2 2280 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2281 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2282 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2283 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2284 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
7c673cae 2285 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
7c673cae
FG
2286 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2287
31f18b77 2288 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2289 const pg_stat_t& st = pg_stat.at(*i);
2290
2291 ostringstream reported;
2292 reported << st.reported_epoch << ":" << st.reported_seq;
2293
11fdf7f2 2294 ostringstream upstr, actingstr;
9f95a23c
TL
2295 upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
2296 actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
7c673cae
FG
2297 tab << *i
2298 << st.stats.sum.num_objects
7c673cae
FG
2299 << st.stats.sum.num_objects_degraded
2300 << st.stats.sum.num_objects_misplaced
2301 << st.stats.sum.num_objects_unfound
2302 << st.stats.sum.num_bytes
11fdf7f2
TL
2303 << st.stats.sum.num_omap_bytes
2304 << st.stats.sum.num_omap_keys
7c673cae 2305 << st.log_size
7c673cae 2306 << pg_state_string(st.state)
11fdf7f2 2307 << utimespan_str(now - st.last_change)
7c673cae
FG
2308 << st.version
2309 << reported.str()
11fdf7f2
TL
2310 << upstr.str()
2311 << actingstr.str()
7c673cae 2312 << st.last_scrub_stamp
7c673cae
FG
2313 << st.last_deep_scrub_stamp
2314 << TextTable::endrow;
2315 }
2316
2317 ss << tab;
2318}
2319
11fdf7f2 2320void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
9f95a23c 2321 ceph::Formatter *f,
11fdf7f2
TL
2322 stringstream *rs) const {
2323 string pool_name = osd_map.get_pool_name(poolid);
2324 if (f) {
2325 f->open_object_section("pool");
2326 f->dump_string("pool_name", pool_name.c_str());
2327 f->dump_int("pool_id", poolid);
2328 f->open_object_section("recovery");
2329 }
2330 list<string> sl;
2331 stringstream tss;
2332 pool_recovery_summary(f, &sl, poolid);
2333 if (!f && !sl.empty()) {
2334 for (auto &p : sl)
2335 tss << " " << p << "\n";
2336 }
2337 if (f) {
2338 f->close_section(); // object section recovery
2339 f->open_object_section("recovery_rate");
2340 }
2341 ostringstream rss;
2342 pool_recovery_rate_summary(f, &rss, poolid);
2343 if (!f && !rss.str().empty())
2344 tss << " recovery io " << rss.str() << "\n";
2345 if (f) {
2346 f->close_section(); // object section recovery_rate
2347 f->open_object_section("client_io_rate");
2348 }
2349 rss.clear();
2350 rss.str("");
2351 pool_client_io_rate_summary(f, &rss, poolid);
2352 if (!f && !rss.str().empty())
2353 tss << " client io " << rss.str() << "\n";
2354 // dump cache tier IO rate for cache pool
2355 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2356 if (pool->is_tier()) {
2357 if (f) {
2358 f->close_section(); // object section client_io_rate
2359 f->open_object_section("cache_io_rate");
7c673cae 2360 }
11fdf7f2
TL
2361 rss.clear();
2362 rss.str("");
2363 pool_cache_io_rate_summary(f, &rss, poolid);
2364 if (!f && !rss.str().empty())
2365 tss << " cache tier io " << rss.str() << "\n";
2366 }
2367 if (f) {
2368 f->close_section(); // object section cache_io_rate
2369 f->close_section(); // object section pool
2370 } else {
2371 *rs << "pool " << pool_name << " id " << poolid << "\n";
2372 if (!tss.str().empty())
2373 *rs << tss.str() << "\n";
2374 else
2375 *rs << " nothing is going on\n\n";
7c673cae 2376 }
7c673cae
FG
2377}
2378
9f95a23c
TL
2379// Get crush parentage for an osd (skip root)
2380set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
2381{
2382 set<std::string> reporters_by_subtree;
2383 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2384
2385 auto loc = osdmap.crush->get_full_location(id);
2386 for (auto& [parent_bucket_type, parent_id] : loc) {
2387 // Should we show the root? Might not be too informative like "default"
2388 if (parent_bucket_type != "root" &&
2389 parent_bucket_type != reporter_subtree_level) {
2390 reporters_by_subtree.insert(parent_id);
2391 }
2392 }
2393 return reporters_by_subtree;
2394}
2395
11fdf7f2 2396void PGMap::get_health_checks(
31f18b77 2397 CephContext *cct,
11fdf7f2
TL
2398 const OSDMap& osdmap,
2399 health_check_map_t *checks) const
7c673cae 2400{
11fdf7f2
TL
2401 utime_t now = ceph_clock_now();
2402 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2403 const auto& pools = osdmap.get_pools();
224ce89b 2404
224ce89b
WB
2405 typedef enum pg_consequence_t {
2406 UNAVAILABLE = 1, // Client IO to the pool may block
2407 DEGRADED = 2, // Fewer than the requested number of replicas are present
eafe8130
TL
2408 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2409 // This may or may not be a deadlock condition.
2410 DAMAGED = 4, // The data may be missing or inconsistent on disk and
224ce89b 2411 // requires repair
eafe8130 2412 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
224ce89b
WB
2413 } pg_consequence_t;
2414
2415 // For a given PG state, how should it be reported at the pool level?
2416 class PgStateResponse {
2417 public:
2418 pg_consequence_t consequence;
2419 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2420 stuck_cb stuck_since;
2421 bool invert;
2422
11fdf7f2
TL
2423 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2424 : consequence(c), stuck_since(std::move(s)), invert(false)
224ce89b
WB
2425 {
2426 }
2427
11fdf7f2
TL
2428 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2429 : consequence(c), stuck_since(std::move(s)), invert(i)
224ce89b
WB
2430 {
2431 }
2432 };
2433
2434 // Record the PG state counts that contributed to a reported pool state
2435 class PgCauses {
2436 public:
2437 // Map of PG_STATE_* to number of pgs in that state.
2438 std::map<unsigned, unsigned> states;
2439
2440 // List of all PG IDs that had a state contributing
2441 // to this health condition.
2442 std::set<pg_t> pgs;
2443
2444 std::map<pg_t, std::string> pg_messages;
2445 };
2446
2447 // Map of PG state to how to respond to it
2448 std::map<unsigned, PgStateResponse> state_to_response = {
2449 // Immediate reports
2450 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
c07f9fc5 2451 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
224ce89b 2452 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
b32b8144
FG
2453 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2454 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
eafe8130
TL
2455 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2456 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
224ce89b
WB
2457 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2458 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2459 // Delayed (wait until stuck) reports
2460 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2461 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2462 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2463 // Delayed and inverted reports
b32b8144 2464 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
224ce89b
WB
2465 };
2466
2467 // Specialized state printer that takes account of inversion of
2468 // ACTIVE, CLEAN checks.
11fdf7f2 2469 auto state_name = [](const uint64_t &state) {
224ce89b
WB
2470 // Special cases for the states that are inverted checks
2471 if (state == PG_STATE_CLEAN) {
2472 return std::string("unclean");
2473 } else if (state == PG_STATE_ACTIVE) {
2474 return std::string("inactive");
2475 } else {
2476 return pg_state_string(state);
2477 }
2478 };
2479
2480 // Map of what is wrong to information about why, implicitly also stores
2481 // the list of what is wrong.
2482 std::map<pg_consequence_t, PgCauses> detected;
2483
2484 // Optimisation: trim down the number of checks to apply based on
2485 // the summary counters
2486 std::map<unsigned, PgStateResponse> possible_responses;
2487 for (const auto &i : num_pg_by_state) {
2488 for (const auto &j : state_to_response) {
2489 if (!j.second.invert) {
2490 // Check for normal tests by seeing if any pgs have the flag
2491 if (i.first & j.first) {
2492 possible_responses.insert(j);
2493 }
2494 }
2495 }
2496 }
2497
2498 for (const auto &j : state_to_response) {
2499 if (j.second.invert) {
2500 // Check for inverted tests by seeing if not-all pgs have the flag
2501 const auto &found = num_pg_by_state.find(j.first);
2502 if (found == num_pg_by_state.end() || found->second != num_pg) {
2503 possible_responses.insert(j);
2504 }
2505 }
2506 }
2507
11fdf7f2 2508 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
224ce89b
WB
2509 // Loop over all PGs, if there are any possibly-unhealthy states in there
2510 if (!possible_responses.empty()) {
2511 for (const auto& i : pg_stat) {
2512 const auto &pg_id = i.first;
2513 const auto &pg_info = i.second;
2514
2515 for (const auto &j : state_to_response) {
2516 const auto &pg_response_state = j.first;
2517 const auto &pg_response = j.second;
2518
2519 // Apply the state test
2520 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2521 continue;
2522 }
2523
2524 // Apply stuckness test if needed
2525 if (pg_response.stuck_since) {
2526 // Delayed response, check for stuckness
2527 utime_t last_whatever = pg_response.stuck_since(pg_info);
2528 if (last_whatever >= cutoff) {
2529 // Not stuck enough, ignore.
2530 continue;
2531 } else {
2532
2533 }
2534 }
2535
2536 auto &causes = detected[pg_response.consequence];
2537 causes.states[pg_response_state]++;
2538 causes.pgs.insert(pg_id);
2539
2540 // Don't bother composing detail string if we have already recorded
2541 // too many
2542 if (causes.pg_messages.size() > max) {
2543 continue;
2544 }
2545
2546 std::ostringstream ss;
2547 if (pg_response.stuck_since) {
2548 utime_t since = pg_response.stuck_since(pg_info);
2549 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2550 if (since == utime_t()) {
2551 ss << " since forever";
2552 } else {
2553 utime_t dur = now - since;
9f95a23c 2554 ss << " for " << utimespan_str(dur);
224ce89b
WB
2555 }
2556 ss << ", current state " << pg_state_string(pg_info.state)
2557 << ", last acting " << pg_info.acting;
2558 } else {
2559 ss << "pg " << pg_id << " is "
2560 << pg_state_string(pg_info.state);
2561 ss << ", acting " << pg_info.acting;
2562 if (pg_info.stats.sum.num_objects_unfound) {
2563 ss << ", " << pg_info.stats.sum.num_objects_unfound
2564 << " unfound";
2565 }
2566 }
2567
2568 if (pg_info.state & PG_STATE_INCOMPLETE) {
2569 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2570 if (pi && pi->min_size > 1) {
2571 ss << " (reducing pool "
2572 << osdmap.get_pool_name(pg_id.pool())
2573 << " min_size from " << (int)pi->min_size
2574 << " may help; search ceph.com/docs for 'incomplete')";
2575 }
2576 }
2577
2578 causes.pg_messages[pg_id] = ss.str();
2579 }
2580 }
2581 } else {
2582 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2583 }
2584
2585 for (const auto &i : detected) {
2586 std::string health_code;
2587 health_status_t sev;
2588 std::string summary;
2589 switch(i.first) {
2590 case UNAVAILABLE:
2591 health_code = "PG_AVAILABILITY";
2592 sev = HEALTH_WARN;
2593 summary = "Reduced data availability: ";
2594 break;
2595 case DEGRADED:
2596 health_code = "PG_DEGRADED";
2597 summary = "Degraded data redundancy: ";
2598 sev = HEALTH_WARN;
2599 break;
eafe8130
TL
2600 case BACKFILL_FULL:
2601 health_code = "PG_BACKFILL_FULL";
2602 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2603 sev = HEALTH_WARN;
224ce89b
WB
2604 break;
2605 case DAMAGED:
2606 health_code = "PG_DAMAGED";
2607 summary = "Possible data damage: ";
2608 sev = HEALTH_ERR;
2609 break;
eafe8130
TL
2610 case RECOVERY_FULL:
2611 health_code = "PG_RECOVERY_FULL";
2612 summary = "Full OSDs blocking recovery: ";
2613 sev = HEALTH_ERR;
2614 break;
224ce89b 2615 default:
11fdf7f2 2616 ceph_abort();
224ce89b
WB
2617 }
2618
2619 if (i.first == DEGRADED) {
2620 if (pg_sum.stats.sum.num_objects_degraded &&
2621 pg_sum.stats.sum.num_object_copies > 0) {
2622 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2623 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2624 char b[20];
2625 snprintf(b, sizeof(b), "%.3lf", pc);
2626 ostringstream ss;
2627 ss << pg_sum.stats.sum.num_objects_degraded
2628 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2629 << b << "%)";
2630
2631 // Throw in a comma for the benefit of the following PG counts
2632 summary += ss.str() + ", ";
2633 }
2634 }
2635
2636 // Compose summary message saying how many PGs in what states led
2637 // to this health check failing
2638 std::vector<std::string> pg_msgs;
9f95a23c 2639 int64_t count = 0;
224ce89b
WB
2640 for (const auto &j : i.second.states) {
2641 std::ostringstream msg;
2642 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2643 pg_msgs.push_back(msg.str());
9f95a23c 2644 count += j.second;
224ce89b
WB
2645 }
2646 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2647
224ce89b
WB
2648 health_check_t *check = &checks->add(
2649 health_code,
2650 sev,
9f95a23c
TL
2651 summary,
2652 count);
224ce89b
WB
2653
2654 // Compose list of PGs contributing to this health check failing
2655 for (const auto &j : i.second.pg_messages) {
2656 check->detail.push_back(j.second);
2657 }
2658 }
2659
224ce89b
WB
2660 // OSD_SCRUB_ERRORS
2661 if (pg_sum.stats.sum.num_scrub_errors) {
2662 ostringstream ss;
2663 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
9f95a23c
TL
2664 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2665 pg_sum.stats.sum.num_scrub_errors);
224ce89b
WB
2666 }
2667
28e407b8
AA
2668 // LARGE_OMAP_OBJECTS
2669 if (pg_sum.stats.sum.num_large_omap_objects) {
2670 list<string> detail;
2671 for (auto &pool : pools) {
2672 const string& pool_name = osdmap.get_pool_name(pool.first);
2673 auto it2 = pg_pool_sum.find(pool.first);
2674 if (it2 == pg_pool_sum.end()) {
2675 continue;
2676 }
2677 const pool_stat_t *pstat = &it2->second;
2678 if (pstat == nullptr) {
2679 continue;
2680 }
2681 const object_stat_sum_t& sum = pstat->stats.sum;
2682 if (sum.num_large_omap_objects) {
2683 stringstream ss;
2684 ss << sum.num_large_omap_objects << " large objects found in pool "
2685 << "'" << pool_name << "'";
2686 detail.push_back(ss.str());
2687 }
2688 }
2689 if (!detail.empty()) {
2690 ostringstream ss;
2691 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
9f95a23c
TL
2692 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2693 pg_sum.stats.sum.num_large_omap_objects);
28e407b8
AA
2694 stringstream tip;
2695 tip << "Search the cluster log for 'Large omap object found' for more "
2696 << "details.";
2697 detail.push_back(tip.str());
2698 d.detail.swap(detail);
2699 }
2700 }
2701
224ce89b
WB
2702 // CACHE_POOL_NEAR_FULL
2703 {
2704 list<string> detail;
2705 unsigned num_pools = 0;
2706 for (auto& p : pools) {
2707 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2708 !pg_pool_sum.count(p.first)) {
2709 continue;
2710 }
2711 bool nearfull = false;
2712 const string& name = osdmap.get_pool_name(p.first);
2713 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2714 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2715 ((1000000 - p.second.cache_target_full_ratio_micro) *
2716 cct->_conf->mon_cache_target_full_warn_ratio);
2717 if (p.second.target_max_objects &&
2718 (uint64_t)(st.stats.sum.num_objects -
2719 st.stats.sum.num_objects_hit_set_archive) >
2720 p.second.target_max_objects * (ratio / 1000000.0)) {
2721 ostringstream ss;
2722 ss << "cache pool '" << name << "' with "
1adf2230 2723 << si_u_t(st.stats.sum.num_objects)
224ce89b 2724 << " objects at/near target max "
1adf2230 2725 << si_u_t(p.second.target_max_objects) << " objects";
224ce89b
WB
2726 detail.push_back(ss.str());
2727 nearfull = true;
2728 }
2729 if (p.second.target_max_bytes &&
2730 (uint64_t)(st.stats.sum.num_bytes -
2731 st.stats.sum.num_bytes_hit_set_archive) >
2732 p.second.target_max_bytes * (ratio / 1000000.0)) {
2733 ostringstream ss;
2734 ss << "cache pool '" << name
1adf2230
AA
2735 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2736 << " at/near target max "
2737 << byte_u_t(p.second.target_max_bytes);
224ce89b
WB
2738 detail.push_back(ss.str());
2739 nearfull = true;
2740 }
2741 if (nearfull) {
2742 ++num_pools;
2743 }
2744 }
2745 if (!detail.empty()) {
2746 ostringstream ss;
2747 ss << num_pools << " cache pools at or near target size";
9f95a23c
TL
2748 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2749 num_pools);
224ce89b
WB
2750 d.detail.swap(detail);
2751 }
2752 }
2753
2754 // TOO_FEW_PGS
3efd9988
FG
2755 unsigned num_in = osdmap.get_num_in_osds();
2756 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2757 const auto min_pg_per_osd =
11fdf7f2 2758 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
3efd9988
FG
2759 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2760 auto per = sum_pg_up / num_in;
2761 if (per < min_pg_per_osd && per) {
224ce89b
WB
2762 ostringstream ss;
2763 ss << "too few PGs per OSD (" << per
3efd9988 2764 << " < min " << min_pg_per_osd << ")";
9f95a23c
TL
2765 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2766 min_pg_per_osd - per);
224ce89b
WB
2767 }
2768 }
2769
2770 // TOO_MANY_PGS
11fdf7f2 2771 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
2772 if (num_in && max_pg_per_osd > 0) {
2773 auto per = sum_pg_up / num_in;
2774 if (per > max_pg_per_osd) {
224ce89b
WB
2775 ostringstream ss;
2776 ss << "too many PGs per OSD (" << per
3efd9988 2777 << " > max " << max_pg_per_osd << ")";
9f95a23c
TL
2778 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2779 per - max_pg_per_osd);
224ce89b
WB
2780 }
2781 }
2782
eafe8130
TL
2783 // TOO_FEW_OSDS
2784 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2785 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2786 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2787 ostringstream ss;
2788 ss << "OSD count " << osdmap.get_num_osds()
2789 << " < osd_pool_default_size " << osd_pool_default_size;
9f95a23c
TL
2790 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2791 osd_pool_default_size - osdmap.get_num_osds());
eafe8130
TL
2792 }
2793
2794 // SLOW_PING_TIME
2795 // Convert milliseconds to microseconds
2796 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2797 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2798 if (warn_slow_ping_time == 0) {
2799 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2800 warn_slow_ping_time = grace;
2801 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2802 }
2803 if (warn_slow_ping_time > 0) {
2804
2805 struct mon_ping_item_t {
2806 uint32_t pingtime;
2807 int from;
2808 int to;
2809 bool improving;
2810
2811 bool operator<(const mon_ping_item_t& rhs) const {
2812 if (pingtime < rhs.pingtime)
2813 return true;
2814 if (pingtime > rhs.pingtime)
2815 return false;
2816 if (from < rhs.from)
2817 return true;
2818 if (from > rhs.from)
2819 return false;
2820 return to < rhs.to;
2821 }
2822 };
2823
2824 list<string> detail_back;
2825 list<string> detail_front;
2826 set<mon_ping_item_t> back_sorted, front_sorted;
2827 for (auto i : osd_stat) {
2828 for (auto j : i.second.hb_pingtime) {
2829
2830 // Maybe source info is old
2831 if (now.sec() - j.second.last_update > grace * 60)
2832 continue;
2833
2834 mon_ping_item_t back;
2835 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2836 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2837 back.from = i.first;
2838 back.to = j.first;
2839 if (back.pingtime > warn_slow_ping_time) {
2840 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2841 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2842 back_sorted.emplace(back);
2843 }
2844
2845 mon_ping_item_t front;
2846 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2847 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2848 front.from = i.first;
2849 front.to = j.first;
2850 if (front.pingtime > warn_slow_ping_time) {
2851 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
2852 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2853 front_sorted.emplace(front);
2854 }
2855 }
2856 }
2857 int max_detail = 10;
2858 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2859 ostringstream ss;
2860 if (max_detail == 0) {
2861 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2862 detail_back.push_back(ss.str());
2863 break;
2864 }
2865 max_detail--;
9f95a23c
TL
2866 ss << "Slow OSD heartbeats on back from osd." << sback.from
2867 << " [" << osd_parentage(osdmap, sback.from) << "]"
eafe8130
TL
2868 << (osdmap.is_down(sback.from) ? " (down)" : "")
2869 << " to osd." << sback.to
9f95a23c 2870 << " [" << osd_parentage(osdmap, sback.to) << "]"
eafe8130
TL
2871 << (osdmap.is_down(sback.to) ? " (down)" : "")
2872 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2873 << (sback.improving ? " possibly improving" : "");
2874 detail_back.push_back(ss.str());
2875 }
2876 max_detail = 10;
2877 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2878 ostringstream ss;
2879 if (max_detail == 0) {
2880 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2881 detail_front.push_back(ss.str());
2882 break;
2883 }
2884 max_detail--;
9f95a23c
TL
2885 // Get crush parentage for each osd
2886 ss << "Slow OSD heartbeats on front from osd." << sfront.from
2887 << " [" << osd_parentage(osdmap, sfront.from) << "]"
eafe8130
TL
2888 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2889 << " to osd." << sfront.to
9f95a23c 2890 << " [" << osd_parentage(osdmap, sfront.to) << "]"
eafe8130
TL
2891 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2892 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2893 << (sfront.improving ? " possibly improving" : "");
2894 detail_front.push_back(ss.str());
2895 }
2896 if (detail_back.size() != 0) {
2897 ostringstream ss;
9f95a23c
TL
2898 ss << "Slow OSD heartbeats on back (longest "
2899 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
2900 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2901 back_sorted.size());
eafe8130
TL
2902 d.detail.swap(detail_back);
2903 }
2904 if (detail_front.size() != 0) {
2905 ostringstream ss;
9f95a23c
TL
2906 ss << "Slow OSD heartbeats on front (longest "
2907 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
2908 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2909 front_sorted.size());
eafe8130
TL
2910 d.detail.swap(detail_front);
2911 }
2912 }
2913
224ce89b
WB
2914 // SMALLER_PGP_NUM
2915 // MANY_OBJECTS_PER_PG
2916 if (!pg_stat.empty()) {
2917 list<string> pgp_detail, many_detail;
b32b8144 2918 const auto mon_pg_warn_min_objects =
11fdf7f2 2919 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
b32b8144 2920 const auto mon_pg_warn_min_pool_objects =
11fdf7f2 2921 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
b32b8144 2922 const auto mon_pg_warn_max_object_skew =
11fdf7f2 2923 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
224ce89b
WB
2924 for (auto p = pg_pool_sum.begin();
2925 p != pg_pool_sum.end();
2926 ++p) {
2927 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2928 if (!pi)
2929 continue; // in case osdmap changes haven't propagated to PGMap yet
2930 const string& name = osdmap.get_pool_name(p->first);
11fdf7f2
TL
2931 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2932 // the warnings. If the cluster is failing to converge on the target
2933 // values that is a separate issue!
2934 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
224ce89b
WB
2935 !(name.find(".DELETED") != string::npos &&
2936 cct->_conf->mon_fake_pool_delete)) {
2937 ostringstream ss;
2938 ss << "pool " << name << " pg_num "
11fdf7f2
TL
2939 << pi->get_pg_num_target()
2940 << " > pgp_num " << pi->get_pgp_num_target();
224ce89b
WB
2941 pgp_detail.push_back(ss.str());
2942 }
2943 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2944 if (average_objects_per_pg > 0 &&
b32b8144
FG
2945 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2946 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
11fdf7f2
TL
2947 int objects_per_pg = p->second.stats.sum.num_objects /
2948 pi->get_pg_num_target();
224ce89b 2949 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
b32b8144
FG
2950 if (mon_pg_warn_max_object_skew > 0 &&
2951 ratio > mon_pg_warn_max_object_skew) {
224ce89b
WB
2952 ostringstream ss;
2953 ss << "pool " << name << " objects per pg ("
2954 << objects_per_pg << ") is more than " << ratio
2955 << " times cluster average ("
2956 << average_objects_per_pg << ")";
2957 many_detail.push_back(ss.str());
2958 }
2959 }
2960 }
2961 if (!pgp_detail.empty()) {
2962 ostringstream ss;
2963 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
9f95a23c
TL
2964 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
2965 pgp_detail.size());
224ce89b
WB
2966 d.detail.swap(pgp_detail);
2967 }
2968 if (!many_detail.empty()) {
2969 ostringstream ss;
2970 ss << many_detail.size() << " pools have many more objects per pg than"
2971 << " average";
9f95a23c
TL
2972 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
2973 many_detail.size());
224ce89b
WB
2974 d.detail.swap(many_detail);
2975 }
2976 }
2977
2978 // POOL_FULL
2979 // POOL_NEAR_FULL
2980 {
11fdf7f2
TL
2981 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
2982 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
224ce89b
WB
2983 list<string> full_detail, nearfull_detail;
2984 unsigned full_pools = 0, nearfull_pools = 0;
2985 for (auto it : pools) {
2986 auto it2 = pg_pool_sum.find(it.first);
2987 if (it2 == pg_pool_sum.end()) {
2988 continue;
2989 }
2990 const pool_stat_t *pstat = &it2->second;
2991 const object_stat_sum_t& sum = pstat->stats.sum;
2992 const string& pool_name = osdmap.get_pool_name(it.first);
2993 const pg_pool_t &pool = it.second;
2994 bool full = false, nearfull = false;
2995 if (pool.quota_max_objects > 0) {
2996 stringstream ss;
2997 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
2998 } else if (crit_threshold > 0 &&
2999 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3000 ss << "pool '" << pool_name
3001 << "' has " << sum.num_objects << " objects"
3002 << " (max " << pool.quota_max_objects << ")";
3003 full_detail.push_back(ss.str());
3004 full = true;
3005 } else if (warn_threshold > 0 &&
3006 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3007 ss << "pool '" << pool_name
3008 << "' has " << sum.num_objects << " objects"
3009 << " (max " << pool.quota_max_objects << ")";
3010 nearfull_detail.push_back(ss.str());
3011 nearfull = true;
3012 }
3013 }
3014 if (pool.quota_max_bytes > 0) {
3015 stringstream ss;
3016 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3017 } else if (crit_threshold > 0 &&
3018 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3019 ss << "pool '" << pool_name
1adf2230
AA
3020 << "' has " << byte_u_t(sum.num_bytes)
3021 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
224ce89b
WB
3022 full_detail.push_back(ss.str());
3023 full = true;
3024 } else if (warn_threshold > 0 &&
3025 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3026 ss << "pool '" << pool_name
1adf2230
AA
3027 << "' has " << byte_u_t(sum.num_bytes)
3028 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
224ce89b
WB
3029 nearfull_detail.push_back(ss.str());
3030 nearfull = true;
3031 }
3032 }
3033 if (full) {
3034 ++full_pools;
3035 }
3036 if (nearfull) {
3037 ++nearfull_pools;
3038 }
3039 }
3040 if (full_pools) {
3041 ostringstream ss;
3042 ss << full_pools << " pools full";
9f95a23c 3043 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
224ce89b
WB
3044 d.detail.swap(full_detail);
3045 }
3046 if (nearfull_pools) {
3047 ostringstream ss;
11fdf7f2 3048 ss << nearfull_pools << " pools nearfull";
9f95a23c 3049 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
224ce89b
WB
3050 d.detail.swap(nearfull_detail);
3051 }
3052 }
3053
3054 // OBJECT_MISPLACED
3055 if (pg_sum.stats.sum.num_objects_misplaced &&
11fdf7f2
TL
3056 pg_sum.stats.sum.num_object_copies > 0 &&
3057 cct->_conf->mon_warn_on_misplaced) {
224ce89b
WB
3058 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3059 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3060 char b[20];
3061 snprintf(b, sizeof(b), "%.3lf", pc);
3062 ostringstream ss;
3063 ss << pg_sum.stats.sum.num_objects_misplaced
3064 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3065 << b << "%)";
9f95a23c
TL
3066 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3067 pg_sum.stats.sum.num_objects_misplaced);
224ce89b
WB
3068 }
3069
3070 // OBJECT_UNFOUND
3071 if (pg_sum.stats.sum.num_objects_unfound &&
3072 pg_sum.stats.sum.num_objects) {
3073 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3074 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3075 char b[20];
3076 snprintf(b, sizeof(b), "%.3lf", pc);
3077 ostringstream ss;
3078 ss << pg_sum.stats.sum.num_objects_unfound
b5b8bbf5 3079 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
9f95a23c
TL
3080 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3081 pg_sum.stats.sum.num_objects_unfound);
c07f9fc5
FG
3082
3083 for (auto& p : pg_stat) {
3084 if (p.second.stats.sum.num_objects_unfound) {
3085 ostringstream ss;
3086 ss << "pg " << p.first
3087 << " has " << p.second.stats.sum.num_objects_unfound
3088 << " unfound objects";
3089 d.detail.push_back(ss.str());
3090 if (d.detail.size() > max) {
3091 d.detail.push_back("(additional pgs left out for brevity)");
3092 break;
3093 }
3094 }
3095 }
224ce89b
WB
3096 }
3097
3098 // REQUEST_SLOW
3099 // REQUEST_STUCK
11fdf7f2 3100 // SLOW_OPS unifies them in mimic.
9f95a23c 3101 if (osdmap.require_osd_release < ceph_release_t::mimic &&
11fdf7f2 3102 cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3103 !osd_sum.op_queue_age_hist.h.empty() &&
3104 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
224ce89b
WB
3105 cct->_conf->mon_osd_warn_op_age) {
3106 list<string> warn_detail, error_detail;
3107 unsigned warn = 0, error = 0;
3108 float err_age =
3109 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3110 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3111 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3112 float ub = (float)(1 << i) / 1000.0;
3113 if (ub < cct->_conf->mon_osd_warn_op_age)
3114 break;
3115 if (h.h[i]) {
3116 ostringstream ss;
3117 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3118 if (ub > err_age) {
3119 error += h.h[i];
3120 error_detail.push_back(ss.str());
3121 } else {
3122 warn += h.h[i];
3123 warn_detail.push_back(ss.str());
3124 }
3125 }
3126 }
3127
3128 map<float,set<int>> warn_osd_by_max; // max -> osds
3129 map<float,set<int>> error_osd_by_max; // max -> osds
3130 if (!warn_detail.empty() || !error_detail.empty()) {
3131 for (auto& p : osd_stat) {
3132 const pow2_hist_t& h = p.second.op_queue_age_hist;
3133 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3134 float ub = (float)(1 << i) / 1000.0;
3135 if (ub < cct->_conf->mon_osd_warn_op_age)
3136 break;
3137 if (h.h[i]) {
3138 if (ub > err_age) {
3139 error_osd_by_max[ub].insert(p.first);
3140 } else {
3141 warn_osd_by_max[ub].insert(p.first);
3142 }
3143 break;
3144 }
3145 }
3146 }
3147 }
3148
3149 if (!warn_detail.empty()) {
11fdf7f2
TL
3150 ostringstream ss;
3151 ss << warn << " slow requests are blocked > "
3152 << cct->_conf->mon_osd_warn_op_age << " sec";
9f95a23c 3153 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
11fdf7f2 3154 d.detail.swap(warn_detail);
224ce89b
WB
3155 int left = max;
3156 for (auto& p : warn_osd_by_max) {
3157 ostringstream ss;
3158 if (p.second.size() > 1) {
c07f9fc5
FG
3159 ss << "osds " << p.second
3160 << " have blocked requests > " << p.first << " sec";
224ce89b 3161 } else {
c07f9fc5
FG
3162 ss << "osd." << *p.second.begin()
3163 << " has blocked requests > " << p.first << " sec";
224ce89b 3164 }
11fdf7f2 3165 d.detail.push_back(ss.str());
224ce89b
WB
3166 if (--left == 0) {
3167 break;
3168 }
3169 }
3170 }
3171 if (!error_detail.empty()) {
11fdf7f2
TL
3172 ostringstream ss;
3173 ss << error << " stuck requests are blocked > "
3174 << err_age << " sec";
9f95a23c 3175 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
11fdf7f2 3176 d.detail.swap(error_detail);
224ce89b
WB
3177 int left = max;
3178 for (auto& p : error_osd_by_max) {
3179 ostringstream ss;
3180 if (p.second.size() > 1) {
c07f9fc5
FG
3181 ss << "osds " << p.second
3182 << " have stuck requests > " << p.first << " sec";
224ce89b 3183 } else {
c07f9fc5
FG
3184 ss << "osd." << *p.second.begin()
3185 << " has stuck requests > " << p.first << " sec";
224ce89b 3186 }
11fdf7f2 3187 d.detail.push_back(ss.str());
224ce89b
WB
3188 if (--left == 0) {
3189 break;
3190 }
3191 }
3192 }
3193 }
7c673cae 3194
11fdf7f2
TL
3195 // OBJECT_STORE_WARN
3196 if (osd_sum.os_alerts.size()) {
3197 map<string, pair<size_t, list<string>>> os_alerts_sum;
3198
3199 for (auto& a : osd_sum.os_alerts) {
3200 int left = max;
3201 string s0 = " osd.";
3202 s0 += stringify(a.first);
3203 for (auto& aa : a.second) {
3204 string s(s0);
3205 s += " ";
3206 s += aa.second;
3207 auto it = os_alerts_sum.find(aa.first);
3208 if (it == os_alerts_sum.end()) {
3209 list<string> d;
3210 d.emplace_back(s);
3211 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3212 } else {
3213 auto& p = it->second;
3214 ++p.first;
3215 p.second.emplace_back(s);
3216 }
3217 if (--left == 0) {
3218 break;
3219 }
3220 }
3221 }
3222
3223 for (auto& asum : os_alerts_sum) {
9f95a23c 3224 string summary = stringify(asum.second.first) + " OSD(s)";
11fdf7f2 3225 if (asum.first == "BLUEFS_SPILLOVER") {
9f95a23c 3226 summary += " experiencing BlueFS spillover";
11fdf7f2 3227 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
9f95a23c 3228 summary += " have broken BlueStore compression";
81eedcae 3229 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
9f95a23c 3230 summary += " reporting legacy (not per-pool) BlueStore stats";
81eedcae 3231 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
9f95a23c
TL
3232 summary += " have dangerous mismatch between BlueStore block device and free list sizes";
3233 } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3234 summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
11fdf7f2 3235 }
9f95a23c 3236 auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
11fdf7f2
TL
3237 for (auto& s : asum.second.second) {
3238 d.detail.push_back(s);
3239 }
3240 }
3241 }
224ce89b
WB
3242 // PG_NOT_SCRUBBED
3243 // PG_NOT_DEEP_SCRUBBED
11fdf7f2
TL
3244 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3245 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
a8e16298
TL
3246 list<string> detail, deep_detail;
3247 int detail_max = max, deep_detail_max = max;
3248 int detail_more = 0, deep_detail_more = 0;
3249 int detail_total = 0, deep_detail_total = 0;
3250 for (auto& p : pg_stat) {
3251 int64_t pnum = p.first.pool();
3252 auto pool = osdmap.get_pg_pool(pnum);
3253 if (!pool)
3254 continue;
11fdf7f2 3255 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
a8e16298
TL
3256 double scrub_max_interval = 0;
3257 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3258 if (scrub_max_interval <= 0) {
3259 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
c07f9fc5 3260 }
11fdf7f2 3261 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
a8e16298
TL
3262 scrub_max_interval;
3263 utime_t cutoff = now;
3264 cutoff -= age;
3265 if (p.second.last_scrub_stamp < cutoff) {
3266 if (detail_max > 0) {
3267 ostringstream ss;
3268 ss << "pg " << p.first << " not scrubbed since "
3269 << p.second.last_scrub_stamp;
3270 detail.push_back(ss.str());
3271 --detail_max;
3272 } else {
3273 ++detail_more;
3274 }
3275 ++detail_total;
3276 }
3277 }
11fdf7f2 3278 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
a8e16298
TL
3279 double deep_scrub_interval = 0;
3280 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3281 if (deep_scrub_interval <= 0) {
3282 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3283 }
11fdf7f2 3284 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
a8e16298
TL
3285 deep_scrub_interval;
3286 utime_t deep_cutoff = now;
3287 deep_cutoff -= deep_age;
3288 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3289 if (deep_detail_max > 0) {
3290 ostringstream ss;
3291 ss << "pg " << p.first << " not deep-scrubbed since "
3292 << p.second.last_deep_scrub_stamp;
3293 deep_detail.push_back(ss.str());
3294 --deep_detail_max;
3295 } else {
3296 ++deep_detail_more;
3297 }
3298 ++deep_detail_total;
c07f9fc5 3299 }
224ce89b 3300 }
a8e16298
TL
3301 }
3302 if (detail_total) {
3303 ostringstream ss;
3304 ss << detail_total << " pgs not scrubbed in time";
9f95a23c 3305 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
a8e16298 3306
c07f9fc5 3307 if (!detail.empty()) {
c07f9fc5 3308 d.detail.swap(detail);
a8e16298
TL
3309
3310 if (detail_more) {
3311 ostringstream ss;
3312 ss << detail_more << " more pgs... ";
3313 d.detail.push_back(ss.str());
3314 }
c07f9fc5 3315 }
a8e16298
TL
3316 }
3317 if (deep_detail_total) {
3318 ostringstream ss;
3319 ss << deep_detail_total << " pgs not deep-scrubbed in time";
9f95a23c
TL
3320 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3321 deep_detail_total);
a8e16298 3322
c07f9fc5 3323 if (!deep_detail.empty()) {
c07f9fc5 3324 d.detail.swap(deep_detail);
a8e16298
TL
3325
3326 if (deep_detail_more) {
3327 ostringstream ss;
3328 ss << deep_detail_more << " more pgs... ";
3329 d.detail.push_back(ss.str());
3330 }
c07f9fc5
FG
3331 }
3332 }
3333 }
3334
3335 // POOL_APP
11fdf7f2 3336 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
c07f9fc5
FG
3337 list<string> detail;
3338 for (auto &it : pools) {
3339 const pg_pool_t &pool = it.second;
3340 const string& pool_name = osdmap.get_pool_name(it.first);
3341 auto it2 = pg_pool_sum.find(it.first);
3342 if (it2 == pg_pool_sum.end()) {
3343 continue;
3344 }
3345 const pool_stat_t *pstat = &it2->second;
3346 if (pstat == nullptr) {
3347 continue;
3348 }
3349 const object_stat_sum_t& sum = pstat->stats.sum;
3350 // application metadata is not encoded until luminous is minimum
3351 // required release
11fdf7f2
TL
3352 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3353 !pool.is_tier()) {
c07f9fc5
FG
3354 stringstream ss;
3355 ss << "application not enabled on pool '" << pool_name << "'";
3356 detail.push_back(ss.str());
224ce89b
WB
3357 }
3358 }
3359 if (!detail.empty()) {
3360 ostringstream ss;
9f95a23c
TL
3361 ss << detail.size() << " pool(s) do not have an application enabled";
3362 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3363 detail.size());
c07f9fc5
FG
3364 stringstream tip;
3365 tip << "use 'ceph osd pool application enable <pool-name> "
3366 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3367 << "or freeform for custom applications.";
3368 detail.push_back(tip.str());
224ce89b
WB
3369 d.detail.swap(detail);
3370 }
31f18b77 3371 }
b32b8144
FG
3372
3373 // PG_SLOW_SNAP_TRIMMING
3374 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3375 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3376 uint64_t snaptrimq_exceeded = 0;
3377 uint32_t longest_queue = 0;
3378 const pg_t* longest_q_pg = nullptr;
3379 list<string> detail;
3380
3381 for (auto& i: pg_stat) {
3382 uint32_t current_len = i.second.snaptrimq_len;
3383 if (current_len >= snapthreshold) {
3384 snaptrimq_exceeded++;
3385 if (longest_queue <= current_len) {
3386 longest_q_pg = &i.first;
3387 longest_queue = current_len;
3388 }
3389 if (detail.size() < max - 1) {
3390 stringstream ss;
3391 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3392 detail.push_back(ss.str());
3393 continue;
3394 }
3395 if (detail.size() < max) {
3396 detail.push_back("...more pgs affected");
3397 continue;
3398 }
3399 }
3400 }
3401
3402 if (snaptrimq_exceeded) {
3403 {
3404 ostringstream ss;
3405 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3406 detail.push_back(ss.str());
3407 }
3408
3409 stringstream ss;
3410 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
9f95a23c
TL
3411 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3412 snaptrimq_exceeded);
b32b8144
FG
3413 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3414 d.detail.swap(detail);
3415 }
3416 }
31f18b77 3417}
7c673cae 3418
9f95a23c
TL
3419void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
3420{
3421 if (f) {
3422 f->open_array_section("pgs_by_pool_state");
3423 for (auto& i: num_pg_by_pool_state) {
3424 f->open_object_section("per_pool_pgs_by_state");
3425 f->dump_int("pool_id", i.first);
3426 f->open_array_section("pg_state_counts");
3427 for (auto& j : i.second) {
3428 f->open_object_section("pg_state_count");
3429 f->dump_string("state_name", pg_state_string(j.first));
3430 f->dump_int("count", j.second);
3431 f->close_section();
3432 }
3433 f->close_section();
3434 f->close_section();
3435 }
3436 f->close_section();
3437 }
3438 PGMapDigest::print_summary(f, out);
3439}
3440
7c673cae
FG
3441int process_pg_map_command(
3442 const string& orig_prefix,
11fdf7f2 3443 const cmdmap_t& orig_cmdmap,
7c673cae
FG
3444 const PGMap& pg_map,
3445 const OSDMap& osdmap,
9f95a23c 3446 ceph::Formatter *f,
7c673cae
FG
3447 stringstream *ss,
3448 bufferlist *odata)
3449{
3450 string prefix = orig_prefix;
11fdf7f2
TL
3451 auto cmdmap = orig_cmdmap;
3452
3453 string omap_stats_note =
3454 "\n* NOTE: Omap statistics are gathered during deep scrub and "
9f95a23c 3455 "may be inaccurate soon afterwards depending on utilization. See "
11fdf7f2
TL
3456 "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3457 "for further details.\n";
3458 bool omap_stats_note_required = false;
7c673cae
FG
3459
3460 // perhaps these would be better in the parsing, but it's weird
3461 bool primary = false;
3462 if (prefix == "pg dump_json") {
3463 vector<string> v;
3464 v.push_back(string("all"));
3465 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3466 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3467 prefix = "pg dump";
3468 } else if (prefix == "pg dump_pools_json") {
3469 vector<string> v;
3470 v.push_back(string("pools"));
3471 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3472 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3473 prefix = "pg dump";
3474 } else if (prefix == "pg ls-by-primary") {
3475 primary = true;
3476 prefix = "pg ls";
3477 } else if (prefix == "pg ls-by-osd") {
3478 prefix = "pg ls";
3479 } else if (prefix == "pg ls-by-pool") {
3480 prefix = "pg ls";
3481 string poolstr;
9f95a23c 3482 cmd_getval(cmdmap, "poolstr", poolstr);
7c673cae
FG
3483 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3484 if (pool < 0) {
3485 *ss << "pool " << poolstr << " does not exist";
3486 return -ENOENT;
3487 }
3488 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3489 }
3490
7c673cae
FG
3491 stringstream ds;
3492 if (prefix == "pg stat") {
3493 if (f) {
3494 f->open_object_section("pg_summary");
3495 pg_map.print_oneline_summary(f, NULL);
3496 f->close_section();
3497 f->flush(ds);
3498 } else {
3499 ds << pg_map;
3500 }
3501 odata->append(ds);
3502 return 0;
3503 }
3504
3505 if (prefix == "pg getmap") {
3506 pg_map.encode(*odata);
3507 *ss << "got pgmap version " << pg_map.version;
3508 return 0;
3509 }
3510
3511 if (prefix == "pg dump") {
3512 string val;
3513 vector<string> dumpcontents;
3514 set<string> what;
9f95a23c 3515 if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
7c673cae
FG
3516 copy(dumpcontents.begin(), dumpcontents.end(),
3517 inserter(what, what.end()));
3518 }
3519 if (what.empty())
3520 what.insert("all");
3521 if (f) {
3522 if (what.count("all")) {
3523 f->open_object_section("pg_map");
3524 pg_map.dump(f);
3525 f->close_section();
3526 } else if (what.count("summary") || what.count("sum")) {
3527 f->open_object_section("pg_map");
3528 pg_map.dump_basic(f);
3529 f->close_section();
3530 } else {
3531 if (what.count("pools")) {
3532 pg_map.dump_pool_stats(f);
3533 }
3534 if (what.count("osds")) {
3535 pg_map.dump_osd_stats(f);
3536 }
3537 if (what.count("pgs")) {
3538 pg_map.dump_pg_stats(f, false);
3539 }
3540 if (what.count("pgs_brief")) {
3541 pg_map.dump_pg_stats(f, true);
3542 }
3543 if (what.count("delta")) {
3544 f->open_object_section("delta");
3545 pg_map.dump_delta(f);
3546 f->close_section();
3547 }
3548 }
3549 f->flush(*odata);
3550 } else {
3551 if (what.count("all")) {
3552 pg_map.dump(ds);
11fdf7f2 3553 omap_stats_note_required = true;
7c673cae
FG
3554 } else if (what.count("summary") || what.count("sum")) {
3555 pg_map.dump_basic(ds);
3556 pg_map.dump_pg_sum_stats(ds, true);
3557 pg_map.dump_osd_sum_stats(ds);
11fdf7f2 3558 omap_stats_note_required = true;
7c673cae
FG
3559 } else {
3560 if (what.count("pgs_brief")) {
3561 pg_map.dump_pg_stats(ds, true);
3562 }
3563 bool header = true;
3564 if (what.count("pgs")) {
3565 pg_map.dump_pg_stats(ds, false);
3566 header = false;
11fdf7f2 3567 omap_stats_note_required = true;
7c673cae
FG
3568 }
3569 if (what.count("pools")) {
3570 pg_map.dump_pool_stats(ds, header);
11fdf7f2 3571 omap_stats_note_required = true;
7c673cae
FG
3572 }
3573 if (what.count("osds")) {
3574 pg_map.dump_osd_stats(ds);
3575 }
3576 }
3577 odata->append(ds);
11fdf7f2
TL
3578 if (omap_stats_note_required) {
3579 odata->append(omap_stats_note);
3580 }
7c673cae
FG
3581 }
3582 *ss << "dumped " << what;
3583 return 0;
3584 }
3585
3586 if (prefix == "pg ls") {
3587 int64_t osd = -1;
3588 int64_t pool = -1;
3589 vector<string>states;
3590 set<pg_t> pgs;
9f95a23c
TL
3591 cmd_getval(cmdmap, "pool", pool);
3592 cmd_getval(cmdmap, "osd", osd);
3593 cmd_getval(cmdmap, "states", states);
7c673cae
FG
3594 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3595 *ss << "pool " << pool << " does not exist";
3596 return -ENOENT;
3597 }
3598 if (osd >= 0 && !osdmap.is_up(osd)) {
3599 *ss << "osd " << osd << " is not up";
3600 return -EAGAIN;
3601 }
3602 if (states.empty())
3603 states.push_back("all");
3604
11fdf7f2 3605 uint64_t state = 0;
7c673cae
FG
3606
3607 while (!states.empty()) {
3608 string state_str = states.back();
3609
3610 if (state_str == "all") {
3611 state = -1;
3612 break;
3613 } else {
3efd9988
FG
3614 auto filter = pg_string_state(state_str);
3615 if (!filter) {
c07f9fc5
FG
3616 *ss << "'" << state_str << "' is not a valid pg state,"
3617 << " available choices: " << pg_state_string(0xFFFFFFFF);
3618 return -EINVAL;
3619 }
3efd9988 3620 state |= *filter;
7c673cae
FG
3621 }
3622
3623 states.pop_back();
3624 }
3625
3626 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3627
3628 if (f && !pgs.empty()) {
3629 pg_map.dump_filtered_pg_stats(f, pgs);
3630 f->flush(*odata);
3631 } else if (!pgs.empty()) {
3632 pg_map.dump_filtered_pg_stats(ds, pgs);
3633 odata->append(ds);
11fdf7f2 3634 odata->append(omap_stats_note);
7c673cae
FG
3635 }
3636 return 0;
3637 }
3638
3639 if (prefix == "pg dump_stuck") {
3640 vector<string> stuckop_vec;
9f95a23c 3641 cmd_getval(cmdmap, "stuckops", stuckop_vec);
7c673cae
FG
3642 if (stuckop_vec.empty())
3643 stuckop_vec.push_back("unclean");
3644 int64_t threshold;
9f95a23c 3645 cmd_getval(cmdmap, "threshold", threshold,
11fdf7f2 3646 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
7c673cae 3647
11fdf7f2 3648 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
7c673cae 3649 *ss << "failed";
11fdf7f2 3650 } else {
7c673cae 3651 *ss << "ok";
11fdf7f2
TL
3652 }
3653 odata->append(ds);
7c673cae
FG
3654 return 0;
3655 }
3656
3657 if (prefix == "pg debug") {
3658 string debugop;
9f95a23c 3659 cmd_getval(cmdmap, "debugop", debugop,
7c673cae
FG
3660 string("unfound_objects_exist"));
3661 if (debugop == "unfound_objects_exist") {
3662 bool unfound_objects_exist = false;
3663 for (const auto& p : pg_map.pg_stat) {
3664 if (p.second.stats.sum.num_objects_unfound > 0) {
3665 unfound_objects_exist = true;
3666 break;
3667 }
3668 }
3669 if (unfound_objects_exist)
3670 ds << "TRUE";
3671 else
3672 ds << "FALSE";
3673 odata->append(ds);
3674 return 0;
3675 }
3676 if (debugop == "degraded_pgs_exist") {
3677 bool degraded_pgs_exist = false;
3678 for (const auto& p : pg_map.pg_stat) {
3679 if (p.second.stats.sum.num_objects_degraded > 0) {
3680 degraded_pgs_exist = true;
3681 break;
3682 }
3683 }
3684 if (degraded_pgs_exist)
3685 ds << "TRUE";
3686 else
3687 ds << "FALSE";
3688 odata->append(ds);
3689 return 0;
3690 }
3691 }
3692
3693 if (prefix == "osd perf") {
3694 if (f) {
3695 f->open_object_section("osdstats");
3696 pg_map.dump_osd_perf_stats(f);
3697 f->close_section();
3698 f->flush(ds);
3699 } else {
3700 pg_map.print_osd_perf_stats(&ds);
3701 }
3702 odata->append(ds);
3703 return 0;
3704 }
3705
3706 if (prefix == "osd blocked-by") {
3707 if (f) {
3708 f->open_object_section("osd_blocked_by");
3709 pg_map.dump_osd_blocked_by_stats(f);
3710 f->close_section();
3711 f->flush(ds);
3712 } else {
3713 pg_map.print_osd_blocked_by_stats(&ds);
3714 }
3715 odata->append(ds);
3716 return 0;
3717 }
3718
7c673cae
FG
3719 return -EOPNOTSUPP;
3720}
3721
31f18b77
FG
3722void PGMapUpdater::check_osd_map(
3723 CephContext *cct,
3724 const OSDMap& osdmap,
3725 const PGMap& pgmap,
3726 PGMap::Incremental *pending_inc)
3727{
3728 for (auto& p : pgmap.osd_stat) {
3729 if (!osdmap.exists(p.first)) {
3730 // remove osd_stat
3731 pending_inc->rm_stat(p.first);
3732 } else if (osdmap.is_out(p.first)) {
3733 // zero osd_stat
11fdf7f2
TL
3734 if (p.second.statfs.total != 0) {
3735 pending_inc->stat_osd_out(p.first);
31f18b77
FG
3736 }
3737 } else if (!osdmap.is_up(p.first)) {
3738 // zero the op_queue_age_hist
3739 if (!p.second.op_queue_age_hist.empty()) {
11fdf7f2 3740 pending_inc->stat_osd_down_up(p.first, pgmap);
31f18b77
FG
3741 }
3742 }
3743 }
3744
3745 // deleted pgs (pools)?
3746 for (auto& p : pgmap.pg_pool_sum) {
3747 if (!osdmap.have_pg_pool(p.first)) {
3748 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3749 << dendl;
3750 for (auto& q : pgmap.pg_stat) {
11fdf7f2 3751 if (q.first.pool() == p.first) {
31f18b77
FG
3752 pending_inc->pg_remove.insert(q.first);
3753 }
3754 }
3755 auto q = pending_inc->pg_stat_updates.begin();
3756 while (q != pending_inc->pg_stat_updates.end()) {
11fdf7f2 3757 if (q->first.pool() == p.first) {
31f18b77
FG
3758 q = pending_inc->pg_stat_updates.erase(q);
3759 } else {
3760 ++q;
3761 }
3762 }
3763 }
3764 }
3765
11fdf7f2
TL
3766 // new (split or new pool) or merged pgs?
3767 map<int64_t,unsigned> new_pg_num;
31f18b77
FG
3768 for (auto& p : osdmap.get_pools()) {
3769 int64_t poolid = p.first;
3770 const pg_pool_t& pi = p.second;
3771 auto q = pgmap.num_pg_by_pool.find(poolid);
3772 unsigned my_pg_num = 0;
3773 if (q != pgmap.num_pg_by_pool.end())
3774 my_pg_num = q->second;
3775 unsigned pg_num = pi.get_pg_num();
11fdf7f2
TL
3776 new_pg_num[poolid] = pg_num;
3777 if (my_pg_num < pg_num) {
224ce89b 3778 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
11fdf7f2 3779 << " > my pg_num " << my_pg_num << dendl;
31f18b77
FG
3780 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3781 pg_t pgid(ps, poolid);
3782 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
224ce89b 3783 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
31f18b77
FG
3784 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3785 stats.last_fresh = osdmap.get_modified();
3786 stats.last_active = osdmap.get_modified();
3787 stats.last_change = osdmap.get_modified();
3788 stats.last_peered = osdmap.get_modified();
3789 stats.last_clean = osdmap.get_modified();
3790 stats.last_unstale = osdmap.get_modified();
3791 stats.last_undegraded = osdmap.get_modified();
3792 stats.last_fullsized = osdmap.get_modified();
3793 stats.last_scrub_stamp = osdmap.get_modified();
3794 stats.last_deep_scrub_stamp = osdmap.get_modified();
3795 stats.last_clean_scrub_stamp = osdmap.get_modified();
3796 }
3797 }
11fdf7f2
TL
3798 } else if (my_pg_num > pg_num) {
3799 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3800 << " < my pg_num " << my_pg_num << dendl;
3801 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3802 pg_t pgid(i, poolid);
3803 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3804 if (pgmap.pg_stat.count(pgid)) {
3805 pending_inc->pg_remove.insert(pgid);
3806 }
3807 pending_inc->pg_stat_updates.erase(pgid);
7c673cae 3808 }
7c673cae
FG
3809 }
3810 }
11fdf7f2
TL
3811 auto i = pending_inc->pg_stat_updates.begin();
3812 while (i != pending_inc->pg_stat_updates.end()) {
3813 auto j = new_pg_num.find(i->first.pool());
3814 if (j == new_pg_num.end() ||
3815 i->first.ps() >= j->second) {
3816 ldout(cct,20) << __func__ << " removing pending update to old "
3817 << i->first << dendl;
3818 i = pending_inc->pg_stat_updates.erase(i);
3819 } else {
3820 ++i;
7c673cae
FG
3821 }
3822 }
7c673cae
FG
3823}
3824
3825static void _try_mark_pg_stale(
3826 const OSDMap& osdmap,
3827 pg_t pgid,
3828 const pg_stat_t& cur,
3829 PGMap::Incremental *pending_inc)
3830{
3831 if ((cur.state & PG_STATE_STALE) == 0 &&
3832 cur.acting_primary != -1 &&
3833 osdmap.is_down(cur.acting_primary)) {
3834 pg_stat_t *newstat;
3835 auto q = pending_inc->pg_stat_updates.find(pgid);
3836 if (q != pending_inc->pg_stat_updates.end()) {
3837 if ((q->second.acting_primary == cur.acting_primary) ||
3838 ((q->second.state & PG_STATE_STALE) == 0 &&
3839 q->second.acting_primary != -1 &&
3840 osdmap.is_down(q->second.acting_primary))) {
3841 newstat = &q->second;
3842 } else {
3843 // pending update is no longer down or already stale
3844 return;
3845 }
3846 } else {
3847 newstat = &pending_inc->pg_stat_updates[pgid];
3848 *newstat = cur;
3849 }
3850 dout(10) << __func__ << " marking pg " << pgid
3851 << " stale (acting_primary " << newstat->acting_primary
3852 << ")" << dendl;
3853 newstat->state |= PG_STATE_STALE;
3854 newstat->last_unstale = ceph_clock_now();
3855 }
3856}
3857
3858void PGMapUpdater::check_down_pgs(
3859 const OSDMap &osdmap,
3860 const PGMap &pg_map,
3861 bool check_all,
3862 const set<int>& need_check_down_pg_osds,
3863 PGMap::Incremental *pending_inc)
3864{
3865 // if a large number of osds changed state, just iterate over the whole
3866 // pg map.
3867 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
11fdf7f2 3868 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
7c673cae
FG
3869 check_all = true;
3870 }
3871
3872 if (check_all) {
3873 for (const auto& p : pg_map.pg_stat) {
3874 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3875 }
3876 } else {
3877 for (auto osd : need_check_down_pg_osds) {
3878 if (osdmap.is_down(osd)) {
3879 auto p = pg_map.pg_by_osd.find(osd);
3880 if (p == pg_map.pg_by_osd.end()) {
3881 continue;
3882 }
3883 for (auto pgid : p->second) {
3884 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
11fdf7f2 3885 ceph_assert(stat.acting_primary == osd);
7c673cae
FG
3886 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3887 }
3888 }
3889 }
3890 }
3891}
3892
3893int reweight::by_utilization(
3894 const OSDMap &osdmap,
3895 const PGMap &pgm,
3896 int oload,
3897 double max_changef,
3898 int max_osds,
3899 bool by_pg, const set<int64_t> *pools,
3900 bool no_increasing,
3901 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3902 std::stringstream *ss,
3903 std::string *out_str,
9f95a23c 3904 ceph::Formatter *f)
7c673cae
FG
3905{
3906 if (oload <= 100) {
3907 *ss << "You must give a percentage higher than 100. "
3908 "The reweighting threshold will be calculated as <average-utilization> "
3909 "times <input-percentage>. For example, an argument of 200 would "
3910 "reweight OSDs which are twice as utilized as the average OSD.\n";
3911 return -EINVAL;
3912 }
3913
3914 vector<int> pgs_by_osd(osdmap.get_max_osd());
3915
3916 // Avoid putting a small number (or 0) in the denominator when calculating
3917 // average_util
3918 double average_util;
3919 if (by_pg) {
3920 // by pg mapping
3921 double weight_sum = 0.0; // sum up the crush weights
3922 unsigned num_pg_copies = 0;
3923 int num_osds = 0;
3924 for (const auto& pg : pgm.pg_stat) {
3925 if (pools && pools->count(pg.first.pool()) == 0)
3926 continue;
3927 for (const auto acting : pg.second.acting) {
b5b8bbf5
FG
3928 if (!osdmap.exists(acting)) {
3929 continue;
3930 }
7c673cae
FG
3931 if (acting >= (int)pgs_by_osd.size())
3932 pgs_by_osd.resize(acting);
3933 if (pgs_by_osd[acting] == 0) {
3934 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3935 //skip if we currently can not identify item
3936 continue;
3937 }
3938 weight_sum += osdmap.crush->get_item_weightf(acting);
3939 ++num_osds;
3940 }
3941 ++pgs_by_osd[acting];
3942 ++num_pg_copies;
3943 }
3944 }
3945
11fdf7f2 3946 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
7c673cae
FG
3947 *ss << "Refusing to reweight: we only have " << num_pg_copies
3948 << " PGs across " << num_osds << " osds!\n";
3949 return -EDOM;
3950 }
3951
3952 average_util = (double)num_pg_copies / weight_sum;
3953 } else {
3954 // by osd utilization
11fdf7f2
TL
3955 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3956 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3957 < g_conf()->mon_reweight_min_bytes_per_osd) {
3958 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
7c673cae
FG
3959 << " kb across all osds!\n";
3960 return -EDOM;
3961 }
11fdf7f2
TL
3962 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3963 < g_conf()->mon_reweight_min_bytes_per_osd) {
3964 *ss << "Refusing to reweight: we only have "
3965 << pgm.osd_sum.statfs.kb_used_raw()
7c673cae
FG
3966 << " kb used across all osds!\n";
3967 return -EDOM;
3968 }
3969
11fdf7f2
TL
3970 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
3971 (double)pgm.osd_sum.statfs.total;
7c673cae
FG
3972 }
3973
3974 // adjust down only if we are above the threshold
3975 const double overload_util = average_util * (double)oload / 100.0;
3976
3977 // but aggressively adjust weights up whenever possible.
3978 const double underload_util = average_util;
3979
3980 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
3981
3982 ostringstream oss;
3983 if (f) {
3984 f->open_object_section("reweight_by_utilization");
3985 f->dump_int("overload_min", oload);
3986 f->dump_float("max_change", max_changef);
3987 f->dump_int("max_change_osds", max_osds);
3988 f->dump_float("average_utilization", average_util);
3989 f->dump_float("overload_utilization", overload_util);
3990 } else {
3991 oss << "oload " << oload << "\n";
3992 oss << "max_change " << max_changef << "\n";
3993 oss << "max_change_osds " << max_osds << "\n";
3994 oss.precision(4);
3995 oss << "average_utilization " << std::fixed << average_util << "\n";
3996 oss << "overload_utilization " << overload_util << "\n";
3997 }
3998 int num_changed = 0;
3999
4000 // precompute util for each OSD
4001 std::vector<std::pair<int, float> > util_by_osd;
4002 for (const auto& p : pgm.osd_stat) {
4003 std::pair<int, float> osd_util;
4004 osd_util.first = p.first;
4005 if (by_pg) {
4006 if (p.first >= (int)pgs_by_osd.size() ||
4007 pgs_by_osd[p.first] == 0) {
4008 // skip if this OSD does not contain any pg
4009 // belonging to the specified pool(s).
4010 continue;
4011 }
4012
4013 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4014 // skip if we are unable to locate item.
4015 continue;
4016 }
4017
11fdf7f2
TL
4018 osd_util.second =
4019 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
7c673cae 4020 } else {
11fdf7f2
TL
4021 osd_util.second =
4022 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
7c673cae
FG
4023 }
4024 util_by_osd.push_back(osd_util);
4025 }
4026
4027 // sort by absolute deviation from the mean utilization,
4028 // in descending order.
4029 std::sort(util_by_osd.begin(), util_by_osd.end(),
4030 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4031 return abs(l.second - average_util) > abs(r.second - average_util);
4032 }
4033 );
4034
4035 if (f)
4036 f->open_array_section("reweights");
4037
4038 for (const auto& p : util_by_osd) {
4039 unsigned weight = osdmap.get_weight(p.first);
4040 if (weight == 0) {
4041 // skip if OSD is currently out
4042 continue;
4043 }
4044 float util = p.second;
4045
4046 if (util >= overload_util) {
4047 // Assign a lower weight to overloaded OSDs. The current weight
4048 // is a factor to take into account the original weights,
4049 // to represent e.g. differing storage capacities
4050 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4051 if (weight > max_change)
11fdf7f2 4052 new_weight = std::max(new_weight, weight - max_change);
7c673cae
FG
4053 new_weights->insert({p.first, new_weight});
4054 if (f) {
4055 f->open_object_section("osd");
4056 f->dump_int("osd", p.first);
4057 f->dump_float("weight", (float)weight / (float)0x10000);
4058 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4059 f->close_section();
4060 } else {
4061 oss << "osd." << p.first << " weight "
4062 << (float)weight / (float)0x10000 << " -> "
4063 << (float)new_weight / (float)0x10000 << "\n";
4064 }
4065 if (++num_changed >= max_osds)
4066 break;
4067 }
4068 if (!no_increasing && util <= underload_util) {
4069 // assign a higher weight.. if we can.
4070 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
11fdf7f2 4071 new_weight = std::min(new_weight, weight + max_change);
7c673cae
FG
4072 if (new_weight > 0x10000)
4073 new_weight = 0x10000;
4074 if (new_weight > weight) {
4075 new_weights->insert({p.first, new_weight});
4076 oss << "osd." << p.first << " weight "
4077 << (float)weight / (float)0x10000 << " -> "
4078 << (float)new_weight / (float)0x10000 << "\n";
4079 if (++num_changed >= max_osds)
4080 break;
4081 }
4082 }
4083 }
4084 if (f) {
4085 f->close_section();
4086 }
4087
4088 OSDMap newmap;
4089 newmap.deepish_copy_from(osdmap);
4090 OSDMap::Incremental newinc;
4091 newinc.fsid = newmap.get_fsid();
4092 newinc.epoch = newmap.get_epoch() + 1;
4093 newinc.new_weight = *new_weights;
4094 newmap.apply_incremental(newinc);
4095
4096 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4097
4098 if (f) {
4099 f->close_section();
4100 } else {
4101 *out_str += "\n";
4102 *out_str += oss.str();
4103 }
4104 return num_changed;
4105}