]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
4d5950fc48a0851e0b6e9851b429d28e33f4a688
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Clock.h"
11 #include "common/Formatter.h"
12 #include "global/global_context.h"
13 #include "include/ceph_features.h"
14 #include "include/stringify.h"
15
16 #include "osd/osd_types.h"
17 #include "osd/OSDMap.h"
18 #include <boost/range/adaptor/reversed.hpp>
19
20 #define dout_context g_ceph_context
21
22 using std::list;
23 using std::make_pair;
24 using std::map;
25 using std::pair;
26 using std::ostream;
27 using std::ostringstream;
28 using std::set;
29 using std::string;
30 using std::stringstream;
31 using std::vector;
32
33 using ceph::bufferlist;
34 using TOPNSPC::common::cmd_getval;
35
36 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
37 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
38 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
39
40
41 // ---------------------
42 // PGMapDigest
43
44 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
45 {
46 // NOTE: see PGMap::encode_digest
47 uint8_t v = 4;
48 if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
49 v = 1;
50 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
51 v = 3;
52 }
53 ENCODE_START(v, 1, bl);
54 encode(num_pg, bl);
55 encode(num_pg_active, bl);
56 encode(num_pg_unknown, bl);
57 encode(num_osd, bl);
58 encode(pg_pool_sum, bl, features);
59 encode(pg_sum, bl, features);
60 encode(osd_sum, bl, features);
61 if (v >= 2) {
62 encode(num_pg_by_state, bl);
63 } else {
64 uint32_t n = num_pg_by_state.size();
65 encode(n, bl);
66 for (auto p : num_pg_by_state) {
67 encode((int32_t)p.first, bl);
68 encode(p.second, bl);
69 }
70 }
71 encode(num_pg_by_osd, bl);
72 encode(num_pg_by_pool, bl);
73 encode(osd_last_seq, bl);
74 encode(per_pool_sum_delta, bl, features);
75 encode(per_pool_sum_deltas_stamps, bl);
76 encode(pg_sum_delta, bl, features);
77 encode(stamp_delta, bl);
78 encode(avail_space_by_rule, bl);
79 if (struct_v >= 3) {
80 encode(purged_snaps, bl);
81 }
82 if (struct_v >= 4) {
83 encode(osd_sum_by_class, bl, features);
84 }
85 ENCODE_FINISH(bl);
86 }
87
88 void PGMapDigest::decode(bufferlist::const_iterator& p)
89 {
90 DECODE_START(4, p);
91 decode(num_pg, p);
92 decode(num_pg_active, p);
93 decode(num_pg_unknown, p);
94 decode(num_osd, p);
95 decode(pg_pool_sum, p);
96 decode(pg_sum, p);
97 decode(osd_sum, p);
98 if (struct_v >= 2) {
99 decode(num_pg_by_state, p);
100 } else {
101 map<int32_t, int32_t> nps;
102 decode(nps, p);
103 num_pg_by_state.clear();
104 for (auto i : nps) {
105 num_pg_by_state[i.first] = i.second;
106 }
107 }
108 decode(num_pg_by_osd, p);
109 decode(num_pg_by_pool, p);
110 decode(osd_last_seq, p);
111 decode(per_pool_sum_delta, p);
112 decode(per_pool_sum_deltas_stamps, p);
113 decode(pg_sum_delta, p);
114 decode(stamp_delta, p);
115 decode(avail_space_by_rule, p);
116 if (struct_v >= 3) {
117 decode(purged_snaps, p);
118 }
119 if (struct_v >= 4) {
120 decode(osd_sum_by_class, p);
121 }
122 DECODE_FINISH(p);
123 }
124
125 void PGMapDigest::dump(ceph::Formatter *f) const
126 {
127 f->dump_unsigned("num_pg", num_pg);
128 f->dump_unsigned("num_pg_active", num_pg_active);
129 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
130 f->dump_unsigned("num_osd", num_osd);
131 f->dump_object("pool_sum", pg_sum);
132 f->dump_object("osd_sum", osd_sum);
133
134 f->open_object_section("osd_sum_by_class");
135 for (auto& i : osd_sum_by_class) {
136 f->dump_object(i.first.c_str(), i.second);
137 }
138 f->close_section();
139
140 f->open_array_section("pool_stats");
141 for (auto& p : pg_pool_sum) {
142 f->open_object_section("pool_stat");
143 f->dump_int("poolid", p.first);
144 auto q = num_pg_by_pool.find(p.first);
145 if (q != num_pg_by_pool.end())
146 f->dump_unsigned("num_pg", q->second);
147 p.second.dump(f);
148 f->close_section();
149 }
150 f->close_section();
151 f->open_array_section("osd_stats");
152 int i = 0;
153 // TODO: this isn't really correct since we can dump non-existent OSDs
154 // I dunno what osd_last_seq is set to in that case...
155 for (auto& p : osd_last_seq) {
156 f->open_object_section("osd_stat");
157 f->dump_int("osd", i);
158 f->dump_unsigned("seq", p);
159 f->close_section();
160 ++i;
161 }
162 f->close_section();
163 f->open_array_section("num_pg_by_state");
164 for (auto& p : num_pg_by_state) {
165 f->open_object_section("count");
166 f->dump_string("state", pg_state_string(p.first));
167 f->dump_unsigned("num", p.second);
168 f->close_section();
169 }
170 f->close_section();
171 f->open_array_section("num_pg_by_osd");
172 for (auto& p : num_pg_by_osd) {
173 f->open_object_section("count");
174 f->dump_unsigned("osd", p.first);
175 f->dump_unsigned("num_primary_pg", p.second.primary);
176 f->dump_unsigned("num_acting_pg", p.second.acting);
177 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
178 f->close_section();
179 }
180 f->close_section();
181 f->open_array_section("purged_snaps");
182 for (auto& j : purged_snaps) {
183 f->open_object_section("pool");
184 f->dump_int("pool", j.first);
185 f->open_object_section("purged_snaps");
186 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
187 f->open_object_section("interval");
188 f->dump_stream("start") << i.get_start();
189 f->dump_stream("length") << i.get_len();
190 f->close_section();
191 }
192 f->close_section();
193 f->close_section();
194 }
195 f->close_section();
196 }
197
198 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
199 {
200 ls.push_back(new PGMapDigest);
201 }
202
203 inline std::string percentify(const float& a) {
204 std::stringstream ss;
205 if (a < 0.01)
206 ss << "0";
207 else
208 ss << std::fixed << std::setprecision(2) << a;
209 return ss.str();
210 }
211
212 void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
213 {
214 if (f)
215 f->open_array_section("pgs_by_state");
216
217 // list is descending numeric order (by count)
218 std::multimap<int,uint64_t> state_by_count; // count -> state
219 for (auto p = num_pg_by_state.begin();
220 p != num_pg_by_state.end();
221 ++p) {
222 state_by_count.insert(make_pair(p->second, p->first));
223 }
224 if (f) {
225 for (auto p = state_by_count.rbegin();
226 p != state_by_count.rend();
227 ++p)
228 {
229 f->open_object_section("pgs_by_state_element");
230 f->dump_string("state_name", pg_state_string(p->second));
231 f->dump_unsigned("count", p->first);
232 f->close_section();
233 }
234 }
235 if (f)
236 f->close_section();
237
238 if (f) {
239 f->dump_unsigned("num_pgs", num_pg);
240 f->dump_unsigned("num_pools", pg_pool_sum.size());
241 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
242 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
243 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
244 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
245 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
246 } else {
247 *out << " pools: " << pg_pool_sum.size() << " pools, "
248 << num_pg << " pgs\n";
249 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
250 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
251 *out << " usage: "
252 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
253 << byte_u_t(osd_sum.statfs.available) << " / "
254 << byte_u_t(osd_sum.statfs.total) << " avail\n";
255 *out << " pgs: ";
256 }
257
258 bool pad = false;
259
260 if (num_pg_unknown > 0) {
261 float p = (float)num_pg_unknown / (float)num_pg;
262 if (f) {
263 f->dump_float("unknown_pgs_ratio", p);
264 } else {
265 char b[20];
266 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
267 *out << b << "% pgs unknown\n";
268 pad = true;
269 }
270 }
271
272 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
273 if (num_pg_inactive > 0) {
274 float p = (float)num_pg_inactive / (float)num_pg;
275 if (f) {
276 f->dump_float("inactive_pgs_ratio", p);
277 } else {
278 if (pad) {
279 *out << " ";
280 }
281 char b[20];
282 snprintf(b, sizeof(b), "%.3f", p * 100.0);
283 *out << b << "% pgs not active\n";
284 pad = true;
285 }
286 }
287
288 list<string> sl;
289 overall_recovery_summary(f, &sl);
290 if (!f && !sl.empty()) {
291 for (auto p = sl.begin(); p != sl.end(); ++p) {
292 if (pad) {
293 *out << " ";
294 }
295 *out << *p << "\n";
296 pad = true;
297 }
298 }
299 sl.clear();
300
301 if (!f) {
302 unsigned max_width = 1;
303 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
304 {
305 std::stringstream ss;
306 ss << p->first;
307 max_width = std::max<size_t>(ss.str().size(), max_width);
308 }
309
310 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
311 {
312 if (pad) {
313 *out << " ";
314 }
315 pad = true;
316 out->setf(std::ios::left);
317 *out << std::setw(max_width) << p->first
318 << " " << pg_state_string(p->second) << "\n";
319 out->unsetf(std::ios::left);
320 }
321 }
322
323 ostringstream ss_rec_io;
324 overall_recovery_rate_summary(f, &ss_rec_io);
325 ostringstream ss_client_io;
326 overall_client_io_rate_summary(f, &ss_client_io);
327 ostringstream ss_cache_io;
328 overall_cache_io_rate_summary(f, &ss_cache_io);
329
330 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
331 || ss_cache_io.str().length())) {
332 *out << "\n \n";
333 *out << " io:\n";
334 }
335
336 if (!f && ss_client_io.str().length())
337 *out << " client: " << ss_client_io.str() << "\n";
338 if (!f && ss_rec_io.str().length())
339 *out << " recovery: " << ss_rec_io.str() << "\n";
340 if (!f && ss_cache_io.str().length())
341 *out << " cache: " << ss_cache_io.str() << "\n";
342 }
343
344 void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
345 {
346 std::stringstream ss;
347
348 if (f)
349 f->open_array_section("num_pg_by_state");
350 for (auto p = num_pg_by_state.begin();
351 p != num_pg_by_state.end();
352 ++p) {
353 if (f) {
354 f->open_object_section("state");
355 f->dump_string("name", pg_state_string(p->first));
356 f->dump_unsigned("num", p->second);
357 f->close_section();
358 }
359 if (p != num_pg_by_state.begin())
360 ss << ", ";
361 ss << p->second << " " << pg_state_string(p->first);
362 }
363 if (f)
364 f->close_section();
365
366 string states = ss.str();
367 if (out)
368 *out << num_pg << " pgs: "
369 << states << "; "
370 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
371 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
372 << byte_u_t(osd_sum.statfs.available) << " / "
373 << byte_u_t(osd_sum.statfs.total) << " avail";
374 if (f) {
375 f->dump_unsigned("num_pgs", num_pg);
376 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
377 f->dump_int("total_bytes", osd_sum.statfs.total);
378 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
379 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
380 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
381 }
382
383 // make non-negative; we can get negative values if osds send
384 // uncommitted stats and then "go backward" or if they are just
385 // buggy/wrong.
386 pool_stat_t pos_delta = pg_sum_delta;
387 pos_delta.floor(0);
388 if (pos_delta.stats.sum.num_rd ||
389 pos_delta.stats.sum.num_wr) {
390 if (out)
391 *out << "; ";
392 if (pos_delta.stats.sum.num_rd) {
393 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
394 if (out)
395 *out << byte_u_t(rd) << "/s rd, ";
396 if (f)
397 f->dump_unsigned("read_bytes_sec", rd);
398 }
399 if (pos_delta.stats.sum.num_wr) {
400 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
401 if (out)
402 *out << byte_u_t(wr) << "/s wr, ";
403 if (f)
404 f->dump_unsigned("write_bytes_sec", wr);
405 }
406 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
407 if (out)
408 *out << si_u_t(iops) << " op/s";
409 if (f)
410 f->dump_unsigned("io_sec", iops);
411 }
412
413 list<string> sl;
414 overall_recovery_summary(f, &sl);
415 if (out)
416 for (auto p = sl.begin(); p != sl.end(); ++p)
417 *out << "; " << *p;
418 std::stringstream ssr;
419 overall_recovery_rate_summary(f, &ssr);
420 if (out && ssr.str().length())
421 *out << "; " << ssr.str() << " recovering";
422 }
423
424 void PGMapDigest::get_recovery_stats(
425 double *misplaced_ratio,
426 double *degraded_ratio,
427 double *inactive_pgs_ratio,
428 double *unknown_pgs_ratio) const
429 {
430 if (pg_sum.stats.sum.num_objects_degraded &&
431 pg_sum.stats.sum.num_object_copies > 0) {
432 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
433 (double)pg_sum.stats.sum.num_object_copies;
434 } else {
435 *degraded_ratio = 0;
436 }
437 if (pg_sum.stats.sum.num_objects_misplaced &&
438 pg_sum.stats.sum.num_object_copies > 0) {
439 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
440 (double)pg_sum.stats.sum.num_object_copies;
441 } else {
442 *misplaced_ratio = 0;
443 }
444 if (num_pg > 0) {
445 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
446 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
447 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
448 } else {
449 *inactive_pgs_ratio = 0;
450 *unknown_pgs_ratio = 0;
451 }
452 }
453
454 void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
455 const pool_stat_t& pool_sum) const
456 {
457 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
458 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
459 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
460 char b[20];
461 snprintf(b, sizeof(b), "%.3lf", pc);
462 if (f) {
463 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
464 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
465 f->dump_float("degraded_ratio", pc / 100.0);
466 } else {
467 ostringstream ss;
468 ss << pool_sum.stats.sum.num_objects_degraded
469 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
470 psl->push_back(ss.str());
471 }
472 }
473 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
474 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
475 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
476 char b[20];
477 snprintf(b, sizeof(b), "%.3lf", pc);
478 if (f) {
479 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
480 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
481 f->dump_float("misplaced_ratio", pc / 100.0);
482 } else {
483 ostringstream ss;
484 ss << pool_sum.stats.sum.num_objects_misplaced
485 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
486 psl->push_back(ss.str());
487 }
488 }
489 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
490 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
491 (double)pool_sum.stats.sum.num_objects * (double)100.0;
492 char b[20];
493 snprintf(b, sizeof(b), "%.3lf", pc);
494 if (f) {
495 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
496 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
497 f->dump_float("unfound_ratio", pc / 100.0);
498 } else {
499 ostringstream ss;
500 ss << pool_sum.stats.sum.num_objects_unfound
501 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
502 psl->push_back(ss.str());
503 }
504 }
505 }
506
507 void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
508 const pool_stat_t& delta_sum,
509 utime_t delta_stamp) const
510 {
511 // make non-negative; we can get negative values if osds send
512 // uncommitted stats and then "go backward" or if they are just
513 // buggy/wrong.
514 pool_stat_t pos_delta = delta_sum;
515 pos_delta.floor(0);
516 if (pos_delta.stats.sum.num_objects_recovered ||
517 pos_delta.stats.sum.num_bytes_recovered ||
518 pos_delta.stats.sum.num_keys_recovered) {
519 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
520 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
521 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
522 if (f) {
523 f->dump_int("recovering_objects_per_sec", objps);
524 f->dump_int("recovering_bytes_per_sec", bps);
525 f->dump_int("recovering_keys_per_sec", kps);
526 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
527 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
528 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
529 } else {
530 *out << byte_u_t(bps) << "/s";
531 if (pos_delta.stats.sum.num_keys_recovered)
532 *out << ", " << si_u_t(kps) << " keys/s";
533 *out << ", " << si_u_t(objps) << " objects/s";
534 }
535 }
536 }
537
538 void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
539 {
540 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
541 }
542
543 void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
544 {
545 recovery_summary(f, psl, pg_sum);
546 }
547
548 void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
549 uint64_t poolid) const
550 {
551 auto p = per_pool_sum_delta.find(poolid);
552 if (p == per_pool_sum_delta.end())
553 return;
554
555 auto ts = per_pool_sum_deltas_stamps.find(p->first);
556 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
557 recovery_rate_summary(f, out, p->second.first, ts->second);
558 }
559
560 void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
561 uint64_t poolid) const
562 {
563 auto p = pg_pool_sum.find(poolid);
564 if (p == pg_pool_sum.end())
565 return;
566
567 recovery_summary(f, psl, p->second);
568 }
569
570 void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
571 const pool_stat_t& delta_sum,
572 utime_t delta_stamp) const
573 {
574 pool_stat_t pos_delta = delta_sum;
575 pos_delta.floor(0);
576 if (pos_delta.stats.sum.num_rd ||
577 pos_delta.stats.sum.num_wr) {
578 if (pos_delta.stats.sum.num_rd) {
579 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
580 if (f) {
581 f->dump_int("read_bytes_sec", rd);
582 } else {
583 *out << byte_u_t(rd) << "/s rd, ";
584 }
585 }
586 if (pos_delta.stats.sum.num_wr) {
587 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
588 if (f) {
589 f->dump_int("write_bytes_sec", wr);
590 } else {
591 *out << byte_u_t(wr) << "/s wr, ";
592 }
593 }
594 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
595 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
596 if (f) {
597 f->dump_int("read_op_per_sec", iops_rd);
598 f->dump_int("write_op_per_sec", iops_wr);
599 } else {
600 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
601 }
602 }
603 }
604
605 void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
606 {
607 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
608 }
609
610 void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
611 uint64_t poolid) const
612 {
613 auto p = per_pool_sum_delta.find(poolid);
614 if (p == per_pool_sum_delta.end())
615 return;
616
617 auto ts = per_pool_sum_deltas_stamps.find(p->first);
618 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
619 client_io_rate_summary(f, out, p->second.first, ts->second);
620 }
621
622 void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
623 const pool_stat_t& delta_sum,
624 utime_t delta_stamp) const
625 {
626 pool_stat_t pos_delta = delta_sum;
627 pos_delta.floor(0);
628 bool have_output = false;
629
630 if (pos_delta.stats.sum.num_flush) {
631 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
632 if (f) {
633 f->dump_int("flush_bytes_sec", flush);
634 } else {
635 *out << byte_u_t(flush) << "/s flush";
636 have_output = true;
637 }
638 }
639 if (pos_delta.stats.sum.num_evict) {
640 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
641 if (f) {
642 f->dump_int("evict_bytes_sec", evict);
643 } else {
644 if (have_output)
645 *out << ", ";
646 *out << byte_u_t(evict) << "/s evict";
647 have_output = true;
648 }
649 }
650 if (pos_delta.stats.sum.num_promote) {
651 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
652 if (f) {
653 f->dump_int("promote_op_per_sec", promote);
654 } else {
655 if (have_output)
656 *out << ", ";
657 *out << si_u_t(promote) << " op/s promote";
658 have_output = true;
659 }
660 }
661 if (pos_delta.stats.sum.num_flush_mode_low) {
662 if (f) {
663 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
664 } else {
665 if (have_output)
666 *out << ", ";
667 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
668 have_output = true;
669 }
670 }
671 if (pos_delta.stats.sum.num_flush_mode_high) {
672 if (f) {
673 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
674 } else {
675 if (have_output)
676 *out << ", ";
677 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
678 have_output = true;
679 }
680 }
681 if (pos_delta.stats.sum.num_evict_mode_some) {
682 if (f) {
683 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
684 } else {
685 if (have_output)
686 *out << ", ";
687 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
688 have_output = true;
689 }
690 }
691 if (pos_delta.stats.sum.num_evict_mode_full) {
692 if (f) {
693 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
694 } else {
695 if (have_output)
696 *out << ", ";
697 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
698 }
699 }
700 }
701
702 void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
703 {
704 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
705 }
706
707 void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
708 uint64_t poolid) const
709 {
710 auto p = per_pool_sum_delta.find(poolid);
711 if (p == per_pool_sum_delta.end())
712 return;
713
714 auto ts = per_pool_sum_deltas_stamps.find(p->first);
715 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
716 cache_io_rate_summary(f, out, p->second.first, ts->second);
717 }
718
719 ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
720 boost::optional<int64_t> data_pool) const
721 {
722 ceph_statfs statfs;
723 bool filter = false;
724 object_stat_sum_t sum;
725
726 if (data_pool) {
727 auto i = pg_pool_sum.find(*data_pool);
728 if (i != pg_pool_sum.end()) {
729 sum = i->second.stats.sum;
730 filter = true;
731 }
732 }
733
734 if (filter) {
735 statfs.kb_used = (sum.num_bytes >> 10);
736 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
737 statfs.num_objects = sum.num_objects;
738 statfs.kb = statfs.kb_used + statfs.kb_avail;
739 } else {
740 // these are in KB.
741 statfs.kb = osd_sum.statfs.kb();
742 statfs.kb_used = osd_sum.statfs.kb_used_raw();
743 statfs.kb_avail = osd_sum.statfs.kb_avail();
744 statfs.num_objects = pg_sum.stats.sum.num_objects;
745 }
746
747 return statfs;
748 }
749
750 void PGMapDigest::dump_pool_stats_full(
751 const OSDMap &osd_map,
752 stringstream *ss,
753 ceph::Formatter *f,
754 bool verbose) const
755 {
756 TextTable tbl;
757
758 if (f) {
759 f->open_array_section("pools");
760 } else {
761 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
762 tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT);
763 tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT);
764 if (verbose) {
765 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
766 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
767 }
768 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
769 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
770 if (verbose) {
771 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
772 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
773 }
774 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
775 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
776
777 if (verbose) {
778 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
779 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
780 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
781 tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT);
782 tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT);
783 }
784 }
785
786 map<int,uint64_t> avail_by_rule;
787 for (auto p = osd_map.get_pools().begin();
788 p != osd_map.get_pools().end(); ++p) {
789 int64_t pool_id = p->first;
790 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
791 continue;
792
793 const string& pool_name = osd_map.get_pool_name(pool_id);
794 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
795
796 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
797 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
798 pool->get_type(),
799 pool->get_size());
800 int64_t avail;
801 if (avail_by_rule.count(ruleno) == 0) {
802 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
803 avail = get_rule_avail(ruleno);
804 if (avail < 0)
805 avail = 0;
806 avail_by_rule[ruleno] = avail;
807 } else {
808 avail = avail_by_rule[ruleno];
809 }
810 if (f) {
811 f->open_object_section("pool");
812 f->dump_string("name", pool_name);
813 f->dump_int("id", pool_id);
814 f->open_object_section("stats");
815 } else {
816 tbl << pool_name
817 << pool_id;
818 }
819 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
820 bool per_pool = use_per_pool_stats();
821 bool per_pool_omap = use_per_pool_omap_stats();
822 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
823 per_pool_omap, pool);
824 if (f) {
825 f->close_section(); // stats
826 f->close_section(); // pool
827 } else {
828 tbl << TextTable::endrow;
829 }
830 }
831 if (f)
832 f->close_section();
833 else {
834 ceph_assert(ss != nullptr);
835 *ss << "--- POOLS ---\n";
836 *ss << tbl;
837 }
838 }
839
840 void PGMapDigest::dump_cluster_stats(stringstream *ss,
841 ceph::Formatter *f,
842 bool verbose) const
843 {
844 if (f) {
845 f->open_object_section("stats");
846 f->dump_int("total_bytes", osd_sum.statfs.total);
847 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
848 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
849 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
850 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
851 f->dump_unsigned("num_osds", osd_sum.num_osds);
852 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
853 f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
854 f->close_section();
855 f->open_object_section("stats_by_class");
856 for (auto& i : osd_sum_by_class) {
857 f->open_object_section(i.first.c_str());
858 f->dump_int("total_bytes", i.second.statfs.total);
859 f->dump_int("total_avail_bytes", i.second.statfs.available);
860 f->dump_int("total_used_bytes", i.second.statfs.get_used());
861 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
862 f->dump_float("total_used_raw_ratio",
863 i.second.statfs.get_used_raw_ratio());
864 f->close_section();
865 }
866 f->close_section();
867 } else {
868 ceph_assert(ss != nullptr);
869 TextTable tbl;
870 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
871 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
872 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
873 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
874 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
875 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
876
877
878 for (auto& i : osd_sum_by_class) {
879 tbl << i.first;
880 tbl << stringify(byte_u_t(i.second.statfs.total))
881 << stringify(byte_u_t(i.second.statfs.available))
882 << stringify(byte_u_t(i.second.statfs.get_used()))
883 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
884 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
885 << TextTable::endrow;
886 }
887 tbl << "TOTAL";
888 tbl << stringify(byte_u_t(osd_sum.statfs.total))
889 << stringify(byte_u_t(osd_sum.statfs.available))
890 << stringify(byte_u_t(osd_sum.statfs.get_used()))
891 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
892 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
893 << TextTable::endrow;
894
895 *ss << "--- RAW STORAGE ---\n";
896 *ss << tbl;
897 }
898 }
899
900 void PGMapDigest::dump_object_stat_sum(
901 TextTable &tbl, ceph::Formatter *f,
902 const pool_stat_t &pool_stat, uint64_t avail,
903 float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
904 const pg_pool_t *pool)
905 {
906 const object_stat_sum_t &sum = pool_stat.stats.sum;
907 const store_statfs_t statfs = pool_stat.store_stats;
908
909 if (sum.num_object_copies > 0) {
910 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
911 }
912
913 uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
914 uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
915 uint64_t used_bytes = used_data_bytes + used_omap_bytes;
916
917 float used = 0.0;
918 // note avail passed in is raw_avail, calc raw_used here.
919 if (avail) {
920 used = used_bytes;
921 used /= used + avail;
922 } else if (used_bytes) {
923 used = 1.0;
924 }
925 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
926 // an approximation for actually stored user data
927 auto stored_data_normalized = pool_stat.get_user_data_bytes(
928 raw_used_rate, per_pool);
929 auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
930 raw_used_rate, per_pool_omap);
931 auto stored_normalized = stored_data_normalized + stored_omap_normalized;
932 // same, amplied by replication or EC
933 auto stored_raw = stored_normalized * raw_used_rate;
934 if (f) {
935 f->dump_int("stored", stored_normalized);
936 if (verbose) {
937 f->dump_int("stored_data", stored_data_normalized);
938 f->dump_int("stored_omap", stored_omap_normalized);
939 }
940 f->dump_int("objects", sum.num_objects);
941 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
942 f->dump_int("bytes_used", used_bytes);
943 if (verbose) {
944 f->dump_int("data_bytes_used", used_data_bytes);
945 f->dump_int("omap_bytes_used", used_omap_bytes);
946 }
947 f->dump_float("percent_used", used);
948 f->dump_unsigned("max_avail", avail_res);
949 if (verbose) {
950 f->dump_int("quota_objects", pool->quota_max_objects);
951 f->dump_int("quota_bytes", pool->quota_max_bytes);
952 f->dump_int("dirty", sum.num_objects_dirty);
953 f->dump_int("rd", sum.num_rd);
954 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
955 f->dump_int("wr", sum.num_wr);
956 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
957 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
958 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
959 // Stored by user amplified by replication
960 f->dump_int("stored_raw", stored_raw);
961 f->dump_unsigned("avail_raw", avail);
962 }
963 } else {
964 tbl << stringify(byte_u_t(stored_normalized));
965 if (verbose) {
966 tbl << stringify(byte_u_t(stored_data_normalized));
967 tbl << stringify(byte_u_t(stored_omap_normalized));
968 }
969 tbl << stringify(si_u_t(sum.num_objects));
970 tbl << stringify(byte_u_t(used_bytes));
971 if (verbose) {
972 tbl << stringify(byte_u_t(used_data_bytes));
973 tbl << stringify(byte_u_t(used_omap_bytes));
974 }
975 tbl << percentify(used*100);
976 tbl << stringify(byte_u_t(avail_res));
977 if (verbose) {
978 if (pool->quota_max_objects == 0)
979 tbl << "N/A";
980 else
981 tbl << stringify(si_u_t(pool->quota_max_objects));
982
983 if (pool->quota_max_bytes == 0)
984 tbl << "N/A";
985 else
986 tbl << stringify(byte_u_t(pool->quota_max_bytes));
987
988 tbl << stringify(si_u_t(sum.num_objects_dirty))
989 << stringify(byte_u_t(statfs.data_compressed_allocated))
990 << stringify(byte_u_t(statfs.data_compressed_original))
991 ;
992 }
993 }
994 }
995
996 int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
997 int64_t poolid) const
998 {
999 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
1000 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
1001 pool->get_type(),
1002 pool->get_size());
1003 int64_t avail;
1004 avail = get_rule_avail(ruleno);
1005 if (avail < 0)
1006 avail = 0;
1007
1008 return avail / osd_map.pool_raw_used_rate(poolid);
1009 }
1010
1011 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1012 {
1013 map<int,float> wm;
1014 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1015 if (r < 0) {
1016 return r;
1017 }
1018 if (wm.empty()) {
1019 return 0;
1020 }
1021
1022 float fratio = osdmap.get_full_ratio();
1023
1024 int64_t min = -1;
1025 for (auto p = wm.begin(); p != wm.end(); ++p) {
1026 auto osd_info = osd_stat.find(p->first);
1027 if (osd_info != osd_stat.end()) {
1028 if (osd_info->second.statfs.total == 0 || p->second == 0) {
1029 // osd must be out, hence its stats have been zeroed
1030 // (unless we somehow managed to have a disk with size 0...)
1031 //
1032 // (p->second == 0), if osd weight is 0, no need to
1033 // calculate proj below.
1034 continue;
1035 }
1036 double unusable = (double)osd_info->second.statfs.kb() *
1037 (1.0 - fratio);
1038 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
1039 avail *= 1024.0;
1040 int64_t proj = (int64_t)(avail / (double)p->second);
1041 if (min < 0 || proj < min) {
1042 min = proj;
1043 }
1044 } else {
1045 if (osdmap.is_up(p->first)) {
1046 // This is a level 4 rather than an error, because we might have
1047 // only just started, and not received the first stats message yet.
1048 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1049 }
1050 }
1051 }
1052 return min;
1053 }
1054
1055 void PGMap::get_rules_avail(const OSDMap& osdmap,
1056 std::map<int,int64_t> *avail_map) const
1057 {
1058 avail_map->clear();
1059 for (auto p : osdmap.get_pools()) {
1060 int64_t pool_id = p.first;
1061 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1062 continue;
1063 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1064 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1065 pool->get_type(),
1066 pool->get_size());
1067 if (avail_map->count(ruleno) == 0)
1068 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1069 }
1070 }
1071
1072 // ---------------------
1073 // PGMap
1074
1075 void PGMap::Incremental::dump(ceph::Formatter *f) const
1076 {
1077 f->dump_unsigned("version", version);
1078 f->dump_stream("stamp") << stamp;
1079 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1080 f->dump_unsigned("pg_scan_epoch", pg_scan);
1081
1082 f->open_array_section("pg_stat_updates");
1083 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1084 f->open_object_section("pg_stat");
1085 f->dump_stream("pgid") << p->first;
1086 p->second.dump(f);
1087 f->close_section();
1088 }
1089 f->close_section();
1090
1091 f->open_array_section("osd_stat_updates");
1092 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1093 f->open_object_section("osd_stat");
1094 f->dump_int("osd", p->first);
1095 p->second.dump(f);
1096 f->close_section();
1097 }
1098 f->close_section();
1099 f->open_array_section("pool_statfs_updates");
1100 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1101 f->open_object_section("pool_statfs");
1102 f->dump_stream("poolid/osd") << p->first;
1103 p->second.dump(f);
1104 f->close_section();
1105 }
1106 f->close_section();
1107
1108 f->open_array_section("osd_stat_removals");
1109 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1110 f->dump_int("osd", *p);
1111 f->close_section();
1112
1113 f->open_array_section("pg_removals");
1114 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1115 f->dump_stream("pgid") << *p;
1116 f->close_section();
1117 }
1118
1119 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1120 {
1121 o.push_back(new Incremental);
1122 o.push_back(new Incremental);
1123 o.back()->version = 1;
1124 o.back()->stamp = utime_t(123,345);
1125 o.push_back(new Incremental);
1126 o.back()->version = 2;
1127 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
1128 o.back()->osd_stat_updates[5] = osd_stat_t();
1129 o.push_back(new Incremental);
1130 o.back()->version = 3;
1131 o.back()->osdmap_epoch = 1;
1132 o.back()->pg_scan = 2;
1133 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
1134 o.back()->osd_stat_updates[6] = osd_stat_t();
1135 o.back()->pg_remove.insert(pg_t(1,2));
1136 o.back()->osd_stat_rm.insert(5);
1137 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
1138 }
1139
1140 // --
1141
1142 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1143 {
1144 ceph_assert(inc.version == version+1);
1145 version++;
1146
1147 pool_stat_t pg_sum_old = pg_sum;
1148 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1149 pg_pool_sum_old = pg_pool_sum;
1150
1151 for (auto p = inc.pg_stat_updates.begin();
1152 p != inc.pg_stat_updates.end();
1153 ++p) {
1154 const pg_t &update_pg(p->first);
1155 auto update_pool = update_pg.pool();
1156 const pg_stat_t &update_stat(p->second);
1157
1158 auto pg_stat_iter = pg_stat.find(update_pg);
1159 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1160 if (pg_stat_iter == pg_stat.end()) {
1161 pg_stat.insert(make_pair(update_pg, update_stat));
1162 } else {
1163 stat_pg_sub(update_pg, pg_stat_iter->second);
1164 pool_sum_ref.sub(pg_stat_iter->second);
1165 pg_stat_iter->second = update_stat;
1166 }
1167 stat_pg_add(update_pg, update_stat);
1168 pool_sum_ref.add(update_stat);
1169 }
1170
1171 for (auto p = inc.pool_statfs_updates.begin();
1172 p != inc.pool_statfs_updates.end();
1173 ++p) {
1174 auto update_pool = p->first.first;
1175 auto update_osd = p->first.second;
1176 auto& statfs_inc = p->second;
1177
1178 auto pool_statfs_iter =
1179 pool_statfs.find(std::make_pair(update_pool, update_osd));
1180 if (pg_pool_sum.count(update_pool)) {
1181 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1182 if (pool_statfs_iter == pool_statfs.end()) {
1183 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1184 } else {
1185 pool_sum_ref.sub(pool_statfs_iter->second);
1186 pool_statfs_iter->second = statfs_inc;
1187 }
1188 pool_sum_ref.add(statfs_inc);
1189 }
1190 }
1191
1192 for (auto p = inc.get_osd_stat_updates().begin();
1193 p != inc.get_osd_stat_updates().end();
1194 ++p) {
1195 int osd = p->first;
1196 const osd_stat_t &new_stats(p->second);
1197
1198 auto t = osd_stat.find(osd);
1199 if (t == osd_stat.end()) {
1200 osd_stat.insert(make_pair(osd, new_stats));
1201 } else {
1202 stat_osd_sub(t->first, t->second);
1203 t->second = new_stats;
1204 }
1205 stat_osd_add(osd, new_stats);
1206 }
1207 set<int64_t> deleted_pools;
1208 for (auto p = inc.pg_remove.begin();
1209 p != inc.pg_remove.end();
1210 ++p) {
1211 const pg_t &removed_pg(*p);
1212 auto s = pg_stat.find(removed_pg);
1213 bool pool_erased = false;
1214 if (s != pg_stat.end()) {
1215 pool_erased = stat_pg_sub(removed_pg, s->second);
1216
1217 // decrease pool stats if pg was removed
1218 auto pool_stats_it = pg_pool_sum.find(removed_pg.pool());
1219 if (pool_stats_it != pg_pool_sum.end()) {
1220 pool_stats_it->second.sub(s->second);
1221 }
1222
1223 pg_stat.erase(s);
1224 if (pool_erased) {
1225 deleted_pools.insert(removed_pg.pool());
1226 }
1227 }
1228 }
1229
1230 for (auto p = inc.get_osd_stat_rm().begin();
1231 p != inc.get_osd_stat_rm().end();
1232 ++p) {
1233 auto t = osd_stat.find(*p);
1234 if (t != osd_stat.end()) {
1235 stat_osd_sub(t->first, t->second);
1236 osd_stat.erase(t);
1237 }
1238 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1239 if (i->first.second == *p) {
1240 pg_pool_sum[i->first.first].sub(i->second);
1241 pool_statfs.erase(i);
1242 }
1243 }
1244 }
1245
1246 // skip calculating delta while sum was not synchronized
1247 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1248 utime_t delta_t;
1249 delta_t = inc.stamp;
1250 delta_t -= stamp;
1251 // calculate a delta, and average over the last 2 deltas.
1252 pool_stat_t d = pg_sum;
1253 d.stats.sub(pg_sum_old.stats);
1254 pg_sum_deltas.push_back(make_pair(d, delta_t));
1255 stamp_delta += delta_t;
1256 pg_sum_delta.stats.add(d.stats);
1257 auto smooth_intervals =
1258 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1259 while (pg_sum_deltas.size() > smooth_intervals) {
1260 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1261 stamp_delta -= pg_sum_deltas.front().second;
1262 pg_sum_deltas.pop_front();
1263 }
1264 }
1265 stamp = inc.stamp;
1266
1267 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1268
1269 for (auto p : deleted_pools) {
1270 if (cct)
1271 dout(20) << " deleted pool " << p << dendl;
1272 deleted_pool(p);
1273 }
1274
1275 if (inc.osdmap_epoch)
1276 last_osdmap_epoch = inc.osdmap_epoch;
1277 if (inc.pg_scan)
1278 last_pg_scan = inc.pg_scan;
1279 }
1280
1281 void PGMap::calc_stats()
1282 {
1283 num_pg = 0;
1284 num_pg_active = 0;
1285 num_pg_unknown = 0;
1286 num_osd = 0;
1287 pg_pool_sum.clear();
1288 num_pg_by_pool.clear();
1289 pg_by_osd.clear();
1290 pg_sum = pool_stat_t();
1291 osd_sum = osd_stat_t();
1292 osd_sum_by_class.clear();
1293 num_pg_by_state.clear();
1294 num_pg_by_pool_state.clear();
1295 num_pg_by_osd.clear();
1296
1297 for (auto p = pg_stat.begin();
1298 p != pg_stat.end();
1299 ++p) {
1300 auto pg = p->first;
1301 stat_pg_add(pg, p->second);
1302 pg_pool_sum[pg.pool()].add(p->second);
1303 }
1304 for (auto p = pool_statfs.begin();
1305 p != pool_statfs.end();
1306 ++p) {
1307 auto pool = p->first.first;
1308 pg_pool_sum[pool].add(p->second);
1309 }
1310 for (auto p = osd_stat.begin();
1311 p != osd_stat.end();
1312 ++p)
1313 stat_osd_add(p->first, p->second);
1314 }
1315
1316 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1317 bool sameosds)
1318 {
1319 auto pool = pgid.pool();
1320 pg_sum.add(s);
1321
1322 num_pg++;
1323 num_pg_by_state[s.state]++;
1324 num_pg_by_pool_state[pgid.pool()][s.state]++;
1325 num_pg_by_pool[pool]++;
1326
1327 if ((s.state & PG_STATE_CREATING) &&
1328 s.parent_split_bits == 0) {
1329 creating_pgs.insert(pgid);
1330 if (s.acting_primary >= 0) {
1331 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1332 }
1333 }
1334
1335 if (s.state & PG_STATE_ACTIVE) {
1336 ++num_pg_active;
1337 }
1338 if (s.state == 0) {
1339 ++num_pg_unknown;
1340 }
1341
1342 if (sameosds)
1343 return;
1344
1345 for (auto p = s.blocked_by.begin();
1346 p != s.blocked_by.end();
1347 ++p) {
1348 ++blocked_by_sum[*p];
1349 }
1350
1351 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1352 pg_by_osd[*p].insert(pgid);
1353 num_pg_by_osd[*p].acting++;
1354 }
1355 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1356 auto& t = pg_by_osd[*p];
1357 if (t.find(pgid) == t.end()) {
1358 t.insert(pgid);
1359 num_pg_by_osd[*p].up_not_acting++;
1360 }
1361 }
1362
1363 if (s.up_primary >= 0) {
1364 num_pg_by_osd[s.up_primary].primary++;
1365 }
1366 }
1367
1368 bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1369 bool sameosds)
1370 {
1371 bool pool_erased = false;
1372 pg_sum.sub(s);
1373
1374 num_pg--;
1375 int end = --num_pg_by_state[s.state];
1376 ceph_assert(end >= 0);
1377 if (end == 0)
1378 num_pg_by_state.erase(s.state);
1379 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1380 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1381 }
1382 end = --num_pg_by_pool[pgid.pool()];
1383 if (end == 0) {
1384 pool_erased = true;
1385 }
1386
1387 if ((s.state & PG_STATE_CREATING) &&
1388 s.parent_split_bits == 0) {
1389 creating_pgs.erase(pgid);
1390 if (s.acting_primary >= 0) {
1391 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1392 r[s.mapping_epoch].erase(pgid);
1393 if (r[s.mapping_epoch].empty())
1394 r.erase(s.mapping_epoch);
1395 if (r.empty())
1396 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1397 }
1398 }
1399
1400 if (s.state & PG_STATE_ACTIVE) {
1401 --num_pg_active;
1402 }
1403 if (s.state == 0) {
1404 --num_pg_unknown;
1405 }
1406
1407 if (sameosds)
1408 return pool_erased;
1409
1410 for (auto p = s.blocked_by.begin();
1411 p != s.blocked_by.end();
1412 ++p) {
1413 auto q = blocked_by_sum.find(*p);
1414 ceph_assert(q != blocked_by_sum.end());
1415 --q->second;
1416 if (q->second == 0)
1417 blocked_by_sum.erase(q);
1418 }
1419
1420 set<int32_t> actingset;
1421 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1422 actingset.insert(*p);
1423 auto& oset = pg_by_osd[*p];
1424 oset.erase(pgid);
1425 if (oset.empty())
1426 pg_by_osd.erase(*p);
1427 auto it = num_pg_by_osd.find(*p);
1428 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1429 it->second.acting--;
1430 }
1431 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1432 auto& oset = pg_by_osd[*p];
1433 oset.erase(pgid);
1434 if (oset.empty())
1435 pg_by_osd.erase(*p);
1436 if (actingset.count(*p))
1437 continue;
1438 auto it = num_pg_by_osd.find(*p);
1439 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1440 it->second.up_not_acting--;
1441 }
1442
1443 if (s.up_primary >= 0) {
1444 auto it = num_pg_by_osd.find(s.up_primary);
1445 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1446 it->second.primary--;
1447 }
1448 return pool_erased;
1449 }
1450
1451 void PGMap::calc_purged_snaps()
1452 {
1453 purged_snaps.clear();
1454 set<int64_t> unknown;
1455 for (auto& i : pg_stat) {
1456 if (i.second.state == 0) {
1457 unknown.insert(i.first.pool());
1458 purged_snaps.erase(i.first.pool());
1459 continue;
1460 } else if (unknown.count(i.first.pool())) {
1461 continue;
1462 }
1463 auto j = purged_snaps.find(i.first.pool());
1464 if (j == purged_snaps.end()) {
1465 // base case
1466 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1467 } else {
1468 j->second.intersection_of(i.second.purged_snaps);
1469 }
1470 }
1471 }
1472
1473 void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
1474 {
1475 osd_sum_by_class.clear();
1476 for (auto& i : osd_stat) {
1477 const char *class_name = osdmap.crush->get_item_class(i.first);
1478 if (class_name) {
1479 osd_sum_by_class[class_name].add(i.second);
1480 }
1481 }
1482 }
1483
1484 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1485 {
1486 num_osd++;
1487 osd_sum.add(s);
1488 if (osd >= (int)osd_last_seq.size()) {
1489 osd_last_seq.resize(osd + 1);
1490 }
1491 osd_last_seq[osd] = s.seq;
1492 }
1493
1494 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1495 {
1496 num_osd--;
1497 osd_sum.sub(s);
1498 ceph_assert(osd < (int)osd_last_seq.size());
1499 osd_last_seq[osd] = 0;
1500 }
1501
1502 void PGMap::encode_digest(const OSDMap& osdmap,
1503 bufferlist& bl, uint64_t features)
1504 {
1505 get_rules_avail(osdmap, &avail_space_by_rule);
1506 calc_osd_sum_by_class(osdmap);
1507 calc_purged_snaps();
1508 PGMapDigest::encode(bl, features);
1509 }
1510
1511 void PGMap::encode(bufferlist &bl, uint64_t features) const
1512 {
1513 ENCODE_START(8, 8, bl);
1514 encode(version, bl);
1515 encode(pg_stat, bl);
1516 encode(osd_stat, bl, features);
1517 encode(last_osdmap_epoch, bl);
1518 encode(last_pg_scan, bl);
1519 encode(stamp, bl);
1520 encode(pool_statfs, bl, features);
1521 ENCODE_FINISH(bl);
1522 }
1523
1524 void PGMap::decode(bufferlist::const_iterator &bl)
1525 {
1526 DECODE_START(8, bl);
1527 decode(version, bl);
1528 decode(pg_stat, bl);
1529 decode(osd_stat, bl);
1530 decode(last_osdmap_epoch, bl);
1531 decode(last_pg_scan, bl);
1532 decode(stamp, bl);
1533 decode(pool_statfs, bl);
1534 DECODE_FINISH(bl);
1535
1536 calc_stats();
1537 }
1538
1539 void PGMap::dump(ceph::Formatter *f, bool with_net) const
1540 {
1541 dump_basic(f);
1542 dump_pg_stats(f, false);
1543 dump_pool_stats(f);
1544 dump_osd_stats(f, with_net);
1545 }
1546
1547 void PGMap::dump_basic(ceph::Formatter *f) const
1548 {
1549 f->dump_unsigned("version", version);
1550 f->dump_stream("stamp") << stamp;
1551 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1552 f->dump_unsigned("last_pg_scan", last_pg_scan);
1553
1554 f->open_object_section("pg_stats_sum");
1555 pg_sum.dump(f);
1556 f->close_section();
1557
1558 f->open_object_section("osd_stats_sum");
1559 osd_sum.dump(f);
1560 f->close_section();
1561
1562 dump_delta(f);
1563 }
1564
1565 void PGMap::dump_delta(ceph::Formatter *f) const
1566 {
1567 f->open_object_section("pg_stats_delta");
1568 pg_sum_delta.dump(f);
1569 f->dump_stream("stamp_delta") << stamp_delta;
1570 f->close_section();
1571 }
1572
1573 void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
1574 {
1575 f->open_array_section("pg_stats");
1576 for (auto i = pg_stat.begin();
1577 i != pg_stat.end();
1578 ++i) {
1579 f->open_object_section("pg_stat");
1580 f->dump_stream("pgid") << i->first;
1581 if (brief)
1582 i->second.dump_brief(f);
1583 else
1584 i->second.dump(f);
1585 f->close_section();
1586 }
1587 f->close_section();
1588 }
1589
1590 void PGMap::dump_pool_stats(ceph::Formatter *f) const
1591 {
1592 f->open_array_section("pool_stats");
1593 for (auto p = pg_pool_sum.begin();
1594 p != pg_pool_sum.end();
1595 ++p) {
1596 f->open_object_section("pool_stat");
1597 f->dump_int("poolid", p->first);
1598 auto q = num_pg_by_pool.find(p->first);
1599 if (q != num_pg_by_pool.end())
1600 f->dump_unsigned("num_pg", q->second);
1601 p->second.dump(f);
1602 f->close_section();
1603 }
1604 f->close_section();
1605 }
1606
1607 void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
1608 {
1609 f->open_array_section("osd_stats");
1610 for (auto q = osd_stat.begin();
1611 q != osd_stat.end();
1612 ++q) {
1613 f->open_object_section("osd_stat");
1614 f->dump_int("osd", q->first);
1615 q->second.dump(f, with_net);
1616 f->close_section();
1617 }
1618 f->close_section();
1619 }
1620
1621 void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
1622 {
1623 f->open_array_section("osd_ping_times");
1624 for (auto& [osd, stat] : osd_stat) {
1625 f->open_object_section("osd_ping_time");
1626 f->dump_int("osd", osd);
1627 stat.dump_ping_time(f);
1628 f->close_section();
1629 }
1630 f->close_section();
1631 }
1632
1633 void PGMap::dump_pg_stats_plain(
1634 ostream& ss,
1635 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1636 bool brief) const
1637 {
1638 TextTable tab;
1639
1640 if (brief){
1641 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1642 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1643 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1644 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1645 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1646 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1647 }
1648 else {
1649 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1650 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1651 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1652 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1653 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1654 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1655 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1656 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1657 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1658 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1659 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1660 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1661 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1662 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1663 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1664 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1665 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1666 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1667 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1668 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1669 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1670 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1671 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1672 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
1673 }
1674
1675 for (auto i = pg_stats.begin();
1676 i != pg_stats.end(); ++i) {
1677 const pg_stat_t &st(i->second);
1678 if (brief) {
1679 tab << i->first
1680 << pg_state_string(st.state)
1681 << st.up
1682 << st.up_primary
1683 << st.acting
1684 << st.acting_primary
1685 << TextTable::endrow;
1686 } else {
1687 ostringstream reported;
1688 reported << st.reported_epoch << ":" << st.reported_seq;
1689
1690 tab << i->first
1691 << st.stats.sum.num_objects
1692 << st.stats.sum.num_objects_missing_on_primary
1693 << st.stats.sum.num_objects_degraded
1694 << st.stats.sum.num_objects_misplaced
1695 << st.stats.sum.num_objects_unfound
1696 << st.stats.sum.num_bytes
1697 << st.stats.sum.num_omap_bytes
1698 << st.stats.sum.num_omap_keys
1699 << st.log_size
1700 << st.ondisk_log_size
1701 << pg_state_string(st.state)
1702 << st.last_change
1703 << st.version
1704 << reported.str()
1705 << pg_vector_string(st.up)
1706 << st.up_primary
1707 << pg_vector_string(st.acting)
1708 << st.acting_primary
1709 << st.last_scrub
1710 << st.last_scrub_stamp
1711 << st.last_deep_scrub
1712 << st.last_deep_scrub_stamp
1713 << st.snaptrimq_len
1714 << TextTable::endrow;
1715 }
1716 }
1717
1718 ss << tab;
1719 }
1720
1721 void PGMap::dump(ostream& ss) const
1722 {
1723 dump_basic(ss);
1724 dump_pg_stats(ss, false);
1725 dump_pool_stats(ss, false);
1726 dump_pg_sum_stats(ss, false);
1727 dump_osd_stats(ss);
1728 }
1729
1730 void PGMap::dump_basic(ostream& ss) const
1731 {
1732 ss << "version " << version << std::endl;
1733 ss << "stamp " << stamp << std::endl;
1734 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1735 ss << "last_pg_scan " << last_pg_scan << std::endl;
1736 }
1737
1738 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1739 {
1740 dump_pg_stats_plain(ss, pg_stat, brief);
1741 }
1742
1743 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1744 {
1745 TextTable tab;
1746
1747 if (header) {
1748 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1749 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1750 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1751 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1752 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1753 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1754 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1755 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1756 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1757 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1758 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1759 } else {
1760 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1761 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1762 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1763 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1764 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1765 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1766 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1768 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1770 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1771 }
1772
1773 for (auto p = pg_pool_sum.begin();
1774 p != pg_pool_sum.end();
1775 ++p) {
1776 tab << p->first
1777 << p->second.stats.sum.num_objects
1778 << p->second.stats.sum.num_objects_missing_on_primary
1779 << p->second.stats.sum.num_objects_degraded
1780 << p->second.stats.sum.num_objects_misplaced
1781 << p->second.stats.sum.num_objects_unfound
1782 << p->second.stats.sum.num_bytes
1783 << p->second.stats.sum.num_omap_bytes
1784 << p->second.stats.sum.num_omap_keys
1785 << p->second.log_size
1786 << p->second.ondisk_log_size
1787 << TextTable::endrow;
1788 }
1789
1790 ss << tab;
1791 }
1792
1793 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1794 {
1795 TextTable tab;
1796
1797 if (header) {
1798 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1799 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1800 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1801 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1802 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1803 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1804 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1805 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1806 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1807 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1808 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1809 } else {
1810 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1811 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1812 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1813 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1814 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1815 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1816 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1817 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1818 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1819 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1820 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1821 };
1822
1823 tab << "sum"
1824 << pg_sum.stats.sum.num_objects
1825 << pg_sum.stats.sum.num_objects_missing_on_primary
1826 << pg_sum.stats.sum.num_objects_degraded
1827 << pg_sum.stats.sum.num_objects_misplaced
1828 << pg_sum.stats.sum.num_objects_unfound
1829 << pg_sum.stats.sum.num_bytes
1830 << pg_sum.stats.sum.num_omap_bytes
1831 << pg_sum.stats.sum.num_omap_keys
1832 << pg_sum.log_size
1833 << pg_sum.ondisk_log_size
1834 << TextTable::endrow;
1835
1836 ss << tab;
1837 }
1838
1839 void PGMap::dump_osd_stats(ostream& ss) const
1840 {
1841 TextTable tab;
1842
1843 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1844 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1845 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1846 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1847 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1848 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1849 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1850 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1851
1852 for (auto p = osd_stat.begin();
1853 p != osd_stat.end();
1854 ++p) {
1855 tab << p->first
1856 << byte_u_t(p->second.statfs.get_used())
1857 << byte_u_t(p->second.statfs.available)
1858 << byte_u_t(p->second.statfs.get_used_raw())
1859 << byte_u_t(p->second.statfs.total)
1860 << p->second.hb_peers
1861 << get_num_pg_by_osd(p->first)
1862 << get_num_primary_pg_by_osd(p->first)
1863 << TextTable::endrow;
1864 }
1865
1866 tab << "sum"
1867 << byte_u_t(osd_sum.statfs.get_used())
1868 << byte_u_t(osd_sum.statfs.available)
1869 << byte_u_t(osd_sum.statfs.get_used_raw())
1870 << byte_u_t(osd_sum.statfs.total)
1871 << TextTable::endrow;
1872
1873 ss << tab;
1874 }
1875
1876 void PGMap::dump_osd_sum_stats(ostream& ss) const
1877 {
1878 TextTable tab;
1879
1880 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1881 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1882 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1883 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1884 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1885
1886 tab << "sum"
1887 << byte_u_t(osd_sum.statfs.get_used())
1888 << byte_u_t(osd_sum.statfs.available)
1889 << byte_u_t(osd_sum.statfs.get_used_raw())
1890 << byte_u_t(osd_sum.statfs.total)
1891 << TextTable::endrow;
1892
1893 ss << tab;
1894 }
1895
1896 void PGMap::get_stuck_stats(
1897 int types, const utime_t cutoff,
1898 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1899 {
1900 ceph_assert(types != 0);
1901 for (auto i = pg_stat.begin();
1902 i != pg_stat.end();
1903 ++i) {
1904 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1905
1906 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1907 if (i->second.last_active < val)
1908 val = i->second.last_active;
1909 }
1910
1911 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1912 if (i->second.last_clean < val)
1913 val = i->second.last_clean;
1914 }
1915
1916 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1917 if (i->second.last_undegraded < val)
1918 val = i->second.last_undegraded;
1919 }
1920
1921 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1922 if (i->second.last_fullsized < val)
1923 val = i->second.last_fullsized;
1924 }
1925
1926 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1927 if (i->second.last_unstale < val)
1928 val = i->second.last_unstale;
1929 }
1930
1931 // val is now the earliest any of the requested stuck states began
1932 if (val < cutoff) {
1933 stuck_pgs[i->first] = i->second;
1934 }
1935 }
1936 }
1937
1938 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1939 {
1940 int inactive = 0;
1941 int unclean = 0;
1942 int degraded = 0;
1943 int undersized = 0;
1944 int stale = 0;
1945
1946 for (auto i = pg_stat.begin();
1947 i != pg_stat.end();
1948 ++i) {
1949 if (! (i->second.state & PG_STATE_ACTIVE)) {
1950 if (i->second.last_active < cutoff)
1951 ++inactive;
1952 }
1953 if (! (i->second.state & PG_STATE_CLEAN)) {
1954 if (i->second.last_clean < cutoff)
1955 ++unclean;
1956 }
1957 if (i->second.state & PG_STATE_DEGRADED) {
1958 if (i->second.last_undegraded < cutoff)
1959 ++degraded;
1960 }
1961 if (i->second.state & PG_STATE_UNDERSIZED) {
1962 if (i->second.last_fullsized < cutoff)
1963 ++undersized;
1964 }
1965 if (i->second.state & PG_STATE_STALE) {
1966 if (i->second.last_unstale < cutoff)
1967 ++stale;
1968 }
1969 }
1970
1971 if (inactive)
1972 note["stuck inactive"] = inactive;
1973
1974 if (unclean)
1975 note["stuck unclean"] = unclean;
1976
1977 if (undersized)
1978 note["stuck undersized"] = undersized;
1979
1980 if (degraded)
1981 note["stuck degraded"] = degraded;
1982
1983 if (stale)
1984 note["stuck stale"] = stale;
1985
1986 return inactive || unclean || undersized || degraded || stale;
1987 }
1988
1989 void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
1990 {
1991 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1992 get_stuck_stats(types, cutoff, stuck_pg_stats);
1993 f->open_array_section("stuck_pg_stats");
1994 for (auto i = stuck_pg_stats.begin();
1995 i != stuck_pg_stats.end();
1996 ++i) {
1997 f->open_object_section("pg_stat");
1998 f->dump_stream("pgid") << i->first;
1999 i->second.dump(f);
2000 f->close_section();
2001 }
2002 f->close_section();
2003 }
2004
2005 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2006 {
2007 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2008 get_stuck_stats(types, cutoff, stuck_pg_stats);
2009 if (!stuck_pg_stats.empty())
2010 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2011 }
2012
2013 int PGMap::dump_stuck_pg_stats(
2014 stringstream &ds,
2015 ceph::Formatter *f,
2016 int threshold,
2017 vector<string>& args) const
2018 {
2019 int stuck_types = 0;
2020
2021 for (auto i = args.begin(); i != args.end(); ++i) {
2022 if (*i == "inactive")
2023 stuck_types |= PGMap::STUCK_INACTIVE;
2024 else if (*i == "unclean")
2025 stuck_types |= PGMap::STUCK_UNCLEAN;
2026 else if (*i == "undersized")
2027 stuck_types |= PGMap::STUCK_UNDERSIZED;
2028 else if (*i == "degraded")
2029 stuck_types |= PGMap::STUCK_DEGRADED;
2030 else if (*i == "stale")
2031 stuck_types |= PGMap::STUCK_STALE;
2032 else {
2033 ds << "Unknown type: " << *i << std::endl;
2034 return -EINVAL;
2035 }
2036 }
2037
2038 utime_t now(ceph_clock_now());
2039 utime_t cutoff = now - utime_t(threshold, 0);
2040
2041 if (!f) {
2042 dump_stuck_plain(ds, stuck_types, cutoff);
2043 } else {
2044 dump_stuck(f, stuck_types, cutoff);
2045 f->flush(ds);
2046 }
2047
2048 return 0;
2049 }
2050
2051 void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
2052 {
2053 f->open_array_section("osd_perf_infos");
2054 for (auto i = osd_stat.begin();
2055 i != osd_stat.end();
2056 ++i) {
2057 f->open_object_section("osd");
2058 f->dump_int("id", i->first);
2059 {
2060 f->open_object_section("perf_stats");
2061 i->second.os_perf_stat.dump(f);
2062 f->close_section();
2063 }
2064 f->close_section();
2065 }
2066 f->close_section();
2067 }
2068 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2069 {
2070 TextTable tab;
2071 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2072 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2073 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2074 for (auto i = osd_stat.begin();
2075 i != osd_stat.end();
2076 ++i) {
2077 tab << i->first;
2078 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2079 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
2080 tab << TextTable::endrow;
2081 }
2082 (*ss) << tab;
2083 }
2084
2085 void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
2086 {
2087 f->open_array_section("osd_blocked_by_infos");
2088 for (auto i = blocked_by_sum.begin();
2089 i != blocked_by_sum.end();
2090 ++i) {
2091 f->open_object_section("osd");
2092 f->dump_int("id", i->first);
2093 f->dump_int("num_blocked", i->second);
2094 f->close_section();
2095 }
2096 f->close_section();
2097 }
2098 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2099 {
2100 TextTable tab;
2101 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2102 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2103 for (auto i = blocked_by_sum.begin();
2104 i != blocked_by_sum.end();
2105 ++i) {
2106 tab << i->first;
2107 tab << i->second;
2108 tab << TextTable::endrow;
2109 }
2110 (*ss) << tab;
2111 }
2112
2113
2114 /**
2115 * update aggregated delta
2116 *
2117 * @param cct ceph context
2118 * @param ts Timestamp for the stats being delta'ed
2119 * @param old_pool_sum Previous stats sum
2120 * @param last_ts Last timestamp for pool
2121 * @param result_pool_sum Resulting stats
2122 * @param result_pool_delta Resulting pool delta
2123 * @param result_ts_delta Resulting timestamp delta
2124 * @param delta_avg_list List of last N computed deltas, used to average
2125 */
2126 void PGMap::update_delta(
2127 CephContext *cct,
2128 const utime_t ts,
2129 const pool_stat_t& old_pool_sum,
2130 utime_t *last_ts,
2131 const pool_stat_t& current_pool_sum,
2132 pool_stat_t *result_pool_delta,
2133 utime_t *result_ts_delta,
2134 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2135 {
2136 /* @p ts is the timestamp we want to associate with the data
2137 * in @p old_pool_sum, and on which we will base ourselves to
2138 * calculate the delta, stored in 'delta_t'.
2139 */
2140 utime_t delta_t;
2141 delta_t = ts; // start with the provided timestamp
2142 delta_t -= *last_ts; // take the last timestamp we saw
2143 *last_ts = ts; // @p ts becomes the last timestamp we saw
2144
2145 // adjust delta_t, quick start if there is no update in a long period
2146 delta_t = std::min(delta_t,
2147 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2148
2149 // calculate a delta, and average over the last 6 deltas by default.
2150 /* start by taking a copy of our current @p result_pool_sum, and by
2151 * taking out the stats from @p old_pool_sum. This generates a stats
2152 * delta. Stash this stats delta in @p delta_avg_list, along with the
2153 * timestamp delta for these results.
2154 */
2155 pool_stat_t d = current_pool_sum;
2156 d.stats.sub(old_pool_sum.stats);
2157
2158 /* Aggregate current delta, and take out the last seen delta (if any) to
2159 * average it out.
2160 * Skip calculating delta while sum was not synchronized.
2161 */
2162 if(!old_pool_sum.stats.sum.is_zero()) {
2163 delta_avg_list->push_back(make_pair(d,delta_t));
2164 *result_ts_delta += delta_t;
2165 result_pool_delta->stats.add(d.stats);
2166 }
2167 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2168 while (delta_avg_list->size() > s) {
2169 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2170 *result_ts_delta -= delta_avg_list->front().second;
2171 delta_avg_list->pop_front();
2172 }
2173 }
2174
2175 /**
2176 * Update a given pool's deltas
2177 *
2178 * @param cct Ceph Context
2179 * @param ts Timestamp for the stats being delta'ed
2180 * @param pool Pool's id
2181 * @param old_pool_sum Previous stats sum
2182 */
2183 void PGMap::update_one_pool_delta(
2184 CephContext *cct,
2185 const utime_t ts,
2186 const int64_t pool,
2187 const pool_stat_t& old_pool_sum)
2188 {
2189 if (per_pool_sum_deltas.count(pool) == 0) {
2190 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2191 ceph_assert(per_pool_sum_delta.count(pool) == 0);
2192 }
2193
2194 auto& sum_delta = per_pool_sum_delta[pool];
2195
2196 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2197 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2198 &per_pool_sum_deltas[pool]);
2199 }
2200
2201 /**
2202 * Update pools' deltas
2203 *
2204 * @param cct CephContext
2205 * @param ts Timestamp for the stats being delta'ed
2206 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2207 */
2208 void PGMap::update_pool_deltas(
2209 CephContext *cct, const utime_t ts,
2210 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
2211 {
2212 for (auto it = pg_pool_sum_old.begin();
2213 it != pg_pool_sum_old.end(); ++it) {
2214 update_one_pool_delta(cct, ts, it->first, it->second);
2215 }
2216 }
2217
2218 void PGMap::clear_delta()
2219 {
2220 pg_sum_delta = pool_stat_t();
2221 pg_sum_deltas.clear();
2222 stamp_delta = utime_t();
2223 }
2224
2225 void PGMap::generate_test_instances(list<PGMap*>& o)
2226 {
2227 o.push_back(new PGMap);
2228 list<Incremental*> inc;
2229 Incremental::generate_test_instances(inc);
2230 delete inc.front();
2231 inc.pop_front();
2232 while (!inc.empty()) {
2233 PGMap *pmp = new PGMap();
2234 *pmp = *o.back();
2235 o.push_back(pmp);
2236 o.back()->apply_incremental(NULL, *inc.front());
2237 delete inc.front();
2238 inc.pop_front();
2239 }
2240 }
2241
2242 void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
2243 bool primary, set<pg_t>& pgs) const
2244 {
2245 for (auto i = pg_stat.begin();
2246 i != pg_stat.end();
2247 ++i) {
2248 if ((poolid >= 0) && (poolid != i->first.pool()))
2249 continue;
2250 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2251 continue;
2252 if (state == (uint64_t)-1 || // "all"
2253 (i->second.state & state) || // matches a state bit
2254 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2255 pgs.insert(i->first);
2256 }
2257 }
2258 }
2259
2260 void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
2261 {
2262 f->open_array_section("pg_stats");
2263 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2264 const pg_stat_t& st = pg_stat.at(*i);
2265 f->open_object_section("pg_stat");
2266 f->dump_stream("pgid") << *i;
2267 st.dump(f);
2268 f->close_section();
2269 }
2270 f->close_section();
2271 }
2272
2273 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2274 {
2275 TextTable tab;
2276 utime_t now = ceph_clock_now();
2277
2278 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
2279 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2280 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2281 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2282 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2283 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2284 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2285 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
2286 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2287 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2288 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
2289 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2290 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2291 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2292 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2293 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2294 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2295
2296 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2297 const pg_stat_t& st = pg_stat.at(*i);
2298
2299 ostringstream reported;
2300 reported << st.reported_epoch << ":" << st.reported_seq;
2301
2302 ostringstream upstr, actingstr;
2303 upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
2304 actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
2305 tab << *i
2306 << st.stats.sum.num_objects
2307 << st.stats.sum.num_objects_degraded
2308 << st.stats.sum.num_objects_misplaced
2309 << st.stats.sum.num_objects_unfound
2310 << st.stats.sum.num_bytes
2311 << st.stats.sum.num_omap_bytes
2312 << st.stats.sum.num_omap_keys
2313 << st.log_size
2314 << pg_state_string(st.state)
2315 << utimespan_str(now - st.last_change)
2316 << st.version
2317 << reported.str()
2318 << upstr.str()
2319 << actingstr.str()
2320 << st.last_scrub_stamp
2321 << st.last_deep_scrub_stamp
2322 << TextTable::endrow;
2323 }
2324
2325 ss << tab;
2326 }
2327
2328 void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
2329 ceph::Formatter *f,
2330 stringstream *rs) const {
2331 string pool_name = osd_map.get_pool_name(poolid);
2332 if (f) {
2333 f->open_object_section("pool");
2334 f->dump_string("pool_name", pool_name.c_str());
2335 f->dump_int("pool_id", poolid);
2336 f->open_object_section("recovery");
2337 }
2338 list<string> sl;
2339 stringstream tss;
2340 pool_recovery_summary(f, &sl, poolid);
2341 if (!f && !sl.empty()) {
2342 for (auto &p : sl)
2343 tss << " " << p << "\n";
2344 }
2345 if (f) {
2346 f->close_section(); // object section recovery
2347 f->open_object_section("recovery_rate");
2348 }
2349 ostringstream rss;
2350 pool_recovery_rate_summary(f, &rss, poolid);
2351 if (!f && !rss.str().empty())
2352 tss << " recovery io " << rss.str() << "\n";
2353 if (f) {
2354 f->close_section(); // object section recovery_rate
2355 f->open_object_section("client_io_rate");
2356 }
2357 rss.clear();
2358 rss.str("");
2359 pool_client_io_rate_summary(f, &rss, poolid);
2360 if (!f && !rss.str().empty())
2361 tss << " client io " << rss.str() << "\n";
2362 // dump cache tier IO rate for cache pool
2363 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2364 if (pool->is_tier()) {
2365 if (f) {
2366 f->close_section(); // object section client_io_rate
2367 f->open_object_section("cache_io_rate");
2368 }
2369 rss.clear();
2370 rss.str("");
2371 pool_cache_io_rate_summary(f, &rss, poolid);
2372 if (!f && !rss.str().empty())
2373 tss << " cache tier io " << rss.str() << "\n";
2374 }
2375 if (f) {
2376 f->close_section(); // object section cache_io_rate
2377 f->close_section(); // object section pool
2378 } else {
2379 *rs << "pool " << pool_name << " id " << poolid << "\n";
2380 if (!tss.str().empty())
2381 *rs << tss.str() << "\n";
2382 else
2383 *rs << " nothing is going on\n\n";
2384 }
2385 }
2386
2387 // Get crush parentage for an osd (skip root)
2388 set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
2389 {
2390 set<std::string> reporters_by_subtree;
2391 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2392
2393 auto loc = osdmap.crush->get_full_location(id);
2394 for (auto& [parent_bucket_type, parent_id] : loc) {
2395 // Should we show the root? Might not be too informative like "default"
2396 if (parent_bucket_type != "root" &&
2397 parent_bucket_type != reporter_subtree_level) {
2398 reporters_by_subtree.insert(parent_id);
2399 }
2400 }
2401 return reporters_by_subtree;
2402 }
2403
2404 void PGMap::get_health_checks(
2405 CephContext *cct,
2406 const OSDMap& osdmap,
2407 health_check_map_t *checks) const
2408 {
2409 utime_t now = ceph_clock_now();
2410 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2411 const auto& pools = osdmap.get_pools();
2412
2413 typedef enum pg_consequence_t {
2414 UNAVAILABLE = 1, // Client IO to the pool may block
2415 DEGRADED = 2, // Fewer than the requested number of replicas are present
2416 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2417 // This may or may not be a deadlock condition.
2418 DAMAGED = 4, // The data may be missing or inconsistent on disk and
2419 // requires repair
2420 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
2421 } pg_consequence_t;
2422
2423 // For a given PG state, how should it be reported at the pool level?
2424 class PgStateResponse {
2425 public:
2426 pg_consequence_t consequence;
2427 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2428 stuck_cb stuck_since;
2429 bool invert;
2430
2431 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2432 : consequence(c), stuck_since(std::move(s)), invert(false)
2433 {
2434 }
2435
2436 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2437 : consequence(c), stuck_since(std::move(s)), invert(i)
2438 {
2439 }
2440 };
2441
2442 // Record the PG state counts that contributed to a reported pool state
2443 class PgCauses {
2444 public:
2445 // Map of PG_STATE_* to number of pgs in that state.
2446 std::map<unsigned, unsigned> states;
2447
2448 // List of all PG IDs that had a state contributing
2449 // to this health condition.
2450 std::set<pg_t> pgs;
2451
2452 std::map<pg_t, std::string> pg_messages;
2453 };
2454
2455 // Map of PG state to how to respond to it
2456 std::map<unsigned, PgStateResponse> state_to_response = {
2457 // Immediate reports
2458 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2459 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
2460 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2461 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2462 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
2463 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2464 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
2465 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2466 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2467 // Delayed (wait until stuck) reports
2468 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2469 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2470 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2471 // Delayed and inverted reports
2472 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
2473 };
2474
2475 // Specialized state printer that takes account of inversion of
2476 // ACTIVE, CLEAN checks.
2477 auto state_name = [](const uint64_t &state) {
2478 // Special cases for the states that are inverted checks
2479 if (state == PG_STATE_CLEAN) {
2480 return std::string("unclean");
2481 } else if (state == PG_STATE_ACTIVE) {
2482 return std::string("inactive");
2483 } else {
2484 return pg_state_string(state);
2485 }
2486 };
2487
2488 // Map of what is wrong to information about why, implicitly also stores
2489 // the list of what is wrong.
2490 std::map<pg_consequence_t, PgCauses> detected;
2491
2492 // Optimisation: trim down the number of checks to apply based on
2493 // the summary counters
2494 std::map<unsigned, PgStateResponse> possible_responses;
2495 for (const auto &i : num_pg_by_state) {
2496 for (const auto &j : state_to_response) {
2497 if (!j.second.invert) {
2498 // Check for normal tests by seeing if any pgs have the flag
2499 if (i.first & j.first) {
2500 possible_responses.insert(j);
2501 }
2502 }
2503 }
2504 }
2505
2506 for (const auto &j : state_to_response) {
2507 if (j.second.invert) {
2508 // Check for inverted tests by seeing if not-all pgs have the flag
2509 const auto &found = num_pg_by_state.find(j.first);
2510 if (found == num_pg_by_state.end() || found->second != num_pg) {
2511 possible_responses.insert(j);
2512 }
2513 }
2514 }
2515
2516 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
2517 // Loop over all PGs, if there are any possibly-unhealthy states in there
2518 if (!possible_responses.empty()) {
2519 for (const auto& i : pg_stat) {
2520 const auto &pg_id = i.first;
2521 const auto &pg_info = i.second;
2522
2523 for (const auto &j : state_to_response) {
2524 const auto &pg_response_state = j.first;
2525 const auto &pg_response = j.second;
2526
2527 // Apply the state test
2528 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2529 continue;
2530 }
2531
2532 // Apply stuckness test if needed
2533 if (pg_response.stuck_since) {
2534 // Delayed response, check for stuckness
2535 utime_t last_whatever = pg_response.stuck_since(pg_info);
2536 if (last_whatever.is_zero() &&
2537 pg_info.last_change >= cutoff) {
2538 // still moving, ignore
2539 continue;
2540 } else if (last_whatever >= cutoff) {
2541 // Not stuck enough, ignore.
2542 continue;
2543 } else {
2544
2545 }
2546 }
2547
2548 auto &causes = detected[pg_response.consequence];
2549 causes.states[pg_response_state]++;
2550 causes.pgs.insert(pg_id);
2551
2552 // Don't bother composing detail string if we have already recorded
2553 // too many
2554 if (causes.pg_messages.size() > max) {
2555 continue;
2556 }
2557
2558 std::ostringstream ss;
2559 if (pg_response.stuck_since) {
2560 utime_t since = pg_response.stuck_since(pg_info);
2561 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2562 if (since == utime_t()) {
2563 ss << " since forever";
2564 } else {
2565 utime_t dur = now - since;
2566 ss << " for " << utimespan_str(dur);
2567 }
2568 ss << ", current state " << pg_state_string(pg_info.state)
2569 << ", last acting " << pg_info.acting;
2570 } else {
2571 ss << "pg " << pg_id << " is "
2572 << pg_state_string(pg_info.state);
2573 ss << ", acting " << pg_info.acting;
2574 if (pg_info.stats.sum.num_objects_unfound) {
2575 ss << ", " << pg_info.stats.sum.num_objects_unfound
2576 << " unfound";
2577 }
2578 }
2579
2580 if (pg_info.state & PG_STATE_INCOMPLETE) {
2581 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2582 if (pi && pi->min_size > 1) {
2583 ss << " (reducing pool "
2584 << osdmap.get_pool_name(pg_id.pool())
2585 << " min_size from " << (int)pi->min_size
2586 << " may help; search ceph.com/docs for 'incomplete')";
2587 }
2588 }
2589
2590 causes.pg_messages[pg_id] = ss.str();
2591 }
2592 }
2593 } else {
2594 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2595 }
2596
2597 for (const auto &i : detected) {
2598 std::string health_code;
2599 health_status_t sev;
2600 std::string summary;
2601 switch(i.first) {
2602 case UNAVAILABLE:
2603 health_code = "PG_AVAILABILITY";
2604 sev = HEALTH_WARN;
2605 summary = "Reduced data availability: ";
2606 break;
2607 case DEGRADED:
2608 health_code = "PG_DEGRADED";
2609 summary = "Degraded data redundancy: ";
2610 sev = HEALTH_WARN;
2611 break;
2612 case BACKFILL_FULL:
2613 health_code = "PG_BACKFILL_FULL";
2614 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2615 sev = HEALTH_WARN;
2616 break;
2617 case DAMAGED:
2618 health_code = "PG_DAMAGED";
2619 summary = "Possible data damage: ";
2620 sev = HEALTH_ERR;
2621 break;
2622 case RECOVERY_FULL:
2623 health_code = "PG_RECOVERY_FULL";
2624 summary = "Full OSDs blocking recovery: ";
2625 sev = HEALTH_ERR;
2626 break;
2627 default:
2628 ceph_abort();
2629 }
2630
2631 if (i.first == DEGRADED) {
2632 if (pg_sum.stats.sum.num_objects_degraded &&
2633 pg_sum.stats.sum.num_object_copies > 0) {
2634 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2635 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2636 char b[20];
2637 snprintf(b, sizeof(b), "%.3lf", pc);
2638 ostringstream ss;
2639 ss << pg_sum.stats.sum.num_objects_degraded
2640 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2641 << b << "%)";
2642
2643 // Throw in a comma for the benefit of the following PG counts
2644 summary += ss.str() + ", ";
2645 }
2646 }
2647
2648 // Compose summary message saying how many PGs in what states led
2649 // to this health check failing
2650 std::vector<std::string> pg_msgs;
2651 int64_t count = 0;
2652 for (const auto &j : i.second.states) {
2653 std::ostringstream msg;
2654 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2655 pg_msgs.push_back(msg.str());
2656 count += j.second;
2657 }
2658 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2659
2660 health_check_t *check = &checks->add(
2661 health_code,
2662 sev,
2663 summary,
2664 count);
2665
2666 // Compose list of PGs contributing to this health check failing
2667 for (const auto &j : i.second.pg_messages) {
2668 check->detail.push_back(j.second);
2669 }
2670 }
2671
2672 // OSD_SCRUB_ERRORS
2673 if (pg_sum.stats.sum.num_scrub_errors) {
2674 ostringstream ss;
2675 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2676 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2677 pg_sum.stats.sum.num_scrub_errors);
2678 }
2679
2680 // LARGE_OMAP_OBJECTS
2681 if (pg_sum.stats.sum.num_large_omap_objects) {
2682 list<string> detail;
2683 for (auto &pool : pools) {
2684 const string& pool_name = osdmap.get_pool_name(pool.first);
2685 auto it2 = pg_pool_sum.find(pool.first);
2686 if (it2 == pg_pool_sum.end()) {
2687 continue;
2688 }
2689 const pool_stat_t *pstat = &it2->second;
2690 if (pstat == nullptr) {
2691 continue;
2692 }
2693 const object_stat_sum_t& sum = pstat->stats.sum;
2694 if (sum.num_large_omap_objects) {
2695 stringstream ss;
2696 ss << sum.num_large_omap_objects << " large objects found in pool "
2697 << "'" << pool_name << "'";
2698 detail.push_back(ss.str());
2699 }
2700 }
2701 if (!detail.empty()) {
2702 ostringstream ss;
2703 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
2704 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2705 pg_sum.stats.sum.num_large_omap_objects);
2706 stringstream tip;
2707 tip << "Search the cluster log for 'Large omap object found' for more "
2708 << "details.";
2709 detail.push_back(tip.str());
2710 d.detail.swap(detail);
2711 }
2712 }
2713
2714 // CACHE_POOL_NEAR_FULL
2715 {
2716 list<string> detail;
2717 unsigned num_pools = 0;
2718 for (auto& p : pools) {
2719 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2720 !pg_pool_sum.count(p.first)) {
2721 continue;
2722 }
2723 bool nearfull = false;
2724 const string& name = osdmap.get_pool_name(p.first);
2725 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2726 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2727 ((1000000 - p.second.cache_target_full_ratio_micro) *
2728 cct->_conf->mon_cache_target_full_warn_ratio);
2729 if (p.second.target_max_objects &&
2730 (uint64_t)(st.stats.sum.num_objects -
2731 st.stats.sum.num_objects_hit_set_archive) >
2732 p.second.target_max_objects * (ratio / 1000000.0)) {
2733 ostringstream ss;
2734 ss << "cache pool '" << name << "' with "
2735 << si_u_t(st.stats.sum.num_objects)
2736 << " objects at/near target max "
2737 << si_u_t(p.second.target_max_objects) << " objects";
2738 detail.push_back(ss.str());
2739 nearfull = true;
2740 }
2741 if (p.second.target_max_bytes &&
2742 (uint64_t)(st.stats.sum.num_bytes -
2743 st.stats.sum.num_bytes_hit_set_archive) >
2744 p.second.target_max_bytes * (ratio / 1000000.0)) {
2745 ostringstream ss;
2746 ss << "cache pool '" << name
2747 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2748 << " at/near target max "
2749 << byte_u_t(p.second.target_max_bytes);
2750 detail.push_back(ss.str());
2751 nearfull = true;
2752 }
2753 if (nearfull) {
2754 ++num_pools;
2755 }
2756 }
2757 if (!detail.empty()) {
2758 ostringstream ss;
2759 ss << num_pools << " cache pools at or near target size";
2760 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2761 num_pools);
2762 d.detail.swap(detail);
2763 }
2764 }
2765
2766 // TOO_FEW_PGS
2767 unsigned num_in = osdmap.get_num_in_osds();
2768 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2769 const auto min_pg_per_osd =
2770 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
2771 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2772 auto per = sum_pg_up / num_in;
2773 if (per < min_pg_per_osd && per) {
2774 ostringstream ss;
2775 ss << "too few PGs per OSD (" << per
2776 << " < min " << min_pg_per_osd << ")";
2777 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2778 min_pg_per_osd - per);
2779 }
2780 }
2781
2782 // TOO_MANY_PGS
2783 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
2784 if (num_in && max_pg_per_osd > 0) {
2785 auto per = sum_pg_up / num_in;
2786 if (per > max_pg_per_osd) {
2787 ostringstream ss;
2788 ss << "too many PGs per OSD (" << per
2789 << " > max " << max_pg_per_osd << ")";
2790 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2791 per - max_pg_per_osd);
2792 }
2793 }
2794
2795 // TOO_FEW_OSDS
2796 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2797 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2798 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2799 ostringstream ss;
2800 ss << "OSD count " << osdmap.get_num_osds()
2801 << " < osd_pool_default_size " << osd_pool_default_size;
2802 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2803 osd_pool_default_size - osdmap.get_num_osds());
2804 }
2805
2806 // SLOW_PING_TIME
2807 // Convert milliseconds to microseconds
2808 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2809 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2810 if (warn_slow_ping_time == 0) {
2811 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2812 warn_slow_ping_time = grace;
2813 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2814 }
2815 if (warn_slow_ping_time > 0) {
2816
2817 struct mon_ping_item_t {
2818 uint32_t pingtime;
2819 int from;
2820 int to;
2821 bool improving;
2822
2823 bool operator<(const mon_ping_item_t& rhs) const {
2824 if (pingtime < rhs.pingtime)
2825 return true;
2826 if (pingtime > rhs.pingtime)
2827 return false;
2828 if (from < rhs.from)
2829 return true;
2830 if (from > rhs.from)
2831 return false;
2832 return to < rhs.to;
2833 }
2834 };
2835
2836 list<string> detail_back;
2837 list<string> detail_front;
2838 list<string> detail;
2839 set<mon_ping_item_t> back_sorted, front_sorted;
2840 for (auto i : osd_stat) {
2841 for (auto j : i.second.hb_pingtime) {
2842
2843 // Maybe source info is old
2844 if (now.sec() - j.second.last_update > grace * 60)
2845 continue;
2846
2847 mon_ping_item_t back;
2848 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2849 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2850 back.from = i.first;
2851 back.to = j.first;
2852 if (back.pingtime > warn_slow_ping_time) {
2853 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2854 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2855 back_sorted.emplace(back);
2856 }
2857
2858 mon_ping_item_t front;
2859 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2860 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2861 front.from = i.first;
2862 front.to = j.first;
2863 if (front.pingtime > warn_slow_ping_time) {
2864 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
2865 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2866 front_sorted.emplace(front);
2867 }
2868 }
2869 if (i.second.num_shards_repaired >
2870 cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
2871 ostringstream ss;
2872 ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
2873 detail.push_back(ss.str());
2874 }
2875 }
2876 if (!detail.empty()) {
2877 ostringstream ss;
2878 ss << "Too many repaired reads on " << detail.size() << " OSDs";
2879 auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
2880 detail.size());
2881 d.detail.swap(detail);
2882 }
2883 int max_detail = 10;
2884 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2885 ostringstream ss;
2886 if (max_detail == 0) {
2887 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2888 detail_back.push_back(ss.str());
2889 break;
2890 }
2891 max_detail--;
2892 ss << "Slow OSD heartbeats on back from osd." << sback.from
2893 << " [" << osd_parentage(osdmap, sback.from) << "]"
2894 << (osdmap.is_down(sback.from) ? " (down)" : "")
2895 << " to osd." << sback.to
2896 << " [" << osd_parentage(osdmap, sback.to) << "]"
2897 << (osdmap.is_down(sback.to) ? " (down)" : "")
2898 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2899 << (sback.improving ? " possibly improving" : "");
2900 detail_back.push_back(ss.str());
2901 }
2902 max_detail = 10;
2903 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2904 ostringstream ss;
2905 if (max_detail == 0) {
2906 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2907 detail_front.push_back(ss.str());
2908 break;
2909 }
2910 max_detail--;
2911 // Get crush parentage for each osd
2912 ss << "Slow OSD heartbeats on front from osd." << sfront.from
2913 << " [" << osd_parentage(osdmap, sfront.from) << "]"
2914 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2915 << " to osd." << sfront.to
2916 << " [" << osd_parentage(osdmap, sfront.to) << "]"
2917 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2918 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2919 << (sfront.improving ? " possibly improving" : "");
2920 detail_front.push_back(ss.str());
2921 }
2922 if (detail_back.size() != 0) {
2923 ostringstream ss;
2924 ss << "Slow OSD heartbeats on back (longest "
2925 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
2926 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2927 back_sorted.size());
2928 d.detail.swap(detail_back);
2929 }
2930 if (detail_front.size() != 0) {
2931 ostringstream ss;
2932 ss << "Slow OSD heartbeats on front (longest "
2933 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
2934 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2935 front_sorted.size());
2936 d.detail.swap(detail_front);
2937 }
2938 }
2939
2940 // SMALLER_PGP_NUM
2941 // MANY_OBJECTS_PER_PG
2942 if (!pg_stat.empty()) {
2943 list<string> pgp_detail, many_detail;
2944 const auto mon_pg_warn_min_objects =
2945 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
2946 const auto mon_pg_warn_min_pool_objects =
2947 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
2948 const auto mon_pg_warn_max_object_skew =
2949 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
2950 for (auto p = pg_pool_sum.begin();
2951 p != pg_pool_sum.end();
2952 ++p) {
2953 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2954 if (!pi)
2955 continue; // in case osdmap changes haven't propagated to PGMap yet
2956 const string& name = osdmap.get_pool_name(p->first);
2957 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2958 // the warnings. If the cluster is failing to converge on the target
2959 // values that is a separate issue!
2960 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
2961 !(name.find(".DELETED") != string::npos &&
2962 cct->_conf->mon_fake_pool_delete)) {
2963 ostringstream ss;
2964 ss << "pool " << name << " pg_num "
2965 << pi->get_pg_num_target()
2966 << " > pgp_num " << pi->get_pgp_num_target();
2967 pgp_detail.push_back(ss.str());
2968 }
2969 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2970 if (average_objects_per_pg > 0 &&
2971 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2972 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
2973 int objects_per_pg = p->second.stats.sum.num_objects /
2974 pi->get_pg_num_target();
2975 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2976 if (mon_pg_warn_max_object_skew > 0 &&
2977 ratio > mon_pg_warn_max_object_skew) {
2978 ostringstream ss;
2979 ss << "pool " << name << " objects per pg ("
2980 << objects_per_pg << ") is more than " << ratio
2981 << " times cluster average ("
2982 << average_objects_per_pg << ")";
2983 many_detail.push_back(ss.str());
2984 }
2985 }
2986 }
2987 if (!pgp_detail.empty()) {
2988 ostringstream ss;
2989 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2990 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
2991 pgp_detail.size());
2992 d.detail.swap(pgp_detail);
2993 }
2994 if (!many_detail.empty()) {
2995 ostringstream ss;
2996 ss << many_detail.size() << " pools have many more objects per pg than"
2997 << " average";
2998 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
2999 many_detail.size());
3000 d.detail.swap(many_detail);
3001 }
3002 }
3003
3004 // POOL_FULL
3005 // POOL_NEAR_FULL
3006 {
3007 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
3008 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
3009 list<string> full_detail, nearfull_detail;
3010 unsigned full_pools = 0, nearfull_pools = 0;
3011 for (auto it : pools) {
3012 auto it2 = pg_pool_sum.find(it.first);
3013 if (it2 == pg_pool_sum.end()) {
3014 continue;
3015 }
3016 const pool_stat_t *pstat = &it2->second;
3017 const object_stat_sum_t& sum = pstat->stats.sum;
3018 const string& pool_name = osdmap.get_pool_name(it.first);
3019 const pg_pool_t &pool = it.second;
3020 bool full = false, nearfull = false;
3021 if (pool.quota_max_objects > 0) {
3022 stringstream ss;
3023 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3024 } else if (crit_threshold > 0 &&
3025 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3026 ss << "pool '" << pool_name
3027 << "' has " << sum.num_objects << " objects"
3028 << " (max " << pool.quota_max_objects << ")";
3029 full_detail.push_back(ss.str());
3030 full = true;
3031 } else if (warn_threshold > 0 &&
3032 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3033 ss << "pool '" << pool_name
3034 << "' has " << sum.num_objects << " objects"
3035 << " (max " << pool.quota_max_objects << ")";
3036 nearfull_detail.push_back(ss.str());
3037 nearfull = true;
3038 }
3039 }
3040 if (pool.quota_max_bytes > 0) {
3041 stringstream ss;
3042 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3043 } else if (crit_threshold > 0 &&
3044 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3045 ss << "pool '" << pool_name
3046 << "' has " << byte_u_t(sum.num_bytes)
3047 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
3048 full_detail.push_back(ss.str());
3049 full = true;
3050 } else if (warn_threshold > 0 &&
3051 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3052 ss << "pool '" << pool_name
3053 << "' has " << byte_u_t(sum.num_bytes)
3054 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
3055 nearfull_detail.push_back(ss.str());
3056 nearfull = true;
3057 }
3058 }
3059 if (full) {
3060 ++full_pools;
3061 }
3062 if (nearfull) {
3063 ++nearfull_pools;
3064 }
3065 }
3066 if (full_pools) {
3067 ostringstream ss;
3068 ss << full_pools << " pools full";
3069 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
3070 d.detail.swap(full_detail);
3071 }
3072 if (nearfull_pools) {
3073 ostringstream ss;
3074 ss << nearfull_pools << " pools nearfull";
3075 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
3076 d.detail.swap(nearfull_detail);
3077 }
3078 }
3079
3080 // OBJECT_MISPLACED
3081 if (pg_sum.stats.sum.num_objects_misplaced &&
3082 pg_sum.stats.sum.num_object_copies > 0 &&
3083 cct->_conf->mon_warn_on_misplaced) {
3084 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3085 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3086 char b[20];
3087 snprintf(b, sizeof(b), "%.3lf", pc);
3088 ostringstream ss;
3089 ss << pg_sum.stats.sum.num_objects_misplaced
3090 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3091 << b << "%)";
3092 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3093 pg_sum.stats.sum.num_objects_misplaced);
3094 }
3095
3096 // OBJECT_UNFOUND
3097 if (pg_sum.stats.sum.num_objects_unfound &&
3098 pg_sum.stats.sum.num_objects) {
3099 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3100 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3101 char b[20];
3102 snprintf(b, sizeof(b), "%.3lf", pc);
3103 ostringstream ss;
3104 ss << pg_sum.stats.sum.num_objects_unfound
3105 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
3106 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3107 pg_sum.stats.sum.num_objects_unfound);
3108
3109 for (auto& p : pg_stat) {
3110 if (p.second.stats.sum.num_objects_unfound) {
3111 ostringstream ss;
3112 ss << "pg " << p.first
3113 << " has " << p.second.stats.sum.num_objects_unfound
3114 << " unfound objects";
3115 d.detail.push_back(ss.str());
3116 if (d.detail.size() > max) {
3117 d.detail.push_back("(additional pgs left out for brevity)");
3118 break;
3119 }
3120 }
3121 }
3122 }
3123
3124 // REQUEST_SLOW
3125 // REQUEST_STUCK
3126 // SLOW_OPS unifies them in mimic.
3127 if (osdmap.require_osd_release < ceph_release_t::mimic &&
3128 cct->_conf->mon_osd_warn_op_age > 0 &&
3129 !osd_sum.op_queue_age_hist.h.empty() &&
3130 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3131 cct->_conf->mon_osd_warn_op_age) {
3132 list<string> warn_detail, error_detail;
3133 unsigned warn = 0, error = 0;
3134 float err_age =
3135 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3136 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3137 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3138 float ub = (float)(1 << i) / 1000.0;
3139 if (ub < cct->_conf->mon_osd_warn_op_age)
3140 break;
3141 if (h.h[i]) {
3142 ostringstream ss;
3143 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3144 if (ub > err_age) {
3145 error += h.h[i];
3146 error_detail.push_back(ss.str());
3147 } else {
3148 warn += h.h[i];
3149 warn_detail.push_back(ss.str());
3150 }
3151 }
3152 }
3153
3154 map<float,set<int>> warn_osd_by_max; // max -> osds
3155 map<float,set<int>> error_osd_by_max; // max -> osds
3156 if (!warn_detail.empty() || !error_detail.empty()) {
3157 for (auto& p : osd_stat) {
3158 const pow2_hist_t& h = p.second.op_queue_age_hist;
3159 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3160 float ub = (float)(1 << i) / 1000.0;
3161 if (ub < cct->_conf->mon_osd_warn_op_age)
3162 break;
3163 if (h.h[i]) {
3164 if (ub > err_age) {
3165 error_osd_by_max[ub].insert(p.first);
3166 } else {
3167 warn_osd_by_max[ub].insert(p.first);
3168 }
3169 break;
3170 }
3171 }
3172 }
3173 }
3174
3175 if (!warn_detail.empty()) {
3176 ostringstream ss;
3177 ss << warn << " slow requests are blocked > "
3178 << cct->_conf->mon_osd_warn_op_age << " sec";
3179 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
3180 d.detail.swap(warn_detail);
3181 int left = max;
3182 for (auto& p : warn_osd_by_max) {
3183 ostringstream ss;
3184 if (p.second.size() > 1) {
3185 ss << "osds " << p.second
3186 << " have blocked requests > " << p.first << " sec";
3187 } else {
3188 ss << "osd." << *p.second.begin()
3189 << " has blocked requests > " << p.first << " sec";
3190 }
3191 d.detail.push_back(ss.str());
3192 if (--left == 0) {
3193 break;
3194 }
3195 }
3196 }
3197 if (!error_detail.empty()) {
3198 ostringstream ss;
3199 ss << error << " stuck requests are blocked > "
3200 << err_age << " sec";
3201 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
3202 d.detail.swap(error_detail);
3203 int left = max;
3204 for (auto& p : error_osd_by_max) {
3205 ostringstream ss;
3206 if (p.second.size() > 1) {
3207 ss << "osds " << p.second
3208 << " have stuck requests > " << p.first << " sec";
3209 } else {
3210 ss << "osd." << *p.second.begin()
3211 << " has stuck requests > " << p.first << " sec";
3212 }
3213 d.detail.push_back(ss.str());
3214 if (--left == 0) {
3215 break;
3216 }
3217 }
3218 }
3219 }
3220
3221 // OBJECT_STORE_WARN
3222 if (osd_sum.os_alerts.size()) {
3223 map<string, pair<size_t, list<string>>> os_alerts_sum;
3224
3225 for (auto& a : osd_sum.os_alerts) {
3226 int left = max;
3227 string s0 = " osd.";
3228 s0 += stringify(a.first);
3229 for (auto& aa : a.second) {
3230 string s(s0);
3231 s += " ";
3232 s += aa.second;
3233 auto it = os_alerts_sum.find(aa.first);
3234 if (it == os_alerts_sum.end()) {
3235 list<string> d;
3236 d.emplace_back(s);
3237 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3238 } else {
3239 auto& p = it->second;
3240 ++p.first;
3241 p.second.emplace_back(s);
3242 }
3243 if (--left == 0) {
3244 break;
3245 }
3246 }
3247 }
3248
3249 for (auto& asum : os_alerts_sum) {
3250 string summary = stringify(asum.second.first) + " OSD(s)";
3251 if (asum.first == "BLUEFS_SPILLOVER") {
3252 summary += " experiencing BlueFS spillover";
3253 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
3254 summary += " have broken BlueStore compression";
3255 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
3256 summary += " reporting legacy (not per-pool) BlueStore stats";
3257 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
3258 summary += " have dangerous mismatch between BlueStore block device and free list sizes";
3259 } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3260 summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
3261 }
3262 auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
3263 for (auto& s : asum.second.second) {
3264 d.detail.push_back(s);
3265 }
3266 }
3267 }
3268 // PG_NOT_SCRUBBED
3269 // PG_NOT_DEEP_SCRUBBED
3270 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3271 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3272 list<string> detail, deep_detail;
3273 int detail_max = max, deep_detail_max = max;
3274 int detail_more = 0, deep_detail_more = 0;
3275 int detail_total = 0, deep_detail_total = 0;
3276 for (auto& p : pg_stat) {
3277 int64_t pnum = p.first.pool();
3278 auto pool = osdmap.get_pg_pool(pnum);
3279 if (!pool)
3280 continue;
3281 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
3282 double scrub_max_interval = 0;
3283 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3284 if (scrub_max_interval <= 0) {
3285 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
3286 }
3287 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
3288 scrub_max_interval;
3289 utime_t cutoff = now;
3290 cutoff -= age;
3291 if (p.second.last_scrub_stamp < cutoff) {
3292 if (detail_max > 0) {
3293 ostringstream ss;
3294 ss << "pg " << p.first << " not scrubbed since "
3295 << p.second.last_scrub_stamp;
3296 detail.push_back(ss.str());
3297 --detail_max;
3298 } else {
3299 ++detail_more;
3300 }
3301 ++detail_total;
3302 }
3303 }
3304 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3305 double deep_scrub_interval = 0;
3306 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3307 if (deep_scrub_interval <= 0) {
3308 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3309 }
3310 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
3311 deep_scrub_interval;
3312 utime_t deep_cutoff = now;
3313 deep_cutoff -= deep_age;
3314 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3315 if (deep_detail_max > 0) {
3316 ostringstream ss;
3317 ss << "pg " << p.first << " not deep-scrubbed since "
3318 << p.second.last_deep_scrub_stamp;
3319 deep_detail.push_back(ss.str());
3320 --deep_detail_max;
3321 } else {
3322 ++deep_detail_more;
3323 }
3324 ++deep_detail_total;
3325 }
3326 }
3327 }
3328 if (detail_total) {
3329 ostringstream ss;
3330 ss << detail_total << " pgs not scrubbed in time";
3331 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
3332
3333 if (!detail.empty()) {
3334 d.detail.swap(detail);
3335
3336 if (detail_more) {
3337 ostringstream ss;
3338 ss << detail_more << " more pgs... ";
3339 d.detail.push_back(ss.str());
3340 }
3341 }
3342 }
3343 if (deep_detail_total) {
3344 ostringstream ss;
3345 ss << deep_detail_total << " pgs not deep-scrubbed in time";
3346 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3347 deep_detail_total);
3348
3349 if (!deep_detail.empty()) {
3350 d.detail.swap(deep_detail);
3351
3352 if (deep_detail_more) {
3353 ostringstream ss;
3354 ss << deep_detail_more << " more pgs... ";
3355 d.detail.push_back(ss.str());
3356 }
3357 }
3358 }
3359 }
3360
3361 // POOL_APP
3362 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
3363 list<string> detail;
3364 for (auto &it : pools) {
3365 const pg_pool_t &pool = it.second;
3366 const string& pool_name = osdmap.get_pool_name(it.first);
3367 auto it2 = pg_pool_sum.find(it.first);
3368 if (it2 == pg_pool_sum.end()) {
3369 continue;
3370 }
3371 const pool_stat_t *pstat = &it2->second;
3372 if (pstat == nullptr) {
3373 continue;
3374 }
3375 const object_stat_sum_t& sum = pstat->stats.sum;
3376 // application metadata is not encoded until luminous is minimum
3377 // required release
3378 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3379 !pool.is_tier()) {
3380 stringstream ss;
3381 ss << "application not enabled on pool '" << pool_name << "'";
3382 detail.push_back(ss.str());
3383 }
3384 }
3385 if (!detail.empty()) {
3386 ostringstream ss;
3387 ss << detail.size() << " pool(s) do not have an application enabled";
3388 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3389 detail.size());
3390 stringstream tip;
3391 tip << "use 'ceph osd pool application enable <pool-name> "
3392 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3393 << "or freeform for custom applications.";
3394 detail.push_back(tip.str());
3395 d.detail.swap(detail);
3396 }
3397 }
3398
3399 // PG_SLOW_SNAP_TRIMMING
3400 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3401 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3402 uint64_t snaptrimq_exceeded = 0;
3403 uint32_t longest_queue = 0;
3404 const pg_t* longest_q_pg = nullptr;
3405 list<string> detail;
3406
3407 for (auto& i: pg_stat) {
3408 uint32_t current_len = i.second.snaptrimq_len;
3409 if (current_len >= snapthreshold) {
3410 snaptrimq_exceeded++;
3411 if (longest_queue <= current_len) {
3412 longest_q_pg = &i.first;
3413 longest_queue = current_len;
3414 }
3415 if (detail.size() < max - 1) {
3416 stringstream ss;
3417 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3418 detail.push_back(ss.str());
3419 continue;
3420 }
3421 if (detail.size() < max) {
3422 detail.push_back("...more pgs affected");
3423 continue;
3424 }
3425 }
3426 }
3427
3428 if (snaptrimq_exceeded) {
3429 {
3430 ostringstream ss;
3431 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3432 detail.push_back(ss.str());
3433 }
3434
3435 stringstream ss;
3436 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3437 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3438 snaptrimq_exceeded);
3439 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3440 d.detail.swap(detail);
3441 }
3442 }
3443 }
3444
3445 void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
3446 {
3447 if (f) {
3448 f->open_array_section("pgs_by_pool_state");
3449 for (auto& i: num_pg_by_pool_state) {
3450 f->open_object_section("per_pool_pgs_by_state");
3451 f->dump_int("pool_id", i.first);
3452 f->open_array_section("pg_state_counts");
3453 for (auto& j : i.second) {
3454 f->open_object_section("pg_state_count");
3455 f->dump_string("state_name", pg_state_string(j.first));
3456 f->dump_int("count", j.second);
3457 f->close_section();
3458 }
3459 f->close_section();
3460 f->close_section();
3461 }
3462 f->close_section();
3463 }
3464 PGMapDigest::print_summary(f, out);
3465 }
3466
3467 int process_pg_map_command(
3468 const string& orig_prefix,
3469 const cmdmap_t& orig_cmdmap,
3470 const PGMap& pg_map,
3471 const OSDMap& osdmap,
3472 ceph::Formatter *f,
3473 stringstream *ss,
3474 bufferlist *odata)
3475 {
3476 string prefix = orig_prefix;
3477 auto cmdmap = orig_cmdmap;
3478
3479 string omap_stats_note =
3480 "\n* NOTE: Omap statistics are gathered during deep scrub and "
3481 "may be inaccurate soon afterwards depending on utilization. See "
3482 "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3483 "for further details.\n";
3484 bool omap_stats_note_required = false;
3485
3486 // perhaps these would be better in the parsing, but it's weird
3487 bool primary = false;
3488 if (prefix == "pg dump_json") {
3489 vector<string> v;
3490 v.push_back(string("all"));
3491 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3492 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3493 prefix = "pg dump";
3494 } else if (prefix == "pg dump_pools_json") {
3495 vector<string> v;
3496 v.push_back(string("pools"));
3497 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3498 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3499 prefix = "pg dump";
3500 } else if (prefix == "pg ls-by-primary") {
3501 primary = true;
3502 prefix = "pg ls";
3503 } else if (prefix == "pg ls-by-osd") {
3504 prefix = "pg ls";
3505 } else if (prefix == "pg ls-by-pool") {
3506 prefix = "pg ls";
3507 string poolstr;
3508 cmd_getval(cmdmap, "poolstr", poolstr);
3509 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3510 if (pool < 0) {
3511 *ss << "pool " << poolstr << " does not exist";
3512 return -ENOENT;
3513 }
3514 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3515 }
3516
3517 stringstream ds;
3518 if (prefix == "pg stat") {
3519 if (f) {
3520 f->open_object_section("pg_summary");
3521 pg_map.print_oneline_summary(f, NULL);
3522 f->close_section();
3523 f->flush(ds);
3524 } else {
3525 ds << pg_map;
3526 }
3527 odata->append(ds);
3528 return 0;
3529 }
3530
3531 if (prefix == "pg getmap") {
3532 pg_map.encode(*odata);
3533 *ss << "got pgmap version " << pg_map.version;
3534 return 0;
3535 }
3536
3537 if (prefix == "pg dump") {
3538 string val;
3539 vector<string> dumpcontents;
3540 set<string> what;
3541 if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
3542 copy(dumpcontents.begin(), dumpcontents.end(),
3543 inserter(what, what.end()));
3544 }
3545 if (what.empty())
3546 what.insert("all");
3547 if (f) {
3548 if (what.count("all")) {
3549 f->open_object_section("pg_map");
3550 pg_map.dump(f);
3551 f->close_section();
3552 } else if (what.count("summary") || what.count("sum")) {
3553 f->open_object_section("pg_map");
3554 pg_map.dump_basic(f);
3555 f->close_section();
3556 } else {
3557 if (what.count("pools")) {
3558 pg_map.dump_pool_stats(f);
3559 }
3560 if (what.count("osds")) {
3561 pg_map.dump_osd_stats(f);
3562 }
3563 if (what.count("pgs")) {
3564 pg_map.dump_pg_stats(f, false);
3565 }
3566 if (what.count("pgs_brief")) {
3567 pg_map.dump_pg_stats(f, true);
3568 }
3569 if (what.count("delta")) {
3570 f->open_object_section("delta");
3571 pg_map.dump_delta(f);
3572 f->close_section();
3573 }
3574 }
3575 f->flush(*odata);
3576 } else {
3577 if (what.count("all")) {
3578 pg_map.dump(ds);
3579 omap_stats_note_required = true;
3580 } else if (what.count("summary") || what.count("sum")) {
3581 pg_map.dump_basic(ds);
3582 pg_map.dump_pg_sum_stats(ds, true);
3583 pg_map.dump_osd_sum_stats(ds);
3584 omap_stats_note_required = true;
3585 } else {
3586 if (what.count("pgs_brief")) {
3587 pg_map.dump_pg_stats(ds, true);
3588 }
3589 bool header = true;
3590 if (what.count("pgs")) {
3591 pg_map.dump_pg_stats(ds, false);
3592 header = false;
3593 omap_stats_note_required = true;
3594 }
3595 if (what.count("pools")) {
3596 pg_map.dump_pool_stats(ds, header);
3597 omap_stats_note_required = true;
3598 }
3599 if (what.count("osds")) {
3600 pg_map.dump_osd_stats(ds);
3601 }
3602 }
3603 odata->append(ds);
3604 if (omap_stats_note_required) {
3605 odata->append(omap_stats_note);
3606 }
3607 }
3608 *ss << "dumped " << what;
3609 return 0;
3610 }
3611
3612 if (prefix == "pg ls") {
3613 int64_t osd = -1;
3614 int64_t pool = -1;
3615 vector<string>states;
3616 set<pg_t> pgs;
3617 cmd_getval(cmdmap, "pool", pool);
3618 cmd_getval(cmdmap, "osd", osd);
3619 cmd_getval(cmdmap, "states", states);
3620 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3621 *ss << "pool " << pool << " does not exist";
3622 return -ENOENT;
3623 }
3624 if (osd >= 0 && !osdmap.is_up(osd)) {
3625 *ss << "osd " << osd << " is not up";
3626 return -EAGAIN;
3627 }
3628 if (states.empty())
3629 states.push_back("all");
3630
3631 uint64_t state = 0;
3632
3633 while (!states.empty()) {
3634 string state_str = states.back();
3635
3636 if (state_str == "all") {
3637 state = -1;
3638 break;
3639 } else {
3640 auto filter = pg_string_state(state_str);
3641 if (!filter) {
3642 *ss << "'" << state_str << "' is not a valid pg state,"
3643 << " available choices: " << pg_state_string(0xFFFFFFFF);
3644 return -EINVAL;
3645 }
3646 state |= *filter;
3647 }
3648
3649 states.pop_back();
3650 }
3651
3652 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3653
3654 if (f && !pgs.empty()) {
3655 pg_map.dump_filtered_pg_stats(f, pgs);
3656 f->flush(*odata);
3657 } else if (!pgs.empty()) {
3658 pg_map.dump_filtered_pg_stats(ds, pgs);
3659 odata->append(ds);
3660 odata->append(omap_stats_note);
3661 }
3662 return 0;
3663 }
3664
3665 if (prefix == "pg dump_stuck") {
3666 vector<string> stuckop_vec;
3667 cmd_getval(cmdmap, "stuckops", stuckop_vec);
3668 if (stuckop_vec.empty())
3669 stuckop_vec.push_back("unclean");
3670 int64_t threshold;
3671 cmd_getval(cmdmap, "threshold", threshold,
3672 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
3673
3674 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
3675 *ss << "failed";
3676 } else {
3677 *ss << "ok";
3678 }
3679 odata->append(ds);
3680 return 0;
3681 }
3682
3683 if (prefix == "pg debug") {
3684 string debugop;
3685 cmd_getval(cmdmap, "debugop", debugop,
3686 string("unfound_objects_exist"));
3687 if (debugop == "unfound_objects_exist") {
3688 bool unfound_objects_exist = false;
3689 for (const auto& p : pg_map.pg_stat) {
3690 if (p.second.stats.sum.num_objects_unfound > 0) {
3691 unfound_objects_exist = true;
3692 break;
3693 }
3694 }
3695 if (unfound_objects_exist)
3696 ds << "TRUE";
3697 else
3698 ds << "FALSE";
3699 odata->append(ds);
3700 return 0;
3701 }
3702 if (debugop == "degraded_pgs_exist") {
3703 bool degraded_pgs_exist = false;
3704 for (const auto& p : pg_map.pg_stat) {
3705 if (p.second.stats.sum.num_objects_degraded > 0) {
3706 degraded_pgs_exist = true;
3707 break;
3708 }
3709 }
3710 if (degraded_pgs_exist)
3711 ds << "TRUE";
3712 else
3713 ds << "FALSE";
3714 odata->append(ds);
3715 return 0;
3716 }
3717 }
3718
3719 if (prefix == "osd perf") {
3720 if (f) {
3721 f->open_object_section("osdstats");
3722 pg_map.dump_osd_perf_stats(f);
3723 f->close_section();
3724 f->flush(ds);
3725 } else {
3726 pg_map.print_osd_perf_stats(&ds);
3727 }
3728 odata->append(ds);
3729 return 0;
3730 }
3731
3732 if (prefix == "osd blocked-by") {
3733 if (f) {
3734 f->open_object_section("osd_blocked_by");
3735 pg_map.dump_osd_blocked_by_stats(f);
3736 f->close_section();
3737 f->flush(ds);
3738 } else {
3739 pg_map.print_osd_blocked_by_stats(&ds);
3740 }
3741 odata->append(ds);
3742 return 0;
3743 }
3744
3745 return -EOPNOTSUPP;
3746 }
3747
3748 void PGMapUpdater::check_osd_map(
3749 CephContext *cct,
3750 const OSDMap& osdmap,
3751 const PGMap& pgmap,
3752 PGMap::Incremental *pending_inc)
3753 {
3754 for (auto& p : pgmap.osd_stat) {
3755 if (!osdmap.exists(p.first)) {
3756 // remove osd_stat
3757 pending_inc->rm_stat(p.first);
3758 } else if (osdmap.is_out(p.first)) {
3759 // zero osd_stat
3760 if (p.second.statfs.total != 0) {
3761 pending_inc->stat_osd_out(p.first);
3762 }
3763 } else if (!osdmap.is_up(p.first)) {
3764 // zero the op_queue_age_hist
3765 if (!p.second.op_queue_age_hist.empty()) {
3766 pending_inc->stat_osd_down_up(p.first, pgmap);
3767 }
3768 }
3769 }
3770
3771 // deleted pgs (pools)?
3772 for (auto& p : pgmap.pg_pool_sum) {
3773 if (!osdmap.have_pg_pool(p.first)) {
3774 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3775 << dendl;
3776 for (auto& q : pgmap.pg_stat) {
3777 if (q.first.pool() == p.first) {
3778 pending_inc->pg_remove.insert(q.first);
3779 }
3780 }
3781 auto q = pending_inc->pg_stat_updates.begin();
3782 while (q != pending_inc->pg_stat_updates.end()) {
3783 if (q->first.pool() == p.first) {
3784 q = pending_inc->pg_stat_updates.erase(q);
3785 } else {
3786 ++q;
3787 }
3788 }
3789 }
3790 }
3791
3792 // new (split or new pool) or merged pgs?
3793 map<int64_t,unsigned> new_pg_num;
3794 for (auto& p : osdmap.get_pools()) {
3795 int64_t poolid = p.first;
3796 const pg_pool_t& pi = p.second;
3797 auto q = pgmap.num_pg_by_pool.find(poolid);
3798 unsigned my_pg_num = 0;
3799 if (q != pgmap.num_pg_by_pool.end())
3800 my_pg_num = q->second;
3801 unsigned pg_num = pi.get_pg_num();
3802 new_pg_num[poolid] = pg_num;
3803 if (my_pg_num < pg_num) {
3804 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3805 << " > my pg_num " << my_pg_num << dendl;
3806 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3807 pg_t pgid(ps, poolid);
3808 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
3809 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
3810 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3811 stats.last_fresh = osdmap.get_modified();
3812 stats.last_active = osdmap.get_modified();
3813 stats.last_change = osdmap.get_modified();
3814 stats.last_peered = osdmap.get_modified();
3815 stats.last_clean = osdmap.get_modified();
3816 stats.last_unstale = osdmap.get_modified();
3817 stats.last_undegraded = osdmap.get_modified();
3818 stats.last_fullsized = osdmap.get_modified();
3819 stats.last_scrub_stamp = osdmap.get_modified();
3820 stats.last_deep_scrub_stamp = osdmap.get_modified();
3821 stats.last_clean_scrub_stamp = osdmap.get_modified();
3822 }
3823 }
3824 } else if (my_pg_num > pg_num) {
3825 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3826 << " < my pg_num " << my_pg_num << dendl;
3827 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3828 pg_t pgid(i, poolid);
3829 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3830 if (pgmap.pg_stat.count(pgid)) {
3831 pending_inc->pg_remove.insert(pgid);
3832 }
3833 pending_inc->pg_stat_updates.erase(pgid);
3834 }
3835 }
3836 }
3837 auto i = pending_inc->pg_stat_updates.begin();
3838 while (i != pending_inc->pg_stat_updates.end()) {
3839 auto j = new_pg_num.find(i->first.pool());
3840 if (j == new_pg_num.end() ||
3841 i->first.ps() >= j->second) {
3842 ldout(cct,20) << __func__ << " removing pending update to old "
3843 << i->first << dendl;
3844 i = pending_inc->pg_stat_updates.erase(i);
3845 } else {
3846 ++i;
3847 }
3848 }
3849 }
3850
3851 static void _try_mark_pg_stale(
3852 const OSDMap& osdmap,
3853 pg_t pgid,
3854 const pg_stat_t& cur,
3855 PGMap::Incremental *pending_inc)
3856 {
3857 if ((cur.state & PG_STATE_STALE) == 0 &&
3858 cur.acting_primary != -1 &&
3859 osdmap.is_down(cur.acting_primary)) {
3860 pg_stat_t *newstat;
3861 auto q = pending_inc->pg_stat_updates.find(pgid);
3862 if (q != pending_inc->pg_stat_updates.end()) {
3863 if ((q->second.acting_primary == cur.acting_primary) ||
3864 ((q->second.state & PG_STATE_STALE) == 0 &&
3865 q->second.acting_primary != -1 &&
3866 osdmap.is_down(q->second.acting_primary))) {
3867 newstat = &q->second;
3868 } else {
3869 // pending update is no longer down or already stale
3870 return;
3871 }
3872 } else {
3873 newstat = &pending_inc->pg_stat_updates[pgid];
3874 *newstat = cur;
3875 }
3876 dout(10) << __func__ << " marking pg " << pgid
3877 << " stale (acting_primary " << newstat->acting_primary
3878 << ")" << dendl;
3879 newstat->state |= PG_STATE_STALE;
3880 newstat->last_unstale = ceph_clock_now();
3881 }
3882 }
3883
3884 void PGMapUpdater::check_down_pgs(
3885 const OSDMap &osdmap,
3886 const PGMap &pg_map,
3887 bool check_all,
3888 const set<int>& need_check_down_pg_osds,
3889 PGMap::Incremental *pending_inc)
3890 {
3891 // if a large number of osds changed state, just iterate over the whole
3892 // pg map.
3893 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
3894 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
3895 check_all = true;
3896 }
3897
3898 if (check_all) {
3899 for (const auto& p : pg_map.pg_stat) {
3900 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3901 }
3902 } else {
3903 for (auto osd : need_check_down_pg_osds) {
3904 if (osdmap.is_down(osd)) {
3905 auto p = pg_map.pg_by_osd.find(osd);
3906 if (p == pg_map.pg_by_osd.end()) {
3907 continue;
3908 }
3909 for (auto pgid : p->second) {
3910 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
3911 ceph_assert(stat.acting_primary == osd);
3912 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3913 }
3914 }
3915 }
3916 }
3917 }
3918
3919 int reweight::by_utilization(
3920 const OSDMap &osdmap,
3921 const PGMap &pgm,
3922 int oload,
3923 double max_changef,
3924 int max_osds,
3925 bool by_pg, const set<int64_t> *pools,
3926 bool no_increasing,
3927 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3928 std::stringstream *ss,
3929 std::string *out_str,
3930 ceph::Formatter *f)
3931 {
3932 if (oload <= 100) {
3933 *ss << "You must give a percentage higher than 100. "
3934 "The reweighting threshold will be calculated as <average-utilization> "
3935 "times <input-percentage>. For example, an argument of 200 would "
3936 "reweight OSDs which are twice as utilized as the average OSD.\n";
3937 return -EINVAL;
3938 }
3939
3940 vector<int> pgs_by_osd(osdmap.get_max_osd());
3941
3942 // Avoid putting a small number (or 0) in the denominator when calculating
3943 // average_util
3944 double average_util;
3945 if (by_pg) {
3946 // by pg mapping
3947 double weight_sum = 0.0; // sum up the crush weights
3948 unsigned num_pg_copies = 0;
3949 int num_osds = 0;
3950 for (const auto& pg : pgm.pg_stat) {
3951 if (pools && pools->count(pg.first.pool()) == 0)
3952 continue;
3953 for (const auto acting : pg.second.acting) {
3954 if (!osdmap.exists(acting)) {
3955 continue;
3956 }
3957 if (acting >= (int)pgs_by_osd.size())
3958 pgs_by_osd.resize(acting);
3959 if (pgs_by_osd[acting] == 0) {
3960 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3961 //skip if we currently can not identify item
3962 continue;
3963 }
3964 weight_sum += osdmap.crush->get_item_weightf(acting);
3965 ++num_osds;
3966 }
3967 ++pgs_by_osd[acting];
3968 ++num_pg_copies;
3969 }
3970 }
3971
3972 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
3973 *ss << "Refusing to reweight: we only have " << num_pg_copies
3974 << " PGs across " << num_osds << " osds!\n";
3975 return -EDOM;
3976 }
3977
3978 average_util = (double)num_pg_copies / weight_sum;
3979 } else {
3980 // by osd utilization
3981 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3982 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3983 < g_conf()->mon_reweight_min_bytes_per_osd) {
3984 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
3985 << " kb across all osds!\n";
3986 return -EDOM;
3987 }
3988 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3989 < g_conf()->mon_reweight_min_bytes_per_osd) {
3990 *ss << "Refusing to reweight: we only have "
3991 << pgm.osd_sum.statfs.kb_used_raw()
3992 << " kb used across all osds!\n";
3993 return -EDOM;
3994 }
3995
3996 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
3997 (double)pgm.osd_sum.statfs.total;
3998 }
3999
4000 // adjust down only if we are above the threshold
4001 const double overload_util = average_util * (double)oload / 100.0;
4002
4003 // but aggressively adjust weights up whenever possible.
4004 const double underload_util = average_util;
4005
4006 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4007
4008 ostringstream oss;
4009 if (f) {
4010 f->open_object_section("reweight_by_utilization");
4011 f->dump_int("overload_min", oload);
4012 f->dump_float("max_change", max_changef);
4013 f->dump_int("max_change_osds", max_osds);
4014 f->dump_float("average_utilization", average_util);
4015 f->dump_float("overload_utilization", overload_util);
4016 } else {
4017 oss << "oload " << oload << "\n";
4018 oss << "max_change " << max_changef << "\n";
4019 oss << "max_change_osds " << max_osds << "\n";
4020 oss.precision(4);
4021 oss << "average_utilization " << std::fixed << average_util << "\n";
4022 oss << "overload_utilization " << overload_util << "\n";
4023 }
4024 int num_changed = 0;
4025
4026 // precompute util for each OSD
4027 std::vector<std::pair<int, float> > util_by_osd;
4028 for (const auto& p : pgm.osd_stat) {
4029 std::pair<int, float> osd_util;
4030 osd_util.first = p.first;
4031 if (by_pg) {
4032 if (p.first >= (int)pgs_by_osd.size() ||
4033 pgs_by_osd[p.first] == 0) {
4034 // skip if this OSD does not contain any pg
4035 // belonging to the specified pool(s).
4036 continue;
4037 }
4038
4039 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4040 // skip if we are unable to locate item.
4041 continue;
4042 }
4043
4044 osd_util.second =
4045 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4046 } else {
4047 osd_util.second =
4048 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
4049 }
4050 util_by_osd.push_back(osd_util);
4051 }
4052
4053 // sort by absolute deviation from the mean utilization,
4054 // in descending order.
4055 std::sort(util_by_osd.begin(), util_by_osd.end(),
4056 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4057 return abs(l.second - average_util) > abs(r.second - average_util);
4058 }
4059 );
4060
4061 if (f)
4062 f->open_array_section("reweights");
4063
4064 for (const auto& p : util_by_osd) {
4065 unsigned weight = osdmap.get_weight(p.first);
4066 if (weight == 0) {
4067 // skip if OSD is currently out
4068 continue;
4069 }
4070 float util = p.second;
4071
4072 if (util >= overload_util) {
4073 // Assign a lower weight to overloaded OSDs. The current weight
4074 // is a factor to take into account the original weights,
4075 // to represent e.g. differing storage capacities
4076 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4077 if (weight > max_change)
4078 new_weight = std::max(new_weight, weight - max_change);
4079 new_weights->insert({p.first, new_weight});
4080 if (f) {
4081 f->open_object_section("osd");
4082 f->dump_int("osd", p.first);
4083 f->dump_float("weight", (float)weight / (float)0x10000);
4084 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4085 f->close_section();
4086 } else {
4087 oss << "osd." << p.first << " weight "
4088 << (float)weight / (float)0x10000 << " -> "
4089 << (float)new_weight / (float)0x10000 << "\n";
4090 }
4091 if (++num_changed >= max_osds)
4092 break;
4093 }
4094 if (!no_increasing && util <= underload_util) {
4095 // assign a higher weight.. if we can.
4096 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4097 new_weight = std::min(new_weight, weight + max_change);
4098 if (new_weight > 0x10000)
4099 new_weight = 0x10000;
4100 if (new_weight > weight) {
4101 new_weights->insert({p.first, new_weight});
4102 oss << "osd." << p.first << " weight "
4103 << (float)weight / (float)0x10000 << " -> "
4104 << (float)new_weight / (float)0x10000 << "\n";
4105 if (++num_changed >= max_osds)
4106 break;
4107 }
4108 }
4109 }
4110 if (f) {
4111 f->close_section();
4112 }
4113
4114 OSDMap newmap;
4115 newmap.deepish_copy_from(osdmap);
4116 OSDMap::Incremental newinc;
4117 newinc.fsid = newmap.get_fsid();
4118 newinc.epoch = newmap.get_epoch() + 1;
4119 newinc.new_weight = *new_weights;
4120 newmap.apply_incremental(newinc);
4121
4122 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4123
4124 if (f) {
4125 f->close_section();
4126 } else {
4127 *out_str += "\n";
4128 *out_str += oss.str();
4129 }
4130 return num_changed;
4131 }