]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Clock.h"
11 #include "common/Formatter.h"
12 #include "global/global_context.h"
13 #include "include/ceph_features.h"
14 #include "include/stringify.h"
15
16 #include "osd/osd_types.h"
17 #include "osd/OSDMap.h"
18 #include <boost/range/adaptor/reversed.hpp>
19
20 #define dout_context g_ceph_context
21
22 using std::list;
23 using std::make_pair;
24 using std::map;
25 using std::pair;
26 using std::ostream;
27 using std::ostringstream;
28 using std::set;
29 using std::string;
30 using std::stringstream;
31 using std::vector;
32
33 using ceph::bufferlist;
34 using ceph::fixed_u_to_string;
35
36 using TOPNSPC::common::cmd_getval;
37
38 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
39 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
40 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
41
42
43 // ---------------------
44 // PGMapDigest
45
46 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
47 {
48 // NOTE: see PGMap::encode_digest
49 uint8_t v = 4;
50 if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
51 v = 1;
52 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
53 v = 3;
54 }
55 ENCODE_START(v, 1, bl);
56 encode(num_pg, bl);
57 encode(num_pg_active, bl);
58 encode(num_pg_unknown, bl);
59 encode(num_osd, bl);
60 encode(pg_pool_sum, bl, features);
61 encode(pg_sum, bl, features);
62 encode(osd_sum, bl, features);
63 if (v >= 2) {
64 encode(num_pg_by_state, bl);
65 } else {
66 uint32_t n = num_pg_by_state.size();
67 encode(n, bl);
68 for (auto p : num_pg_by_state) {
69 encode((int32_t)p.first, bl);
70 encode(p.second, bl);
71 }
72 }
73 encode(num_pg_by_osd, bl);
74 encode(num_pg_by_pool, bl);
75 encode(osd_last_seq, bl);
76 encode(per_pool_sum_delta, bl, features);
77 encode(per_pool_sum_deltas_stamps, bl);
78 encode(pg_sum_delta, bl, features);
79 encode(stamp_delta, bl);
80 encode(avail_space_by_rule, bl);
81 if (struct_v >= 3) {
82 encode(purged_snaps, bl);
83 }
84 if (struct_v >= 4) {
85 encode(osd_sum_by_class, bl, features);
86 }
87 ENCODE_FINISH(bl);
88 }
89
90 void PGMapDigest::decode(bufferlist::const_iterator& p)
91 {
92 DECODE_START(4, p);
93 decode(num_pg, p);
94 decode(num_pg_active, p);
95 decode(num_pg_unknown, p);
96 decode(num_osd, p);
97 decode(pg_pool_sum, p);
98 decode(pg_sum, p);
99 decode(osd_sum, p);
100 if (struct_v >= 2) {
101 decode(num_pg_by_state, p);
102 } else {
103 map<int32_t, int32_t> nps;
104 decode(nps, p);
105 num_pg_by_state.clear();
106 for (auto i : nps) {
107 num_pg_by_state[i.first] = i.second;
108 }
109 }
110 decode(num_pg_by_osd, p);
111 decode(num_pg_by_pool, p);
112 decode(osd_last_seq, p);
113 decode(per_pool_sum_delta, p);
114 decode(per_pool_sum_deltas_stamps, p);
115 decode(pg_sum_delta, p);
116 decode(stamp_delta, p);
117 decode(avail_space_by_rule, p);
118 if (struct_v >= 3) {
119 decode(purged_snaps, p);
120 }
121 if (struct_v >= 4) {
122 decode(osd_sum_by_class, p);
123 }
124 DECODE_FINISH(p);
125 }
126
127 void PGMapDigest::dump(ceph::Formatter *f) const
128 {
129 f->dump_unsigned("num_pg", num_pg);
130 f->dump_unsigned("num_pg_active", num_pg_active);
131 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
132 f->dump_unsigned("num_osd", num_osd);
133 f->dump_object("pool_sum", pg_sum);
134 f->dump_object("osd_sum", osd_sum);
135
136 f->open_object_section("osd_sum_by_class");
137 for (auto& i : osd_sum_by_class) {
138 f->dump_object(i.first.c_str(), i.second);
139 }
140 f->close_section();
141
142 f->open_array_section("pool_stats");
143 for (auto& p : pg_pool_sum) {
144 f->open_object_section("pool_stat");
145 f->dump_int("poolid", p.first);
146 auto q = num_pg_by_pool.find(p.first);
147 if (q != num_pg_by_pool.end())
148 f->dump_unsigned("num_pg", q->second);
149 p.second.dump(f);
150 f->close_section();
151 }
152 f->close_section();
153 f->open_array_section("osd_stats");
154 int i = 0;
155 // TODO: this isn't really correct since we can dump non-existent OSDs
156 // I dunno what osd_last_seq is set to in that case...
157 for (auto& p : osd_last_seq) {
158 f->open_object_section("osd_stat");
159 f->dump_int("osd", i);
160 f->dump_unsigned("seq", p);
161 f->close_section();
162 ++i;
163 }
164 f->close_section();
165 f->open_array_section("num_pg_by_state");
166 for (auto& p : num_pg_by_state) {
167 f->open_object_section("count");
168 f->dump_string("state", pg_state_string(p.first));
169 f->dump_unsigned("num", p.second);
170 f->close_section();
171 }
172 f->close_section();
173 f->open_array_section("num_pg_by_osd");
174 for (auto& p : num_pg_by_osd) {
175 f->open_object_section("count");
176 f->dump_unsigned("osd", p.first);
177 f->dump_unsigned("num_primary_pg", p.second.primary);
178 f->dump_unsigned("num_acting_pg", p.second.acting);
179 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
180 f->close_section();
181 }
182 f->close_section();
183 f->open_array_section("purged_snaps");
184 for (auto& j : purged_snaps) {
185 f->open_object_section("pool");
186 f->dump_int("pool", j.first);
187 f->open_object_section("purged_snaps");
188 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
189 f->open_object_section("interval");
190 f->dump_stream("start") << i.get_start();
191 f->dump_stream("length") << i.get_len();
192 f->close_section();
193 }
194 f->close_section();
195 f->close_section();
196 }
197 f->close_section();
198 }
199
200 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
201 {
202 ls.push_back(new PGMapDigest);
203 }
204
205 inline std::string percentify(const float& a) {
206 std::stringstream ss;
207 if (a < 0.01)
208 ss << "0";
209 else
210 ss << std::fixed << std::setprecision(2) << a;
211 return ss.str();
212 }
213
214 void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
215 {
216 if (f)
217 f->open_array_section("pgs_by_state");
218
219 // list is descending numeric order (by count)
220 std::multimap<int,uint64_t> state_by_count; // count -> state
221 for (auto p = num_pg_by_state.begin();
222 p != num_pg_by_state.end();
223 ++p) {
224 state_by_count.insert(make_pair(p->second, p->first));
225 }
226 if (f) {
227 for (auto p = state_by_count.rbegin();
228 p != state_by_count.rend();
229 ++p)
230 {
231 f->open_object_section("pgs_by_state_element");
232 f->dump_string("state_name", pg_state_string(p->second));
233 f->dump_unsigned("count", p->first);
234 f->close_section();
235 }
236 }
237 if (f)
238 f->close_section();
239
240 if (f) {
241 f->dump_unsigned("num_pgs", num_pg);
242 f->dump_unsigned("num_pools", pg_pool_sum.size());
243 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
244 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
245 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
246 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
247 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
248 } else {
249 *out << " pools: " << pg_pool_sum.size() << " pools, "
250 << num_pg << " pgs\n";
251 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
252 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
253 *out << " usage: "
254 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
255 << byte_u_t(osd_sum.statfs.available) << " / "
256 << byte_u_t(osd_sum.statfs.total) << " avail\n";
257 *out << " pgs: ";
258 }
259
260 bool pad = false;
261
262 if (num_pg_unknown > 0) {
263 float p = (float)num_pg_unknown / (float)num_pg;
264 if (f) {
265 f->dump_float("unknown_pgs_ratio", p);
266 } else {
267 char b[20];
268 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
269 *out << b << "% pgs unknown\n";
270 pad = true;
271 }
272 }
273
274 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
275 if (num_pg_inactive > 0) {
276 float p = (float)num_pg_inactive / (float)num_pg;
277 if (f) {
278 f->dump_float("inactive_pgs_ratio", p);
279 } else {
280 if (pad) {
281 *out << " ";
282 }
283 char b[20];
284 snprintf(b, sizeof(b), "%.3f", p * 100.0);
285 *out << b << "% pgs not active\n";
286 pad = true;
287 }
288 }
289
290 list<string> sl;
291 overall_recovery_summary(f, &sl);
292 if (!f && !sl.empty()) {
293 for (auto p = sl.begin(); p != sl.end(); ++p) {
294 if (pad) {
295 *out << " ";
296 }
297 *out << *p << "\n";
298 pad = true;
299 }
300 }
301 sl.clear();
302
303 if (!f) {
304 unsigned max_width = 1;
305 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
306 {
307 std::stringstream ss;
308 ss << p->first;
309 max_width = std::max<size_t>(ss.str().size(), max_width);
310 }
311
312 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
313 {
314 if (pad) {
315 *out << " ";
316 }
317 pad = true;
318 out->setf(std::ios::left);
319 *out << std::setw(max_width) << p->first
320 << " " << pg_state_string(p->second) << "\n";
321 out->unsetf(std::ios::left);
322 }
323 }
324
325 ostringstream ss_rec_io;
326 overall_recovery_rate_summary(f, &ss_rec_io);
327 ostringstream ss_client_io;
328 overall_client_io_rate_summary(f, &ss_client_io);
329 ostringstream ss_cache_io;
330 overall_cache_io_rate_summary(f, &ss_cache_io);
331
332 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
333 || ss_cache_io.str().length())) {
334 *out << "\n \n";
335 *out << " io:\n";
336 }
337
338 if (!f && ss_client_io.str().length())
339 *out << " client: " << ss_client_io.str() << "\n";
340 if (!f && ss_rec_io.str().length())
341 *out << " recovery: " << ss_rec_io.str() << "\n";
342 if (!f && ss_cache_io.str().length())
343 *out << " cache: " << ss_cache_io.str() << "\n";
344 }
345
346 void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
347 {
348 std::stringstream ss;
349
350 if (f)
351 f->open_array_section("num_pg_by_state");
352 for (auto p = num_pg_by_state.begin();
353 p != num_pg_by_state.end();
354 ++p) {
355 if (f) {
356 f->open_object_section("state");
357 f->dump_string("name", pg_state_string(p->first));
358 f->dump_unsigned("num", p->second);
359 f->close_section();
360 }
361 if (p != num_pg_by_state.begin())
362 ss << ", ";
363 ss << p->second << " " << pg_state_string(p->first);
364 }
365 if (f)
366 f->close_section();
367
368 string states = ss.str();
369 if (out)
370 *out << num_pg << " pgs: "
371 << states << "; "
372 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
373 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
374 << byte_u_t(osd_sum.statfs.available) << " / "
375 << byte_u_t(osd_sum.statfs.total) << " avail";
376 if (f) {
377 f->dump_unsigned("num_pgs", num_pg);
378 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
379 f->dump_int("total_bytes", osd_sum.statfs.total);
380 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
381 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
382 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
383 }
384
385 // make non-negative; we can get negative values if osds send
386 // uncommitted stats and then "go backward" or if they are just
387 // buggy/wrong.
388 pool_stat_t pos_delta = pg_sum_delta;
389 pos_delta.floor(0);
390 if (pos_delta.stats.sum.num_rd ||
391 pos_delta.stats.sum.num_wr) {
392 if (out)
393 *out << "; ";
394 if (pos_delta.stats.sum.num_rd) {
395 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
396 if (out)
397 *out << byte_u_t(rd) << "/s rd, ";
398 if (f)
399 f->dump_unsigned("read_bytes_sec", rd);
400 }
401 if (pos_delta.stats.sum.num_wr) {
402 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
403 if (out)
404 *out << byte_u_t(wr) << "/s wr, ";
405 if (f)
406 f->dump_unsigned("write_bytes_sec", wr);
407 }
408 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
409 if (out)
410 *out << si_u_t(iops) << " op/s";
411 if (f)
412 f->dump_unsigned("io_sec", iops);
413 }
414
415 list<string> sl;
416 overall_recovery_summary(f, &sl);
417 if (out)
418 for (auto p = sl.begin(); p != sl.end(); ++p)
419 *out << "; " << *p;
420 std::stringstream ssr;
421 overall_recovery_rate_summary(f, &ssr);
422 if (out && ssr.str().length())
423 *out << "; " << ssr.str() << " recovering";
424 }
425
426 void PGMapDigest::get_recovery_stats(
427 double *misplaced_ratio,
428 double *degraded_ratio,
429 double *inactive_pgs_ratio,
430 double *unknown_pgs_ratio) const
431 {
432 if (pg_sum.stats.sum.num_objects_degraded &&
433 pg_sum.stats.sum.num_object_copies > 0) {
434 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
435 (double)pg_sum.stats.sum.num_object_copies;
436 } else {
437 *degraded_ratio = 0;
438 }
439 if (pg_sum.stats.sum.num_objects_misplaced &&
440 pg_sum.stats.sum.num_object_copies > 0) {
441 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
442 (double)pg_sum.stats.sum.num_object_copies;
443 } else {
444 *misplaced_ratio = 0;
445 }
446 if (num_pg > 0) {
447 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
448 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
449 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
450 } else {
451 *inactive_pgs_ratio = 0;
452 *unknown_pgs_ratio = 0;
453 }
454 }
455
456 void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
457 const pool_stat_t& pool_sum) const
458 {
459 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
460 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
461 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
462 char b[20];
463 snprintf(b, sizeof(b), "%.3lf", pc);
464 if (f) {
465 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
466 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
467 f->dump_float("degraded_ratio", pc / 100.0);
468 } else {
469 ostringstream ss;
470 ss << pool_sum.stats.sum.num_objects_degraded
471 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
472 psl->push_back(ss.str());
473 }
474 }
475 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
476 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
477 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
478 char b[20];
479 snprintf(b, sizeof(b), "%.3lf", pc);
480 if (f) {
481 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
482 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
483 f->dump_float("misplaced_ratio", pc / 100.0);
484 } else {
485 ostringstream ss;
486 ss << pool_sum.stats.sum.num_objects_misplaced
487 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
488 psl->push_back(ss.str());
489 }
490 }
491 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
492 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
493 (double)pool_sum.stats.sum.num_objects * (double)100.0;
494 char b[20];
495 snprintf(b, sizeof(b), "%.3lf", pc);
496 if (f) {
497 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
498 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
499 f->dump_float("unfound_ratio", pc / 100.0);
500 } else {
501 ostringstream ss;
502 ss << pool_sum.stats.sum.num_objects_unfound
503 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
504 psl->push_back(ss.str());
505 }
506 }
507 }
508
509 void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
510 const pool_stat_t& delta_sum,
511 utime_t delta_stamp) const
512 {
513 // make non-negative; we can get negative values if osds send
514 // uncommitted stats and then "go backward" or if they are just
515 // buggy/wrong.
516 pool_stat_t pos_delta = delta_sum;
517 pos_delta.floor(0);
518 if (pos_delta.stats.sum.num_objects_recovered ||
519 pos_delta.stats.sum.num_bytes_recovered ||
520 pos_delta.stats.sum.num_keys_recovered) {
521 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
522 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
523 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
524 if (f) {
525 f->dump_int("recovering_objects_per_sec", objps);
526 f->dump_int("recovering_bytes_per_sec", bps);
527 f->dump_int("recovering_keys_per_sec", kps);
528 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
529 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
530 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
531 } else {
532 *out << byte_u_t(bps) << "/s";
533 if (pos_delta.stats.sum.num_keys_recovered)
534 *out << ", " << si_u_t(kps) << " keys/s";
535 *out << ", " << si_u_t(objps) << " objects/s";
536 }
537 }
538 }
539
540 void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
541 {
542 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
543 }
544
545 void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
546 {
547 recovery_summary(f, psl, pg_sum);
548 }
549
550 void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
551 uint64_t poolid) const
552 {
553 auto p = per_pool_sum_delta.find(poolid);
554 if (p == per_pool_sum_delta.end())
555 return;
556
557 auto ts = per_pool_sum_deltas_stamps.find(p->first);
558 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
559 recovery_rate_summary(f, out, p->second.first, ts->second);
560 }
561
562 void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
563 uint64_t poolid) const
564 {
565 auto p = pg_pool_sum.find(poolid);
566 if (p == pg_pool_sum.end())
567 return;
568
569 recovery_summary(f, psl, p->second);
570 }
571
572 void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
573 const pool_stat_t& delta_sum,
574 utime_t delta_stamp) const
575 {
576 pool_stat_t pos_delta = delta_sum;
577 pos_delta.floor(0);
578 if (pos_delta.stats.sum.num_rd ||
579 pos_delta.stats.sum.num_wr) {
580 if (pos_delta.stats.sum.num_rd) {
581 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
582 if (f) {
583 f->dump_int("read_bytes_sec", rd);
584 } else {
585 *out << byte_u_t(rd) << "/s rd, ";
586 }
587 }
588 if (pos_delta.stats.sum.num_wr) {
589 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
590 if (f) {
591 f->dump_int("write_bytes_sec", wr);
592 } else {
593 *out << byte_u_t(wr) << "/s wr, ";
594 }
595 }
596 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
597 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
598 if (f) {
599 f->dump_int("read_op_per_sec", iops_rd);
600 f->dump_int("write_op_per_sec", iops_wr);
601 } else {
602 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
603 }
604 }
605 }
606
607 void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
608 {
609 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
610 }
611
612 void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
613 uint64_t poolid) const
614 {
615 auto p = per_pool_sum_delta.find(poolid);
616 if (p == per_pool_sum_delta.end())
617 return;
618
619 auto ts = per_pool_sum_deltas_stamps.find(p->first);
620 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
621 client_io_rate_summary(f, out, p->second.first, ts->second);
622 }
623
624 void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
625 const pool_stat_t& delta_sum,
626 utime_t delta_stamp) const
627 {
628 pool_stat_t pos_delta = delta_sum;
629 pos_delta.floor(0);
630 bool have_output = false;
631
632 if (pos_delta.stats.sum.num_flush) {
633 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
634 if (f) {
635 f->dump_int("flush_bytes_sec", flush);
636 } else {
637 *out << byte_u_t(flush) << "/s flush";
638 have_output = true;
639 }
640 }
641 if (pos_delta.stats.sum.num_evict) {
642 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
643 if (f) {
644 f->dump_int("evict_bytes_sec", evict);
645 } else {
646 if (have_output)
647 *out << ", ";
648 *out << byte_u_t(evict) << "/s evict";
649 have_output = true;
650 }
651 }
652 if (pos_delta.stats.sum.num_promote) {
653 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
654 if (f) {
655 f->dump_int("promote_op_per_sec", promote);
656 } else {
657 if (have_output)
658 *out << ", ";
659 *out << si_u_t(promote) << " op/s promote";
660 have_output = true;
661 }
662 }
663 if (pos_delta.stats.sum.num_flush_mode_low) {
664 if (f) {
665 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
666 } else {
667 if (have_output)
668 *out << ", ";
669 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
670 have_output = true;
671 }
672 }
673 if (pos_delta.stats.sum.num_flush_mode_high) {
674 if (f) {
675 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
676 } else {
677 if (have_output)
678 *out << ", ";
679 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
680 have_output = true;
681 }
682 }
683 if (pos_delta.stats.sum.num_evict_mode_some) {
684 if (f) {
685 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
686 } else {
687 if (have_output)
688 *out << ", ";
689 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
690 have_output = true;
691 }
692 }
693 if (pos_delta.stats.sum.num_evict_mode_full) {
694 if (f) {
695 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
696 } else {
697 if (have_output)
698 *out << ", ";
699 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
700 }
701 }
702 }
703
704 void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
705 {
706 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
707 }
708
709 void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
710 uint64_t poolid) const
711 {
712 auto p = per_pool_sum_delta.find(poolid);
713 if (p == per_pool_sum_delta.end())
714 return;
715
716 auto ts = per_pool_sum_deltas_stamps.find(p->first);
717 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
718 cache_io_rate_summary(f, out, p->second.first, ts->second);
719 }
720
721 ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
722 boost::optional<int64_t> data_pool) const
723 {
724 ceph_statfs statfs;
725 bool filter = false;
726 object_stat_sum_t sum;
727
728 if (data_pool) {
729 auto i = pg_pool_sum.find(*data_pool);
730 if (i != pg_pool_sum.end()) {
731 sum = i->second.stats.sum;
732 filter = true;
733 }
734 }
735
736 if (filter) {
737 statfs.kb_used = (sum.num_bytes >> 10);
738 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
739 statfs.num_objects = sum.num_objects;
740 statfs.kb = statfs.kb_used + statfs.kb_avail;
741 } else {
742 // these are in KB.
743 statfs.kb = osd_sum.statfs.kb();
744 statfs.kb_used = osd_sum.statfs.kb_used_raw();
745 statfs.kb_avail = osd_sum.statfs.kb_avail();
746 statfs.num_objects = pg_sum.stats.sum.num_objects;
747 }
748
749 return statfs;
750 }
751
752 void PGMapDigest::dump_pool_stats_full(
753 const OSDMap &osd_map,
754 stringstream *ss,
755 ceph::Formatter *f,
756 bool verbose) const
757 {
758 TextTable tbl;
759
760 if (f) {
761 f->open_array_section("pools");
762 } else {
763 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
764 tbl.define_column("ID", TextTable::RIGHT, TextTable::RIGHT);
765 tbl.define_column("PGS", TextTable::RIGHT, TextTable::RIGHT);
766 tbl.define_column("STORED", TextTable::RIGHT, TextTable::RIGHT);
767 if (verbose) {
768 tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
769 tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
770 }
771 tbl.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
772 tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
773 if (verbose) {
774 tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
775 tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
776 }
777 tbl.define_column("%USED", TextTable::RIGHT, TextTable::RIGHT);
778 tbl.define_column("MAX AVAIL", TextTable::RIGHT, TextTable::RIGHT);
779
780 if (verbose) {
781 tbl.define_column("QUOTA OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
782 tbl.define_column("QUOTA BYTES", TextTable::RIGHT, TextTable::RIGHT);
783 tbl.define_column("DIRTY", TextTable::RIGHT, TextTable::RIGHT);
784 tbl.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT);
785 tbl.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT);
786 }
787 }
788
789 map<int,uint64_t> avail_by_rule;
790 for (auto p = osd_map.get_pools().begin();
791 p != osd_map.get_pools().end(); ++p) {
792 int64_t pool_id = p->first;
793 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
794 continue;
795
796 const string& pool_name = osd_map.get_pool_name(pool_id);
797 auto pool_pg_num = osd_map.get_pg_num(pool_id);
798 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
799
800 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
801 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
802 pool->get_type(),
803 pool->get_size());
804 int64_t avail;
805 if (avail_by_rule.count(ruleno) == 0) {
806 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
807 avail = get_rule_avail(ruleno);
808 if (avail < 0)
809 avail = 0;
810 avail_by_rule[ruleno] = avail;
811 } else {
812 avail = avail_by_rule[ruleno];
813 }
814 if (f) {
815 f->open_object_section("pool");
816 f->dump_string("name", pool_name);
817 f->dump_int("id", pool_id);
818 f->open_object_section("stats");
819 } else {
820 tbl << pool_name
821 << pool_id
822 << pool_pg_num;
823 }
824 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
825 bool per_pool = use_per_pool_stats();
826 bool per_pool_omap = use_per_pool_omap_stats();
827 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
828 per_pool_omap, pool);
829 if (f) {
830 f->close_section(); // stats
831 f->close_section(); // pool
832 } else {
833 tbl << TextTable::endrow;
834 }
835 }
836 if (f)
837 f->close_section();
838 else {
839 ceph_assert(ss != nullptr);
840 *ss << "--- POOLS ---\n";
841 *ss << tbl;
842 }
843 }
844
845 void PGMapDigest::dump_cluster_stats(stringstream *ss,
846 ceph::Formatter *f,
847 bool verbose) const
848 {
849 if (f) {
850 f->open_object_section("stats");
851 f->dump_int("total_bytes", osd_sum.statfs.total);
852 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
853 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
854 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
855 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
856 f->dump_unsigned("num_osds", osd_sum.num_osds);
857 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
858 f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
859 f->close_section();
860 f->open_object_section("stats_by_class");
861 for (auto& i : osd_sum_by_class) {
862 f->open_object_section(i.first.c_str());
863 f->dump_int("total_bytes", i.second.statfs.total);
864 f->dump_int("total_avail_bytes", i.second.statfs.available);
865 f->dump_int("total_used_bytes", i.second.statfs.get_used());
866 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
867 f->dump_float("total_used_raw_ratio",
868 i.second.statfs.get_used_raw_ratio());
869 f->close_section();
870 }
871 f->close_section();
872 } else {
873 ceph_assert(ss != nullptr);
874 TextTable tbl;
875 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
876 tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
877 tbl.define_column("AVAIL", TextTable::RIGHT, TextTable::RIGHT);
878 tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
879 tbl.define_column("RAW USED", TextTable::RIGHT, TextTable::RIGHT);
880 tbl.define_column("%RAW USED", TextTable::RIGHT, TextTable::RIGHT);
881
882
883 for (auto& i : osd_sum_by_class) {
884 tbl << i.first;
885 tbl << stringify(byte_u_t(i.second.statfs.total))
886 << stringify(byte_u_t(i.second.statfs.available))
887 << stringify(byte_u_t(i.second.statfs.get_used()))
888 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
889 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
890 << TextTable::endrow;
891 }
892 tbl << "TOTAL";
893 tbl << stringify(byte_u_t(osd_sum.statfs.total))
894 << stringify(byte_u_t(osd_sum.statfs.available))
895 << stringify(byte_u_t(osd_sum.statfs.get_used()))
896 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
897 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
898 << TextTable::endrow;
899
900 *ss << "--- RAW STORAGE ---\n";
901 *ss << tbl;
902 }
903 }
904
905 void PGMapDigest::dump_object_stat_sum(
906 TextTable &tbl, ceph::Formatter *f,
907 const pool_stat_t &pool_stat, uint64_t avail,
908 float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
909 const pg_pool_t *pool)
910 {
911 const object_stat_sum_t &sum = pool_stat.stats.sum;
912 const store_statfs_t statfs = pool_stat.store_stats;
913
914 if (sum.num_object_copies > 0) {
915 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
916 }
917
918 uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
919 uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
920 uint64_t used_bytes = used_data_bytes + used_omap_bytes;
921
922 float used = 0.0;
923 // note avail passed in is raw_avail, calc raw_used here.
924 if (avail) {
925 used = used_bytes;
926 used /= used + avail;
927 } else if (used_bytes) {
928 used = 1.0;
929 }
930 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
931 // an approximation for actually stored user data
932 auto stored_data_normalized = pool_stat.get_user_data_bytes(
933 raw_used_rate, per_pool);
934 auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
935 raw_used_rate, per_pool_omap);
936 auto stored_normalized = stored_data_normalized + stored_omap_normalized;
937 // same, amplied by replication or EC
938 auto stored_raw = stored_normalized * raw_used_rate;
939 if (f) {
940 f->dump_int("stored", stored_normalized);
941 if (verbose) {
942 f->dump_int("stored_data", stored_data_normalized);
943 f->dump_int("stored_omap", stored_omap_normalized);
944 }
945 f->dump_int("objects", sum.num_objects);
946 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
947 f->dump_int("bytes_used", used_bytes);
948 if (verbose) {
949 f->dump_int("data_bytes_used", used_data_bytes);
950 f->dump_int("omap_bytes_used", used_omap_bytes);
951 }
952 f->dump_float("percent_used", used);
953 f->dump_unsigned("max_avail", avail_res);
954 if (verbose) {
955 f->dump_int("quota_objects", pool->quota_max_objects);
956 f->dump_int("quota_bytes", pool->quota_max_bytes);
957 f->dump_int("dirty", sum.num_objects_dirty);
958 f->dump_int("rd", sum.num_rd);
959 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
960 f->dump_int("wr", sum.num_wr);
961 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
962 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
963 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
964 // Stored by user amplified by replication
965 f->dump_int("stored_raw", stored_raw);
966 f->dump_unsigned("avail_raw", avail);
967 }
968 } else {
969 tbl << stringify(byte_u_t(stored_normalized));
970 if (verbose) {
971 tbl << stringify(byte_u_t(stored_data_normalized));
972 tbl << stringify(byte_u_t(stored_omap_normalized));
973 }
974 tbl << stringify(si_u_t(sum.num_objects));
975 tbl << stringify(byte_u_t(used_bytes));
976 if (verbose) {
977 tbl << stringify(byte_u_t(used_data_bytes));
978 tbl << stringify(byte_u_t(used_omap_bytes));
979 }
980 tbl << percentify(used*100);
981 tbl << stringify(byte_u_t(avail_res));
982 if (verbose) {
983 if (pool->quota_max_objects == 0)
984 tbl << "N/A";
985 else
986 tbl << stringify(si_u_t(pool->quota_max_objects));
987
988 if (pool->quota_max_bytes == 0)
989 tbl << "N/A";
990 else
991 tbl << stringify(byte_u_t(pool->quota_max_bytes));
992
993 tbl << stringify(si_u_t(sum.num_objects_dirty))
994 << stringify(byte_u_t(statfs.data_compressed_allocated))
995 << stringify(byte_u_t(statfs.data_compressed_original))
996 ;
997 }
998 }
999 }
1000
1001 int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
1002 int64_t poolid) const
1003 {
1004 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
1005 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
1006 pool->get_type(),
1007 pool->get_size());
1008 int64_t avail;
1009 avail = get_rule_avail(ruleno);
1010 if (avail < 0)
1011 avail = 0;
1012
1013 return avail / osd_map.pool_raw_used_rate(poolid);
1014 }
1015
1016 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1017 {
1018 map<int,float> wm;
1019 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1020 if (r < 0) {
1021 return r;
1022 }
1023 if (wm.empty()) {
1024 return 0;
1025 }
1026
1027 float fratio = osdmap.get_full_ratio();
1028
1029 int64_t min = -1;
1030 for (auto p = wm.begin(); p != wm.end(); ++p) {
1031 auto osd_info = osd_stat.find(p->first);
1032 if (osd_info != osd_stat.end()) {
1033 if (osd_info->second.statfs.total == 0 || p->second == 0) {
1034 // osd must be out, hence its stats have been zeroed
1035 // (unless we somehow managed to have a disk with size 0...)
1036 //
1037 // (p->second == 0), if osd weight is 0, no need to
1038 // calculate proj below.
1039 continue;
1040 }
1041 double unusable = (double)osd_info->second.statfs.kb() *
1042 (1.0 - fratio);
1043 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
1044 avail *= 1024.0;
1045 int64_t proj = (int64_t)(avail / (double)p->second);
1046 if (min < 0 || proj < min) {
1047 min = proj;
1048 }
1049 } else {
1050 if (osdmap.is_up(p->first)) {
1051 // This is a level 4 rather than an error, because we might have
1052 // only just started, and not received the first stats message yet.
1053 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1054 }
1055 }
1056 }
1057 return min;
1058 }
1059
1060 void PGMap::get_rules_avail(const OSDMap& osdmap,
1061 std::map<int,int64_t> *avail_map) const
1062 {
1063 avail_map->clear();
1064 for (auto p : osdmap.get_pools()) {
1065 int64_t pool_id = p.first;
1066 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1067 continue;
1068 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1069 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1070 pool->get_type(),
1071 pool->get_size());
1072 if (avail_map->count(ruleno) == 0)
1073 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1074 }
1075 }
1076
1077 // ---------------------
1078 // PGMap
1079
1080 void PGMap::Incremental::dump(ceph::Formatter *f) const
1081 {
1082 f->dump_unsigned("version", version);
1083 f->dump_stream("stamp") << stamp;
1084 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1085 f->dump_unsigned("pg_scan_epoch", pg_scan);
1086
1087 f->open_array_section("pg_stat_updates");
1088 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1089 f->open_object_section("pg_stat");
1090 f->dump_stream("pgid") << p->first;
1091 p->second.dump(f);
1092 f->close_section();
1093 }
1094 f->close_section();
1095
1096 f->open_array_section("osd_stat_updates");
1097 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1098 f->open_object_section("osd_stat");
1099 f->dump_int("osd", p->first);
1100 p->second.dump(f);
1101 f->close_section();
1102 }
1103 f->close_section();
1104 f->open_array_section("pool_statfs_updates");
1105 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1106 f->open_object_section("pool_statfs");
1107 f->dump_stream("poolid/osd") << p->first;
1108 p->second.dump(f);
1109 f->close_section();
1110 }
1111 f->close_section();
1112
1113 f->open_array_section("osd_stat_removals");
1114 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1115 f->dump_int("osd", *p);
1116 f->close_section();
1117
1118 f->open_array_section("pg_removals");
1119 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1120 f->dump_stream("pgid") << *p;
1121 f->close_section();
1122 }
1123
1124 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1125 {
1126 o.push_back(new Incremental);
1127 o.push_back(new Incremental);
1128 o.back()->version = 1;
1129 o.back()->stamp = utime_t(123,345);
1130 o.push_back(new Incremental);
1131 o.back()->version = 2;
1132 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
1133 o.back()->osd_stat_updates[5] = osd_stat_t();
1134 o.push_back(new Incremental);
1135 o.back()->version = 3;
1136 o.back()->osdmap_epoch = 1;
1137 o.back()->pg_scan = 2;
1138 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
1139 o.back()->osd_stat_updates[6] = osd_stat_t();
1140 o.back()->pg_remove.insert(pg_t(1,2));
1141 o.back()->osd_stat_rm.insert(5);
1142 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
1143 }
1144
1145 // --
1146
1147 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1148 {
1149 ceph_assert(inc.version == version+1);
1150 version++;
1151
1152 pool_stat_t pg_sum_old = pg_sum;
1153 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1154 pg_pool_sum_old = pg_pool_sum;
1155
1156 for (auto p = inc.pg_stat_updates.begin();
1157 p != inc.pg_stat_updates.end();
1158 ++p) {
1159 const pg_t &update_pg(p->first);
1160 auto update_pool = update_pg.pool();
1161 const pg_stat_t &update_stat(p->second);
1162
1163 auto pg_stat_iter = pg_stat.find(update_pg);
1164 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1165 if (pg_stat_iter == pg_stat.end()) {
1166 pg_stat.insert(make_pair(update_pg, update_stat));
1167 } else {
1168 stat_pg_sub(update_pg, pg_stat_iter->second);
1169 pool_sum_ref.sub(pg_stat_iter->second);
1170 pg_stat_iter->second = update_stat;
1171 }
1172 stat_pg_add(update_pg, update_stat);
1173 pool_sum_ref.add(update_stat);
1174 }
1175
1176 for (auto p = inc.pool_statfs_updates.begin();
1177 p != inc.pool_statfs_updates.end();
1178 ++p) {
1179 auto update_pool = p->first.first;
1180 auto update_osd = p->first.second;
1181 auto& statfs_inc = p->second;
1182
1183 auto pool_statfs_iter =
1184 pool_statfs.find(std::make_pair(update_pool, update_osd));
1185 if (pg_pool_sum.count(update_pool)) {
1186 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1187 if (pool_statfs_iter == pool_statfs.end()) {
1188 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1189 } else {
1190 pool_sum_ref.sub(pool_statfs_iter->second);
1191 pool_statfs_iter->second = statfs_inc;
1192 }
1193 pool_sum_ref.add(statfs_inc);
1194 }
1195 }
1196
1197 for (auto p = inc.get_osd_stat_updates().begin();
1198 p != inc.get_osd_stat_updates().end();
1199 ++p) {
1200 int osd = p->first;
1201 const osd_stat_t &new_stats(p->second);
1202
1203 auto t = osd_stat.find(osd);
1204 if (t == osd_stat.end()) {
1205 osd_stat.insert(make_pair(osd, new_stats));
1206 } else {
1207 stat_osd_sub(t->first, t->second);
1208 t->second = new_stats;
1209 }
1210 stat_osd_add(osd, new_stats);
1211 }
1212 set<int64_t> deleted_pools;
1213 for (auto p = inc.pg_remove.begin();
1214 p != inc.pg_remove.end();
1215 ++p) {
1216 const pg_t &removed_pg(*p);
1217 auto s = pg_stat.find(removed_pg);
1218 bool pool_erased = false;
1219 if (s != pg_stat.end()) {
1220 pool_erased = stat_pg_sub(removed_pg, s->second);
1221
1222 // decrease pool stats if pg was removed
1223 auto pool_stats_it = pg_pool_sum.find(removed_pg.pool());
1224 if (pool_stats_it != pg_pool_sum.end()) {
1225 pool_stats_it->second.sub(s->second);
1226 }
1227
1228 pg_stat.erase(s);
1229 if (pool_erased) {
1230 deleted_pools.insert(removed_pg.pool());
1231 }
1232 }
1233 }
1234
1235 for (auto p = inc.get_osd_stat_rm().begin();
1236 p != inc.get_osd_stat_rm().end();
1237 ++p) {
1238 auto t = osd_stat.find(*p);
1239 if (t != osd_stat.end()) {
1240 stat_osd_sub(t->first, t->second);
1241 osd_stat.erase(t);
1242 }
1243 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1244 if (i->first.second == *p) {
1245 pg_pool_sum[i->first.first].sub(i->second);
1246 pool_statfs.erase(i);
1247 }
1248 }
1249 }
1250
1251 // skip calculating delta while sum was not synchronized
1252 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1253 utime_t delta_t;
1254 delta_t = inc.stamp;
1255 delta_t -= stamp;
1256 // calculate a delta, and average over the last 2 deltas.
1257 pool_stat_t d = pg_sum;
1258 d.stats.sub(pg_sum_old.stats);
1259 pg_sum_deltas.push_back(make_pair(d, delta_t));
1260 stamp_delta += delta_t;
1261 pg_sum_delta.stats.add(d.stats);
1262 auto smooth_intervals =
1263 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1264 while (pg_sum_deltas.size() > smooth_intervals) {
1265 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1266 stamp_delta -= pg_sum_deltas.front().second;
1267 pg_sum_deltas.pop_front();
1268 }
1269 }
1270 stamp = inc.stamp;
1271
1272 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1273
1274 for (auto p : deleted_pools) {
1275 if (cct)
1276 dout(20) << " deleted pool " << p << dendl;
1277 deleted_pool(p);
1278 }
1279
1280 if (inc.osdmap_epoch)
1281 last_osdmap_epoch = inc.osdmap_epoch;
1282 if (inc.pg_scan)
1283 last_pg_scan = inc.pg_scan;
1284 }
1285
1286 void PGMap::calc_stats()
1287 {
1288 num_pg = 0;
1289 num_pg_active = 0;
1290 num_pg_unknown = 0;
1291 num_osd = 0;
1292 pg_pool_sum.clear();
1293 num_pg_by_pool.clear();
1294 pg_by_osd.clear();
1295 pg_sum = pool_stat_t();
1296 osd_sum = osd_stat_t();
1297 osd_sum_by_class.clear();
1298 num_pg_by_state.clear();
1299 num_pg_by_pool_state.clear();
1300 num_pg_by_osd.clear();
1301
1302 for (auto p = pg_stat.begin();
1303 p != pg_stat.end();
1304 ++p) {
1305 auto pg = p->first;
1306 stat_pg_add(pg, p->second);
1307 pg_pool_sum[pg.pool()].add(p->second);
1308 }
1309 for (auto p = pool_statfs.begin();
1310 p != pool_statfs.end();
1311 ++p) {
1312 auto pool = p->first.first;
1313 pg_pool_sum[pool].add(p->second);
1314 }
1315 for (auto p = osd_stat.begin();
1316 p != osd_stat.end();
1317 ++p)
1318 stat_osd_add(p->first, p->second);
1319 }
1320
1321 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1322 bool sameosds)
1323 {
1324 auto pool = pgid.pool();
1325 pg_sum.add(s);
1326
1327 num_pg++;
1328 num_pg_by_state[s.state]++;
1329 num_pg_by_pool_state[pgid.pool()][s.state]++;
1330 num_pg_by_pool[pool]++;
1331
1332 if ((s.state & PG_STATE_CREATING) &&
1333 s.parent_split_bits == 0) {
1334 creating_pgs.insert(pgid);
1335 if (s.acting_primary >= 0) {
1336 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1337 }
1338 }
1339
1340 if (s.state & PG_STATE_ACTIVE) {
1341 ++num_pg_active;
1342 }
1343 if (s.state == 0) {
1344 ++num_pg_unknown;
1345 }
1346
1347 if (sameosds)
1348 return;
1349
1350 for (auto p = s.blocked_by.begin();
1351 p != s.blocked_by.end();
1352 ++p) {
1353 ++blocked_by_sum[*p];
1354 }
1355
1356 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1357 pg_by_osd[*p].insert(pgid);
1358 num_pg_by_osd[*p].acting++;
1359 }
1360 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1361 auto& t = pg_by_osd[*p];
1362 if (t.find(pgid) == t.end()) {
1363 t.insert(pgid);
1364 num_pg_by_osd[*p].up_not_acting++;
1365 }
1366 }
1367
1368 if (s.up_primary >= 0) {
1369 num_pg_by_osd[s.up_primary].primary++;
1370 }
1371 }
1372
1373 bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1374 bool sameosds)
1375 {
1376 bool pool_erased = false;
1377 pg_sum.sub(s);
1378
1379 num_pg--;
1380 int end = --num_pg_by_state[s.state];
1381 ceph_assert(end >= 0);
1382 if (end == 0)
1383 num_pg_by_state.erase(s.state);
1384 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1385 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1386 }
1387 end = --num_pg_by_pool[pgid.pool()];
1388 if (end == 0) {
1389 pool_erased = true;
1390 }
1391
1392 if ((s.state & PG_STATE_CREATING) &&
1393 s.parent_split_bits == 0) {
1394 creating_pgs.erase(pgid);
1395 if (s.acting_primary >= 0) {
1396 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1397 r[s.mapping_epoch].erase(pgid);
1398 if (r[s.mapping_epoch].empty())
1399 r.erase(s.mapping_epoch);
1400 if (r.empty())
1401 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1402 }
1403 }
1404
1405 if (s.state & PG_STATE_ACTIVE) {
1406 --num_pg_active;
1407 }
1408 if (s.state == 0) {
1409 --num_pg_unknown;
1410 }
1411
1412 if (sameosds)
1413 return pool_erased;
1414
1415 for (auto p = s.blocked_by.begin();
1416 p != s.blocked_by.end();
1417 ++p) {
1418 auto q = blocked_by_sum.find(*p);
1419 ceph_assert(q != blocked_by_sum.end());
1420 --q->second;
1421 if (q->second == 0)
1422 blocked_by_sum.erase(q);
1423 }
1424
1425 set<int32_t> actingset;
1426 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1427 actingset.insert(*p);
1428 auto& oset = pg_by_osd[*p];
1429 oset.erase(pgid);
1430 if (oset.empty())
1431 pg_by_osd.erase(*p);
1432 auto it = num_pg_by_osd.find(*p);
1433 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1434 it->second.acting--;
1435 }
1436 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1437 auto& oset = pg_by_osd[*p];
1438 oset.erase(pgid);
1439 if (oset.empty())
1440 pg_by_osd.erase(*p);
1441 if (actingset.count(*p))
1442 continue;
1443 auto it = num_pg_by_osd.find(*p);
1444 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1445 it->second.up_not_acting--;
1446 }
1447
1448 if (s.up_primary >= 0) {
1449 auto it = num_pg_by_osd.find(s.up_primary);
1450 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1451 it->second.primary--;
1452 }
1453 return pool_erased;
1454 }
1455
1456 void PGMap::calc_purged_snaps()
1457 {
1458 purged_snaps.clear();
1459 set<int64_t> unknown;
1460 for (auto& i : pg_stat) {
1461 if (i.second.state == 0) {
1462 unknown.insert(i.first.pool());
1463 purged_snaps.erase(i.first.pool());
1464 continue;
1465 } else if (unknown.count(i.first.pool())) {
1466 continue;
1467 }
1468 auto j = purged_snaps.find(i.first.pool());
1469 if (j == purged_snaps.end()) {
1470 // base case
1471 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1472 } else {
1473 j->second.intersection_of(i.second.purged_snaps);
1474 }
1475 }
1476 }
1477
1478 void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
1479 {
1480 osd_sum_by_class.clear();
1481 for (auto& i : osd_stat) {
1482 const char *class_name = osdmap.crush->get_item_class(i.first);
1483 if (class_name) {
1484 osd_sum_by_class[class_name].add(i.second);
1485 }
1486 }
1487 }
1488
1489 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1490 {
1491 num_osd++;
1492 osd_sum.add(s);
1493 if (osd >= (int)osd_last_seq.size()) {
1494 osd_last_seq.resize(osd + 1);
1495 }
1496 osd_last_seq[osd] = s.seq;
1497 }
1498
1499 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1500 {
1501 num_osd--;
1502 osd_sum.sub(s);
1503 ceph_assert(osd < (int)osd_last_seq.size());
1504 osd_last_seq[osd] = 0;
1505 }
1506
1507 void PGMap::encode_digest(const OSDMap& osdmap,
1508 bufferlist& bl, uint64_t features)
1509 {
1510 get_rules_avail(osdmap, &avail_space_by_rule);
1511 calc_osd_sum_by_class(osdmap);
1512 calc_purged_snaps();
1513 PGMapDigest::encode(bl, features);
1514 }
1515
1516 void PGMap::encode(bufferlist &bl, uint64_t features) const
1517 {
1518 ENCODE_START(8, 8, bl);
1519 encode(version, bl);
1520 encode(pg_stat, bl);
1521 encode(osd_stat, bl, features);
1522 encode(last_osdmap_epoch, bl);
1523 encode(last_pg_scan, bl);
1524 encode(stamp, bl);
1525 encode(pool_statfs, bl, features);
1526 ENCODE_FINISH(bl);
1527 }
1528
1529 void PGMap::decode(bufferlist::const_iterator &bl)
1530 {
1531 DECODE_START(8, bl);
1532 decode(version, bl);
1533 decode(pg_stat, bl);
1534 decode(osd_stat, bl);
1535 decode(last_osdmap_epoch, bl);
1536 decode(last_pg_scan, bl);
1537 decode(stamp, bl);
1538 decode(pool_statfs, bl);
1539 DECODE_FINISH(bl);
1540
1541 calc_stats();
1542 }
1543
1544 void PGMap::dump(ceph::Formatter *f, bool with_net) const
1545 {
1546 dump_basic(f);
1547 dump_pg_stats(f, false);
1548 dump_pool_stats(f);
1549 dump_osd_stats(f, with_net);
1550 }
1551
1552 void PGMap::dump_basic(ceph::Formatter *f) const
1553 {
1554 f->dump_unsigned("version", version);
1555 f->dump_stream("stamp") << stamp;
1556 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1557 f->dump_unsigned("last_pg_scan", last_pg_scan);
1558
1559 f->open_object_section("pg_stats_sum");
1560 pg_sum.dump(f);
1561 f->close_section();
1562
1563 f->open_object_section("osd_stats_sum");
1564 osd_sum.dump(f);
1565 f->close_section();
1566
1567 dump_delta(f);
1568 }
1569
1570 void PGMap::dump_delta(ceph::Formatter *f) const
1571 {
1572 f->open_object_section("pg_stats_delta");
1573 pg_sum_delta.dump(f);
1574 f->dump_stream("stamp_delta") << stamp_delta;
1575 f->close_section();
1576 }
1577
1578 void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
1579 {
1580 f->open_array_section("pg_stats");
1581 for (auto i = pg_stat.begin();
1582 i != pg_stat.end();
1583 ++i) {
1584 f->open_object_section("pg_stat");
1585 f->dump_stream("pgid") << i->first;
1586 if (brief)
1587 i->second.dump_brief(f);
1588 else
1589 i->second.dump(f);
1590 f->close_section();
1591 }
1592 f->close_section();
1593 }
1594
1595 void PGMap::dump_pool_stats(ceph::Formatter *f) const
1596 {
1597 f->open_array_section("pool_stats");
1598 for (auto p = pg_pool_sum.begin();
1599 p != pg_pool_sum.end();
1600 ++p) {
1601 f->open_object_section("pool_stat");
1602 f->dump_int("poolid", p->first);
1603 auto q = num_pg_by_pool.find(p->first);
1604 if (q != num_pg_by_pool.end())
1605 f->dump_unsigned("num_pg", q->second);
1606 p->second.dump(f);
1607 f->close_section();
1608 }
1609 f->close_section();
1610 }
1611
1612 void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
1613 {
1614 f->open_array_section("osd_stats");
1615 for (auto q = osd_stat.begin();
1616 q != osd_stat.end();
1617 ++q) {
1618 f->open_object_section("osd_stat");
1619 f->dump_int("osd", q->first);
1620 q->second.dump(f, with_net);
1621 f->close_section();
1622 }
1623 f->close_section();
1624
1625 f->open_array_section("pool_statfs");
1626 for (auto& p : pool_statfs) {
1627 f->open_object_section("item");
1628 f->dump_int("poolid", p.first.first);
1629 f->dump_int("osd", p.first.second);
1630 p.second.dump(f);
1631 f->close_section();
1632 }
1633 f->close_section();
1634 }
1635
1636 void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
1637 {
1638 f->open_array_section("osd_ping_times");
1639 for (auto& [osd, stat] : osd_stat) {
1640 f->open_object_section("osd_ping_time");
1641 f->dump_int("osd", osd);
1642 stat.dump_ping_time(f);
1643 f->close_section();
1644 }
1645 f->close_section();
1646 }
1647
1648 void PGMap::dump_pg_stats_plain(
1649 ostream& ss,
1650 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1651 bool brief) const
1652 {
1653 TextTable tab;
1654
1655 if (brief){
1656 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1657 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1658 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1659 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1660 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1661 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1662 }
1663 else {
1664 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1665 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1666 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1667 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1668 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1669 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1670 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1671 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1672 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1673 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1674 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1675 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1676 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1677 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1678 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1679 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1680 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1681 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1682 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1683 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1684 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1685 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1686 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1687 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
1688 }
1689
1690 for (auto i = pg_stats.begin();
1691 i != pg_stats.end(); ++i) {
1692 const pg_stat_t &st(i->second);
1693 if (brief) {
1694 tab << i->first
1695 << pg_state_string(st.state)
1696 << st.up
1697 << st.up_primary
1698 << st.acting
1699 << st.acting_primary
1700 << TextTable::endrow;
1701 } else {
1702 ostringstream reported;
1703 reported << st.reported_epoch << ":" << st.reported_seq;
1704
1705 tab << i->first
1706 << st.stats.sum.num_objects
1707 << st.stats.sum.num_objects_missing_on_primary
1708 << st.stats.sum.num_objects_degraded
1709 << st.stats.sum.num_objects_misplaced
1710 << st.stats.sum.num_objects_unfound
1711 << st.stats.sum.num_bytes
1712 << st.stats.sum.num_omap_bytes
1713 << st.stats.sum.num_omap_keys
1714 << st.log_size
1715 << st.ondisk_log_size
1716 << pg_state_string(st.state)
1717 << st.last_change
1718 << st.version
1719 << reported.str()
1720 << pg_vector_string(st.up)
1721 << st.up_primary
1722 << pg_vector_string(st.acting)
1723 << st.acting_primary
1724 << st.last_scrub
1725 << st.last_scrub_stamp
1726 << st.last_deep_scrub
1727 << st.last_deep_scrub_stamp
1728 << st.snaptrimq_len
1729 << TextTable::endrow;
1730 }
1731 }
1732
1733 ss << tab;
1734 }
1735
1736 void PGMap::dump(ostream& ss) const
1737 {
1738 dump_basic(ss);
1739 dump_pg_stats(ss, false);
1740 dump_pool_stats(ss, false);
1741 dump_pg_sum_stats(ss, false);
1742 dump_osd_stats(ss);
1743 }
1744
1745 void PGMap::dump_basic(ostream& ss) const
1746 {
1747 ss << "version " << version << std::endl;
1748 ss << "stamp " << stamp << std::endl;
1749 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1750 ss << "last_pg_scan " << last_pg_scan << std::endl;
1751 }
1752
1753 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1754 {
1755 dump_pg_stats_plain(ss, pg_stat, brief);
1756 }
1757
1758 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1759 {
1760 TextTable tab;
1761
1762 if (header) {
1763 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1764 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1765 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1766 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1768 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1770 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1771 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1772 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1774 } else {
1775 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1776 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1777 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1778 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1780 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1782 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1784 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1785 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1786 }
1787
1788 for (auto p = pg_pool_sum.begin();
1789 p != pg_pool_sum.end();
1790 ++p) {
1791 tab << p->first
1792 << p->second.stats.sum.num_objects
1793 << p->second.stats.sum.num_objects_missing_on_primary
1794 << p->second.stats.sum.num_objects_degraded
1795 << p->second.stats.sum.num_objects_misplaced
1796 << p->second.stats.sum.num_objects_unfound
1797 << p->second.stats.sum.num_bytes
1798 << p->second.stats.sum.num_omap_bytes
1799 << p->second.stats.sum.num_omap_keys
1800 << p->second.log_size
1801 << p->second.ondisk_log_size
1802 << TextTable::endrow;
1803 }
1804
1805 ss << tab;
1806 }
1807
1808 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1809 {
1810 TextTable tab;
1811
1812 if (header) {
1813 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1814 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1815 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1816 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1817 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1818 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1819 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1820 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1821 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1822 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1823 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1824 } else {
1825 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1826 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1827 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1828 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1829 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1830 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1831 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1832 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1833 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1834 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1835 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1836 };
1837
1838 tab << "sum"
1839 << pg_sum.stats.sum.num_objects
1840 << pg_sum.stats.sum.num_objects_missing_on_primary
1841 << pg_sum.stats.sum.num_objects_degraded
1842 << pg_sum.stats.sum.num_objects_misplaced
1843 << pg_sum.stats.sum.num_objects_unfound
1844 << pg_sum.stats.sum.num_bytes
1845 << pg_sum.stats.sum.num_omap_bytes
1846 << pg_sum.stats.sum.num_omap_keys
1847 << pg_sum.log_size
1848 << pg_sum.ondisk_log_size
1849 << TextTable::endrow;
1850
1851 ss << tab;
1852 }
1853
1854 void PGMap::dump_osd_stats(ostream& ss) const
1855 {
1856 TextTable tab;
1857
1858 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1859 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1860 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1864 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1865 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1866
1867 for (auto p = osd_stat.begin();
1868 p != osd_stat.end();
1869 ++p) {
1870 tab << p->first
1871 << byte_u_t(p->second.statfs.get_used())
1872 << byte_u_t(p->second.statfs.available)
1873 << byte_u_t(p->second.statfs.get_used_raw())
1874 << byte_u_t(p->second.statfs.total)
1875 << p->second.hb_peers
1876 << get_num_pg_by_osd(p->first)
1877 << get_num_primary_pg_by_osd(p->first)
1878 << TextTable::endrow;
1879 }
1880
1881 tab << "sum"
1882 << byte_u_t(osd_sum.statfs.get_used())
1883 << byte_u_t(osd_sum.statfs.available)
1884 << byte_u_t(osd_sum.statfs.get_used_raw())
1885 << byte_u_t(osd_sum.statfs.total)
1886 << TextTable::endrow;
1887
1888 ss << tab;
1889 }
1890
1891 void PGMap::dump_osd_sum_stats(ostream& ss) const
1892 {
1893 TextTable tab;
1894
1895 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1896 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1897 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1898 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1899 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1900
1901 tab << "sum"
1902 << byte_u_t(osd_sum.statfs.get_used())
1903 << byte_u_t(osd_sum.statfs.available)
1904 << byte_u_t(osd_sum.statfs.get_used_raw())
1905 << byte_u_t(osd_sum.statfs.total)
1906 << TextTable::endrow;
1907
1908 ss << tab;
1909 }
1910
1911 void PGMap::get_stuck_stats(
1912 int types, const utime_t cutoff,
1913 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1914 {
1915 ceph_assert(types != 0);
1916 for (auto i = pg_stat.begin();
1917 i != pg_stat.end();
1918 ++i) {
1919 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1920
1921 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1922 if (i->second.last_active < val)
1923 val = i->second.last_active;
1924 }
1925
1926 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1927 if (i->second.last_clean < val)
1928 val = i->second.last_clean;
1929 }
1930
1931 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1932 if (i->second.last_undegraded < val)
1933 val = i->second.last_undegraded;
1934 }
1935
1936 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1937 if (i->second.last_fullsized < val)
1938 val = i->second.last_fullsized;
1939 }
1940
1941 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1942 if (i->second.last_unstale < val)
1943 val = i->second.last_unstale;
1944 }
1945
1946 // val is now the earliest any of the requested stuck states began
1947 if (val < cutoff) {
1948 stuck_pgs[i->first] = i->second;
1949 }
1950 }
1951 }
1952
1953 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1954 {
1955 int inactive = 0;
1956 int unclean = 0;
1957 int degraded = 0;
1958 int undersized = 0;
1959 int stale = 0;
1960
1961 for (auto i = pg_stat.begin();
1962 i != pg_stat.end();
1963 ++i) {
1964 if (! (i->second.state & PG_STATE_ACTIVE)) {
1965 if (i->second.last_active < cutoff)
1966 ++inactive;
1967 }
1968 if (! (i->second.state & PG_STATE_CLEAN)) {
1969 if (i->second.last_clean < cutoff)
1970 ++unclean;
1971 }
1972 if (i->second.state & PG_STATE_DEGRADED) {
1973 if (i->second.last_undegraded < cutoff)
1974 ++degraded;
1975 }
1976 if (i->second.state & PG_STATE_UNDERSIZED) {
1977 if (i->second.last_fullsized < cutoff)
1978 ++undersized;
1979 }
1980 if (i->second.state & PG_STATE_STALE) {
1981 if (i->second.last_unstale < cutoff)
1982 ++stale;
1983 }
1984 }
1985
1986 if (inactive)
1987 note["stuck inactive"] = inactive;
1988
1989 if (unclean)
1990 note["stuck unclean"] = unclean;
1991
1992 if (undersized)
1993 note["stuck undersized"] = undersized;
1994
1995 if (degraded)
1996 note["stuck degraded"] = degraded;
1997
1998 if (stale)
1999 note["stuck stale"] = stale;
2000
2001 return inactive || unclean || undersized || degraded || stale;
2002 }
2003
2004 void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
2005 {
2006 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2007 get_stuck_stats(types, cutoff, stuck_pg_stats);
2008 f->open_array_section("stuck_pg_stats");
2009 for (auto i = stuck_pg_stats.begin();
2010 i != stuck_pg_stats.end();
2011 ++i) {
2012 f->open_object_section("pg_stat");
2013 f->dump_stream("pgid") << i->first;
2014 i->second.dump(f);
2015 f->close_section();
2016 }
2017 f->close_section();
2018 }
2019
2020 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2021 {
2022 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2023 get_stuck_stats(types, cutoff, stuck_pg_stats);
2024 if (!stuck_pg_stats.empty())
2025 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2026 }
2027
2028 int PGMap::dump_stuck_pg_stats(
2029 stringstream &ds,
2030 ceph::Formatter *f,
2031 int threshold,
2032 vector<string>& args) const
2033 {
2034 int stuck_types = 0;
2035
2036 for (auto i = args.begin(); i != args.end(); ++i) {
2037 if (*i == "inactive")
2038 stuck_types |= PGMap::STUCK_INACTIVE;
2039 else if (*i == "unclean")
2040 stuck_types |= PGMap::STUCK_UNCLEAN;
2041 else if (*i == "undersized")
2042 stuck_types |= PGMap::STUCK_UNDERSIZED;
2043 else if (*i == "degraded")
2044 stuck_types |= PGMap::STUCK_DEGRADED;
2045 else if (*i == "stale")
2046 stuck_types |= PGMap::STUCK_STALE;
2047 else {
2048 ds << "Unknown type: " << *i << std::endl;
2049 return -EINVAL;
2050 }
2051 }
2052
2053 utime_t now(ceph_clock_now());
2054 utime_t cutoff = now - utime_t(threshold, 0);
2055
2056 if (!f) {
2057 dump_stuck_plain(ds, stuck_types, cutoff);
2058 } else {
2059 dump_stuck(f, stuck_types, cutoff);
2060 f->flush(ds);
2061 }
2062
2063 return 0;
2064 }
2065
2066 void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
2067 {
2068 f->open_array_section("osd_perf_infos");
2069 for (auto i = osd_stat.begin();
2070 i != osd_stat.end();
2071 ++i) {
2072 f->open_object_section("osd");
2073 f->dump_int("id", i->first);
2074 {
2075 f->open_object_section("perf_stats");
2076 i->second.os_perf_stat.dump(f);
2077 f->close_section();
2078 }
2079 f->close_section();
2080 }
2081 f->close_section();
2082 }
2083 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2084 {
2085 TextTable tab;
2086 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2087 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2088 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2089 for (auto i = osd_stat.begin();
2090 i != osd_stat.end();
2091 ++i) {
2092 tab << i->first;
2093 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2094 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
2095 tab << TextTable::endrow;
2096 }
2097 (*ss) << tab;
2098 }
2099
2100 void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
2101 {
2102 f->open_array_section("osd_blocked_by_infos");
2103 for (auto i = blocked_by_sum.begin();
2104 i != blocked_by_sum.end();
2105 ++i) {
2106 f->open_object_section("osd");
2107 f->dump_int("id", i->first);
2108 f->dump_int("num_blocked", i->second);
2109 f->close_section();
2110 }
2111 f->close_section();
2112 }
2113 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2114 {
2115 TextTable tab;
2116 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2117 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2118 for (auto i = blocked_by_sum.begin();
2119 i != blocked_by_sum.end();
2120 ++i) {
2121 tab << i->first;
2122 tab << i->second;
2123 tab << TextTable::endrow;
2124 }
2125 (*ss) << tab;
2126 }
2127
2128
2129 /**
2130 * update aggregated delta
2131 *
2132 * @param cct ceph context
2133 * @param ts Timestamp for the stats being delta'ed
2134 * @param old_pool_sum Previous stats sum
2135 * @param last_ts Last timestamp for pool
2136 * @param result_pool_sum Resulting stats
2137 * @param result_pool_delta Resulting pool delta
2138 * @param result_ts_delta Resulting timestamp delta
2139 * @param delta_avg_list List of last N computed deltas, used to average
2140 */
2141 void PGMap::update_delta(
2142 CephContext *cct,
2143 const utime_t ts,
2144 const pool_stat_t& old_pool_sum,
2145 utime_t *last_ts,
2146 const pool_stat_t& current_pool_sum,
2147 pool_stat_t *result_pool_delta,
2148 utime_t *result_ts_delta,
2149 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2150 {
2151 /* @p ts is the timestamp we want to associate with the data
2152 * in @p old_pool_sum, and on which we will base ourselves to
2153 * calculate the delta, stored in 'delta_t'.
2154 */
2155 utime_t delta_t;
2156 delta_t = ts; // start with the provided timestamp
2157 delta_t -= *last_ts; // take the last timestamp we saw
2158 *last_ts = ts; // @p ts becomes the last timestamp we saw
2159
2160 // adjust delta_t, quick start if there is no update in a long period
2161 delta_t = std::min(delta_t,
2162 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2163
2164 // calculate a delta, and average over the last 6 deltas by default.
2165 /* start by taking a copy of our current @p result_pool_sum, and by
2166 * taking out the stats from @p old_pool_sum. This generates a stats
2167 * delta. Stash this stats delta in @p delta_avg_list, along with the
2168 * timestamp delta for these results.
2169 */
2170 pool_stat_t d = current_pool_sum;
2171 d.stats.sub(old_pool_sum.stats);
2172
2173 /* Aggregate current delta, and take out the last seen delta (if any) to
2174 * average it out.
2175 * Skip calculating delta while sum was not synchronized.
2176 */
2177 if(!old_pool_sum.stats.sum.is_zero()) {
2178 delta_avg_list->push_back(make_pair(d,delta_t));
2179 *result_ts_delta += delta_t;
2180 result_pool_delta->stats.add(d.stats);
2181 }
2182 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2183 while (delta_avg_list->size() > s) {
2184 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2185 *result_ts_delta -= delta_avg_list->front().second;
2186 delta_avg_list->pop_front();
2187 }
2188 }
2189
2190 /**
2191 * Update a given pool's deltas
2192 *
2193 * @param cct Ceph Context
2194 * @param ts Timestamp for the stats being delta'ed
2195 * @param pool Pool's id
2196 * @param old_pool_sum Previous stats sum
2197 */
2198 void PGMap::update_one_pool_delta(
2199 CephContext *cct,
2200 const utime_t ts,
2201 const int64_t pool,
2202 const pool_stat_t& old_pool_sum)
2203 {
2204 if (per_pool_sum_deltas.count(pool) == 0) {
2205 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2206 ceph_assert(per_pool_sum_delta.count(pool) == 0);
2207 }
2208
2209 auto& sum_delta = per_pool_sum_delta[pool];
2210
2211 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2212 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2213 &per_pool_sum_deltas[pool]);
2214 }
2215
2216 /**
2217 * Update pools' deltas
2218 *
2219 * @param cct CephContext
2220 * @param ts Timestamp for the stats being delta'ed
2221 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2222 */
2223 void PGMap::update_pool_deltas(
2224 CephContext *cct, const utime_t ts,
2225 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
2226 {
2227 for (auto it = pg_pool_sum_old.begin();
2228 it != pg_pool_sum_old.end(); ++it) {
2229 update_one_pool_delta(cct, ts, it->first, it->second);
2230 }
2231 }
2232
2233 void PGMap::clear_delta()
2234 {
2235 pg_sum_delta = pool_stat_t();
2236 pg_sum_deltas.clear();
2237 stamp_delta = utime_t();
2238 }
2239
2240 void PGMap::generate_test_instances(list<PGMap*>& o)
2241 {
2242 o.push_back(new PGMap);
2243 list<Incremental*> inc;
2244 Incremental::generate_test_instances(inc);
2245 delete inc.front();
2246 inc.pop_front();
2247 while (!inc.empty()) {
2248 PGMap *pmp = new PGMap();
2249 *pmp = *o.back();
2250 o.push_back(pmp);
2251 o.back()->apply_incremental(NULL, *inc.front());
2252 delete inc.front();
2253 inc.pop_front();
2254 }
2255 }
2256
2257 void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
2258 bool primary, set<pg_t>& pgs) const
2259 {
2260 for (auto i = pg_stat.begin();
2261 i != pg_stat.end();
2262 ++i) {
2263 if ((poolid >= 0) && (poolid != i->first.pool()))
2264 continue;
2265 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2266 continue;
2267 if (state == (uint64_t)-1 || // "all"
2268 (i->second.state & state) || // matches a state bit
2269 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2270 pgs.insert(i->first);
2271 }
2272 }
2273 }
2274
2275 void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
2276 {
2277 f->open_array_section("pg_stats");
2278 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2279 const pg_stat_t& st = pg_stat.at(*i);
2280 f->open_object_section("pg_stat");
2281 f->dump_stream("pgid") << *i;
2282 st.dump(f);
2283 f->close_section();
2284 }
2285 f->close_section();
2286 }
2287
2288 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2289 {
2290 TextTable tab;
2291 utime_t now = ceph_clock_now();
2292
2293 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
2294 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2295 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2296 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2297 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2298 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2299 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2300 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
2301 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2302 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2303 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
2304 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2305 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2306 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2307 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2308 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2309 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2310
2311 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2312 const pg_stat_t& st = pg_stat.at(*i);
2313
2314 ostringstream reported;
2315 reported << st.reported_epoch << ":" << st.reported_seq;
2316
2317 ostringstream upstr, actingstr;
2318 upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
2319 actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
2320 tab << *i
2321 << st.stats.sum.num_objects
2322 << st.stats.sum.num_objects_degraded
2323 << st.stats.sum.num_objects_misplaced
2324 << st.stats.sum.num_objects_unfound
2325 << st.stats.sum.num_bytes
2326 << st.stats.sum.num_omap_bytes
2327 << st.stats.sum.num_omap_keys
2328 << st.log_size
2329 << pg_state_string(st.state)
2330 << utimespan_str(now - st.last_change)
2331 << st.version
2332 << reported.str()
2333 << upstr.str()
2334 << actingstr.str()
2335 << st.last_scrub_stamp
2336 << st.last_deep_scrub_stamp
2337 << TextTable::endrow;
2338 }
2339
2340 ss << tab;
2341 }
2342
2343 void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
2344 ceph::Formatter *f,
2345 stringstream *rs) const {
2346 string pool_name = osd_map.get_pool_name(poolid);
2347 if (f) {
2348 f->open_object_section("pool");
2349 f->dump_string("pool_name", pool_name.c_str());
2350 f->dump_int("pool_id", poolid);
2351 f->open_object_section("recovery");
2352 }
2353 list<string> sl;
2354 stringstream tss;
2355 pool_recovery_summary(f, &sl, poolid);
2356 if (!f && !sl.empty()) {
2357 for (auto &p : sl)
2358 tss << " " << p << "\n";
2359 }
2360 if (f) {
2361 f->close_section(); // object section recovery
2362 f->open_object_section("recovery_rate");
2363 }
2364 ostringstream rss;
2365 pool_recovery_rate_summary(f, &rss, poolid);
2366 if (!f && !rss.str().empty())
2367 tss << " recovery io " << rss.str() << "\n";
2368 if (f) {
2369 f->close_section(); // object section recovery_rate
2370 f->open_object_section("client_io_rate");
2371 }
2372 rss.clear();
2373 rss.str("");
2374 pool_client_io_rate_summary(f, &rss, poolid);
2375 if (!f && !rss.str().empty())
2376 tss << " client io " << rss.str() << "\n";
2377 // dump cache tier IO rate for cache pool
2378 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2379 if (pool->is_tier()) {
2380 if (f) {
2381 f->close_section(); // object section client_io_rate
2382 f->open_object_section("cache_io_rate");
2383 }
2384 rss.clear();
2385 rss.str("");
2386 pool_cache_io_rate_summary(f, &rss, poolid);
2387 if (!f && !rss.str().empty())
2388 tss << " cache tier io " << rss.str() << "\n";
2389 }
2390 if (f) {
2391 f->close_section(); // object section cache_io_rate
2392 f->close_section(); // object section pool
2393 } else {
2394 *rs << "pool " << pool_name << " id " << poolid << "\n";
2395 if (!tss.str().empty())
2396 *rs << tss.str() << "\n";
2397 else
2398 *rs << " nothing is going on\n\n";
2399 }
2400 }
2401
2402 // Get crush parentage for an osd (skip root)
2403 set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
2404 {
2405 set<std::string> reporters_by_subtree;
2406 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2407
2408 auto loc = osdmap.crush->get_full_location(id);
2409 for (auto& [parent_bucket_type, parent_id] : loc) {
2410 // Should we show the root? Might not be too informative like "default"
2411 if (parent_bucket_type != "root" &&
2412 parent_bucket_type != reporter_subtree_level) {
2413 reporters_by_subtree.insert(parent_id);
2414 }
2415 }
2416 return reporters_by_subtree;
2417 }
2418
2419 void PGMap::get_health_checks(
2420 CephContext *cct,
2421 const OSDMap& osdmap,
2422 health_check_map_t *checks) const
2423 {
2424 utime_t now = ceph_clock_now();
2425 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2426 const auto& pools = osdmap.get_pools();
2427
2428 typedef enum pg_consequence_t {
2429 UNAVAILABLE = 1, // Client IO to the pool may block
2430 DEGRADED = 2, // Fewer than the requested number of replicas are present
2431 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2432 // This may or may not be a deadlock condition.
2433 DAMAGED = 4, // The data may be missing or inconsistent on disk and
2434 // requires repair
2435 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
2436 } pg_consequence_t;
2437
2438 // For a given PG state, how should it be reported at the pool level?
2439 class PgStateResponse {
2440 public:
2441 pg_consequence_t consequence;
2442 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2443 stuck_cb stuck_since;
2444 bool invert;
2445
2446 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2447 : consequence(c), stuck_since(std::move(s)), invert(false)
2448 {
2449 }
2450
2451 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2452 : consequence(c), stuck_since(std::move(s)), invert(i)
2453 {
2454 }
2455 };
2456
2457 // Record the PG state counts that contributed to a reported pool state
2458 class PgCauses {
2459 public:
2460 // Map of PG_STATE_* to number of pgs in that state.
2461 std::map<unsigned, unsigned> states;
2462
2463 // List of all PG IDs that had a state contributing
2464 // to this health condition.
2465 std::set<pg_t> pgs;
2466
2467 std::map<pg_t, std::string> pg_messages;
2468 };
2469
2470 // Map of PG state to how to respond to it
2471 std::map<unsigned, PgStateResponse> state_to_response = {
2472 // Immediate reports
2473 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2474 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
2475 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2476 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2477 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
2478 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2479 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
2480 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2481 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2482 // Delayed (wait until stuck) reports
2483 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2484 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2485 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2486 // Delayed and inverted reports
2487 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
2488 };
2489
2490 // Specialized state printer that takes account of inversion of
2491 // ACTIVE, CLEAN checks.
2492 auto state_name = [](const uint64_t &state) {
2493 // Special cases for the states that are inverted checks
2494 if (state == PG_STATE_CLEAN) {
2495 return std::string("unclean");
2496 } else if (state == PG_STATE_ACTIVE) {
2497 return std::string("inactive");
2498 } else {
2499 return pg_state_string(state);
2500 }
2501 };
2502
2503 // Map of what is wrong to information about why, implicitly also stores
2504 // the list of what is wrong.
2505 std::map<pg_consequence_t, PgCauses> detected;
2506
2507 // Optimisation: trim down the number of checks to apply based on
2508 // the summary counters
2509 std::map<unsigned, PgStateResponse> possible_responses;
2510 for (const auto &i : num_pg_by_state) {
2511 for (const auto &j : state_to_response) {
2512 if (!j.second.invert) {
2513 // Check for normal tests by seeing if any pgs have the flag
2514 if (i.first & j.first) {
2515 possible_responses.insert(j);
2516 }
2517 }
2518 }
2519 }
2520
2521 for (const auto &j : state_to_response) {
2522 if (j.second.invert) {
2523 // Check for inverted tests by seeing if not-all pgs have the flag
2524 const auto &found = num_pg_by_state.find(j.first);
2525 if (found == num_pg_by_state.end() || found->second != num_pg) {
2526 possible_responses.insert(j);
2527 }
2528 }
2529 }
2530
2531 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
2532 // Loop over all PGs, if there are any possibly-unhealthy states in there
2533 if (!possible_responses.empty()) {
2534 for (const auto& i : pg_stat) {
2535 const auto &pg_id = i.first;
2536 const auto &pg_info = i.second;
2537
2538 for (const auto &j : state_to_response) {
2539 const auto &pg_response_state = j.first;
2540 const auto &pg_response = j.second;
2541
2542 // Apply the state test
2543 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2544 continue;
2545 }
2546
2547 // Apply stuckness test if needed
2548 if (pg_response.stuck_since) {
2549 // Delayed response, check for stuckness
2550 utime_t last_whatever = pg_response.stuck_since(pg_info);
2551 if (last_whatever.is_zero() &&
2552 pg_info.last_change >= cutoff) {
2553 // still moving, ignore
2554 continue;
2555 } else if (last_whatever >= cutoff) {
2556 // Not stuck enough, ignore.
2557 continue;
2558 } else {
2559
2560 }
2561 }
2562
2563 auto &causes = detected[pg_response.consequence];
2564 causes.states[pg_response_state]++;
2565 causes.pgs.insert(pg_id);
2566
2567 // Don't bother composing detail string if we have already recorded
2568 // too many
2569 if (causes.pg_messages.size() > max) {
2570 continue;
2571 }
2572
2573 std::ostringstream ss;
2574 if (pg_response.stuck_since) {
2575 utime_t since = pg_response.stuck_since(pg_info);
2576 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2577 if (since == utime_t()) {
2578 ss << " since forever";
2579 } else {
2580 utime_t dur = now - since;
2581 ss << " for " << utimespan_str(dur);
2582 }
2583 ss << ", current state " << pg_state_string(pg_info.state)
2584 << ", last acting " << pg_info.acting;
2585 } else {
2586 ss << "pg " << pg_id << " is "
2587 << pg_state_string(pg_info.state);
2588 ss << ", acting " << pg_info.acting;
2589 if (pg_info.stats.sum.num_objects_unfound) {
2590 ss << ", " << pg_info.stats.sum.num_objects_unfound
2591 << " unfound";
2592 }
2593 }
2594
2595 if (pg_info.state & PG_STATE_INCOMPLETE) {
2596 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2597 if (pi && pi->min_size > 1) {
2598 ss << " (reducing pool "
2599 << osdmap.get_pool_name(pg_id.pool())
2600 << " min_size from " << (int)pi->min_size
2601 << " may help; search ceph.com/docs for 'incomplete')";
2602 }
2603 }
2604
2605 causes.pg_messages[pg_id] = ss.str();
2606 }
2607 }
2608 } else {
2609 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2610 }
2611
2612 for (const auto &i : detected) {
2613 std::string health_code;
2614 health_status_t sev;
2615 std::string summary;
2616 switch(i.first) {
2617 case UNAVAILABLE:
2618 health_code = "PG_AVAILABILITY";
2619 sev = HEALTH_WARN;
2620 summary = "Reduced data availability: ";
2621 break;
2622 case DEGRADED:
2623 health_code = "PG_DEGRADED";
2624 summary = "Degraded data redundancy: ";
2625 sev = HEALTH_WARN;
2626 break;
2627 case BACKFILL_FULL:
2628 health_code = "PG_BACKFILL_FULL";
2629 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2630 sev = HEALTH_WARN;
2631 break;
2632 case DAMAGED:
2633 health_code = "PG_DAMAGED";
2634 summary = "Possible data damage: ";
2635 sev = HEALTH_ERR;
2636 break;
2637 case RECOVERY_FULL:
2638 health_code = "PG_RECOVERY_FULL";
2639 summary = "Full OSDs blocking recovery: ";
2640 sev = HEALTH_ERR;
2641 break;
2642 default:
2643 ceph_abort();
2644 }
2645
2646 if (i.first == DEGRADED) {
2647 if (pg_sum.stats.sum.num_objects_degraded &&
2648 pg_sum.stats.sum.num_object_copies > 0) {
2649 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2650 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2651 char b[20];
2652 snprintf(b, sizeof(b), "%.3lf", pc);
2653 ostringstream ss;
2654 ss << pg_sum.stats.sum.num_objects_degraded
2655 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2656 << b << "%)";
2657
2658 // Throw in a comma for the benefit of the following PG counts
2659 summary += ss.str() + ", ";
2660 }
2661 }
2662
2663 // Compose summary message saying how many PGs in what states led
2664 // to this health check failing
2665 std::vector<std::string> pg_msgs;
2666 int64_t count = 0;
2667 for (const auto &j : i.second.states) {
2668 std::ostringstream msg;
2669 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2670 pg_msgs.push_back(msg.str());
2671 count += j.second;
2672 }
2673 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2674
2675 health_check_t *check = &checks->add(
2676 health_code,
2677 sev,
2678 summary,
2679 count);
2680
2681 // Compose list of PGs contributing to this health check failing
2682 for (const auto &j : i.second.pg_messages) {
2683 check->detail.push_back(j.second);
2684 }
2685 }
2686
2687 // OSD_SCRUB_ERRORS
2688 if (pg_sum.stats.sum.num_scrub_errors) {
2689 ostringstream ss;
2690 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2691 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2692 pg_sum.stats.sum.num_scrub_errors);
2693 }
2694
2695 // LARGE_OMAP_OBJECTS
2696 if (pg_sum.stats.sum.num_large_omap_objects) {
2697 list<string> detail;
2698 for (auto &pool : pools) {
2699 const string& pool_name = osdmap.get_pool_name(pool.first);
2700 auto it2 = pg_pool_sum.find(pool.first);
2701 if (it2 == pg_pool_sum.end()) {
2702 continue;
2703 }
2704 const pool_stat_t *pstat = &it2->second;
2705 if (pstat == nullptr) {
2706 continue;
2707 }
2708 const object_stat_sum_t& sum = pstat->stats.sum;
2709 if (sum.num_large_omap_objects) {
2710 stringstream ss;
2711 ss << sum.num_large_omap_objects << " large objects found in pool "
2712 << "'" << pool_name << "'";
2713 detail.push_back(ss.str());
2714 }
2715 }
2716 if (!detail.empty()) {
2717 ostringstream ss;
2718 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
2719 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2720 pg_sum.stats.sum.num_large_omap_objects);
2721 stringstream tip;
2722 tip << "Search the cluster log for 'Large omap object found' for more "
2723 << "details.";
2724 detail.push_back(tip.str());
2725 d.detail.swap(detail);
2726 }
2727 }
2728
2729 // CACHE_POOL_NEAR_FULL
2730 {
2731 list<string> detail;
2732 unsigned num_pools = 0;
2733 for (auto& p : pools) {
2734 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2735 !pg_pool_sum.count(p.first)) {
2736 continue;
2737 }
2738 bool nearfull = false;
2739 const string& name = osdmap.get_pool_name(p.first);
2740 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2741 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2742 ((1000000 - p.second.cache_target_full_ratio_micro) *
2743 cct->_conf->mon_cache_target_full_warn_ratio);
2744 if (p.second.target_max_objects &&
2745 (uint64_t)(st.stats.sum.num_objects -
2746 st.stats.sum.num_objects_hit_set_archive) >
2747 p.second.target_max_objects * (ratio / 1000000.0)) {
2748 ostringstream ss;
2749 ss << "cache pool '" << name << "' with "
2750 << si_u_t(st.stats.sum.num_objects)
2751 << " objects at/near target max "
2752 << si_u_t(p.second.target_max_objects) << " objects";
2753 detail.push_back(ss.str());
2754 nearfull = true;
2755 }
2756 if (p.second.target_max_bytes &&
2757 (uint64_t)(st.stats.sum.num_bytes -
2758 st.stats.sum.num_bytes_hit_set_archive) >
2759 p.second.target_max_bytes * (ratio / 1000000.0)) {
2760 ostringstream ss;
2761 ss << "cache pool '" << name
2762 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2763 << " at/near target max "
2764 << byte_u_t(p.second.target_max_bytes);
2765 detail.push_back(ss.str());
2766 nearfull = true;
2767 }
2768 if (nearfull) {
2769 ++num_pools;
2770 }
2771 }
2772 if (!detail.empty()) {
2773 ostringstream ss;
2774 ss << num_pools << " cache pools at or near target size";
2775 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2776 num_pools);
2777 d.detail.swap(detail);
2778 }
2779 }
2780
2781 // TOO_FEW_PGS
2782 unsigned num_in = osdmap.get_num_in_osds();
2783 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2784 const auto min_pg_per_osd =
2785 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
2786 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2787 auto per = sum_pg_up / num_in;
2788 if (per < min_pg_per_osd && per) {
2789 ostringstream ss;
2790 ss << "too few PGs per OSD (" << per
2791 << " < min " << min_pg_per_osd << ")";
2792 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2793 min_pg_per_osd - per);
2794 }
2795 }
2796
2797 // TOO_MANY_PGS
2798 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
2799 if (num_in && max_pg_per_osd > 0) {
2800 auto per = sum_pg_up / num_in;
2801 if (per > max_pg_per_osd) {
2802 ostringstream ss;
2803 ss << "too many PGs per OSD (" << per
2804 << " > max " << max_pg_per_osd << ")";
2805 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2806 per - max_pg_per_osd);
2807 }
2808 }
2809
2810 // TOO_FEW_OSDS
2811 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2812 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2813 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2814 ostringstream ss;
2815 ss << "OSD count " << osdmap.get_num_osds()
2816 << " < osd_pool_default_size " << osd_pool_default_size;
2817 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2818 osd_pool_default_size - osdmap.get_num_osds());
2819 }
2820
2821 // SLOW_PING_TIME
2822 // Convert milliseconds to microseconds
2823 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2824 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2825 if (warn_slow_ping_time == 0) {
2826 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2827 warn_slow_ping_time = grace;
2828 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2829 }
2830 if (warn_slow_ping_time > 0) {
2831
2832 struct mon_ping_item_t {
2833 uint32_t pingtime;
2834 int from;
2835 int to;
2836 bool improving;
2837
2838 bool operator<(const mon_ping_item_t& rhs) const {
2839 if (pingtime < rhs.pingtime)
2840 return true;
2841 if (pingtime > rhs.pingtime)
2842 return false;
2843 if (from < rhs.from)
2844 return true;
2845 if (from > rhs.from)
2846 return false;
2847 return to < rhs.to;
2848 }
2849 };
2850
2851 list<string> detail_back;
2852 list<string> detail_front;
2853 list<string> detail;
2854 set<mon_ping_item_t> back_sorted, front_sorted;
2855 for (auto i : osd_stat) {
2856 for (auto j : i.second.hb_pingtime) {
2857
2858 // Maybe source info is old
2859 if (now.sec() - j.second.last_update > grace * 60)
2860 continue;
2861
2862 mon_ping_item_t back;
2863 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2864 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2865 back.from = i.first;
2866 back.to = j.first;
2867 if (back.pingtime > warn_slow_ping_time) {
2868 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2869 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2870 back_sorted.emplace(back);
2871 }
2872
2873 mon_ping_item_t front;
2874 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2875 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2876 front.from = i.first;
2877 front.to = j.first;
2878 if (front.pingtime > warn_slow_ping_time) {
2879 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
2880 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2881 front_sorted.emplace(front);
2882 }
2883 }
2884 if (i.second.num_shards_repaired >
2885 cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
2886 ostringstream ss;
2887 ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
2888 detail.push_back(ss.str());
2889 }
2890 }
2891 if (!detail.empty()) {
2892 ostringstream ss;
2893 ss << "Too many repaired reads on " << detail.size() << " OSDs";
2894 auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
2895 detail.size());
2896 d.detail.swap(detail);
2897 }
2898 int max_detail = 10;
2899 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2900 ostringstream ss;
2901 if (max_detail == 0) {
2902 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2903 detail_back.push_back(ss.str());
2904 break;
2905 }
2906 max_detail--;
2907 ss << "Slow OSD heartbeats on back from osd." << sback.from
2908 << " [" << osd_parentage(osdmap, sback.from) << "]"
2909 << (osdmap.is_down(sback.from) ? " (down)" : "")
2910 << " to osd." << sback.to
2911 << " [" << osd_parentage(osdmap, sback.to) << "]"
2912 << (osdmap.is_down(sback.to) ? " (down)" : "")
2913 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2914 << (sback.improving ? " possibly improving" : "");
2915 detail_back.push_back(ss.str());
2916 }
2917 max_detail = 10;
2918 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2919 ostringstream ss;
2920 if (max_detail == 0) {
2921 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2922 detail_front.push_back(ss.str());
2923 break;
2924 }
2925 max_detail--;
2926 // Get crush parentage for each osd
2927 ss << "Slow OSD heartbeats on front from osd." << sfront.from
2928 << " [" << osd_parentage(osdmap, sfront.from) << "]"
2929 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2930 << " to osd." << sfront.to
2931 << " [" << osd_parentage(osdmap, sfront.to) << "]"
2932 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2933 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2934 << (sfront.improving ? " possibly improving" : "");
2935 detail_front.push_back(ss.str());
2936 }
2937 if (detail_back.size() != 0) {
2938 ostringstream ss;
2939 ss << "Slow OSD heartbeats on back (longest "
2940 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
2941 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2942 back_sorted.size());
2943 d.detail.swap(detail_back);
2944 }
2945 if (detail_front.size() != 0) {
2946 ostringstream ss;
2947 ss << "Slow OSD heartbeats on front (longest "
2948 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
2949 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2950 front_sorted.size());
2951 d.detail.swap(detail_front);
2952 }
2953 }
2954
2955 // SMALLER_PGP_NUM
2956 // MANY_OBJECTS_PER_PG
2957 if (!pg_stat.empty()) {
2958 list<string> pgp_detail, many_detail;
2959 const auto mon_pg_warn_min_objects =
2960 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
2961 const auto mon_pg_warn_min_pool_objects =
2962 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
2963 const auto mon_pg_warn_max_object_skew =
2964 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
2965 for (auto p = pg_pool_sum.begin();
2966 p != pg_pool_sum.end();
2967 ++p) {
2968 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2969 if (!pi)
2970 continue; // in case osdmap changes haven't propagated to PGMap yet
2971 const string& name = osdmap.get_pool_name(p->first);
2972 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2973 // the warnings. If the cluster is failing to converge on the target
2974 // values that is a separate issue!
2975 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
2976 !(name.find(".DELETED") != string::npos &&
2977 cct->_conf->mon_fake_pool_delete)) {
2978 ostringstream ss;
2979 ss << "pool " << name << " pg_num "
2980 << pi->get_pg_num_target()
2981 << " > pgp_num " << pi->get_pgp_num_target();
2982 pgp_detail.push_back(ss.str());
2983 }
2984 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2985 if (average_objects_per_pg > 0 &&
2986 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2987 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
2988 int objects_per_pg = p->second.stats.sum.num_objects /
2989 pi->get_pg_num_target();
2990 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2991 if (mon_pg_warn_max_object_skew > 0 &&
2992 ratio > mon_pg_warn_max_object_skew) {
2993 ostringstream ss;
2994 ss << "pool " << name << " objects per pg ("
2995 << objects_per_pg << ") is more than " << ratio
2996 << " times cluster average ("
2997 << average_objects_per_pg << ")";
2998 many_detail.push_back(ss.str());
2999 }
3000 }
3001 }
3002 if (!pgp_detail.empty()) {
3003 ostringstream ss;
3004 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
3005 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
3006 pgp_detail.size());
3007 d.detail.swap(pgp_detail);
3008 }
3009 if (!many_detail.empty()) {
3010 ostringstream ss;
3011 ss << many_detail.size() << " pools have many more objects per pg than"
3012 << " average";
3013 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
3014 many_detail.size());
3015 d.detail.swap(many_detail);
3016 }
3017 }
3018
3019 // POOL_FULL
3020 // POOL_NEAR_FULL
3021 {
3022 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
3023 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
3024 list<string> full_detail, nearfull_detail;
3025 unsigned full_pools = 0, nearfull_pools = 0;
3026 for (auto it : pools) {
3027 auto it2 = pg_pool_sum.find(it.first);
3028 if (it2 == pg_pool_sum.end()) {
3029 continue;
3030 }
3031 const pool_stat_t *pstat = &it2->second;
3032 const object_stat_sum_t& sum = pstat->stats.sum;
3033 const string& pool_name = osdmap.get_pool_name(it.first);
3034 const pg_pool_t &pool = it.second;
3035 bool full = false, nearfull = false;
3036 if (pool.quota_max_objects > 0) {
3037 stringstream ss;
3038 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3039 } else if (crit_threshold > 0 &&
3040 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3041 ss << "pool '" << pool_name
3042 << "' has " << sum.num_objects << " objects"
3043 << " (max " << pool.quota_max_objects << ")";
3044 full_detail.push_back(ss.str());
3045 full = true;
3046 } else if (warn_threshold > 0 &&
3047 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3048 ss << "pool '" << pool_name
3049 << "' has " << sum.num_objects << " objects"
3050 << " (max " << pool.quota_max_objects << ")";
3051 nearfull_detail.push_back(ss.str());
3052 nearfull = true;
3053 }
3054 }
3055 if (pool.quota_max_bytes > 0) {
3056 stringstream ss;
3057 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3058 } else if (crit_threshold > 0 &&
3059 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3060 ss << "pool '" << pool_name
3061 << "' has " << byte_u_t(sum.num_bytes)
3062 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
3063 full_detail.push_back(ss.str());
3064 full = true;
3065 } else if (warn_threshold > 0 &&
3066 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3067 ss << "pool '" << pool_name
3068 << "' has " << byte_u_t(sum.num_bytes)
3069 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
3070 nearfull_detail.push_back(ss.str());
3071 nearfull = true;
3072 }
3073 }
3074 if (full) {
3075 ++full_pools;
3076 }
3077 if (nearfull) {
3078 ++nearfull_pools;
3079 }
3080 }
3081 if (full_pools) {
3082 ostringstream ss;
3083 ss << full_pools << " pools full";
3084 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
3085 d.detail.swap(full_detail);
3086 }
3087 if (nearfull_pools) {
3088 ostringstream ss;
3089 ss << nearfull_pools << " pools nearfull";
3090 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
3091 d.detail.swap(nearfull_detail);
3092 }
3093 }
3094
3095 // OBJECT_MISPLACED
3096 if (pg_sum.stats.sum.num_objects_misplaced &&
3097 pg_sum.stats.sum.num_object_copies > 0 &&
3098 cct->_conf->mon_warn_on_misplaced) {
3099 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3100 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3101 char b[20];
3102 snprintf(b, sizeof(b), "%.3lf", pc);
3103 ostringstream ss;
3104 ss << pg_sum.stats.sum.num_objects_misplaced
3105 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3106 << b << "%)";
3107 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3108 pg_sum.stats.sum.num_objects_misplaced);
3109 }
3110
3111 // OBJECT_UNFOUND
3112 if (pg_sum.stats.sum.num_objects_unfound &&
3113 pg_sum.stats.sum.num_objects) {
3114 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3115 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3116 char b[20];
3117 snprintf(b, sizeof(b), "%.3lf", pc);
3118 ostringstream ss;
3119 ss << pg_sum.stats.sum.num_objects_unfound
3120 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
3121 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3122 pg_sum.stats.sum.num_objects_unfound);
3123
3124 for (auto& p : pg_stat) {
3125 if (p.second.stats.sum.num_objects_unfound) {
3126 ostringstream ss;
3127 ss << "pg " << p.first
3128 << " has " << p.second.stats.sum.num_objects_unfound
3129 << " unfound objects";
3130 d.detail.push_back(ss.str());
3131 if (d.detail.size() > max) {
3132 d.detail.push_back("(additional pgs left out for brevity)");
3133 break;
3134 }
3135 }
3136 }
3137 }
3138
3139 // REQUEST_SLOW
3140 // REQUEST_STUCK
3141 // SLOW_OPS unifies them in mimic.
3142 if (osdmap.require_osd_release < ceph_release_t::mimic &&
3143 cct->_conf->mon_osd_warn_op_age > 0 &&
3144 !osd_sum.op_queue_age_hist.h.empty() &&
3145 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3146 cct->_conf->mon_osd_warn_op_age) {
3147 list<string> warn_detail, error_detail;
3148 unsigned warn = 0, error = 0;
3149 float err_age =
3150 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3151 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3152 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3153 float ub = (float)(1 << i) / 1000.0;
3154 if (ub < cct->_conf->mon_osd_warn_op_age)
3155 break;
3156 if (h.h[i]) {
3157 ostringstream ss;
3158 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3159 if (ub > err_age) {
3160 error += h.h[i];
3161 error_detail.push_back(ss.str());
3162 } else {
3163 warn += h.h[i];
3164 warn_detail.push_back(ss.str());
3165 }
3166 }
3167 }
3168
3169 map<float,set<int>> warn_osd_by_max; // max -> osds
3170 map<float,set<int>> error_osd_by_max; // max -> osds
3171 if (!warn_detail.empty() || !error_detail.empty()) {
3172 for (auto& p : osd_stat) {
3173 const pow2_hist_t& h = p.second.op_queue_age_hist;
3174 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3175 float ub = (float)(1 << i) / 1000.0;
3176 if (ub < cct->_conf->mon_osd_warn_op_age)
3177 break;
3178 if (h.h[i]) {
3179 if (ub > err_age) {
3180 error_osd_by_max[ub].insert(p.first);
3181 } else {
3182 warn_osd_by_max[ub].insert(p.first);
3183 }
3184 break;
3185 }
3186 }
3187 }
3188 }
3189
3190 if (!warn_detail.empty()) {
3191 ostringstream ss;
3192 ss << warn << " slow requests are blocked > "
3193 << cct->_conf->mon_osd_warn_op_age << " sec";
3194 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
3195 d.detail.swap(warn_detail);
3196 int left = max;
3197 for (auto& p : warn_osd_by_max) {
3198 ostringstream ss;
3199 if (p.second.size() > 1) {
3200 ss << "osds " << p.second
3201 << " have blocked requests > " << p.first << " sec";
3202 } else {
3203 ss << "osd." << *p.second.begin()
3204 << " has blocked requests > " << p.first << " sec";
3205 }
3206 d.detail.push_back(ss.str());
3207 if (--left == 0) {
3208 break;
3209 }
3210 }
3211 }
3212 if (!error_detail.empty()) {
3213 ostringstream ss;
3214 ss << error << " stuck requests are blocked > "
3215 << err_age << " sec";
3216 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
3217 d.detail.swap(error_detail);
3218 int left = max;
3219 for (auto& p : error_osd_by_max) {
3220 ostringstream ss;
3221 if (p.second.size() > 1) {
3222 ss << "osds " << p.second
3223 << " have stuck requests > " << p.first << " sec";
3224 } else {
3225 ss << "osd." << *p.second.begin()
3226 << " has stuck requests > " << p.first << " sec";
3227 }
3228 d.detail.push_back(ss.str());
3229 if (--left == 0) {
3230 break;
3231 }
3232 }
3233 }
3234 }
3235
3236 // OBJECT_STORE_WARN
3237 if (osd_sum.os_alerts.size()) {
3238 map<string, pair<size_t, list<string>>> os_alerts_sum;
3239
3240 for (auto& a : osd_sum.os_alerts) {
3241 int left = max;
3242 string s0 = " osd.";
3243 s0 += stringify(a.first);
3244 for (auto& aa : a.second) {
3245 string s(s0);
3246 s += " ";
3247 s += aa.second;
3248 auto it = os_alerts_sum.find(aa.first);
3249 if (it == os_alerts_sum.end()) {
3250 list<string> d;
3251 d.emplace_back(s);
3252 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3253 } else {
3254 auto& p = it->second;
3255 ++p.first;
3256 p.second.emplace_back(s);
3257 }
3258 if (--left == 0) {
3259 break;
3260 }
3261 }
3262 }
3263
3264 for (auto& asum : os_alerts_sum) {
3265 string summary = stringify(asum.second.first) + " OSD(s)";
3266 if (asum.first == "BLUEFS_SPILLOVER") {
3267 summary += " experiencing BlueFS spillover";
3268 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
3269 summary += " have broken BlueStore compression";
3270 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
3271 summary += " reporting legacy (not per-pool) BlueStore stats";
3272 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
3273 summary += " have dangerous mismatch between BlueStore block device and free list sizes";
3274 } else if (asum.first == "BLUESTORE_NO_PER_PG_OMAP") {
3275 summary += " reporting legacy (not per-pg) BlueStore omap";
3276 } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3277 summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
3278 } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
3279 summary += " have spurious read errors";
3280 }
3281
3282 auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
3283 for (auto& s : asum.second.second) {
3284 d.detail.push_back(s);
3285 }
3286 }
3287 }
3288 // PG_NOT_SCRUBBED
3289 // PG_NOT_DEEP_SCRUBBED
3290 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3291 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3292 list<string> detail, deep_detail;
3293 int detail_max = max, deep_detail_max = max;
3294 int detail_more = 0, deep_detail_more = 0;
3295 int detail_total = 0, deep_detail_total = 0;
3296 for (auto& p : pg_stat) {
3297 int64_t pnum = p.first.pool();
3298 auto pool = osdmap.get_pg_pool(pnum);
3299 if (!pool)
3300 continue;
3301 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
3302 double scrub_max_interval = 0;
3303 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3304 if (scrub_max_interval <= 0) {
3305 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
3306 }
3307 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
3308 scrub_max_interval;
3309 utime_t cutoff = now;
3310 cutoff -= age;
3311 if (p.second.last_scrub_stamp < cutoff) {
3312 if (detail_max > 0) {
3313 ostringstream ss;
3314 ss << "pg " << p.first << " not scrubbed since "
3315 << p.second.last_scrub_stamp;
3316 detail.push_back(ss.str());
3317 --detail_max;
3318 } else {
3319 ++detail_more;
3320 }
3321 ++detail_total;
3322 }
3323 }
3324 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3325 double deep_scrub_interval = 0;
3326 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3327 if (deep_scrub_interval <= 0) {
3328 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3329 }
3330 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
3331 deep_scrub_interval;
3332 utime_t deep_cutoff = now;
3333 deep_cutoff -= deep_age;
3334 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3335 if (deep_detail_max > 0) {
3336 ostringstream ss;
3337 ss << "pg " << p.first << " not deep-scrubbed since "
3338 << p.second.last_deep_scrub_stamp;
3339 deep_detail.push_back(ss.str());
3340 --deep_detail_max;
3341 } else {
3342 ++deep_detail_more;
3343 }
3344 ++deep_detail_total;
3345 }
3346 }
3347 }
3348 if (detail_total) {
3349 ostringstream ss;
3350 ss << detail_total << " pgs not scrubbed in time";
3351 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
3352
3353 if (!detail.empty()) {
3354 d.detail.swap(detail);
3355
3356 if (detail_more) {
3357 ostringstream ss;
3358 ss << detail_more << " more pgs... ";
3359 d.detail.push_back(ss.str());
3360 }
3361 }
3362 }
3363 if (deep_detail_total) {
3364 ostringstream ss;
3365 ss << deep_detail_total << " pgs not deep-scrubbed in time";
3366 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3367 deep_detail_total);
3368
3369 if (!deep_detail.empty()) {
3370 d.detail.swap(deep_detail);
3371
3372 if (deep_detail_more) {
3373 ostringstream ss;
3374 ss << deep_detail_more << " more pgs... ";
3375 d.detail.push_back(ss.str());
3376 }
3377 }
3378 }
3379 }
3380
3381 // POOL_APP
3382 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
3383 list<string> detail;
3384 for (auto &it : pools) {
3385 const pg_pool_t &pool = it.second;
3386 const string& pool_name = osdmap.get_pool_name(it.first);
3387 auto it2 = pg_pool_sum.find(it.first);
3388 if (it2 == pg_pool_sum.end()) {
3389 continue;
3390 }
3391 const pool_stat_t *pstat = &it2->second;
3392 if (pstat == nullptr) {
3393 continue;
3394 }
3395 const object_stat_sum_t& sum = pstat->stats.sum;
3396 // application metadata is not encoded until luminous is minimum
3397 // required release
3398 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3399 !pool.is_tier()) {
3400 stringstream ss;
3401 ss << "application not enabled on pool '" << pool_name << "'";
3402 detail.push_back(ss.str());
3403 }
3404 }
3405 if (!detail.empty()) {
3406 ostringstream ss;
3407 ss << detail.size() << " pool(s) do not have an application enabled";
3408 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3409 detail.size());
3410 stringstream tip;
3411 tip << "use 'ceph osd pool application enable <pool-name> "
3412 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3413 << "or freeform for custom applications.";
3414 detail.push_back(tip.str());
3415 d.detail.swap(detail);
3416 }
3417 }
3418
3419 // PG_SLOW_SNAP_TRIMMING
3420 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3421 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3422 uint64_t snaptrimq_exceeded = 0;
3423 uint32_t longest_queue = 0;
3424 const pg_t* longest_q_pg = nullptr;
3425 list<string> detail;
3426
3427 for (auto& i: pg_stat) {
3428 uint32_t current_len = i.second.snaptrimq_len;
3429 if (current_len >= snapthreshold) {
3430 snaptrimq_exceeded++;
3431 if (longest_queue <= current_len) {
3432 longest_q_pg = &i.first;
3433 longest_queue = current_len;
3434 }
3435 if (detail.size() < max - 1) {
3436 stringstream ss;
3437 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3438 detail.push_back(ss.str());
3439 continue;
3440 }
3441 if (detail.size() < max) {
3442 detail.push_back("...more pgs affected");
3443 continue;
3444 }
3445 }
3446 }
3447
3448 if (snaptrimq_exceeded) {
3449 {
3450 ostringstream ss;
3451 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3452 detail.push_back(ss.str());
3453 }
3454
3455 stringstream ss;
3456 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3457 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3458 snaptrimq_exceeded);
3459 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3460 d.detail.swap(detail);
3461 }
3462 }
3463 }
3464
3465 void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
3466 {
3467 if (f) {
3468 f->open_array_section("pgs_by_pool_state");
3469 for (auto& i: num_pg_by_pool_state) {
3470 f->open_object_section("per_pool_pgs_by_state");
3471 f->dump_int("pool_id", i.first);
3472 f->open_array_section("pg_state_counts");
3473 for (auto& j : i.second) {
3474 f->open_object_section("pg_state_count");
3475 f->dump_string("state_name", pg_state_string(j.first));
3476 f->dump_int("count", j.second);
3477 f->close_section();
3478 }
3479 f->close_section();
3480 f->close_section();
3481 }
3482 f->close_section();
3483 }
3484 PGMapDigest::print_summary(f, out);
3485 }
3486
3487 int process_pg_map_command(
3488 const string& orig_prefix,
3489 const cmdmap_t& orig_cmdmap,
3490 const PGMap& pg_map,
3491 const OSDMap& osdmap,
3492 ceph::Formatter *f,
3493 stringstream *ss,
3494 bufferlist *odata)
3495 {
3496 string prefix = orig_prefix;
3497 auto cmdmap = orig_cmdmap;
3498
3499 string omap_stats_note =
3500 "\n* NOTE: Omap statistics are gathered during deep scrub and "
3501 "may be inaccurate soon afterwards depending on utilization. See "
3502 "http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics "
3503 "for further details.\n";
3504 bool omap_stats_note_required = false;
3505
3506 // perhaps these would be better in the parsing, but it's weird
3507 bool primary = false;
3508 if (prefix == "pg dump_json") {
3509 vector<string> v;
3510 v.push_back(string("all"));
3511 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3512 prefix = "pg dump";
3513 } else if (prefix == "pg dump_pools_json") {
3514 vector<string> v;
3515 v.push_back(string("pools"));
3516 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3517 prefix = "pg dump";
3518 } else if (prefix == "pg ls-by-primary") {
3519 primary = true;
3520 prefix = "pg ls";
3521 } else if (prefix == "pg ls-by-osd") {
3522 prefix = "pg ls";
3523 } else if (prefix == "pg ls-by-pool") {
3524 prefix = "pg ls";
3525 string poolstr;
3526 cmd_getval(cmdmap, "poolstr", poolstr);
3527 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3528 if (pool < 0) {
3529 *ss << "pool " << poolstr << " does not exist";
3530 return -ENOENT;
3531 }
3532 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3533 }
3534
3535 stringstream ds;
3536 if (prefix == "pg stat") {
3537 if (f) {
3538 f->open_object_section("pg_summary");
3539 pg_map.print_oneline_summary(f, NULL);
3540 f->close_section();
3541 f->flush(ds);
3542 } else {
3543 ds << pg_map;
3544 }
3545 odata->append(ds);
3546 return 0;
3547 }
3548
3549 if (prefix == "pg getmap") {
3550 pg_map.encode(*odata);
3551 *ss << "got pgmap version " << pg_map.version;
3552 return 0;
3553 }
3554
3555 if (prefix == "pg dump") {
3556 string val;
3557 vector<string> dumpcontents;
3558 set<string> what;
3559 if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
3560 copy(dumpcontents.begin(), dumpcontents.end(),
3561 inserter(what, what.end()));
3562 }
3563 if (what.empty())
3564 what.insert("all");
3565 if (f) {
3566 if (what.count("all")) {
3567 f->open_object_section("pg_map");
3568 pg_map.dump(f);
3569 f->close_section();
3570 } else if (what.count("summary") || what.count("sum")) {
3571 f->open_object_section("pg_map");
3572 pg_map.dump_basic(f);
3573 f->close_section();
3574 } else {
3575 if (what.count("pools")) {
3576 pg_map.dump_pool_stats(f);
3577 }
3578 if (what.count("osds")) {
3579 pg_map.dump_osd_stats(f);
3580 }
3581 if (what.count("pgs")) {
3582 pg_map.dump_pg_stats(f, false);
3583 }
3584 if (what.count("pgs_brief")) {
3585 pg_map.dump_pg_stats(f, true);
3586 }
3587 if (what.count("delta")) {
3588 f->open_object_section("delta");
3589 pg_map.dump_delta(f);
3590 f->close_section();
3591 }
3592 }
3593 f->flush(*odata);
3594 } else {
3595 if (what.count("all")) {
3596 pg_map.dump(ds);
3597 omap_stats_note_required = true;
3598 } else if (what.count("summary") || what.count("sum")) {
3599 pg_map.dump_basic(ds);
3600 pg_map.dump_pg_sum_stats(ds, true);
3601 pg_map.dump_osd_sum_stats(ds);
3602 omap_stats_note_required = true;
3603 } else {
3604 if (what.count("pgs_brief")) {
3605 pg_map.dump_pg_stats(ds, true);
3606 }
3607 bool header = true;
3608 if (what.count("pgs")) {
3609 pg_map.dump_pg_stats(ds, false);
3610 header = false;
3611 omap_stats_note_required = true;
3612 }
3613 if (what.count("pools")) {
3614 pg_map.dump_pool_stats(ds, header);
3615 omap_stats_note_required = true;
3616 }
3617 if (what.count("osds")) {
3618 pg_map.dump_osd_stats(ds);
3619 }
3620 }
3621 odata->append(ds);
3622 if (omap_stats_note_required) {
3623 odata->append(omap_stats_note);
3624 }
3625 }
3626 *ss << "dumped " << what;
3627 return 0;
3628 }
3629
3630 if (prefix == "pg ls") {
3631 int64_t osd = -1;
3632 int64_t pool = -1;
3633 vector<string>states;
3634 set<pg_t> pgs;
3635 cmd_getval(cmdmap, "pool", pool);
3636 cmd_getval(cmdmap, "osd", osd);
3637 cmd_getval(cmdmap, "states", states);
3638 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3639 *ss << "pool " << pool << " does not exist";
3640 return -ENOENT;
3641 }
3642 if (osd >= 0 && !osdmap.is_up(osd)) {
3643 *ss << "osd " << osd << " is not up";
3644 return -EAGAIN;
3645 }
3646 if (states.empty())
3647 states.push_back("all");
3648
3649 uint64_t state = 0;
3650
3651 while (!states.empty()) {
3652 string state_str = states.back();
3653
3654 if (state_str == "all") {
3655 state = -1;
3656 break;
3657 } else {
3658 auto filter = pg_string_state(state_str);
3659 if (!filter) {
3660 *ss << "'" << state_str << "' is not a valid pg state,"
3661 << " available choices: " << pg_state_string(0xFFFFFFFF);
3662 return -EINVAL;
3663 }
3664 state |= *filter;
3665 }
3666
3667 states.pop_back();
3668 }
3669
3670 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3671
3672 if (f && !pgs.empty()) {
3673 pg_map.dump_filtered_pg_stats(f, pgs);
3674 f->flush(*odata);
3675 } else if (!pgs.empty()) {
3676 pg_map.dump_filtered_pg_stats(ds, pgs);
3677 odata->append(ds);
3678 odata->append(omap_stats_note);
3679 }
3680 return 0;
3681 }
3682
3683 if (prefix == "pg dump_stuck") {
3684 vector<string> stuckop_vec;
3685 cmd_getval(cmdmap, "stuckops", stuckop_vec);
3686 if (stuckop_vec.empty())
3687 stuckop_vec.push_back("unclean");
3688 int64_t threshold;
3689 cmd_getval(cmdmap, "threshold", threshold,
3690 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
3691
3692 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
3693 *ss << "failed";
3694 } else {
3695 *ss << "ok";
3696 }
3697 odata->append(ds);
3698 return 0;
3699 }
3700
3701 if (prefix == "pg debug") {
3702 string debugop;
3703 cmd_getval(cmdmap, "debugop", debugop,
3704 string("unfound_objects_exist"));
3705 if (debugop == "unfound_objects_exist") {
3706 bool unfound_objects_exist = false;
3707 for (const auto& p : pg_map.pg_stat) {
3708 if (p.second.stats.sum.num_objects_unfound > 0) {
3709 unfound_objects_exist = true;
3710 break;
3711 }
3712 }
3713 if (unfound_objects_exist)
3714 ds << "TRUE";
3715 else
3716 ds << "FALSE";
3717 odata->append(ds);
3718 return 0;
3719 }
3720 if (debugop == "degraded_pgs_exist") {
3721 bool degraded_pgs_exist = false;
3722 for (const auto& p : pg_map.pg_stat) {
3723 if (p.second.stats.sum.num_objects_degraded > 0) {
3724 degraded_pgs_exist = true;
3725 break;
3726 }
3727 }
3728 if (degraded_pgs_exist)
3729 ds << "TRUE";
3730 else
3731 ds << "FALSE";
3732 odata->append(ds);
3733 return 0;
3734 }
3735 }
3736
3737 if (prefix == "osd perf") {
3738 if (f) {
3739 f->open_object_section("osdstats");
3740 pg_map.dump_osd_perf_stats(f);
3741 f->close_section();
3742 f->flush(ds);
3743 } else {
3744 pg_map.print_osd_perf_stats(&ds);
3745 }
3746 odata->append(ds);
3747 return 0;
3748 }
3749
3750 if (prefix == "osd blocked-by") {
3751 if (f) {
3752 f->open_object_section("osd_blocked_by");
3753 pg_map.dump_osd_blocked_by_stats(f);
3754 f->close_section();
3755 f->flush(ds);
3756 } else {
3757 pg_map.print_osd_blocked_by_stats(&ds);
3758 }
3759 odata->append(ds);
3760 return 0;
3761 }
3762
3763 return -EOPNOTSUPP;
3764 }
3765
3766 void PGMapUpdater::check_osd_map(
3767 CephContext *cct,
3768 const OSDMap& osdmap,
3769 const PGMap& pgmap,
3770 PGMap::Incremental *pending_inc)
3771 {
3772 for (auto& p : pgmap.osd_stat) {
3773 if (!osdmap.exists(p.first)) {
3774 // remove osd_stat
3775 pending_inc->rm_stat(p.first);
3776 } else if (osdmap.is_out(p.first)) {
3777 // zero osd_stat
3778 if (p.second.statfs.total != 0) {
3779 pending_inc->stat_osd_out(p.first);
3780 }
3781 } else if (!osdmap.is_up(p.first)) {
3782 // zero the op_queue_age_hist
3783 if (!p.second.op_queue_age_hist.empty()) {
3784 pending_inc->stat_osd_down_up(p.first, pgmap);
3785 }
3786 }
3787 }
3788
3789 // deleted pgs (pools)?
3790 for (auto& p : pgmap.pg_pool_sum) {
3791 if (!osdmap.have_pg_pool(p.first)) {
3792 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3793 << dendl;
3794 for (auto& q : pgmap.pg_stat) {
3795 if (q.first.pool() == p.first) {
3796 pending_inc->pg_remove.insert(q.first);
3797 }
3798 }
3799 auto q = pending_inc->pg_stat_updates.begin();
3800 while (q != pending_inc->pg_stat_updates.end()) {
3801 if (q->first.pool() == p.first) {
3802 q = pending_inc->pg_stat_updates.erase(q);
3803 } else {
3804 ++q;
3805 }
3806 }
3807 }
3808 }
3809
3810 // new (split or new pool) or merged pgs?
3811 map<int64_t,unsigned> new_pg_num;
3812 for (auto& p : osdmap.get_pools()) {
3813 int64_t poolid = p.first;
3814 const pg_pool_t& pi = p.second;
3815 auto q = pgmap.num_pg_by_pool.find(poolid);
3816 unsigned my_pg_num = 0;
3817 if (q != pgmap.num_pg_by_pool.end())
3818 my_pg_num = q->second;
3819 unsigned pg_num = pi.get_pg_num();
3820 new_pg_num[poolid] = pg_num;
3821 if (my_pg_num < pg_num) {
3822 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3823 << " > my pg_num " << my_pg_num << dendl;
3824 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3825 pg_t pgid(ps, poolid);
3826 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
3827 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
3828 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3829 stats.last_fresh = osdmap.get_modified();
3830 stats.last_active = osdmap.get_modified();
3831 stats.last_change = osdmap.get_modified();
3832 stats.last_peered = osdmap.get_modified();
3833 stats.last_clean = osdmap.get_modified();
3834 stats.last_unstale = osdmap.get_modified();
3835 stats.last_undegraded = osdmap.get_modified();
3836 stats.last_fullsized = osdmap.get_modified();
3837 stats.last_scrub_stamp = osdmap.get_modified();
3838 stats.last_deep_scrub_stamp = osdmap.get_modified();
3839 stats.last_clean_scrub_stamp = osdmap.get_modified();
3840 }
3841 }
3842 } else if (my_pg_num > pg_num) {
3843 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3844 << " < my pg_num " << my_pg_num << dendl;
3845 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3846 pg_t pgid(i, poolid);
3847 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3848 if (pgmap.pg_stat.count(pgid)) {
3849 pending_inc->pg_remove.insert(pgid);
3850 }
3851 pending_inc->pg_stat_updates.erase(pgid);
3852 }
3853 }
3854 }
3855 auto i = pending_inc->pg_stat_updates.begin();
3856 while (i != pending_inc->pg_stat_updates.end()) {
3857 auto j = new_pg_num.find(i->first.pool());
3858 if (j == new_pg_num.end() ||
3859 i->first.ps() >= j->second) {
3860 ldout(cct,20) << __func__ << " removing pending update to old "
3861 << i->first << dendl;
3862 i = pending_inc->pg_stat_updates.erase(i);
3863 } else {
3864 ++i;
3865 }
3866 }
3867 }
3868
3869 static void _try_mark_pg_stale(
3870 const OSDMap& osdmap,
3871 pg_t pgid,
3872 const pg_stat_t& cur,
3873 PGMap::Incremental *pending_inc)
3874 {
3875 if ((cur.state & PG_STATE_STALE) == 0 &&
3876 cur.acting_primary != -1 &&
3877 osdmap.is_down(cur.acting_primary)) {
3878 pg_stat_t *newstat;
3879 auto q = pending_inc->pg_stat_updates.find(pgid);
3880 if (q != pending_inc->pg_stat_updates.end()) {
3881 if ((q->second.acting_primary == cur.acting_primary) ||
3882 ((q->second.state & PG_STATE_STALE) == 0 &&
3883 q->second.acting_primary != -1 &&
3884 osdmap.is_down(q->second.acting_primary))) {
3885 newstat = &q->second;
3886 } else {
3887 // pending update is no longer down or already stale
3888 return;
3889 }
3890 } else {
3891 newstat = &pending_inc->pg_stat_updates[pgid];
3892 *newstat = cur;
3893 }
3894 dout(10) << __func__ << " marking pg " << pgid
3895 << " stale (acting_primary " << newstat->acting_primary
3896 << ")" << dendl;
3897 newstat->state |= PG_STATE_STALE;
3898 newstat->last_unstale = ceph_clock_now();
3899 }
3900 }
3901
3902 void PGMapUpdater::check_down_pgs(
3903 const OSDMap &osdmap,
3904 const PGMap &pg_map,
3905 bool check_all,
3906 const set<int>& need_check_down_pg_osds,
3907 PGMap::Incremental *pending_inc)
3908 {
3909 // if a large number of osds changed state, just iterate over the whole
3910 // pg map.
3911 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
3912 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
3913 check_all = true;
3914 }
3915
3916 if (check_all) {
3917 for (const auto& p : pg_map.pg_stat) {
3918 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3919 }
3920 } else {
3921 for (auto osd : need_check_down_pg_osds) {
3922 if (osdmap.is_down(osd)) {
3923 auto p = pg_map.pg_by_osd.find(osd);
3924 if (p == pg_map.pg_by_osd.end()) {
3925 continue;
3926 }
3927 for (auto pgid : p->second) {
3928 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
3929 ceph_assert(stat.acting_primary == osd);
3930 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3931 }
3932 }
3933 }
3934 }
3935 }
3936
3937 int reweight::by_utilization(
3938 const OSDMap &osdmap,
3939 const PGMap &pgm,
3940 int oload,
3941 double max_changef,
3942 int max_osds,
3943 bool by_pg, const set<int64_t> *pools,
3944 bool no_increasing,
3945 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3946 std::stringstream *ss,
3947 std::string *out_str,
3948 ceph::Formatter *f)
3949 {
3950 if (oload <= 100) {
3951 *ss << "You must give a percentage higher than 100. "
3952 "The reweighting threshold will be calculated as <average-utilization> "
3953 "times <input-percentage>. For example, an argument of 200 would "
3954 "reweight OSDs which are twice as utilized as the average OSD.\n";
3955 return -EINVAL;
3956 }
3957
3958 vector<int> pgs_by_osd(osdmap.get_max_osd());
3959
3960 // Avoid putting a small number (or 0) in the denominator when calculating
3961 // average_util
3962 double average_util;
3963 if (by_pg) {
3964 // by pg mapping
3965 double weight_sum = 0.0; // sum up the crush weights
3966 unsigned num_pg_copies = 0;
3967 int num_osds = 0;
3968 for (const auto& pg : pgm.pg_stat) {
3969 if (pools && pools->count(pg.first.pool()) == 0)
3970 continue;
3971 for (const auto acting : pg.second.acting) {
3972 if (!osdmap.exists(acting)) {
3973 continue;
3974 }
3975 if (acting >= (int)pgs_by_osd.size())
3976 pgs_by_osd.resize(acting);
3977 if (pgs_by_osd[acting] == 0) {
3978 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3979 //skip if we currently can not identify item
3980 continue;
3981 }
3982 weight_sum += osdmap.crush->get_item_weightf(acting);
3983 ++num_osds;
3984 }
3985 ++pgs_by_osd[acting];
3986 ++num_pg_copies;
3987 }
3988 }
3989
3990 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
3991 *ss << "Refusing to reweight: we only have " << num_pg_copies
3992 << " PGs across " << num_osds << " osds!\n";
3993 return -EDOM;
3994 }
3995
3996 average_util = (double)num_pg_copies / weight_sum;
3997 } else {
3998 // by osd utilization
3999 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
4000 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
4001 < g_conf()->mon_reweight_min_bytes_per_osd) {
4002 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
4003 << " kb across all osds!\n";
4004 return -EDOM;
4005 }
4006 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
4007 < g_conf()->mon_reweight_min_bytes_per_osd) {
4008 *ss << "Refusing to reweight: we only have "
4009 << pgm.osd_sum.statfs.kb_used_raw()
4010 << " kb used across all osds!\n";
4011 return -EDOM;
4012 }
4013
4014 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
4015 (double)pgm.osd_sum.statfs.total;
4016 }
4017
4018 // adjust down only if we are above the threshold
4019 const double overload_util = average_util * (double)oload / 100.0;
4020
4021 // but aggressively adjust weights up whenever possible.
4022 const double underload_util = average_util;
4023
4024 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4025
4026 ostringstream oss;
4027 if (f) {
4028 f->open_object_section("reweight_by_utilization");
4029 f->dump_int("overload_min", oload);
4030 f->dump_float("max_change", max_changef);
4031 f->dump_int("max_change_osds", max_osds);
4032 f->dump_float("average_utilization", average_util);
4033 f->dump_float("overload_utilization", overload_util);
4034 } else {
4035 oss << "oload " << oload << "\n";
4036 oss << "max_change " << max_changef << "\n";
4037 oss << "max_change_osds " << max_osds << "\n";
4038 oss.precision(4);
4039 oss << "average_utilization " << std::fixed << average_util << "\n";
4040 oss << "overload_utilization " << overload_util << "\n";
4041 }
4042 int num_changed = 0;
4043
4044 // precompute util for each OSD
4045 std::vector<std::pair<int, float> > util_by_osd;
4046 for (const auto& p : pgm.osd_stat) {
4047 std::pair<int, float> osd_util;
4048 osd_util.first = p.first;
4049 if (by_pg) {
4050 if (p.first >= (int)pgs_by_osd.size() ||
4051 pgs_by_osd[p.first] == 0) {
4052 // skip if this OSD does not contain any pg
4053 // belonging to the specified pool(s).
4054 continue;
4055 }
4056
4057 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4058 // skip if we are unable to locate item.
4059 continue;
4060 }
4061
4062 osd_util.second =
4063 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4064 } else {
4065 osd_util.second =
4066 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
4067 }
4068 util_by_osd.push_back(osd_util);
4069 }
4070
4071 // sort by absolute deviation from the mean utilization,
4072 // in descending order.
4073 std::sort(util_by_osd.begin(), util_by_osd.end(),
4074 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4075 return abs(l.second - average_util) > abs(r.second - average_util);
4076 }
4077 );
4078
4079 if (f)
4080 f->open_array_section("reweights");
4081
4082 for (const auto& p : util_by_osd) {
4083 unsigned weight = osdmap.get_weight(p.first);
4084 if (weight == 0) {
4085 // skip if OSD is currently out
4086 continue;
4087 }
4088 float util = p.second;
4089
4090 if (util >= overload_util) {
4091 // Assign a lower weight to overloaded OSDs. The current weight
4092 // is a factor to take into account the original weights,
4093 // to represent e.g. differing storage capacities
4094 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4095 if (weight > max_change)
4096 new_weight = std::max(new_weight, weight - max_change);
4097 new_weights->insert({p.first, new_weight});
4098 if (f) {
4099 f->open_object_section("osd");
4100 f->dump_int("osd", p.first);
4101 f->dump_float("weight", (float)weight / (float)0x10000);
4102 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4103 f->close_section();
4104 } else {
4105 oss << "osd." << p.first << " weight "
4106 << (float)weight / (float)0x10000 << " -> "
4107 << (float)new_weight / (float)0x10000 << "\n";
4108 }
4109 if (++num_changed >= max_osds)
4110 break;
4111 }
4112 if (!no_increasing && util <= underload_util) {
4113 // assign a higher weight.. if we can.
4114 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4115 new_weight = std::min(new_weight, weight + max_change);
4116 if (new_weight > 0x10000)
4117 new_weight = 0x10000;
4118 if (new_weight > weight) {
4119 new_weights->insert({p.first, new_weight});
4120 oss << "osd." << p.first << " weight "
4121 << (float)weight / (float)0x10000 << " -> "
4122 << (float)new_weight / (float)0x10000 << "\n";
4123 if (++num_changed >= max_osds)
4124 break;
4125 }
4126 }
4127 }
4128 if (f) {
4129 f->close_section();
4130 }
4131
4132 OSDMap newmap;
4133 newmap.deepish_copy_from(osdmap);
4134 OSDMap::Incremental newinc;
4135 newinc.fsid = newmap.get_fsid();
4136 newinc.epoch = newmap.get_epoch() + 1;
4137 newinc.new_weight = *new_weights;
4138 newmap.apply_incremental(newinc);
4139
4140 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4141
4142 if (f) {
4143 f->close_section();
4144 } else {
4145 *out_str += "\n";
4146 *out_str += oss.str();
4147 }
4148 return num_changed;
4149 }