]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
import ceph 14.2.5
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Clock.h"
11 #include "common/Formatter.h"
12 #include "global/global_context.h"
13 #include "include/ceph_features.h"
14 #include "include/stringify.h"
15
16 #include "osd/osd_types.h"
17 #include "osd/OSDMap.h"
18 #include <boost/range/adaptor/reversed.hpp>
19
20 #define dout_context g_ceph_context
21
22 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
25
26
27 // ---------------------
28 // PGMapDigest
29
30 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
31 {
32 // NOTE: see PGMap::encode_digest
33 uint8_t v = 4;
34 if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
35 v = 1;
36 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
37 v = 3;
38 }
39 ENCODE_START(v, 1, bl);
40 encode(num_pg, bl);
41 encode(num_pg_active, bl);
42 encode(num_pg_unknown, bl);
43 encode(num_osd, bl);
44 encode(pg_pool_sum, bl, features);
45 encode(pg_sum, bl, features);
46 encode(osd_sum, bl, features);
47 if (v >= 2) {
48 encode(num_pg_by_state, bl);
49 } else {
50 uint32_t n = num_pg_by_state.size();
51 encode(n, bl);
52 for (auto p : num_pg_by_state) {
53 encode((uint32_t)p.first, bl);
54 encode(p.second, bl);
55 }
56 }
57 encode(num_pg_by_osd, bl);
58 encode(num_pg_by_pool, bl);
59 encode(osd_last_seq, bl);
60 encode(per_pool_sum_delta, bl, features);
61 encode(per_pool_sum_deltas_stamps, bl);
62 encode(pg_sum_delta, bl, features);
63 encode(stamp_delta, bl);
64 encode(avail_space_by_rule, bl);
65 if (struct_v >= 3) {
66 encode(purged_snaps, bl);
67 }
68 if (struct_v >= 4) {
69 encode(osd_sum_by_class, bl, features);
70 }
71 ENCODE_FINISH(bl);
72 }
73
74 void PGMapDigest::decode(bufferlist::const_iterator& p)
75 {
76 DECODE_START(4, p);
77 decode(num_pg, p);
78 decode(num_pg_active, p);
79 decode(num_pg_unknown, p);
80 decode(num_osd, p);
81 decode(pg_pool_sum, p);
82 decode(pg_sum, p);
83 decode(osd_sum, p);
84 if (struct_v >= 2) {
85 decode(num_pg_by_state, p);
86 } else {
87 map<int32_t, int32_t> nps;
88 decode(nps, p);
89 num_pg_by_state.clear();
90 for (auto i : nps) {
91 num_pg_by_state[i.first] = i.second;
92 }
93 }
94 decode(num_pg_by_osd, p);
95 decode(num_pg_by_pool, p);
96 decode(osd_last_seq, p);
97 decode(per_pool_sum_delta, p);
98 decode(per_pool_sum_deltas_stamps, p);
99 decode(pg_sum_delta, p);
100 decode(stamp_delta, p);
101 decode(avail_space_by_rule, p);
102 if (struct_v >= 3) {
103 decode(purged_snaps, p);
104 }
105 if (struct_v >= 4) {
106 decode(osd_sum_by_class, p);
107 }
108 DECODE_FINISH(p);
109 }
110
111 void PGMapDigest::dump(Formatter *f) const
112 {
113 f->dump_unsigned("num_pg", num_pg);
114 f->dump_unsigned("num_pg_active", num_pg_active);
115 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
116 f->dump_unsigned("num_osd", num_osd);
117 f->dump_object("pool_sum", pg_sum);
118 f->dump_object("osd_sum", osd_sum);
119
120 f->open_object_section("osd_sum_by_class");
121 for (auto& i : osd_sum_by_class) {
122 f->dump_object(i.first.c_str(), i.second);
123 }
124 f->close_section();
125
126 f->open_array_section("pool_stats");
127 for (auto& p : pg_pool_sum) {
128 f->open_object_section("pool_stat");
129 f->dump_int("poolid", p.first);
130 auto q = num_pg_by_pool.find(p.first);
131 if (q != num_pg_by_pool.end())
132 f->dump_unsigned("num_pg", q->second);
133 p.second.dump(f);
134 f->close_section();
135 }
136 f->close_section();
137 f->open_array_section("osd_stats");
138 int i = 0;
139 // TODO: this isn't really correct since we can dump non-existent OSDs
140 // I dunno what osd_last_seq is set to in that case...
141 for (auto& p : osd_last_seq) {
142 f->open_object_section("osd_stat");
143 f->dump_int("osd", i);
144 f->dump_unsigned("seq", p);
145 f->close_section();
146 ++i;
147 }
148 f->close_section();
149 f->open_array_section("num_pg_by_state");
150 for (auto& p : num_pg_by_state) {
151 f->open_object_section("count");
152 f->dump_string("state", pg_state_string(p.first));
153 f->dump_unsigned("num", p.second);
154 f->close_section();
155 }
156 f->close_section();
157 f->open_array_section("num_pg_by_osd");
158 for (auto& p : num_pg_by_osd) {
159 f->open_object_section("count");
160 f->dump_unsigned("osd", p.first);
161 f->dump_unsigned("num_primary_pg", p.second.primary);
162 f->dump_unsigned("num_acting_pg", p.second.acting);
163 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
164 f->close_section();
165 }
166 f->close_section();
167 f->open_array_section("purged_snaps");
168 for (auto& j : purged_snaps) {
169 f->open_object_section("pool");
170 f->dump_int("pool", j.first);
171 f->open_object_section("purged_snaps");
172 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
173 f->open_object_section("interval");
174 f->dump_stream("start") << i.get_start();
175 f->dump_stream("length") << i.get_len();
176 f->close_section();
177 }
178 f->close_section();
179 f->close_section();
180 }
181 f->close_section();
182 }
183
184 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
185 {
186 ls.push_back(new PGMapDigest);
187 }
188
189 inline std::string percentify(const float& a) {
190 std::stringstream ss;
191 if (a < 0.01)
192 ss << "0";
193 else
194 ss << std::fixed << std::setprecision(2) << a;
195 return ss.str();
196 }
197
198 void PGMapDigest::print_summary(Formatter *f, ostream *out) const
199 {
200 if (f)
201 f->open_array_section("pgs_by_state");
202
203 // list is descending numeric order (by count)
204 multimap<int,int> state_by_count; // count -> state
205 for (auto p = num_pg_by_state.begin();
206 p != num_pg_by_state.end();
207 ++p) {
208 state_by_count.insert(make_pair(p->second, p->first));
209 }
210 if (f) {
211 for (auto p = state_by_count.rbegin();
212 p != state_by_count.rend();
213 ++p)
214 {
215 f->open_object_section("pgs_by_state_element");
216 f->dump_string("state_name", pg_state_string(p->second));
217 f->dump_unsigned("count", p->first);
218 f->close_section();
219 }
220 }
221 if (f)
222 f->close_section();
223
224 if (f) {
225 f->dump_unsigned("num_pgs", num_pg);
226 f->dump_unsigned("num_pools", pg_pool_sum.size());
227 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
228 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
229 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
230 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
231 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
232 } else {
233 *out << " pools: " << pg_pool_sum.size() << " pools, "
234 << num_pg << " pgs\n";
235 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
236 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
237 *out << " usage: "
238 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
239 << byte_u_t(osd_sum.statfs.available) << " / "
240 << byte_u_t(osd_sum.statfs.total) << " avail\n";
241 *out << " pgs: ";
242 }
243
244 bool pad = false;
245
246 if (num_pg_unknown > 0) {
247 float p = (float)num_pg_unknown / (float)num_pg;
248 if (f) {
249 f->dump_float("unknown_pgs_ratio", p);
250 } else {
251 char b[20];
252 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
253 *out << b << "% pgs unknown\n";
254 pad = true;
255 }
256 }
257
258 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
259 if (num_pg_inactive > 0) {
260 float p = (float)num_pg_inactive / (float)num_pg;
261 if (f) {
262 f->dump_float("inactive_pgs_ratio", p);
263 } else {
264 if (pad) {
265 *out << " ";
266 }
267 char b[20];
268 snprintf(b, sizeof(b), "%.3f", p * 100.0);
269 *out << b << "% pgs not active\n";
270 pad = true;
271 }
272 }
273
274 list<string> sl;
275 overall_recovery_summary(f, &sl);
276 if (!f && !sl.empty()) {
277 for (auto p = sl.begin(); p != sl.end(); ++p) {
278 if (pad) {
279 *out << " ";
280 }
281 *out << *p << "\n";
282 pad = true;
283 }
284 }
285 sl.clear();
286
287 if (!f) {
288 unsigned max_width = 1;
289 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
290 p != state_by_count.rend();
291 ++p)
292 {
293 std::stringstream ss;
294 ss << p->first;
295 max_width = std::max<size_t>(ss.str().size(), max_width);
296 }
297
298 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
299 p != state_by_count.rend();
300 ++p)
301 {
302 if (pad) {
303 *out << " ";
304 }
305 pad = true;
306 out->setf(std::ios::left);
307 *out << std::setw(max_width) << p->first
308 << " " << pg_state_string(p->second) << "\n";
309 out->unsetf(std::ios::left);
310 }
311 }
312
313 ostringstream ss_rec_io;
314 overall_recovery_rate_summary(f, &ss_rec_io);
315 ostringstream ss_client_io;
316 overall_client_io_rate_summary(f, &ss_client_io);
317 ostringstream ss_cache_io;
318 overall_cache_io_rate_summary(f, &ss_cache_io);
319
320 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
321 || ss_cache_io.str().length())) {
322 *out << "\n \n";
323 *out << " io:\n";
324 }
325
326 if (!f && ss_client_io.str().length())
327 *out << " client: " << ss_client_io.str() << "\n";
328 if (!f && ss_rec_io.str().length())
329 *out << " recovery: " << ss_rec_io.str() << "\n";
330 if (!f && ss_cache_io.str().length())
331 *out << " cache: " << ss_cache_io.str() << "\n";
332 }
333
334 void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
335 {
336 std::stringstream ss;
337
338 if (f)
339 f->open_array_section("num_pg_by_state");
340 for (auto p = num_pg_by_state.begin();
341 p != num_pg_by_state.end();
342 ++p) {
343 if (f) {
344 f->open_object_section("state");
345 f->dump_string("name", pg_state_string(p->first));
346 f->dump_unsigned("num", p->second);
347 f->close_section();
348 }
349 if (p != num_pg_by_state.begin())
350 ss << ", ";
351 ss << p->second << " " << pg_state_string(p->first);
352 }
353 if (f)
354 f->close_section();
355
356 string states = ss.str();
357 if (out)
358 *out << num_pg << " pgs: "
359 << states << "; "
360 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
361 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
362 << byte_u_t(osd_sum.statfs.available) << " / "
363 << byte_u_t(osd_sum.statfs.total) << " avail";
364 if (f) {
365 f->dump_unsigned("num_pgs", num_pg);
366 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
367 f->dump_int("total_bytes", osd_sum.statfs.total);
368 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
369 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
370 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
371 }
372
373 // make non-negative; we can get negative values if osds send
374 // uncommitted stats and then "go backward" or if they are just
375 // buggy/wrong.
376 pool_stat_t pos_delta = pg_sum_delta;
377 pos_delta.floor(0);
378 if (pos_delta.stats.sum.num_rd ||
379 pos_delta.stats.sum.num_wr) {
380 if (out)
381 *out << "; ";
382 if (pos_delta.stats.sum.num_rd) {
383 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
384 if (out)
385 *out << byte_u_t(rd) << "/s rd, ";
386 if (f)
387 f->dump_unsigned("read_bytes_sec", rd);
388 }
389 if (pos_delta.stats.sum.num_wr) {
390 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
391 if (out)
392 *out << byte_u_t(wr) << "/s wr, ";
393 if (f)
394 f->dump_unsigned("write_bytes_sec", wr);
395 }
396 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
397 if (out)
398 *out << si_u_t(iops) << " op/s";
399 if (f)
400 f->dump_unsigned("io_sec", iops);
401 }
402
403 list<string> sl;
404 overall_recovery_summary(f, &sl);
405 if (out)
406 for (auto p = sl.begin(); p != sl.end(); ++p)
407 *out << "; " << *p;
408 std::stringstream ssr;
409 overall_recovery_rate_summary(f, &ssr);
410 if (out && ssr.str().length())
411 *out << "; " << ssr.str() << " recovering";
412 }
413
414 void PGMapDigest::get_recovery_stats(
415 double *misplaced_ratio,
416 double *degraded_ratio,
417 double *inactive_pgs_ratio,
418 double *unknown_pgs_ratio) const
419 {
420 if (pg_sum.stats.sum.num_objects_degraded &&
421 pg_sum.stats.sum.num_object_copies > 0) {
422 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
423 (double)pg_sum.stats.sum.num_object_copies;
424 } else {
425 *degraded_ratio = 0;
426 }
427 if (pg_sum.stats.sum.num_objects_misplaced &&
428 pg_sum.stats.sum.num_object_copies > 0) {
429 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
430 (double)pg_sum.stats.sum.num_object_copies;
431 } else {
432 *misplaced_ratio = 0;
433 }
434 if (num_pg > 0) {
435 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
436 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
437 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
438 } else {
439 *inactive_pgs_ratio = 0;
440 *unknown_pgs_ratio = 0;
441 }
442 }
443
444 void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
445 const pool_stat_t& pool_sum) const
446 {
447 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
448 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
449 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
450 char b[20];
451 snprintf(b, sizeof(b), "%.3lf", pc);
452 if (f) {
453 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
454 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
455 f->dump_float("degraded_ratio", pc / 100.0);
456 } else {
457 ostringstream ss;
458 ss << pool_sum.stats.sum.num_objects_degraded
459 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
460 psl->push_back(ss.str());
461 }
462 }
463 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
464 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
465 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
466 char b[20];
467 snprintf(b, sizeof(b), "%.3lf", pc);
468 if (f) {
469 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
470 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
471 f->dump_float("misplaced_ratio", pc / 100.0);
472 } else {
473 ostringstream ss;
474 ss << pool_sum.stats.sum.num_objects_misplaced
475 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
476 psl->push_back(ss.str());
477 }
478 }
479 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
480 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
481 (double)pool_sum.stats.sum.num_objects * (double)100.0;
482 char b[20];
483 snprintf(b, sizeof(b), "%.3lf", pc);
484 if (f) {
485 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
486 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
487 f->dump_float("unfound_ratio", pc / 100.0);
488 } else {
489 ostringstream ss;
490 ss << pool_sum.stats.sum.num_objects_unfound
491 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
492 psl->push_back(ss.str());
493 }
494 }
495 }
496
497 void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
498 const pool_stat_t& delta_sum,
499 utime_t delta_stamp) const
500 {
501 // make non-negative; we can get negative values if osds send
502 // uncommitted stats and then "go backward" or if they are just
503 // buggy/wrong.
504 pool_stat_t pos_delta = delta_sum;
505 pos_delta.floor(0);
506 if (pos_delta.stats.sum.num_objects_recovered ||
507 pos_delta.stats.sum.num_bytes_recovered ||
508 pos_delta.stats.sum.num_keys_recovered) {
509 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
510 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
511 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
512 if (f) {
513 f->dump_int("recovering_objects_per_sec", objps);
514 f->dump_int("recovering_bytes_per_sec", bps);
515 f->dump_int("recovering_keys_per_sec", kps);
516 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
517 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
518 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
519 } else {
520 *out << byte_u_t(bps) << "/s";
521 if (pos_delta.stats.sum.num_keys_recovered)
522 *out << ", " << si_u_t(kps) << " keys/s";
523 *out << ", " << si_u_t(objps) << " objects/s";
524 }
525 }
526 }
527
528 void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
529 {
530 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
531 }
532
533 void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
534 {
535 recovery_summary(f, psl, pg_sum);
536 }
537
538 void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
539 uint64_t poolid) const
540 {
541 auto p = per_pool_sum_delta.find(poolid);
542 if (p == per_pool_sum_delta.end())
543 return;
544
545 auto ts = per_pool_sum_deltas_stamps.find(p->first);
546 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
547 recovery_rate_summary(f, out, p->second.first, ts->second);
548 }
549
550 void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
551 uint64_t poolid) const
552 {
553 auto p = pg_pool_sum.find(poolid);
554 if (p == pg_pool_sum.end())
555 return;
556
557 recovery_summary(f, psl, p->second);
558 }
559
560 void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
561 const pool_stat_t& delta_sum,
562 utime_t delta_stamp) const
563 {
564 pool_stat_t pos_delta = delta_sum;
565 pos_delta.floor(0);
566 if (pos_delta.stats.sum.num_rd ||
567 pos_delta.stats.sum.num_wr) {
568 if (pos_delta.stats.sum.num_rd) {
569 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
570 if (f) {
571 f->dump_int("read_bytes_sec", rd);
572 } else {
573 *out << byte_u_t(rd) << "/s rd, ";
574 }
575 }
576 if (pos_delta.stats.sum.num_wr) {
577 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
578 if (f) {
579 f->dump_int("write_bytes_sec", wr);
580 } else {
581 *out << byte_u_t(wr) << "/s wr, ";
582 }
583 }
584 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
585 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
586 if (f) {
587 f->dump_int("read_op_per_sec", iops_rd);
588 f->dump_int("write_op_per_sec", iops_wr);
589 } else {
590 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
591 }
592 }
593 }
594
595 void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
596 {
597 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
598 }
599
600 void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
601 uint64_t poolid) const
602 {
603 auto p = per_pool_sum_delta.find(poolid);
604 if (p == per_pool_sum_delta.end())
605 return;
606
607 auto ts = per_pool_sum_deltas_stamps.find(p->first);
608 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
609 client_io_rate_summary(f, out, p->second.first, ts->second);
610 }
611
612 void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
613 const pool_stat_t& delta_sum,
614 utime_t delta_stamp) const
615 {
616 pool_stat_t pos_delta = delta_sum;
617 pos_delta.floor(0);
618 bool have_output = false;
619
620 if (pos_delta.stats.sum.num_flush) {
621 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
622 if (f) {
623 f->dump_int("flush_bytes_sec", flush);
624 } else {
625 *out << byte_u_t(flush) << "/s flush";
626 have_output = true;
627 }
628 }
629 if (pos_delta.stats.sum.num_evict) {
630 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
631 if (f) {
632 f->dump_int("evict_bytes_sec", evict);
633 } else {
634 if (have_output)
635 *out << ", ";
636 *out << byte_u_t(evict) << "/s evict";
637 have_output = true;
638 }
639 }
640 if (pos_delta.stats.sum.num_promote) {
641 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
642 if (f) {
643 f->dump_int("promote_op_per_sec", promote);
644 } else {
645 if (have_output)
646 *out << ", ";
647 *out << si_u_t(promote) << " op/s promote";
648 have_output = true;
649 }
650 }
651 if (pos_delta.stats.sum.num_flush_mode_low) {
652 if (f) {
653 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
654 } else {
655 if (have_output)
656 *out << ", ";
657 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
658 have_output = true;
659 }
660 }
661 if (pos_delta.stats.sum.num_flush_mode_high) {
662 if (f) {
663 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
664 } else {
665 if (have_output)
666 *out << ", ";
667 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
668 have_output = true;
669 }
670 }
671 if (pos_delta.stats.sum.num_evict_mode_some) {
672 if (f) {
673 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
674 } else {
675 if (have_output)
676 *out << ", ";
677 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
678 have_output = true;
679 }
680 }
681 if (pos_delta.stats.sum.num_evict_mode_full) {
682 if (f) {
683 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
684 } else {
685 if (have_output)
686 *out << ", ";
687 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
688 }
689 }
690 }
691
692 void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
693 {
694 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
695 }
696
697 void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
698 uint64_t poolid) const
699 {
700 auto p = per_pool_sum_delta.find(poolid);
701 if (p == per_pool_sum_delta.end())
702 return;
703
704 auto ts = per_pool_sum_deltas_stamps.find(p->first);
705 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
706 cache_io_rate_summary(f, out, p->second.first, ts->second);
707 }
708
709 ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
710 boost::optional<int64_t> data_pool) const
711 {
712 ceph_statfs statfs;
713 bool filter = false;
714 object_stat_sum_t sum;
715
716 if (data_pool) {
717 auto i = pg_pool_sum.find(*data_pool);
718 if (i != pg_pool_sum.end()) {
719 sum = i->second.stats.sum;
720 filter = true;
721 }
722 }
723
724 if (filter) {
725 statfs.kb_used = (sum.num_bytes >> 10);
726 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
727 statfs.num_objects = sum.num_objects;
728 statfs.kb = statfs.kb_used + statfs.kb_avail;
729 } else {
730 // these are in KB.
731 statfs.kb = osd_sum.statfs.kb();
732 statfs.kb_used = osd_sum.statfs.kb_used_raw();
733 statfs.kb_avail = osd_sum.statfs.kb_avail();
734 statfs.num_objects = pg_sum.stats.sum.num_objects;
735 }
736
737 return statfs;
738 }
739
740 void PGMapDigest::dump_pool_stats_full(
741 const OSDMap &osd_map,
742 stringstream *ss,
743 Formatter *f,
744 bool verbose) const
745 {
746 TextTable tbl;
747
748 if (f) {
749 f->open_array_section("pools");
750 } else {
751 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
752 tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT);
753 tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT);
754 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
755 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
756 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
757 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
758
759 if (verbose) {
760 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
761 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
762 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
763 tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT);
764 tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT);
765 }
766 }
767
768 map<int,uint64_t> avail_by_rule;
769 for (auto p = osd_map.get_pools().begin();
770 p != osd_map.get_pools().end(); ++p) {
771 int64_t pool_id = p->first;
772 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
773 continue;
774
775 const string& pool_name = osd_map.get_pool_name(pool_id);
776 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
777
778 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
779 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
780 pool->get_type(),
781 pool->get_size());
782 int64_t avail;
783 if (avail_by_rule.count(ruleno) == 0) {
784 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
785 avail = get_rule_avail(ruleno);
786 if (avail < 0)
787 avail = 0;
788 avail_by_rule[ruleno] = avail;
789 } else {
790 avail = avail_by_rule[ruleno];
791 }
792 if (f) {
793 f->open_object_section("pool");
794 f->dump_string("name", pool_name);
795 f->dump_int("id", pool_id);
796 f->open_object_section("stats");
797 } else {
798 tbl << pool_name
799 << pool_id;
800 }
801 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
802 bool per_pool = use_per_pool_stats();
803 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
804 pool);
805 if (f) {
806 f->close_section(); // stats
807 f->close_section(); // pool
808 } else {
809 tbl << TextTable::endrow;
810 }
811 }
812 if (f)
813 f->close_section();
814 else {
815 ceph_assert(ss != nullptr);
816 *ss << "POOLS:\n";
817 tbl.set_indent(4);
818 *ss << tbl;
819 }
820 }
821
822 void PGMapDigest::dump_cluster_stats(stringstream *ss,
823 Formatter *f,
824 bool verbose) const
825 {
826 if (f) {
827 f->open_object_section("stats");
828 f->dump_int("total_bytes", osd_sum.statfs.total);
829 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
830 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
831 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
832 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
833 f->dump_unsigned("num_osds", osd_sum.num_osds);
834 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
835 f->close_section();
836 f->open_object_section("stats_by_class");
837 for (auto& i : osd_sum_by_class) {
838 f->open_object_section(i.first.c_str());
839 f->dump_int("total_bytes", i.second.statfs.total);
840 f->dump_int("total_avail_bytes", i.second.statfs.available);
841 f->dump_int("total_used_bytes", i.second.statfs.get_used());
842 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
843 f->dump_float("total_used_raw_ratio",
844 i.second.statfs.get_used_raw_ratio());
845 f->close_section();
846 }
847 f->close_section();
848 } else {
849 ceph_assert(ss != nullptr);
850 TextTable tbl;
851 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
852 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
853 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
854 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
855 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
856 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
857
858
859 for (auto& i : osd_sum_by_class) {
860 tbl << i.first;
861 tbl << stringify(byte_u_t(i.second.statfs.total))
862 << stringify(byte_u_t(i.second.statfs.available))
863 << stringify(byte_u_t(i.second.statfs.get_used()))
864 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
865 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
866 << TextTable::endrow;
867 }
868 tbl << "TOTAL";
869 tbl << stringify(byte_u_t(osd_sum.statfs.total))
870 << stringify(byte_u_t(osd_sum.statfs.available))
871 << stringify(byte_u_t(osd_sum.statfs.get_used()))
872 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
873 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
874 << TextTable::endrow;
875
876 *ss << "RAW STORAGE:\n";
877 tbl.set_indent(4);
878 *ss << tbl;
879 }
880 }
881
882 void PGMapDigest::dump_object_stat_sum(
883 TextTable &tbl, Formatter *f,
884 const pool_stat_t &pool_stat, uint64_t avail,
885 float raw_used_rate, bool verbose, bool per_pool,
886 const pg_pool_t *pool)
887 {
888 const object_stat_sum_t &sum = pool_stat.stats.sum;
889 const store_statfs_t statfs = pool_stat.store_stats;
890
891 if (sum.num_object_copies > 0) {
892 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
893 }
894
895 uint64_t used_bytes = pool_stat.get_allocated_bytes(per_pool);
896
897 float used = 0.0;
898 // note avail passed in is raw_avail, calc raw_used here.
899 if (avail) {
900 used = used_bytes;
901 used /= used + avail;
902 } else if (used_bytes) {
903 used = 1.0;
904 }
905 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
906 // an approximation for actually stored user data
907 auto stored_normalized = pool_stat.get_user_bytes(raw_used_rate, per_pool);
908 if (f) {
909 f->dump_int("stored", stored_normalized);
910 f->dump_int("objects", sum.num_objects);
911 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
912 f->dump_int("bytes_used", used_bytes);
913 f->dump_float("percent_used", used);
914 f->dump_unsigned("max_avail", avail_res);
915 if (verbose) {
916 f->dump_int("quota_objects", pool->quota_max_objects);
917 f->dump_int("quota_bytes", pool->quota_max_bytes);
918 f->dump_int("dirty", sum.num_objects_dirty);
919 f->dump_int("rd", sum.num_rd);
920 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
921 f->dump_int("wr", sum.num_wr);
922 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
923 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
924 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
925 // Stored by user amplified by replication
926 f->dump_int("stored_raw", pool_stat.get_user_bytes(1.0, per_pool));
927 }
928 } else {
929 tbl << stringify(byte_u_t(stored_normalized));
930 tbl << stringify(si_u_t(sum.num_objects));
931 tbl << stringify(byte_u_t(used_bytes));
932 tbl << percentify(used*100);
933 tbl << stringify(byte_u_t(avail_res));
934 if (verbose) {
935 if (pool->quota_max_objects == 0)
936 tbl << "N/A";
937 else
938 tbl << stringify(si_u_t(pool->quota_max_objects));
939
940 if (pool->quota_max_bytes == 0)
941 tbl << "N/A";
942 else
943 tbl << stringify(byte_u_t(pool->quota_max_bytes));
944
945 tbl << stringify(si_u_t(sum.num_objects_dirty))
946 << stringify(byte_u_t(statfs.data_compressed_allocated))
947 << stringify(byte_u_t(statfs.data_compressed_original))
948 ;
949 }
950 }
951 }
952
953 int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
954 int64_t poolid) const
955 {
956 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
957 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
958 pool->get_type(),
959 pool->get_size());
960 int64_t avail;
961 avail = get_rule_avail(ruleno);
962 if (avail < 0)
963 avail = 0;
964
965 return avail / osd_map.pool_raw_used_rate(poolid);
966 }
967
968 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
969 {
970 map<int,float> wm;
971 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
972 if (r < 0) {
973 return r;
974 }
975 if (wm.empty()) {
976 return 0;
977 }
978
979 float fratio = osdmap.get_full_ratio();
980
981 int64_t min = -1;
982 for (auto p = wm.begin(); p != wm.end(); ++p) {
983 auto osd_info = osd_stat.find(p->first);
984 if (osd_info != osd_stat.end()) {
985 if (osd_info->second.statfs.total == 0 || p->second == 0) {
986 // osd must be out, hence its stats have been zeroed
987 // (unless we somehow managed to have a disk with size 0...)
988 //
989 // (p->second == 0), if osd weight is 0, no need to
990 // calculate proj below.
991 continue;
992 }
993 double unusable = (double)osd_info->second.statfs.kb() *
994 (1.0 - fratio);
995 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
996 avail *= 1024.0;
997 int64_t proj = (int64_t)(avail / (double)p->second);
998 if (min < 0 || proj < min) {
999 min = proj;
1000 }
1001 } else {
1002 if (osdmap.is_up(p->first)) {
1003 // This is a level 4 rather than an error, because we might have
1004 // only just started, and not received the first stats message yet.
1005 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1006 }
1007 }
1008 }
1009 return min;
1010 }
1011
1012 void PGMap::get_rules_avail(const OSDMap& osdmap,
1013 std::map<int,int64_t> *avail_map) const
1014 {
1015 avail_map->clear();
1016 for (auto p : osdmap.get_pools()) {
1017 int64_t pool_id = p.first;
1018 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1019 continue;
1020 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1021 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1022 pool->get_type(),
1023 pool->get_size());
1024 if (avail_map->count(ruleno) == 0)
1025 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1026 }
1027 }
1028
1029 // ---------------------
1030 // PGMap
1031
1032 void PGMap::Incremental::dump(Formatter *f) const
1033 {
1034 f->dump_unsigned("version", version);
1035 f->dump_stream("stamp") << stamp;
1036 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1037 f->dump_unsigned("pg_scan_epoch", pg_scan);
1038
1039 f->open_array_section("pg_stat_updates");
1040 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1041 f->open_object_section("pg_stat");
1042 f->dump_stream("pgid") << p->first;
1043 p->second.dump(f);
1044 f->close_section();
1045 }
1046 f->close_section();
1047
1048 f->open_array_section("osd_stat_updates");
1049 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1050 f->open_object_section("osd_stat");
1051 f->dump_int("osd", p->first);
1052 p->second.dump(f);
1053 f->close_section();
1054 }
1055 f->close_section();
1056 f->open_array_section("pool_statfs_updates");
1057 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1058 f->open_object_section("pool_statfs");
1059 f->dump_stream("poolid/osd") << p->first;
1060 p->second.dump(f);
1061 f->close_section();
1062 }
1063 f->close_section();
1064
1065 f->open_array_section("osd_stat_removals");
1066 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1067 f->dump_int("osd", *p);
1068 f->close_section();
1069
1070 f->open_array_section("pg_removals");
1071 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1072 f->dump_stream("pgid") << *p;
1073 f->close_section();
1074 }
1075
1076 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1077 {
1078 o.push_back(new Incremental);
1079 o.push_back(new Incremental);
1080 o.back()->version = 1;
1081 o.back()->stamp = utime_t(123,345);
1082 o.push_back(new Incremental);
1083 o.back()->version = 2;
1084 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
1085 o.back()->osd_stat_updates[5] = osd_stat_t();
1086 o.push_back(new Incremental);
1087 o.back()->version = 3;
1088 o.back()->osdmap_epoch = 1;
1089 o.back()->pg_scan = 2;
1090 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
1091 o.back()->osd_stat_updates[6] = osd_stat_t();
1092 o.back()->pg_remove.insert(pg_t(1,2));
1093 o.back()->osd_stat_rm.insert(5);
1094 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
1095 }
1096
1097 // --
1098
1099 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1100 {
1101 ceph_assert(inc.version == version+1);
1102 version++;
1103
1104 pool_stat_t pg_sum_old = pg_sum;
1105 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1106 pg_pool_sum_old = pg_pool_sum;
1107
1108 for (auto p = inc.pg_stat_updates.begin();
1109 p != inc.pg_stat_updates.end();
1110 ++p) {
1111 const pg_t &update_pg(p->first);
1112 auto update_pool = update_pg.pool();
1113 const pg_stat_t &update_stat(p->second);
1114
1115 auto pg_stat_iter = pg_stat.find(update_pg);
1116 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1117 if (pg_stat_iter == pg_stat.end()) {
1118 pg_stat.insert(make_pair(update_pg, update_stat));
1119 } else {
1120 stat_pg_sub(update_pg, pg_stat_iter->second);
1121 pool_sum_ref.sub(pg_stat_iter->second);
1122 pg_stat_iter->second = update_stat;
1123 }
1124 stat_pg_add(update_pg, update_stat);
1125 pool_sum_ref.add(update_stat);
1126 }
1127
1128 for (auto p = inc.pool_statfs_updates.begin();
1129 p != inc.pool_statfs_updates.end();
1130 ++p) {
1131 auto update_pool = p->first.first;
1132 auto update_osd = p->first.second;
1133 auto& statfs_inc = p->second;
1134
1135 auto pool_statfs_iter =
1136 pool_statfs.find(std::make_pair(update_pool, update_osd));
1137 if (pg_pool_sum.count(update_pool)) {
1138 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1139 if (pool_statfs_iter == pool_statfs.end()) {
1140 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1141 } else {
1142 pool_sum_ref.sub(pool_statfs_iter->second);
1143 pool_statfs_iter->second = statfs_inc;
1144 }
1145 pool_sum_ref.add(statfs_inc);
1146 }
1147 }
1148
1149 for (auto p = inc.get_osd_stat_updates().begin();
1150 p != inc.get_osd_stat_updates().end();
1151 ++p) {
1152 int osd = p->first;
1153 const osd_stat_t &new_stats(p->second);
1154
1155 auto t = osd_stat.find(osd);
1156 if (t == osd_stat.end()) {
1157 osd_stat.insert(make_pair(osd, new_stats));
1158 } else {
1159 stat_osd_sub(t->first, t->second);
1160 t->second = new_stats;
1161 }
1162 stat_osd_add(osd, new_stats);
1163 }
1164 set<int64_t> deleted_pools;
1165 for (auto p = inc.pg_remove.begin();
1166 p != inc.pg_remove.end();
1167 ++p) {
1168 const pg_t &removed_pg(*p);
1169 auto s = pg_stat.find(removed_pg);
1170 bool pool_erased = false;
1171 if (s != pg_stat.end()) {
1172 pool_erased = stat_pg_sub(removed_pg, s->second);
1173 pg_stat.erase(s);
1174 if (pool_erased) {
1175 deleted_pools.insert(removed_pg.pool());
1176 }
1177 }
1178 }
1179
1180 for (auto p = inc.get_osd_stat_rm().begin();
1181 p != inc.get_osd_stat_rm().end();
1182 ++p) {
1183 auto t = osd_stat.find(*p);
1184 if (t != osd_stat.end()) {
1185 stat_osd_sub(t->first, t->second);
1186 osd_stat.erase(t);
1187 }
1188 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1189 if (i->first.second == *p) {
1190 pg_pool_sum[i->first.first].sub(i->second);
1191 pool_statfs.erase(i);
1192 }
1193 }
1194 }
1195
1196 // skip calculating delta while sum was not synchronized
1197 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1198 utime_t delta_t;
1199 delta_t = inc.stamp;
1200 delta_t -= stamp;
1201 // calculate a delta, and average over the last 2 deltas.
1202 pool_stat_t d = pg_sum;
1203 d.stats.sub(pg_sum_old.stats);
1204 pg_sum_deltas.push_back(make_pair(d, delta_t));
1205 stamp_delta += delta_t;
1206 pg_sum_delta.stats.add(d.stats);
1207 auto smooth_intervals =
1208 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1209 while (pg_sum_deltas.size() > smooth_intervals) {
1210 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1211 stamp_delta -= pg_sum_deltas.front().second;
1212 pg_sum_deltas.pop_front();
1213 }
1214 }
1215 stamp = inc.stamp;
1216
1217 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1218
1219 for (auto p : deleted_pools) {
1220 if (cct)
1221 dout(20) << " deleted pool " << p << dendl;
1222 deleted_pool(p);
1223 }
1224
1225 if (inc.osdmap_epoch)
1226 last_osdmap_epoch = inc.osdmap_epoch;
1227 if (inc.pg_scan)
1228 last_pg_scan = inc.pg_scan;
1229 }
1230
1231 void PGMap::calc_stats()
1232 {
1233 num_pg = 0;
1234 num_pg_active = 0;
1235 num_pg_unknown = 0;
1236 num_osd = 0;
1237 pg_pool_sum.clear();
1238 num_pg_by_pool.clear();
1239 pg_by_osd.clear();
1240 pg_sum = pool_stat_t();
1241 osd_sum = osd_stat_t();
1242 osd_sum_by_class.clear();
1243 num_pg_by_state.clear();
1244 num_pg_by_pool_state.clear();
1245 num_pg_by_osd.clear();
1246
1247 for (auto p = pg_stat.begin();
1248 p != pg_stat.end();
1249 ++p) {
1250 auto pg = p->first;
1251 stat_pg_add(pg, p->second);
1252 pg_pool_sum[pg.pool()].add(p->second);
1253 }
1254 for (auto p = pool_statfs.begin();
1255 p != pool_statfs.end();
1256 ++p) {
1257 auto pool = p->first.first;
1258 pg_pool_sum[pool].add(p->second);
1259 }
1260 for (auto p = osd_stat.begin();
1261 p != osd_stat.end();
1262 ++p)
1263 stat_osd_add(p->first, p->second);
1264 }
1265
1266 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1267 bool sameosds)
1268 {
1269 auto pool = pgid.pool();
1270 pg_sum.add(s);
1271
1272 num_pg++;
1273 num_pg_by_state[s.state]++;
1274 num_pg_by_pool_state[pgid.pool()][s.state]++;
1275 num_pg_by_pool[pool]++;
1276
1277 if ((s.state & PG_STATE_CREATING) &&
1278 s.parent_split_bits == 0) {
1279 creating_pgs.insert(pgid);
1280 if (s.acting_primary >= 0) {
1281 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1282 }
1283 }
1284
1285 if (s.state & PG_STATE_ACTIVE) {
1286 ++num_pg_active;
1287 }
1288 if (s.state == 0) {
1289 ++num_pg_unknown;
1290 }
1291
1292 if (sameosds)
1293 return;
1294
1295 for (auto p = s.blocked_by.begin();
1296 p != s.blocked_by.end();
1297 ++p) {
1298 ++blocked_by_sum[*p];
1299 }
1300
1301 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1302 pg_by_osd[*p].insert(pgid);
1303 num_pg_by_osd[*p].acting++;
1304 }
1305 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1306 auto& t = pg_by_osd[*p];
1307 if (t.find(pgid) == t.end()) {
1308 t.insert(pgid);
1309 num_pg_by_osd[*p].up_not_acting++;
1310 }
1311 }
1312
1313 if (s.up_primary >= 0) {
1314 num_pg_by_osd[s.up_primary].primary++;
1315 }
1316 }
1317
1318 bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1319 bool sameosds)
1320 {
1321 bool pool_erased = false;
1322 pg_sum.sub(s);
1323
1324 num_pg--;
1325 int end = --num_pg_by_state[s.state];
1326 ceph_assert(end >= 0);
1327 if (end == 0)
1328 num_pg_by_state.erase(s.state);
1329 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1330 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1331 }
1332 end = --num_pg_by_pool[pgid.pool()];
1333 if (end == 0) {
1334 pool_erased = true;
1335 }
1336
1337 if ((s.state & PG_STATE_CREATING) &&
1338 s.parent_split_bits == 0) {
1339 creating_pgs.erase(pgid);
1340 if (s.acting_primary >= 0) {
1341 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1342 r[s.mapping_epoch].erase(pgid);
1343 if (r[s.mapping_epoch].empty())
1344 r.erase(s.mapping_epoch);
1345 if (r.empty())
1346 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1347 }
1348 }
1349
1350 if (s.state & PG_STATE_ACTIVE) {
1351 --num_pg_active;
1352 }
1353 if (s.state == 0) {
1354 --num_pg_unknown;
1355 }
1356
1357 if (sameosds)
1358 return pool_erased;
1359
1360 for (auto p = s.blocked_by.begin();
1361 p != s.blocked_by.end();
1362 ++p) {
1363 auto q = blocked_by_sum.find(*p);
1364 ceph_assert(q != blocked_by_sum.end());
1365 --q->second;
1366 if (q->second == 0)
1367 blocked_by_sum.erase(q);
1368 }
1369
1370 set<int32_t> actingset;
1371 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1372 actingset.insert(*p);
1373 auto& oset = pg_by_osd[*p];
1374 oset.erase(pgid);
1375 if (oset.empty())
1376 pg_by_osd.erase(*p);
1377 auto it = num_pg_by_osd.find(*p);
1378 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1379 it->second.acting--;
1380 }
1381 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1382 auto& oset = pg_by_osd[*p];
1383 oset.erase(pgid);
1384 if (oset.empty())
1385 pg_by_osd.erase(*p);
1386 if (actingset.count(*p))
1387 continue;
1388 auto it = num_pg_by_osd.find(*p);
1389 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1390 it->second.up_not_acting--;
1391 }
1392
1393 if (s.up_primary >= 0) {
1394 auto it = num_pg_by_osd.find(s.up_primary);
1395 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1396 it->second.primary--;
1397 }
1398 return pool_erased;
1399 }
1400
1401 void PGMap::calc_purged_snaps()
1402 {
1403 purged_snaps.clear();
1404 set<int64_t> unknown;
1405 for (auto& i : pg_stat) {
1406 if (i.second.state == 0) {
1407 unknown.insert(i.first.pool());
1408 purged_snaps.erase(i.first.pool());
1409 continue;
1410 } else if (unknown.count(i.first.pool())) {
1411 continue;
1412 }
1413 auto j = purged_snaps.find(i.first.pool());
1414 if (j == purged_snaps.end()) {
1415 // base case
1416 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1417 } else {
1418 j->second.intersection_of(i.second.purged_snaps);
1419 }
1420 }
1421 }
1422
1423 void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
1424 {
1425 osd_sum_by_class.clear();
1426 for (auto& i : osd_stat) {
1427 const char *class_name = osdmap.crush->get_item_class(i.first);
1428 if (class_name) {
1429 osd_sum_by_class[class_name].add(i.second);
1430 }
1431 }
1432 }
1433
1434 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1435 {
1436 num_osd++;
1437 osd_sum.add(s);
1438 if (osd >= (int)osd_last_seq.size()) {
1439 osd_last_seq.resize(osd + 1);
1440 }
1441 osd_last_seq[osd] = s.seq;
1442 }
1443
1444 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1445 {
1446 num_osd--;
1447 osd_sum.sub(s);
1448 ceph_assert(osd < (int)osd_last_seq.size());
1449 osd_last_seq[osd] = 0;
1450 }
1451
1452 void PGMap::encode_digest(const OSDMap& osdmap,
1453 bufferlist& bl, uint64_t features)
1454 {
1455 get_rules_avail(osdmap, &avail_space_by_rule);
1456 calc_osd_sum_by_class(osdmap);
1457 calc_purged_snaps();
1458 PGMapDigest::encode(bl, features);
1459 }
1460
1461 void PGMap::encode(bufferlist &bl, uint64_t features) const
1462 {
1463 ENCODE_START(8, 8, bl);
1464 encode(version, bl);
1465 encode(pg_stat, bl);
1466 encode(osd_stat, bl, features);
1467 encode(last_osdmap_epoch, bl);
1468 encode(last_pg_scan, bl);
1469 encode(stamp, bl);
1470 encode(pool_statfs, bl, features);
1471 ENCODE_FINISH(bl);
1472 }
1473
1474 void PGMap::decode(bufferlist::const_iterator &bl)
1475 {
1476 DECODE_START(8, bl);
1477 decode(version, bl);
1478 decode(pg_stat, bl);
1479 decode(osd_stat, bl);
1480 decode(last_osdmap_epoch, bl);
1481 decode(last_pg_scan, bl);
1482 decode(stamp, bl);
1483 decode(pool_statfs, bl);
1484 DECODE_FINISH(bl);
1485
1486 calc_stats();
1487 }
1488
1489 void PGMap::dump(Formatter *f) const
1490 {
1491 dump_basic(f);
1492 dump_pg_stats(f, false);
1493 dump_pool_stats(f);
1494 dump_osd_stats(f);
1495 }
1496
1497 void PGMap::dump_basic(Formatter *f) const
1498 {
1499 f->dump_unsigned("version", version);
1500 f->dump_stream("stamp") << stamp;
1501 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1502 f->dump_unsigned("last_pg_scan", last_pg_scan);
1503
1504 f->open_object_section("pg_stats_sum");
1505 pg_sum.dump(f);
1506 f->close_section();
1507
1508 f->open_object_section("osd_stats_sum");
1509 osd_sum.dump(f);
1510 f->close_section();
1511
1512 dump_delta(f);
1513 }
1514
1515 void PGMap::dump_delta(Formatter *f) const
1516 {
1517 f->open_object_section("pg_stats_delta");
1518 pg_sum_delta.dump(f);
1519 f->dump_stream("stamp_delta") << stamp_delta;
1520 f->close_section();
1521 }
1522
1523 void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1524 {
1525 f->open_array_section("pg_stats");
1526 for (auto i = pg_stat.begin();
1527 i != pg_stat.end();
1528 ++i) {
1529 f->open_object_section("pg_stat");
1530 f->dump_stream("pgid") << i->first;
1531 if (brief)
1532 i->second.dump_brief(f);
1533 else
1534 i->second.dump(f);
1535 f->close_section();
1536 }
1537 f->close_section();
1538 }
1539
1540 void PGMap::dump_pool_stats(Formatter *f) const
1541 {
1542 f->open_array_section("pool_stats");
1543 for (auto p = pg_pool_sum.begin();
1544 p != pg_pool_sum.end();
1545 ++p) {
1546 f->open_object_section("pool_stat");
1547 f->dump_int("poolid", p->first);
1548 auto q = num_pg_by_pool.find(p->first);
1549 if (q != num_pg_by_pool.end())
1550 f->dump_unsigned("num_pg", q->second);
1551 p->second.dump(f);
1552 f->close_section();
1553 }
1554 f->close_section();
1555 }
1556
1557 void PGMap::dump_osd_stats(Formatter *f) const
1558 {
1559 f->open_array_section("osd_stats");
1560 for (auto q = osd_stat.begin();
1561 q != osd_stat.end();
1562 ++q) {
1563 f->open_object_section("osd_stat");
1564 f->dump_int("osd", q->first);
1565 q->second.dump(f);
1566 f->close_section();
1567 }
1568 f->close_section();
1569 }
1570
1571 void PGMap::dump_pg_stats_plain(
1572 ostream& ss,
1573 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1574 bool brief) const
1575 {
1576 TextTable tab;
1577
1578 if (brief){
1579 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1580 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1581 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1582 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1583 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1584 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1585 }
1586 else {
1587 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1588 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1589 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1590 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1591 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1592 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1593 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1594 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1595 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1596 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1597 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1598 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1599 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1600 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1601 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1602 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1603 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1604 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1605 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1606 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1607 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1608 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1609 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1610 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
1611 }
1612
1613 for (auto i = pg_stats.begin();
1614 i != pg_stats.end(); ++i) {
1615 const pg_stat_t &st(i->second);
1616 if (brief) {
1617 tab << i->first
1618 << pg_state_string(st.state)
1619 << st.up
1620 << st.up_primary
1621 << st.acting
1622 << st.acting_primary
1623 << TextTable::endrow;
1624 } else {
1625 ostringstream reported;
1626 reported << st.reported_epoch << ":" << st.reported_seq;
1627
1628 tab << i->first
1629 << st.stats.sum.num_objects
1630 << st.stats.sum.num_objects_missing_on_primary
1631 << st.stats.sum.num_objects_degraded
1632 << st.stats.sum.num_objects_misplaced
1633 << st.stats.sum.num_objects_unfound
1634 << st.stats.sum.num_bytes
1635 << st.stats.sum.num_omap_bytes
1636 << st.stats.sum.num_omap_keys
1637 << st.log_size
1638 << st.ondisk_log_size
1639 << pg_state_string(st.state)
1640 << st.last_change
1641 << st.version
1642 << reported.str()
1643 << pg_vector_string(st.up)
1644 << st.up_primary
1645 << pg_vector_string(st.acting)
1646 << st.acting_primary
1647 << st.last_scrub
1648 << st.last_scrub_stamp
1649 << st.last_deep_scrub
1650 << st.last_deep_scrub_stamp
1651 << st.snaptrimq_len
1652 << TextTable::endrow;
1653 }
1654 }
1655
1656 ss << tab;
1657 }
1658
1659 void PGMap::dump(ostream& ss) const
1660 {
1661 dump_basic(ss);
1662 dump_pg_stats(ss, false);
1663 dump_pool_stats(ss, false);
1664 dump_pg_sum_stats(ss, false);
1665 dump_osd_stats(ss);
1666 }
1667
1668 void PGMap::dump_basic(ostream& ss) const
1669 {
1670 ss << "version " << version << std::endl;
1671 ss << "stamp " << stamp << std::endl;
1672 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1673 ss << "last_pg_scan " << last_pg_scan << std::endl;
1674 }
1675
1676 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1677 {
1678 dump_pg_stats_plain(ss, pg_stat, brief);
1679 }
1680
1681 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1682 {
1683 TextTable tab;
1684
1685 if (header) {
1686 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1687 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1688 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1689 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1690 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1691 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1692 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1693 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1694 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1695 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1696 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1697 } else {
1698 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1699 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1700 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1701 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1702 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1703 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1704 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1705 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1706 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1707 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1708 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1709 }
1710
1711 for (auto p = pg_pool_sum.begin();
1712 p != pg_pool_sum.end();
1713 ++p) {
1714 tab << p->first
1715 << p->second.stats.sum.num_objects
1716 << p->second.stats.sum.num_objects_missing_on_primary
1717 << p->second.stats.sum.num_objects_degraded
1718 << p->second.stats.sum.num_objects_misplaced
1719 << p->second.stats.sum.num_objects_unfound
1720 << p->second.stats.sum.num_bytes
1721 << p->second.stats.sum.num_omap_bytes
1722 << p->second.stats.sum.num_omap_keys
1723 << p->second.log_size
1724 << p->second.ondisk_log_size
1725 << TextTable::endrow;
1726 }
1727
1728 ss << tab;
1729 }
1730
1731 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1732 {
1733 TextTable tab;
1734
1735 if (header) {
1736 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1737 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1738 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1739 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1740 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1741 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1742 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1743 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1744 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1745 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1746 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1747 } else {
1748 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1749 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1750 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1751 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1752 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1753 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1754 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1755 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1756 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1757 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1758 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1759 };
1760
1761 tab << "sum"
1762 << pg_sum.stats.sum.num_objects
1763 << pg_sum.stats.sum.num_objects_missing_on_primary
1764 << pg_sum.stats.sum.num_objects_degraded
1765 << pg_sum.stats.sum.num_objects_misplaced
1766 << pg_sum.stats.sum.num_objects_unfound
1767 << pg_sum.stats.sum.num_bytes
1768 << pg_sum.stats.sum.num_omap_bytes
1769 << pg_sum.stats.sum.num_omap_keys
1770 << pg_sum.log_size
1771 << pg_sum.ondisk_log_size
1772 << TextTable::endrow;
1773
1774 ss << tab;
1775 }
1776
1777 void PGMap::dump_osd_stats(ostream& ss) const
1778 {
1779 TextTable tab;
1780
1781 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1782 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1784 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1785 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1786 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1787 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1788 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1789
1790 for (auto p = osd_stat.begin();
1791 p != osd_stat.end();
1792 ++p) {
1793 tab << p->first
1794 << byte_u_t(p->second.statfs.get_used())
1795 << byte_u_t(p->second.statfs.available)
1796 << byte_u_t(p->second.statfs.get_used_raw())
1797 << byte_u_t(p->second.statfs.total)
1798 << p->second.hb_peers
1799 << get_num_pg_by_osd(p->first)
1800 << get_num_primary_pg_by_osd(p->first)
1801 << TextTable::endrow;
1802 }
1803
1804 tab << "sum"
1805 << byte_u_t(osd_sum.statfs.get_used())
1806 << byte_u_t(osd_sum.statfs.available)
1807 << byte_u_t(osd_sum.statfs.get_used_raw())
1808 << byte_u_t(osd_sum.statfs.total)
1809 << TextTable::endrow;
1810
1811 ss << tab;
1812 }
1813
1814 void PGMap::dump_osd_sum_stats(ostream& ss) const
1815 {
1816 TextTable tab;
1817
1818 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1819 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1820 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1821 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1822 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1823
1824 tab << "sum"
1825 << byte_u_t(osd_sum.statfs.get_used())
1826 << byte_u_t(osd_sum.statfs.available)
1827 << byte_u_t(osd_sum.statfs.get_used_raw())
1828 << byte_u_t(osd_sum.statfs.total)
1829 << TextTable::endrow;
1830
1831 ss << tab;
1832 }
1833
1834 void PGMap::get_stuck_stats(
1835 int types, const utime_t cutoff,
1836 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1837 {
1838 ceph_assert(types != 0);
1839 for (auto i = pg_stat.begin();
1840 i != pg_stat.end();
1841 ++i) {
1842 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1843
1844 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1845 if (i->second.last_active < val)
1846 val = i->second.last_active;
1847 }
1848
1849 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1850 if (i->second.last_clean < val)
1851 val = i->second.last_clean;
1852 }
1853
1854 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1855 if (i->second.last_undegraded < val)
1856 val = i->second.last_undegraded;
1857 }
1858
1859 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1860 if (i->second.last_fullsized < val)
1861 val = i->second.last_fullsized;
1862 }
1863
1864 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1865 if (i->second.last_unstale < val)
1866 val = i->second.last_unstale;
1867 }
1868
1869 // val is now the earliest any of the requested stuck states began
1870 if (val < cutoff) {
1871 stuck_pgs[i->first] = i->second;
1872 }
1873 }
1874 }
1875
1876 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1877 {
1878 int inactive = 0;
1879 int unclean = 0;
1880 int degraded = 0;
1881 int undersized = 0;
1882 int stale = 0;
1883
1884 for (auto i = pg_stat.begin();
1885 i != pg_stat.end();
1886 ++i) {
1887 if (! (i->second.state & PG_STATE_ACTIVE)) {
1888 if (i->second.last_active < cutoff)
1889 ++inactive;
1890 }
1891 if (! (i->second.state & PG_STATE_CLEAN)) {
1892 if (i->second.last_clean < cutoff)
1893 ++unclean;
1894 }
1895 if (i->second.state & PG_STATE_DEGRADED) {
1896 if (i->second.last_undegraded < cutoff)
1897 ++degraded;
1898 }
1899 if (i->second.state & PG_STATE_UNDERSIZED) {
1900 if (i->second.last_fullsized < cutoff)
1901 ++undersized;
1902 }
1903 if (i->second.state & PG_STATE_STALE) {
1904 if (i->second.last_unstale < cutoff)
1905 ++stale;
1906 }
1907 }
1908
1909 if (inactive)
1910 note["stuck inactive"] = inactive;
1911
1912 if (unclean)
1913 note["stuck unclean"] = unclean;
1914
1915 if (undersized)
1916 note["stuck undersized"] = undersized;
1917
1918 if (degraded)
1919 note["stuck degraded"] = degraded;
1920
1921 if (stale)
1922 note["stuck stale"] = stale;
1923
1924 return inactive || unclean || undersized || degraded || stale;
1925 }
1926
1927 void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
1928 {
1929 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1930 get_stuck_stats(types, cutoff, stuck_pg_stats);
1931 f->open_array_section("stuck_pg_stats");
1932 for (auto i = stuck_pg_stats.begin();
1933 i != stuck_pg_stats.end();
1934 ++i) {
1935 f->open_object_section("pg_stat");
1936 f->dump_stream("pgid") << i->first;
1937 i->second.dump(f);
1938 f->close_section();
1939 }
1940 f->close_section();
1941 }
1942
1943 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
1944 {
1945 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1946 get_stuck_stats(types, cutoff, stuck_pg_stats);
1947 if (!stuck_pg_stats.empty())
1948 dump_pg_stats_plain(ss, stuck_pg_stats, true);
1949 }
1950
1951 int PGMap::dump_stuck_pg_stats(
1952 stringstream &ds,
1953 Formatter *f,
1954 int threshold,
1955 vector<string>& args) const
1956 {
1957 int stuck_types = 0;
1958
1959 for (auto i = args.begin(); i != args.end(); ++i) {
1960 if (*i == "inactive")
1961 stuck_types |= PGMap::STUCK_INACTIVE;
1962 else if (*i == "unclean")
1963 stuck_types |= PGMap::STUCK_UNCLEAN;
1964 else if (*i == "undersized")
1965 stuck_types |= PGMap::STUCK_UNDERSIZED;
1966 else if (*i == "degraded")
1967 stuck_types |= PGMap::STUCK_DEGRADED;
1968 else if (*i == "stale")
1969 stuck_types |= PGMap::STUCK_STALE;
1970 else {
1971 ds << "Unknown type: " << *i << std::endl;
1972 return -EINVAL;
1973 }
1974 }
1975
1976 utime_t now(ceph_clock_now());
1977 utime_t cutoff = now - utime_t(threshold, 0);
1978
1979 if (!f) {
1980 dump_stuck_plain(ds, stuck_types, cutoff);
1981 } else {
1982 dump_stuck(f, stuck_types, cutoff);
1983 f->flush(ds);
1984 }
1985
1986 return 0;
1987 }
1988
1989 void PGMap::dump_osd_perf_stats(Formatter *f) const
1990 {
1991 f->open_array_section("osd_perf_infos");
1992 for (auto i = osd_stat.begin();
1993 i != osd_stat.end();
1994 ++i) {
1995 f->open_object_section("osd");
1996 f->dump_int("id", i->first);
1997 {
1998 f->open_object_section("perf_stats");
1999 i->second.os_perf_stat.dump(f);
2000 f->close_section();
2001 }
2002 f->close_section();
2003 }
2004 f->close_section();
2005 }
2006 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2007 {
2008 TextTable tab;
2009 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2010 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2011 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2012 for (auto i = osd_stat.begin();
2013 i != osd_stat.end();
2014 ++i) {
2015 tab << i->first;
2016 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2017 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
2018 tab << TextTable::endrow;
2019 }
2020 (*ss) << tab;
2021 }
2022
2023 void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2024 {
2025 f->open_array_section("osd_blocked_by_infos");
2026 for (auto i = blocked_by_sum.begin();
2027 i != blocked_by_sum.end();
2028 ++i) {
2029 f->open_object_section("osd");
2030 f->dump_int("id", i->first);
2031 f->dump_int("num_blocked", i->second);
2032 f->close_section();
2033 }
2034 f->close_section();
2035 }
2036 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2037 {
2038 TextTable tab;
2039 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2040 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2041 for (auto i = blocked_by_sum.begin();
2042 i != blocked_by_sum.end();
2043 ++i) {
2044 tab << i->first;
2045 tab << i->second;
2046 tab << TextTable::endrow;
2047 }
2048 (*ss) << tab;
2049 }
2050
2051
2052 /**
2053 * update aggregated delta
2054 *
2055 * @param cct ceph context
2056 * @param ts Timestamp for the stats being delta'ed
2057 * @param old_pool_sum Previous stats sum
2058 * @param last_ts Last timestamp for pool
2059 * @param result_pool_sum Resulting stats
2060 * @param result_pool_delta Resulting pool delta
2061 * @param result_ts_delta Resulting timestamp delta
2062 * @param delta_avg_list List of last N computed deltas, used to average
2063 */
2064 void PGMap::update_delta(
2065 CephContext *cct,
2066 const utime_t ts,
2067 const pool_stat_t& old_pool_sum,
2068 utime_t *last_ts,
2069 const pool_stat_t& current_pool_sum,
2070 pool_stat_t *result_pool_delta,
2071 utime_t *result_ts_delta,
2072 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2073 {
2074 /* @p ts is the timestamp we want to associate with the data
2075 * in @p old_pool_sum, and on which we will base ourselves to
2076 * calculate the delta, stored in 'delta_t'.
2077 */
2078 utime_t delta_t;
2079 delta_t = ts; // start with the provided timestamp
2080 delta_t -= *last_ts; // take the last timestamp we saw
2081 *last_ts = ts; // @p ts becomes the last timestamp we saw
2082
2083 // adjust delta_t, quick start if there is no update in a long period
2084 delta_t = std::min(delta_t,
2085 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2086
2087 // calculate a delta, and average over the last 6 deltas by default.
2088 /* start by taking a copy of our current @p result_pool_sum, and by
2089 * taking out the stats from @p old_pool_sum. This generates a stats
2090 * delta. Stash this stats delta in @p delta_avg_list, along with the
2091 * timestamp delta for these results.
2092 */
2093 pool_stat_t d = current_pool_sum;
2094 d.stats.sub(old_pool_sum.stats);
2095
2096 /* Aggregate current delta, and take out the last seen delta (if any) to
2097 * average it out.
2098 * Skip calculating delta while sum was not synchronized.
2099 */
2100 if(!old_pool_sum.stats.sum.is_zero()) {
2101 delta_avg_list->push_back(make_pair(d,delta_t));
2102 *result_ts_delta += delta_t;
2103 result_pool_delta->stats.add(d.stats);
2104 }
2105 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2106 while (delta_avg_list->size() > s) {
2107 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2108 *result_ts_delta -= delta_avg_list->front().second;
2109 delta_avg_list->pop_front();
2110 }
2111 }
2112
2113 /**
2114 * Update a given pool's deltas
2115 *
2116 * @param cct Ceph Context
2117 * @param ts Timestamp for the stats being delta'ed
2118 * @param pool Pool's id
2119 * @param old_pool_sum Previous stats sum
2120 */
2121 void PGMap::update_one_pool_delta(
2122 CephContext *cct,
2123 const utime_t ts,
2124 const int64_t pool,
2125 const pool_stat_t& old_pool_sum)
2126 {
2127 if (per_pool_sum_deltas.count(pool) == 0) {
2128 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2129 ceph_assert(per_pool_sum_delta.count(pool) == 0);
2130 }
2131
2132 auto& sum_delta = per_pool_sum_delta[pool];
2133
2134 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2135 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2136 &per_pool_sum_deltas[pool]);
2137 }
2138
2139 /**
2140 * Update pools' deltas
2141 *
2142 * @param cct CephContext
2143 * @param ts Timestamp for the stats being delta'ed
2144 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2145 */
2146 void PGMap::update_pool_deltas(
2147 CephContext *cct, const utime_t ts,
2148 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
2149 {
2150 for (auto it = pg_pool_sum_old.begin();
2151 it != pg_pool_sum_old.end(); ++it) {
2152 update_one_pool_delta(cct, ts, it->first, it->second);
2153 }
2154 }
2155
2156 void PGMap::clear_delta()
2157 {
2158 pg_sum_delta = pool_stat_t();
2159 pg_sum_deltas.clear();
2160 stamp_delta = utime_t();
2161 }
2162
2163 void PGMap::generate_test_instances(list<PGMap*>& o)
2164 {
2165 o.push_back(new PGMap);
2166 list<Incremental*> inc;
2167 Incremental::generate_test_instances(inc);
2168 delete inc.front();
2169 inc.pop_front();
2170 while (!inc.empty()) {
2171 PGMap *pmp = new PGMap();
2172 *pmp = *o.back();
2173 o.push_back(pmp);
2174 o.back()->apply_incremental(NULL, *inc.front());
2175 delete inc.front();
2176 inc.pop_front();
2177 }
2178 }
2179
2180 void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
2181 bool primary, set<pg_t>& pgs) const
2182 {
2183 for (auto i = pg_stat.begin();
2184 i != pg_stat.end();
2185 ++i) {
2186 if ((poolid >= 0) && (poolid != i->first.pool()))
2187 continue;
2188 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2189 continue;
2190 if (state == (uint64_t)-1 || // "all"
2191 (i->second.state & state) || // matches a state bit
2192 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2193 pgs.insert(i->first);
2194 }
2195 }
2196 }
2197
2198 void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2199 {
2200 f->open_array_section("pg_stats");
2201 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2202 const pg_stat_t& st = pg_stat.at(*i);
2203 f->open_object_section("pg_stat");
2204 f->dump_stream("pgid") << *i;
2205 st.dump(f);
2206 f->close_section();
2207 }
2208 f->close_section();
2209 }
2210
2211 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2212 {
2213 TextTable tab;
2214 utime_t now = ceph_clock_now();
2215
2216 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
2217 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2218 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2219 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2220 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2221 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2222 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2223 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
2224 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2225 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2226 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
2227 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2228 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2229 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2230 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2231 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2232 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2233
2234 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2235 const pg_stat_t& st = pg_stat.at(*i);
2236
2237 ostringstream reported;
2238 reported << st.reported_epoch << ":" << st.reported_seq;
2239
2240 ostringstream upstr, actingstr;
2241 upstr << st.up << 'p' << st.up_primary;
2242 actingstr << st.acting << 'p' << st.acting_primary;
2243 tab << *i
2244 << st.stats.sum.num_objects
2245 << st.stats.sum.num_objects_degraded
2246 << st.stats.sum.num_objects_misplaced
2247 << st.stats.sum.num_objects_unfound
2248 << st.stats.sum.num_bytes
2249 << st.stats.sum.num_omap_bytes
2250 << st.stats.sum.num_omap_keys
2251 << st.log_size
2252 << pg_state_string(st.state)
2253 << utimespan_str(now - st.last_change)
2254 << st.version
2255 << reported.str()
2256 << upstr.str()
2257 << actingstr.str()
2258 << st.last_scrub_stamp
2259 << st.last_deep_scrub_stamp
2260 << TextTable::endrow;
2261 }
2262
2263 ss << tab;
2264 }
2265
2266 void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
2267 Formatter *f,
2268 stringstream *rs) const {
2269 string pool_name = osd_map.get_pool_name(poolid);
2270 if (f) {
2271 f->open_object_section("pool");
2272 f->dump_string("pool_name", pool_name.c_str());
2273 f->dump_int("pool_id", poolid);
2274 f->open_object_section("recovery");
2275 }
2276 list<string> sl;
2277 stringstream tss;
2278 pool_recovery_summary(f, &sl, poolid);
2279 if (!f && !sl.empty()) {
2280 for (auto &p : sl)
2281 tss << " " << p << "\n";
2282 }
2283 if (f) {
2284 f->close_section(); // object section recovery
2285 f->open_object_section("recovery_rate");
2286 }
2287 ostringstream rss;
2288 pool_recovery_rate_summary(f, &rss, poolid);
2289 if (!f && !rss.str().empty())
2290 tss << " recovery io " << rss.str() << "\n";
2291 if (f) {
2292 f->close_section(); // object section recovery_rate
2293 f->open_object_section("client_io_rate");
2294 }
2295 rss.clear();
2296 rss.str("");
2297 pool_client_io_rate_summary(f, &rss, poolid);
2298 if (!f && !rss.str().empty())
2299 tss << " client io " << rss.str() << "\n";
2300 // dump cache tier IO rate for cache pool
2301 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2302 if (pool->is_tier()) {
2303 if (f) {
2304 f->close_section(); // object section client_io_rate
2305 f->open_object_section("cache_io_rate");
2306 }
2307 rss.clear();
2308 rss.str("");
2309 pool_cache_io_rate_summary(f, &rss, poolid);
2310 if (!f && !rss.str().empty())
2311 tss << " cache tier io " << rss.str() << "\n";
2312 }
2313 if (f) {
2314 f->close_section(); // object section cache_io_rate
2315 f->close_section(); // object section pool
2316 } else {
2317 *rs << "pool " << pool_name << " id " << poolid << "\n";
2318 if (!tss.str().empty())
2319 *rs << tss.str() << "\n";
2320 else
2321 *rs << " nothing is going on\n\n";
2322 }
2323 }
2324
2325 void PGMap::get_health_checks(
2326 CephContext *cct,
2327 const OSDMap& osdmap,
2328 health_check_map_t *checks) const
2329 {
2330 utime_t now = ceph_clock_now();
2331 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2332 const auto& pools = osdmap.get_pools();
2333
2334 typedef enum pg_consequence_t {
2335 UNAVAILABLE = 1, // Client IO to the pool may block
2336 DEGRADED = 2, // Fewer than the requested number of replicas are present
2337 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2338 // This may or may not be a deadlock condition.
2339 DAMAGED = 4, // The data may be missing or inconsistent on disk and
2340 // requires repair
2341 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
2342 } pg_consequence_t;
2343
2344 // For a given PG state, how should it be reported at the pool level?
2345 class PgStateResponse {
2346 public:
2347 pg_consequence_t consequence;
2348 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2349 stuck_cb stuck_since;
2350 bool invert;
2351
2352 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2353 : consequence(c), stuck_since(std::move(s)), invert(false)
2354 {
2355 }
2356
2357 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2358 : consequence(c), stuck_since(std::move(s)), invert(i)
2359 {
2360 }
2361 };
2362
2363 // Record the PG state counts that contributed to a reported pool state
2364 class PgCauses {
2365 public:
2366 // Map of PG_STATE_* to number of pgs in that state.
2367 std::map<unsigned, unsigned> states;
2368
2369 // List of all PG IDs that had a state contributing
2370 // to this health condition.
2371 std::set<pg_t> pgs;
2372
2373 std::map<pg_t, std::string> pg_messages;
2374 };
2375
2376 // Map of PG state to how to respond to it
2377 std::map<unsigned, PgStateResponse> state_to_response = {
2378 // Immediate reports
2379 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2380 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
2381 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2382 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2383 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
2384 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2385 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
2386 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2387 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2388 // Delayed (wait until stuck) reports
2389 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2390 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2391 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2392 // Delayed and inverted reports
2393 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
2394 };
2395
2396 // Specialized state printer that takes account of inversion of
2397 // ACTIVE, CLEAN checks.
2398 auto state_name = [](const uint64_t &state) {
2399 // Special cases for the states that are inverted checks
2400 if (state == PG_STATE_CLEAN) {
2401 return std::string("unclean");
2402 } else if (state == PG_STATE_ACTIVE) {
2403 return std::string("inactive");
2404 } else {
2405 return pg_state_string(state);
2406 }
2407 };
2408
2409 // Map of what is wrong to information about why, implicitly also stores
2410 // the list of what is wrong.
2411 std::map<pg_consequence_t, PgCauses> detected;
2412
2413 // Optimisation: trim down the number of checks to apply based on
2414 // the summary counters
2415 std::map<unsigned, PgStateResponse> possible_responses;
2416 for (const auto &i : num_pg_by_state) {
2417 for (const auto &j : state_to_response) {
2418 if (!j.second.invert) {
2419 // Check for normal tests by seeing if any pgs have the flag
2420 if (i.first & j.first) {
2421 possible_responses.insert(j);
2422 }
2423 }
2424 }
2425 }
2426
2427 for (const auto &j : state_to_response) {
2428 if (j.second.invert) {
2429 // Check for inverted tests by seeing if not-all pgs have the flag
2430 const auto &found = num_pg_by_state.find(j.first);
2431 if (found == num_pg_by_state.end() || found->second != num_pg) {
2432 possible_responses.insert(j);
2433 }
2434 }
2435 }
2436
2437 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
2438 // Loop over all PGs, if there are any possibly-unhealthy states in there
2439 if (!possible_responses.empty()) {
2440 for (const auto& i : pg_stat) {
2441 const auto &pg_id = i.first;
2442 const auto &pg_info = i.second;
2443
2444 for (const auto &j : state_to_response) {
2445 const auto &pg_response_state = j.first;
2446 const auto &pg_response = j.second;
2447
2448 // Apply the state test
2449 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2450 continue;
2451 }
2452
2453 // Apply stuckness test if needed
2454 if (pg_response.stuck_since) {
2455 // Delayed response, check for stuckness
2456 utime_t last_whatever = pg_response.stuck_since(pg_info);
2457 if (last_whatever >= cutoff) {
2458 // Not stuck enough, ignore.
2459 continue;
2460 } else {
2461
2462 }
2463 }
2464
2465 auto &causes = detected[pg_response.consequence];
2466 causes.states[pg_response_state]++;
2467 causes.pgs.insert(pg_id);
2468
2469 // Don't bother composing detail string if we have already recorded
2470 // too many
2471 if (causes.pg_messages.size() > max) {
2472 continue;
2473 }
2474
2475 std::ostringstream ss;
2476 if (pg_response.stuck_since) {
2477 utime_t since = pg_response.stuck_since(pg_info);
2478 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2479 if (since == utime_t()) {
2480 ss << " since forever";
2481 } else {
2482 utime_t dur = now - since;
2483 ss << " for " << dur;
2484 }
2485 ss << ", current state " << pg_state_string(pg_info.state)
2486 << ", last acting " << pg_info.acting;
2487 } else {
2488 ss << "pg " << pg_id << " is "
2489 << pg_state_string(pg_info.state);
2490 ss << ", acting " << pg_info.acting;
2491 if (pg_info.stats.sum.num_objects_unfound) {
2492 ss << ", " << pg_info.stats.sum.num_objects_unfound
2493 << " unfound";
2494 }
2495 }
2496
2497 if (pg_info.state & PG_STATE_INCOMPLETE) {
2498 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2499 if (pi && pi->min_size > 1) {
2500 ss << " (reducing pool "
2501 << osdmap.get_pool_name(pg_id.pool())
2502 << " min_size from " << (int)pi->min_size
2503 << " may help; search ceph.com/docs for 'incomplete')";
2504 }
2505 }
2506
2507 causes.pg_messages[pg_id] = ss.str();
2508 }
2509 }
2510 } else {
2511 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2512 }
2513
2514 for (const auto &i : detected) {
2515 std::string health_code;
2516 health_status_t sev;
2517 std::string summary;
2518 switch(i.first) {
2519 case UNAVAILABLE:
2520 health_code = "PG_AVAILABILITY";
2521 sev = HEALTH_WARN;
2522 summary = "Reduced data availability: ";
2523 break;
2524 case DEGRADED:
2525 health_code = "PG_DEGRADED";
2526 summary = "Degraded data redundancy: ";
2527 sev = HEALTH_WARN;
2528 break;
2529 case BACKFILL_FULL:
2530 health_code = "PG_BACKFILL_FULL";
2531 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2532 sev = HEALTH_WARN;
2533 break;
2534 case DAMAGED:
2535 health_code = "PG_DAMAGED";
2536 summary = "Possible data damage: ";
2537 sev = HEALTH_ERR;
2538 break;
2539 case RECOVERY_FULL:
2540 health_code = "PG_RECOVERY_FULL";
2541 summary = "Full OSDs blocking recovery: ";
2542 sev = HEALTH_ERR;
2543 break;
2544 default:
2545 ceph_abort();
2546 }
2547
2548 if (i.first == DEGRADED) {
2549 if (pg_sum.stats.sum.num_objects_degraded &&
2550 pg_sum.stats.sum.num_object_copies > 0) {
2551 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2552 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2553 char b[20];
2554 snprintf(b, sizeof(b), "%.3lf", pc);
2555 ostringstream ss;
2556 ss << pg_sum.stats.sum.num_objects_degraded
2557 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2558 << b << "%)";
2559
2560 // Throw in a comma for the benefit of the following PG counts
2561 summary += ss.str() + ", ";
2562 }
2563 }
2564
2565 // Compose summary message saying how many PGs in what states led
2566 // to this health check failing
2567 std::vector<std::string> pg_msgs;
2568 for (const auto &j : i.second.states) {
2569 std::ostringstream msg;
2570 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2571 pg_msgs.push_back(msg.str());
2572 }
2573 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2574
2575
2576
2577 health_check_t *check = &checks->add(
2578 health_code,
2579 sev,
2580 summary);
2581
2582 // Compose list of PGs contributing to this health check failing
2583 for (const auto &j : i.second.pg_messages) {
2584 check->detail.push_back(j.second);
2585 }
2586 }
2587
2588 // OSD_SCRUB_ERRORS
2589 if (pg_sum.stats.sum.num_scrub_errors) {
2590 ostringstream ss;
2591 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2592 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
2593 }
2594
2595 // LARGE_OMAP_OBJECTS
2596 if (pg_sum.stats.sum.num_large_omap_objects) {
2597 list<string> detail;
2598 for (auto &pool : pools) {
2599 const string& pool_name = osdmap.get_pool_name(pool.first);
2600 auto it2 = pg_pool_sum.find(pool.first);
2601 if (it2 == pg_pool_sum.end()) {
2602 continue;
2603 }
2604 const pool_stat_t *pstat = &it2->second;
2605 if (pstat == nullptr) {
2606 continue;
2607 }
2608 const object_stat_sum_t& sum = pstat->stats.sum;
2609 if (sum.num_large_omap_objects) {
2610 stringstream ss;
2611 ss << sum.num_large_omap_objects << " large objects found in pool "
2612 << "'" << pool_name << "'";
2613 detail.push_back(ss.str());
2614 }
2615 }
2616 if (!detail.empty()) {
2617 ostringstream ss;
2618 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
2619 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
2620 stringstream tip;
2621 tip << "Search the cluster log for 'Large omap object found' for more "
2622 << "details.";
2623 detail.push_back(tip.str());
2624 d.detail.swap(detail);
2625 }
2626 }
2627
2628 // CACHE_POOL_NEAR_FULL
2629 {
2630 list<string> detail;
2631 unsigned num_pools = 0;
2632 for (auto& p : pools) {
2633 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2634 !pg_pool_sum.count(p.first)) {
2635 continue;
2636 }
2637 bool nearfull = false;
2638 const string& name = osdmap.get_pool_name(p.first);
2639 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2640 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2641 ((1000000 - p.second.cache_target_full_ratio_micro) *
2642 cct->_conf->mon_cache_target_full_warn_ratio);
2643 if (p.second.target_max_objects &&
2644 (uint64_t)(st.stats.sum.num_objects -
2645 st.stats.sum.num_objects_hit_set_archive) >
2646 p.second.target_max_objects * (ratio / 1000000.0)) {
2647 ostringstream ss;
2648 ss << "cache pool '" << name << "' with "
2649 << si_u_t(st.stats.sum.num_objects)
2650 << " objects at/near target max "
2651 << si_u_t(p.second.target_max_objects) << " objects";
2652 detail.push_back(ss.str());
2653 nearfull = true;
2654 }
2655 if (p.second.target_max_bytes &&
2656 (uint64_t)(st.stats.sum.num_bytes -
2657 st.stats.sum.num_bytes_hit_set_archive) >
2658 p.second.target_max_bytes * (ratio / 1000000.0)) {
2659 ostringstream ss;
2660 ss << "cache pool '" << name
2661 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2662 << " at/near target max "
2663 << byte_u_t(p.second.target_max_bytes);
2664 detail.push_back(ss.str());
2665 nearfull = true;
2666 }
2667 if (nearfull) {
2668 ++num_pools;
2669 }
2670 }
2671 if (!detail.empty()) {
2672 ostringstream ss;
2673 ss << num_pools << " cache pools at or near target size";
2674 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2675 d.detail.swap(detail);
2676 }
2677 }
2678
2679 // TOO_FEW_PGS
2680 unsigned num_in = osdmap.get_num_in_osds();
2681 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2682 const auto min_pg_per_osd =
2683 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
2684 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2685 auto per = sum_pg_up / num_in;
2686 if (per < min_pg_per_osd && per) {
2687 ostringstream ss;
2688 ss << "too few PGs per OSD (" << per
2689 << " < min " << min_pg_per_osd << ")";
2690 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
2691 }
2692 }
2693
2694 // TOO_MANY_PGS
2695 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
2696 if (num_in && max_pg_per_osd > 0) {
2697 auto per = sum_pg_up / num_in;
2698 if (per > max_pg_per_osd) {
2699 ostringstream ss;
2700 ss << "too many PGs per OSD (" << per
2701 << " > max " << max_pg_per_osd << ")";
2702 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
2703 }
2704 }
2705
2706 // TOO_FEW_OSDS
2707 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2708 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2709 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2710 ostringstream ss;
2711 ss << "OSD count " << osdmap.get_num_osds()
2712 << " < osd_pool_default_size " << osd_pool_default_size;
2713 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str());
2714 }
2715
2716 // SLOW_PING_TIME
2717 // Convert milliseconds to microseconds
2718 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2719 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2720 if (warn_slow_ping_time == 0) {
2721 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2722 warn_slow_ping_time = grace;
2723 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2724 }
2725 if (warn_slow_ping_time > 0) {
2726
2727 struct mon_ping_item_t {
2728 uint32_t pingtime;
2729 int from;
2730 int to;
2731 bool improving;
2732
2733 bool operator<(const mon_ping_item_t& rhs) const {
2734 if (pingtime < rhs.pingtime)
2735 return true;
2736 if (pingtime > rhs.pingtime)
2737 return false;
2738 if (from < rhs.from)
2739 return true;
2740 if (from > rhs.from)
2741 return false;
2742 return to < rhs.to;
2743 }
2744 };
2745
2746 list<string> detail_back;
2747 list<string> detail_front;
2748 set<mon_ping_item_t> back_sorted, front_sorted;
2749 for (auto i : osd_stat) {
2750 for (auto j : i.second.hb_pingtime) {
2751
2752 // Maybe source info is old
2753 if (now.sec() - j.second.last_update > grace * 60)
2754 continue;
2755
2756 mon_ping_item_t back;
2757 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2758 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2759 back.from = i.first;
2760 back.to = j.first;
2761 if (back.pingtime > warn_slow_ping_time) {
2762 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2763 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2764 back_sorted.emplace(back);
2765 }
2766
2767 mon_ping_item_t front;
2768 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2769 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2770 front.from = i.first;
2771 front.to = j.first;
2772 if (front.pingtime > warn_slow_ping_time) {
2773 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
2774 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2775 front_sorted.emplace(front);
2776 }
2777 }
2778 }
2779 int max_detail = 10;
2780 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2781 ostringstream ss;
2782 if (max_detail == 0) {
2783 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2784 detail_back.push_back(ss.str());
2785 break;
2786 }
2787 max_detail--;
2788 ss << "Slow heartbeat ping on back interface from osd." << sback.from
2789 << (osdmap.is_down(sback.from) ? " (down)" : "")
2790 << " to osd." << sback.to
2791 << (osdmap.is_down(sback.to) ? " (down)" : "")
2792 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2793 << (sback.improving ? " possibly improving" : "");
2794 detail_back.push_back(ss.str());
2795 }
2796 max_detail = 10;
2797 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2798 ostringstream ss;
2799 if (max_detail == 0) {
2800 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2801 detail_front.push_back(ss.str());
2802 break;
2803 }
2804 max_detail--;
2805 ss << "Slow heartbeat ping on front interface from osd." << sfront.from
2806 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2807 << " to osd." << sfront.to
2808 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2809 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2810 << (sfront.improving ? " possibly improving" : "");
2811 detail_front.push_back(ss.str());
2812 }
2813 if (detail_back.size() != 0) {
2814 ostringstream ss;
2815 ss << "Long heartbeat ping times on back interface seen, longest is "
2816 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << " msec";
2817 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str());
2818 d.detail.swap(detail_back);
2819 }
2820 if (detail_front.size() != 0) {
2821 ostringstream ss;
2822 ss << "Long heartbeat ping times on front interface seen, longest is "
2823 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << " msec";
2824 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str());
2825 d.detail.swap(detail_front);
2826 }
2827 }
2828
2829 // SMALLER_PGP_NUM
2830 // MANY_OBJECTS_PER_PG
2831 if (!pg_stat.empty()) {
2832 list<string> pgp_detail, many_detail;
2833 const auto mon_pg_warn_min_objects =
2834 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
2835 const auto mon_pg_warn_min_pool_objects =
2836 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
2837 const auto mon_pg_warn_max_object_skew =
2838 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
2839 for (auto p = pg_pool_sum.begin();
2840 p != pg_pool_sum.end();
2841 ++p) {
2842 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2843 if (!pi)
2844 continue; // in case osdmap changes haven't propagated to PGMap yet
2845 const string& name = osdmap.get_pool_name(p->first);
2846 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2847 // the warnings. If the cluster is failing to converge on the target
2848 // values that is a separate issue!
2849 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
2850 !(name.find(".DELETED") != string::npos &&
2851 cct->_conf->mon_fake_pool_delete)) {
2852 ostringstream ss;
2853 ss << "pool " << name << " pg_num "
2854 << pi->get_pg_num_target()
2855 << " > pgp_num " << pi->get_pgp_num_target();
2856 pgp_detail.push_back(ss.str());
2857 }
2858 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2859 if (average_objects_per_pg > 0 &&
2860 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2861 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
2862 int objects_per_pg = p->second.stats.sum.num_objects /
2863 pi->get_pg_num_target();
2864 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2865 if (mon_pg_warn_max_object_skew > 0 &&
2866 ratio > mon_pg_warn_max_object_skew) {
2867 ostringstream ss;
2868 ss << "pool " << name << " objects per pg ("
2869 << objects_per_pg << ") is more than " << ratio
2870 << " times cluster average ("
2871 << average_objects_per_pg << ")";
2872 many_detail.push_back(ss.str());
2873 }
2874 }
2875 }
2876 if (!pgp_detail.empty()) {
2877 ostringstream ss;
2878 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2879 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
2880 d.detail.swap(pgp_detail);
2881 }
2882 if (!many_detail.empty()) {
2883 ostringstream ss;
2884 ss << many_detail.size() << " pools have many more objects per pg than"
2885 << " average";
2886 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
2887 d.detail.swap(many_detail);
2888 }
2889 }
2890
2891 // POOL_FULL
2892 // POOL_NEAR_FULL
2893 {
2894 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
2895 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
2896 list<string> full_detail, nearfull_detail;
2897 unsigned full_pools = 0, nearfull_pools = 0;
2898 for (auto it : pools) {
2899 auto it2 = pg_pool_sum.find(it.first);
2900 if (it2 == pg_pool_sum.end()) {
2901 continue;
2902 }
2903 const pool_stat_t *pstat = &it2->second;
2904 const object_stat_sum_t& sum = pstat->stats.sum;
2905 const string& pool_name = osdmap.get_pool_name(it.first);
2906 const pg_pool_t &pool = it.second;
2907 bool full = false, nearfull = false;
2908 if (pool.quota_max_objects > 0) {
2909 stringstream ss;
2910 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
2911 } else if (crit_threshold > 0 &&
2912 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
2913 ss << "pool '" << pool_name
2914 << "' has " << sum.num_objects << " objects"
2915 << " (max " << pool.quota_max_objects << ")";
2916 full_detail.push_back(ss.str());
2917 full = true;
2918 } else if (warn_threshold > 0 &&
2919 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
2920 ss << "pool '" << pool_name
2921 << "' has " << sum.num_objects << " objects"
2922 << " (max " << pool.quota_max_objects << ")";
2923 nearfull_detail.push_back(ss.str());
2924 nearfull = true;
2925 }
2926 }
2927 if (pool.quota_max_bytes > 0) {
2928 stringstream ss;
2929 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
2930 } else if (crit_threshold > 0 &&
2931 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
2932 ss << "pool '" << pool_name
2933 << "' has " << byte_u_t(sum.num_bytes)
2934 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
2935 full_detail.push_back(ss.str());
2936 full = true;
2937 } else if (warn_threshold > 0 &&
2938 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
2939 ss << "pool '" << pool_name
2940 << "' has " << byte_u_t(sum.num_bytes)
2941 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
2942 nearfull_detail.push_back(ss.str());
2943 nearfull = true;
2944 }
2945 }
2946 if (full) {
2947 ++full_pools;
2948 }
2949 if (nearfull) {
2950 ++nearfull_pools;
2951 }
2952 }
2953 if (full_pools) {
2954 ostringstream ss;
2955 ss << full_pools << " pools full";
2956 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
2957 d.detail.swap(full_detail);
2958 }
2959 if (nearfull_pools) {
2960 ostringstream ss;
2961 ss << nearfull_pools << " pools nearfull";
2962 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2963 d.detail.swap(nearfull_detail);
2964 }
2965 }
2966
2967 // OBJECT_MISPLACED
2968 if (pg_sum.stats.sum.num_objects_misplaced &&
2969 pg_sum.stats.sum.num_object_copies > 0 &&
2970 cct->_conf->mon_warn_on_misplaced) {
2971 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
2972 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2973 char b[20];
2974 snprintf(b, sizeof(b), "%.3lf", pc);
2975 ostringstream ss;
2976 ss << pg_sum.stats.sum.num_objects_misplaced
2977 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
2978 << b << "%)";
2979 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
2980 }
2981
2982 // OBJECT_UNFOUND
2983 if (pg_sum.stats.sum.num_objects_unfound &&
2984 pg_sum.stats.sum.num_objects) {
2985 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
2986 (double)pg_sum.stats.sum.num_objects * (double)100.0;
2987 char b[20];
2988 snprintf(b, sizeof(b), "%.3lf", pc);
2989 ostringstream ss;
2990 ss << pg_sum.stats.sum.num_objects_unfound
2991 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
2992 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
2993
2994 for (auto& p : pg_stat) {
2995 if (p.second.stats.sum.num_objects_unfound) {
2996 ostringstream ss;
2997 ss << "pg " << p.first
2998 << " has " << p.second.stats.sum.num_objects_unfound
2999 << " unfound objects";
3000 d.detail.push_back(ss.str());
3001 if (d.detail.size() > max) {
3002 d.detail.push_back("(additional pgs left out for brevity)");
3003 break;
3004 }
3005 }
3006 }
3007 }
3008
3009 // REQUEST_SLOW
3010 // REQUEST_STUCK
3011 // SLOW_OPS unifies them in mimic.
3012 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC &&
3013 cct->_conf->mon_osd_warn_op_age > 0 &&
3014 !osd_sum.op_queue_age_hist.h.empty() &&
3015 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3016 cct->_conf->mon_osd_warn_op_age) {
3017 list<string> warn_detail, error_detail;
3018 unsigned warn = 0, error = 0;
3019 float err_age =
3020 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3021 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3022 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3023 float ub = (float)(1 << i) / 1000.0;
3024 if (ub < cct->_conf->mon_osd_warn_op_age)
3025 break;
3026 if (h.h[i]) {
3027 ostringstream ss;
3028 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3029 if (ub > err_age) {
3030 error += h.h[i];
3031 error_detail.push_back(ss.str());
3032 } else {
3033 warn += h.h[i];
3034 warn_detail.push_back(ss.str());
3035 }
3036 }
3037 }
3038
3039 map<float,set<int>> warn_osd_by_max; // max -> osds
3040 map<float,set<int>> error_osd_by_max; // max -> osds
3041 if (!warn_detail.empty() || !error_detail.empty()) {
3042 for (auto& p : osd_stat) {
3043 const pow2_hist_t& h = p.second.op_queue_age_hist;
3044 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3045 float ub = (float)(1 << i) / 1000.0;
3046 if (ub < cct->_conf->mon_osd_warn_op_age)
3047 break;
3048 if (h.h[i]) {
3049 if (ub > err_age) {
3050 error_osd_by_max[ub].insert(p.first);
3051 } else {
3052 warn_osd_by_max[ub].insert(p.first);
3053 }
3054 break;
3055 }
3056 }
3057 }
3058 }
3059
3060 if (!warn_detail.empty()) {
3061 ostringstream ss;
3062 ss << warn << " slow requests are blocked > "
3063 << cct->_conf->mon_osd_warn_op_age << " sec";
3064 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
3065 d.detail.swap(warn_detail);
3066 int left = max;
3067 for (auto& p : warn_osd_by_max) {
3068 ostringstream ss;
3069 if (p.second.size() > 1) {
3070 ss << "osds " << p.second
3071 << " have blocked requests > " << p.first << " sec";
3072 } else {
3073 ss << "osd." << *p.second.begin()
3074 << " has blocked requests > " << p.first << " sec";
3075 }
3076 d.detail.push_back(ss.str());
3077 if (--left == 0) {
3078 break;
3079 }
3080 }
3081 }
3082 if (!error_detail.empty()) {
3083 ostringstream ss;
3084 ss << error << " stuck requests are blocked > "
3085 << err_age << " sec";
3086 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
3087 d.detail.swap(error_detail);
3088 int left = max;
3089 for (auto& p : error_osd_by_max) {
3090 ostringstream ss;
3091 if (p.second.size() > 1) {
3092 ss << "osds " << p.second
3093 << " have stuck requests > " << p.first << " sec";
3094 } else {
3095 ss << "osd." << *p.second.begin()
3096 << " has stuck requests > " << p.first << " sec";
3097 }
3098 d.detail.push_back(ss.str());
3099 if (--left == 0) {
3100 break;
3101 }
3102 }
3103 }
3104 }
3105
3106 // OBJECT_STORE_WARN
3107 if (osd_sum.os_alerts.size()) {
3108 map<string, pair<size_t, list<string>>> os_alerts_sum;
3109
3110 for (auto& a : osd_sum.os_alerts) {
3111 int left = max;
3112 string s0 = " osd.";
3113 s0 += stringify(a.first);
3114 for (auto& aa : a.second) {
3115 string s(s0);
3116 s += " ";
3117 s += aa.second;
3118 auto it = os_alerts_sum.find(aa.first);
3119 if (it == os_alerts_sum.end()) {
3120 list<string> d;
3121 d.emplace_back(s);
3122 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3123 } else {
3124 auto& p = it->second;
3125 ++p.first;
3126 p.second.emplace_back(s);
3127 }
3128 if (--left == 0) {
3129 break;
3130 }
3131 }
3132 }
3133
3134 for (auto& asum : os_alerts_sum) {
3135 string summary;
3136 if (asum.first == "BLUEFS_SPILLOVER") {
3137 summary = "BlueFS spillover detected";
3138 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
3139 summary = "BlueStore compression broken";
3140 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
3141 summary = "Legacy BlueStore stats reporting detected";
3142 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
3143 summary = "BlueStore has dangerous mismatch between block device and free list sizes";
3144 }
3145 summary += " on ";
3146 summary += stringify(asum.second.first);
3147 summary += " OSD(s)";
3148 auto& d = checks->add(asum.first, HEALTH_WARN, summary);
3149 for (auto& s : asum.second.second) {
3150 d.detail.push_back(s);
3151 }
3152 }
3153 }
3154 // PG_NOT_SCRUBBED
3155 // PG_NOT_DEEP_SCRUBBED
3156 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3157 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3158 list<string> detail, deep_detail;
3159 int detail_max = max, deep_detail_max = max;
3160 int detail_more = 0, deep_detail_more = 0;
3161 int detail_total = 0, deep_detail_total = 0;
3162 for (auto& p : pg_stat) {
3163 int64_t pnum = p.first.pool();
3164 auto pool = osdmap.get_pg_pool(pnum);
3165 if (!pool)
3166 continue;
3167 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
3168 double scrub_max_interval = 0;
3169 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3170 if (scrub_max_interval <= 0) {
3171 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
3172 }
3173 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
3174 scrub_max_interval;
3175 utime_t cutoff = now;
3176 cutoff -= age;
3177 if (p.second.last_scrub_stamp < cutoff) {
3178 if (detail_max > 0) {
3179 ostringstream ss;
3180 ss << "pg " << p.first << " not scrubbed since "
3181 << p.second.last_scrub_stamp;
3182 detail.push_back(ss.str());
3183 --detail_max;
3184 } else {
3185 ++detail_more;
3186 }
3187 ++detail_total;
3188 }
3189 }
3190 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3191 double deep_scrub_interval = 0;
3192 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3193 if (deep_scrub_interval <= 0) {
3194 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3195 }
3196 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
3197 deep_scrub_interval;
3198 utime_t deep_cutoff = now;
3199 deep_cutoff -= deep_age;
3200 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3201 if (deep_detail_max > 0) {
3202 ostringstream ss;
3203 ss << "pg " << p.first << " not deep-scrubbed since "
3204 << p.second.last_deep_scrub_stamp;
3205 deep_detail.push_back(ss.str());
3206 --deep_detail_max;
3207 } else {
3208 ++deep_detail_more;
3209 }
3210 ++deep_detail_total;
3211 }
3212 }
3213 }
3214 if (detail_total) {
3215 ostringstream ss;
3216 ss << detail_total << " pgs not scrubbed in time";
3217 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
3218
3219 if (!detail.empty()) {
3220 d.detail.swap(detail);
3221
3222 if (detail_more) {
3223 ostringstream ss;
3224 ss << detail_more << " more pgs... ";
3225 d.detail.push_back(ss.str());
3226 }
3227 }
3228 }
3229 if (deep_detail_total) {
3230 ostringstream ss;
3231 ss << deep_detail_total << " pgs not deep-scrubbed in time";
3232 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
3233
3234 if (!deep_detail.empty()) {
3235 d.detail.swap(deep_detail);
3236
3237 if (deep_detail_more) {
3238 ostringstream ss;
3239 ss << deep_detail_more << " more pgs... ";
3240 d.detail.push_back(ss.str());
3241 }
3242 }
3243 }
3244 }
3245
3246 // POOL_APP
3247 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
3248 list<string> detail;
3249 for (auto &it : pools) {
3250 const pg_pool_t &pool = it.second;
3251 const string& pool_name = osdmap.get_pool_name(it.first);
3252 auto it2 = pg_pool_sum.find(it.first);
3253 if (it2 == pg_pool_sum.end()) {
3254 continue;
3255 }
3256 const pool_stat_t *pstat = &it2->second;
3257 if (pstat == nullptr) {
3258 continue;
3259 }
3260 const object_stat_sum_t& sum = pstat->stats.sum;
3261 // application metadata is not encoded until luminous is minimum
3262 // required release
3263 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3264 !pool.is_tier()) {
3265 stringstream ss;
3266 ss << "application not enabled on pool '" << pool_name << "'";
3267 detail.push_back(ss.str());
3268 }
3269 }
3270 if (!detail.empty()) {
3271 ostringstream ss;
3272 ss << "application not enabled on " << detail.size() << " pool(s)";
3273 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
3274 stringstream tip;
3275 tip << "use 'ceph osd pool application enable <pool-name> "
3276 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3277 << "or freeform for custom applications.";
3278 detail.push_back(tip.str());
3279 d.detail.swap(detail);
3280 }
3281 }
3282
3283 // PG_SLOW_SNAP_TRIMMING
3284 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3285 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3286 uint64_t snaptrimq_exceeded = 0;
3287 uint32_t longest_queue = 0;
3288 const pg_t* longest_q_pg = nullptr;
3289 list<string> detail;
3290
3291 for (auto& i: pg_stat) {
3292 uint32_t current_len = i.second.snaptrimq_len;
3293 if (current_len >= snapthreshold) {
3294 snaptrimq_exceeded++;
3295 if (longest_queue <= current_len) {
3296 longest_q_pg = &i.first;
3297 longest_queue = current_len;
3298 }
3299 if (detail.size() < max - 1) {
3300 stringstream ss;
3301 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3302 detail.push_back(ss.str());
3303 continue;
3304 }
3305 if (detail.size() < max) {
3306 detail.push_back("...more pgs affected");
3307 continue;
3308 }
3309 }
3310 }
3311
3312 if (snaptrimq_exceeded) {
3313 {
3314 ostringstream ss;
3315 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3316 detail.push_back(ss.str());
3317 }
3318
3319 stringstream ss;
3320 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3321 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str());
3322 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3323 d.detail.swap(detail);
3324 }
3325 }
3326 }
3327
3328 int process_pg_map_command(
3329 const string& orig_prefix,
3330 const cmdmap_t& orig_cmdmap,
3331 const PGMap& pg_map,
3332 const OSDMap& osdmap,
3333 Formatter *f,
3334 stringstream *ss,
3335 bufferlist *odata)
3336 {
3337 string prefix = orig_prefix;
3338 auto cmdmap = orig_cmdmap;
3339
3340 string omap_stats_note =
3341 "\n* NOTE: Omap statistics are gathered during deep scrub and "
3342 "may be inaccurate soon afterwards depending on utilisation. See "
3343 "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3344 "for further details.\n";
3345 bool omap_stats_note_required = false;
3346
3347 // perhaps these would be better in the parsing, but it's weird
3348 bool primary = false;
3349 if (prefix == "pg dump_json") {
3350 vector<string> v;
3351 v.push_back(string("all"));
3352 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3353 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3354 prefix = "pg dump";
3355 } else if (prefix == "pg dump_pools_json") {
3356 vector<string> v;
3357 v.push_back(string("pools"));
3358 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3359 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3360 prefix = "pg dump";
3361 } else if (prefix == "pg ls-by-primary") {
3362 primary = true;
3363 prefix = "pg ls";
3364 } else if (prefix == "pg ls-by-osd") {
3365 prefix = "pg ls";
3366 } else if (prefix == "pg ls-by-pool") {
3367 prefix = "pg ls";
3368 string poolstr;
3369 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3370 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3371 if (pool < 0) {
3372 *ss << "pool " << poolstr << " does not exist";
3373 return -ENOENT;
3374 }
3375 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3376 }
3377
3378 stringstream ds;
3379 if (prefix == "pg stat") {
3380 if (f) {
3381 f->open_object_section("pg_summary");
3382 pg_map.print_oneline_summary(f, NULL);
3383 f->close_section();
3384 f->flush(ds);
3385 } else {
3386 ds << pg_map;
3387 }
3388 odata->append(ds);
3389 return 0;
3390 }
3391
3392 if (prefix == "pg getmap") {
3393 pg_map.encode(*odata);
3394 *ss << "got pgmap version " << pg_map.version;
3395 return 0;
3396 }
3397
3398 if (prefix == "pg dump") {
3399 string val;
3400 vector<string> dumpcontents;
3401 set<string> what;
3402 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3403 copy(dumpcontents.begin(), dumpcontents.end(),
3404 inserter(what, what.end()));
3405 }
3406 if (what.empty())
3407 what.insert("all");
3408 if (f) {
3409 if (what.count("all")) {
3410 f->open_object_section("pg_map");
3411 pg_map.dump(f);
3412 f->close_section();
3413 } else if (what.count("summary") || what.count("sum")) {
3414 f->open_object_section("pg_map");
3415 pg_map.dump_basic(f);
3416 f->close_section();
3417 } else {
3418 if (what.count("pools")) {
3419 pg_map.dump_pool_stats(f);
3420 }
3421 if (what.count("osds")) {
3422 pg_map.dump_osd_stats(f);
3423 }
3424 if (what.count("pgs")) {
3425 pg_map.dump_pg_stats(f, false);
3426 }
3427 if (what.count("pgs_brief")) {
3428 pg_map.dump_pg_stats(f, true);
3429 }
3430 if (what.count("delta")) {
3431 f->open_object_section("delta");
3432 pg_map.dump_delta(f);
3433 f->close_section();
3434 }
3435 }
3436 f->flush(*odata);
3437 } else {
3438 if (what.count("all")) {
3439 pg_map.dump(ds);
3440 omap_stats_note_required = true;
3441 } else if (what.count("summary") || what.count("sum")) {
3442 pg_map.dump_basic(ds);
3443 pg_map.dump_pg_sum_stats(ds, true);
3444 pg_map.dump_osd_sum_stats(ds);
3445 omap_stats_note_required = true;
3446 } else {
3447 if (what.count("pgs_brief")) {
3448 pg_map.dump_pg_stats(ds, true);
3449 }
3450 bool header = true;
3451 if (what.count("pgs")) {
3452 pg_map.dump_pg_stats(ds, false);
3453 header = false;
3454 omap_stats_note_required = true;
3455 }
3456 if (what.count("pools")) {
3457 pg_map.dump_pool_stats(ds, header);
3458 omap_stats_note_required = true;
3459 }
3460 if (what.count("osds")) {
3461 pg_map.dump_osd_stats(ds);
3462 }
3463 }
3464 odata->append(ds);
3465 if (omap_stats_note_required) {
3466 odata->append(omap_stats_note);
3467 }
3468 }
3469 *ss << "dumped " << what;
3470 return 0;
3471 }
3472
3473 if (prefix == "pg ls") {
3474 int64_t osd = -1;
3475 int64_t pool = -1;
3476 vector<string>states;
3477 set<pg_t> pgs;
3478 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3479 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3480 cmd_getval(g_ceph_context, cmdmap, "states", states);
3481 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3482 *ss << "pool " << pool << " does not exist";
3483 return -ENOENT;
3484 }
3485 if (osd >= 0 && !osdmap.is_up(osd)) {
3486 *ss << "osd " << osd << " is not up";
3487 return -EAGAIN;
3488 }
3489 if (states.empty())
3490 states.push_back("all");
3491
3492 uint64_t state = 0;
3493
3494 while (!states.empty()) {
3495 string state_str = states.back();
3496
3497 if (state_str == "all") {
3498 state = -1;
3499 break;
3500 } else {
3501 auto filter = pg_string_state(state_str);
3502 if (!filter) {
3503 *ss << "'" << state_str << "' is not a valid pg state,"
3504 << " available choices: " << pg_state_string(0xFFFFFFFF);
3505 return -EINVAL;
3506 }
3507 state |= *filter;
3508 }
3509
3510 states.pop_back();
3511 }
3512
3513 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3514
3515 if (f && !pgs.empty()) {
3516 pg_map.dump_filtered_pg_stats(f, pgs);
3517 f->flush(*odata);
3518 } else if (!pgs.empty()) {
3519 pg_map.dump_filtered_pg_stats(ds, pgs);
3520 odata->append(ds);
3521 odata->append(omap_stats_note);
3522 }
3523 return 0;
3524 }
3525
3526 if (prefix == "pg dump_stuck") {
3527 vector<string> stuckop_vec;
3528 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3529 if (stuckop_vec.empty())
3530 stuckop_vec.push_back("unclean");
3531 int64_t threshold;
3532 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3533 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
3534
3535 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
3536 *ss << "failed";
3537 } else {
3538 *ss << "ok";
3539 }
3540 odata->append(ds);
3541 return 0;
3542 }
3543
3544 if (prefix == "pg debug") {
3545 string debugop;
3546 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3547 string("unfound_objects_exist"));
3548 if (debugop == "unfound_objects_exist") {
3549 bool unfound_objects_exist = false;
3550 for (const auto& p : pg_map.pg_stat) {
3551 if (p.second.stats.sum.num_objects_unfound > 0) {
3552 unfound_objects_exist = true;
3553 break;
3554 }
3555 }
3556 if (unfound_objects_exist)
3557 ds << "TRUE";
3558 else
3559 ds << "FALSE";
3560 odata->append(ds);
3561 return 0;
3562 }
3563 if (debugop == "degraded_pgs_exist") {
3564 bool degraded_pgs_exist = false;
3565 for (const auto& p : pg_map.pg_stat) {
3566 if (p.second.stats.sum.num_objects_degraded > 0) {
3567 degraded_pgs_exist = true;
3568 break;
3569 }
3570 }
3571 if (degraded_pgs_exist)
3572 ds << "TRUE";
3573 else
3574 ds << "FALSE";
3575 odata->append(ds);
3576 return 0;
3577 }
3578 }
3579
3580 if (prefix == "osd perf") {
3581 if (f) {
3582 f->open_object_section("osdstats");
3583 pg_map.dump_osd_perf_stats(f);
3584 f->close_section();
3585 f->flush(ds);
3586 } else {
3587 pg_map.print_osd_perf_stats(&ds);
3588 }
3589 odata->append(ds);
3590 return 0;
3591 }
3592
3593 if (prefix == "osd blocked-by") {
3594 if (f) {
3595 f->open_object_section("osd_blocked_by");
3596 pg_map.dump_osd_blocked_by_stats(f);
3597 f->close_section();
3598 f->flush(ds);
3599 } else {
3600 pg_map.print_osd_blocked_by_stats(&ds);
3601 }
3602 odata->append(ds);
3603 return 0;
3604 }
3605
3606 return -EOPNOTSUPP;
3607 }
3608
3609 void PGMapUpdater::check_osd_map(
3610 CephContext *cct,
3611 const OSDMap& osdmap,
3612 const PGMap& pgmap,
3613 PGMap::Incremental *pending_inc)
3614 {
3615 for (auto& p : pgmap.osd_stat) {
3616 if (!osdmap.exists(p.first)) {
3617 // remove osd_stat
3618 pending_inc->rm_stat(p.first);
3619 } else if (osdmap.is_out(p.first)) {
3620 // zero osd_stat
3621 if (p.second.statfs.total != 0) {
3622 pending_inc->stat_osd_out(p.first);
3623 }
3624 } else if (!osdmap.is_up(p.first)) {
3625 // zero the op_queue_age_hist
3626 if (!p.second.op_queue_age_hist.empty()) {
3627 pending_inc->stat_osd_down_up(p.first, pgmap);
3628 }
3629 }
3630 }
3631
3632 // deleted pgs (pools)?
3633 for (auto& p : pgmap.pg_pool_sum) {
3634 if (!osdmap.have_pg_pool(p.first)) {
3635 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3636 << dendl;
3637 for (auto& q : pgmap.pg_stat) {
3638 if (q.first.pool() == p.first) {
3639 pending_inc->pg_remove.insert(q.first);
3640 }
3641 }
3642 auto q = pending_inc->pg_stat_updates.begin();
3643 while (q != pending_inc->pg_stat_updates.end()) {
3644 if (q->first.pool() == p.first) {
3645 q = pending_inc->pg_stat_updates.erase(q);
3646 } else {
3647 ++q;
3648 }
3649 }
3650 }
3651 }
3652
3653 // new (split or new pool) or merged pgs?
3654 map<int64_t,unsigned> new_pg_num;
3655 for (auto& p : osdmap.get_pools()) {
3656 int64_t poolid = p.first;
3657 const pg_pool_t& pi = p.second;
3658 auto q = pgmap.num_pg_by_pool.find(poolid);
3659 unsigned my_pg_num = 0;
3660 if (q != pgmap.num_pg_by_pool.end())
3661 my_pg_num = q->second;
3662 unsigned pg_num = pi.get_pg_num();
3663 new_pg_num[poolid] = pg_num;
3664 if (my_pg_num < pg_num) {
3665 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3666 << " > my pg_num " << my_pg_num << dendl;
3667 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3668 pg_t pgid(ps, poolid);
3669 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
3670 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
3671 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3672 stats.last_fresh = osdmap.get_modified();
3673 stats.last_active = osdmap.get_modified();
3674 stats.last_change = osdmap.get_modified();
3675 stats.last_peered = osdmap.get_modified();
3676 stats.last_clean = osdmap.get_modified();
3677 stats.last_unstale = osdmap.get_modified();
3678 stats.last_undegraded = osdmap.get_modified();
3679 stats.last_fullsized = osdmap.get_modified();
3680 stats.last_scrub_stamp = osdmap.get_modified();
3681 stats.last_deep_scrub_stamp = osdmap.get_modified();
3682 stats.last_clean_scrub_stamp = osdmap.get_modified();
3683 }
3684 }
3685 } else if (my_pg_num > pg_num) {
3686 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3687 << " < my pg_num " << my_pg_num << dendl;
3688 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3689 pg_t pgid(i, poolid);
3690 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3691 if (pgmap.pg_stat.count(pgid)) {
3692 pending_inc->pg_remove.insert(pgid);
3693 }
3694 pending_inc->pg_stat_updates.erase(pgid);
3695 }
3696 }
3697 }
3698 auto i = pending_inc->pg_stat_updates.begin();
3699 while (i != pending_inc->pg_stat_updates.end()) {
3700 auto j = new_pg_num.find(i->first.pool());
3701 if (j == new_pg_num.end() ||
3702 i->first.ps() >= j->second) {
3703 ldout(cct,20) << __func__ << " removing pending update to old "
3704 << i->first << dendl;
3705 i = pending_inc->pg_stat_updates.erase(i);
3706 } else {
3707 ++i;
3708 }
3709 }
3710 }
3711
3712 static void _try_mark_pg_stale(
3713 const OSDMap& osdmap,
3714 pg_t pgid,
3715 const pg_stat_t& cur,
3716 PGMap::Incremental *pending_inc)
3717 {
3718 if ((cur.state & PG_STATE_STALE) == 0 &&
3719 cur.acting_primary != -1 &&
3720 osdmap.is_down(cur.acting_primary)) {
3721 pg_stat_t *newstat;
3722 auto q = pending_inc->pg_stat_updates.find(pgid);
3723 if (q != pending_inc->pg_stat_updates.end()) {
3724 if ((q->second.acting_primary == cur.acting_primary) ||
3725 ((q->second.state & PG_STATE_STALE) == 0 &&
3726 q->second.acting_primary != -1 &&
3727 osdmap.is_down(q->second.acting_primary))) {
3728 newstat = &q->second;
3729 } else {
3730 // pending update is no longer down or already stale
3731 return;
3732 }
3733 } else {
3734 newstat = &pending_inc->pg_stat_updates[pgid];
3735 *newstat = cur;
3736 }
3737 dout(10) << __func__ << " marking pg " << pgid
3738 << " stale (acting_primary " << newstat->acting_primary
3739 << ")" << dendl;
3740 newstat->state |= PG_STATE_STALE;
3741 newstat->last_unstale = ceph_clock_now();
3742 }
3743 }
3744
3745 void PGMapUpdater::check_down_pgs(
3746 const OSDMap &osdmap,
3747 const PGMap &pg_map,
3748 bool check_all,
3749 const set<int>& need_check_down_pg_osds,
3750 PGMap::Incremental *pending_inc)
3751 {
3752 // if a large number of osds changed state, just iterate over the whole
3753 // pg map.
3754 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
3755 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
3756 check_all = true;
3757 }
3758
3759 if (check_all) {
3760 for (const auto& p : pg_map.pg_stat) {
3761 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3762 }
3763 } else {
3764 for (auto osd : need_check_down_pg_osds) {
3765 if (osdmap.is_down(osd)) {
3766 auto p = pg_map.pg_by_osd.find(osd);
3767 if (p == pg_map.pg_by_osd.end()) {
3768 continue;
3769 }
3770 for (auto pgid : p->second) {
3771 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
3772 ceph_assert(stat.acting_primary == osd);
3773 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3774 }
3775 }
3776 }
3777 }
3778 }
3779
3780 int reweight::by_utilization(
3781 const OSDMap &osdmap,
3782 const PGMap &pgm,
3783 int oload,
3784 double max_changef,
3785 int max_osds,
3786 bool by_pg, const set<int64_t> *pools,
3787 bool no_increasing,
3788 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3789 std::stringstream *ss,
3790 std::string *out_str,
3791 Formatter *f)
3792 {
3793 if (oload <= 100) {
3794 *ss << "You must give a percentage higher than 100. "
3795 "The reweighting threshold will be calculated as <average-utilization> "
3796 "times <input-percentage>. For example, an argument of 200 would "
3797 "reweight OSDs which are twice as utilized as the average OSD.\n";
3798 return -EINVAL;
3799 }
3800
3801 vector<int> pgs_by_osd(osdmap.get_max_osd());
3802
3803 // Avoid putting a small number (or 0) in the denominator when calculating
3804 // average_util
3805 double average_util;
3806 if (by_pg) {
3807 // by pg mapping
3808 double weight_sum = 0.0; // sum up the crush weights
3809 unsigned num_pg_copies = 0;
3810 int num_osds = 0;
3811 for (const auto& pg : pgm.pg_stat) {
3812 if (pools && pools->count(pg.first.pool()) == 0)
3813 continue;
3814 for (const auto acting : pg.second.acting) {
3815 if (!osdmap.exists(acting)) {
3816 continue;
3817 }
3818 if (acting >= (int)pgs_by_osd.size())
3819 pgs_by_osd.resize(acting);
3820 if (pgs_by_osd[acting] == 0) {
3821 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3822 //skip if we currently can not identify item
3823 continue;
3824 }
3825 weight_sum += osdmap.crush->get_item_weightf(acting);
3826 ++num_osds;
3827 }
3828 ++pgs_by_osd[acting];
3829 ++num_pg_copies;
3830 }
3831 }
3832
3833 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
3834 *ss << "Refusing to reweight: we only have " << num_pg_copies
3835 << " PGs across " << num_osds << " osds!\n";
3836 return -EDOM;
3837 }
3838
3839 average_util = (double)num_pg_copies / weight_sum;
3840 } else {
3841 // by osd utilization
3842 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3843 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3844 < g_conf()->mon_reweight_min_bytes_per_osd) {
3845 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
3846 << " kb across all osds!\n";
3847 return -EDOM;
3848 }
3849 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3850 < g_conf()->mon_reweight_min_bytes_per_osd) {
3851 *ss << "Refusing to reweight: we only have "
3852 << pgm.osd_sum.statfs.kb_used_raw()
3853 << " kb used across all osds!\n";
3854 return -EDOM;
3855 }
3856
3857 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
3858 (double)pgm.osd_sum.statfs.total;
3859 }
3860
3861 // adjust down only if we are above the threshold
3862 const double overload_util = average_util * (double)oload / 100.0;
3863
3864 // but aggressively adjust weights up whenever possible.
3865 const double underload_util = average_util;
3866
3867 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
3868
3869 ostringstream oss;
3870 if (f) {
3871 f->open_object_section("reweight_by_utilization");
3872 f->dump_int("overload_min", oload);
3873 f->dump_float("max_change", max_changef);
3874 f->dump_int("max_change_osds", max_osds);
3875 f->dump_float("average_utilization", average_util);
3876 f->dump_float("overload_utilization", overload_util);
3877 } else {
3878 oss << "oload " << oload << "\n";
3879 oss << "max_change " << max_changef << "\n";
3880 oss << "max_change_osds " << max_osds << "\n";
3881 oss.precision(4);
3882 oss << "average_utilization " << std::fixed << average_util << "\n";
3883 oss << "overload_utilization " << overload_util << "\n";
3884 }
3885 int num_changed = 0;
3886
3887 // precompute util for each OSD
3888 std::vector<std::pair<int, float> > util_by_osd;
3889 for (const auto& p : pgm.osd_stat) {
3890 std::pair<int, float> osd_util;
3891 osd_util.first = p.first;
3892 if (by_pg) {
3893 if (p.first >= (int)pgs_by_osd.size() ||
3894 pgs_by_osd[p.first] == 0) {
3895 // skip if this OSD does not contain any pg
3896 // belonging to the specified pool(s).
3897 continue;
3898 }
3899
3900 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
3901 // skip if we are unable to locate item.
3902 continue;
3903 }
3904
3905 osd_util.second =
3906 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
3907 } else {
3908 osd_util.second =
3909 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
3910 }
3911 util_by_osd.push_back(osd_util);
3912 }
3913
3914 // sort by absolute deviation from the mean utilization,
3915 // in descending order.
3916 std::sort(util_by_osd.begin(), util_by_osd.end(),
3917 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
3918 return abs(l.second - average_util) > abs(r.second - average_util);
3919 }
3920 );
3921
3922 if (f)
3923 f->open_array_section("reweights");
3924
3925 for (const auto& p : util_by_osd) {
3926 unsigned weight = osdmap.get_weight(p.first);
3927 if (weight == 0) {
3928 // skip if OSD is currently out
3929 continue;
3930 }
3931 float util = p.second;
3932
3933 if (util >= overload_util) {
3934 // Assign a lower weight to overloaded OSDs. The current weight
3935 // is a factor to take into account the original weights,
3936 // to represent e.g. differing storage capacities
3937 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3938 if (weight > max_change)
3939 new_weight = std::max(new_weight, weight - max_change);
3940 new_weights->insert({p.first, new_weight});
3941 if (f) {
3942 f->open_object_section("osd");
3943 f->dump_int("osd", p.first);
3944 f->dump_float("weight", (float)weight / (float)0x10000);
3945 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
3946 f->close_section();
3947 } else {
3948 oss << "osd." << p.first << " weight "
3949 << (float)weight / (float)0x10000 << " -> "
3950 << (float)new_weight / (float)0x10000 << "\n";
3951 }
3952 if (++num_changed >= max_osds)
3953 break;
3954 }
3955 if (!no_increasing && util <= underload_util) {
3956 // assign a higher weight.. if we can.
3957 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3958 new_weight = std::min(new_weight, weight + max_change);
3959 if (new_weight > 0x10000)
3960 new_weight = 0x10000;
3961 if (new_weight > weight) {
3962 new_weights->insert({p.first, new_weight});
3963 oss << "osd." << p.first << " weight "
3964 << (float)weight / (float)0x10000 << " -> "
3965 << (float)new_weight / (float)0x10000 << "\n";
3966 if (++num_changed >= max_osds)
3967 break;
3968 }
3969 }
3970 }
3971 if (f) {
3972 f->close_section();
3973 }
3974
3975 OSDMap newmap;
3976 newmap.deepish_copy_from(osdmap);
3977 OSDMap::Incremental newinc;
3978 newinc.fsid = newmap.get_fsid();
3979 newinc.epoch = newmap.get_epoch() + 1;
3980 newinc.new_weight = *new_weights;
3981 newmap.apply_incremental(newinc);
3982
3983 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
3984
3985 if (f) {
3986 f->close_section();
3987 } else {
3988 *out_str += "\n";
3989 *out_str += oss.str();
3990 }
3991 return num_changed;
3992 }