]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
update sources to v12.1.1
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Formatter.h"
11 #include "include/ceph_features.h"
12 #include "include/stringify.h"
13
14 #include "osd/osd_types.h"
15 #include "osd/OSDMap.h"
16
17 #define dout_context g_ceph_context
18
19 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
22
23
24 // ---------------------
25 // PGMapDigest
26
27 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
28 {
29 // NOTE: see PGMap::encode_digest
30 ENCODE_START(1, 1, bl);
31 ::encode(num_pg, bl);
32 ::encode(num_pg_active, bl);
33 ::encode(num_pg_unknown, bl);
34 ::encode(num_osd, bl);
35 ::encode(pg_pool_sum, bl, features);
36 ::encode(pg_sum, bl, features);
37 ::encode(osd_sum, bl);
38 ::encode(num_pg_by_state, bl);
39 ::encode(num_pg_by_osd, bl);
40 ::encode(num_pg_by_pool, bl);
41 ::encode(osd_last_seq, bl);
42 ::encode(per_pool_sum_delta, bl, features);
43 ::encode(per_pool_sum_deltas_stamps, bl);
44 ::encode(pg_sum_delta, bl, features);
45 ::encode(stamp_delta, bl);
46 ::encode(avail_space_by_rule, bl);
47 ENCODE_FINISH(bl);
48 }
49
50 void PGMapDigest::decode(bufferlist::iterator& p)
51 {
52 DECODE_START(1, p);
53 ::decode(num_pg, p);
54 ::decode(num_pg_active, p);
55 ::decode(num_pg_unknown, p);
56 ::decode(num_osd, p);
57 ::decode(pg_pool_sum, p);
58 ::decode(pg_sum, p);
59 ::decode(osd_sum, p);
60 ::decode(num_pg_by_state, p);
61 ::decode(num_pg_by_osd, p);
62 ::decode(num_pg_by_pool, p);
63 ::decode(osd_last_seq, p);
64 ::decode(per_pool_sum_delta, p);
65 ::decode(per_pool_sum_deltas_stamps, p);
66 ::decode(pg_sum_delta, p);
67 ::decode(stamp_delta, p);
68 ::decode(avail_space_by_rule, p);
69 DECODE_FINISH(p);
70 }
71
72 void PGMapDigest::dump(Formatter *f) const
73 {
74 f->dump_unsigned("num_pg", num_pg);
75 f->dump_unsigned("num_pg_active", num_pg_active);
76 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
77 f->dump_unsigned("num_osd", num_osd);
78 f->dump_object("pool_sum", pg_sum);
79 f->dump_object("osd_sum", osd_sum);
80 f->open_array_section("pool_stats");
81 for (auto& p : pg_pool_sum) {
82 f->open_object_section("pool_stat");
83 f->dump_int("poolid", p.first);
84 auto q = num_pg_by_pool.find(p.first);
85 if (q != num_pg_by_pool.end())
86 f->dump_unsigned("num_pg", q->second);
87 p.second.dump(f);
88 f->close_section();
89 }
90 f->close_section();
91 f->open_array_section("osd_stats");
92 int i = 0;
93 // TODO: this isn't really correct since we can dump non-existent OSDs
94 // I dunno what osd_last_seq is set to in that case...
95 for (auto& p : osd_last_seq) {
96 f->open_object_section("osd_stat");
97 f->dump_int("osd", i);
98 f->dump_unsigned("seq", p);
99 f->close_section();
100 ++i;
101 }
102 f->close_section();
103 f->open_array_section("num_pg_by_state");
104 for (auto& p : num_pg_by_state) {
105 f->open_object_section("count");
106 f->dump_string("state", pg_state_string(p.first));
107 f->dump_unsigned("num", p.second);
108 f->close_section();
109 }
110 f->close_section();
111 f->open_array_section("num_pg_by_osd");
112 for (auto& p : num_pg_by_osd) {
113 f->open_object_section("count");
114 f->dump_unsigned("osd", p.first);
115 f->dump_unsigned("num_primary_pg", p.second.primary);
116 f->dump_unsigned("num_acting_pg", p.second.acting);
117 f->dump_unsigned("num_up_pg", p.second.up);
118 f->close_section();
119 }
120 f->close_section();
121 }
122
123 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
124 {
125 ls.push_back(new PGMapDigest);
126 }
127
128 inline std::string percentify(const float& a) {
129 std::stringstream ss;
130 if (a < 0.01)
131 ss << "0";
132 else
133 ss << std::fixed << std::setprecision(2) << a;
134 return ss.str();
135 }
136
137 void PGMapDigest::print_summary(Formatter *f, ostream *out) const
138 {
139 if (f)
140 f->open_array_section("pgs_by_state");
141
142 // list is descending numeric order (by count)
143 multimap<int,int> state_by_count; // count -> state
144 for (auto p = num_pg_by_state.begin();
145 p != num_pg_by_state.end();
146 ++p) {
147 state_by_count.insert(make_pair(p->second, p->first));
148 }
149 if (f) {
150 for (auto p = state_by_count.rbegin();
151 p != state_by_count.rend();
152 ++p)
153 {
154 f->open_object_section("pgs_by_state_element");
155 f->dump_string("state_name", pg_state_string(p->second));
156 f->dump_unsigned("count", p->first);
157 f->close_section();
158 }
159 }
160 if (f)
161 f->close_section();
162
163 if (f) {
164 f->dump_unsigned("num_pgs", num_pg);
165 f->dump_unsigned("num_pools", pg_pool_sum.size());
166 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
167 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
168 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
169 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
170 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
171 } else {
172 *out << " pools: " << pg_pool_sum.size() << " pools, "
173 << num_pg << " pgs\n";
174 *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
175 << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
176 *out << " usage: "
177 << kb_t(osd_sum.kb_used) << " used, "
178 << kb_t(osd_sum.kb_avail) << " / "
179 << kb_t(osd_sum.kb) << " avail\n";
180 *out << " pgs: ";
181 }
182
183 bool pad = false;
184
185 if (num_pg_unknown > 0) {
186 float p = (float)num_pg_unknown / (float)num_pg;
187 if (f) {
188 f->dump_float("unknown_pgs_ratio", p);
189 } else {
190 char b[20];
191 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
192 *out << b << "% pgs unknown\n";
193 pad = true;
194 }
195 }
196
197 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
198 if (num_pg_inactive > 0) {
199 float p = (float)num_pg_inactive / (float)num_pg;
200 if (f) {
201 f->dump_float("inactive_pgs_ratio", p);
202 } else {
203 if (pad) {
204 *out << " ";
205 }
206 char b[20];
207 snprintf(b, sizeof(b), "%.3f", p * 100.0);
208 *out << b << "% pgs not active\n";
209 pad = true;
210 }
211 }
212
213 list<string> sl;
214 overall_recovery_summary(f, &sl);
215 if (!f && !sl.empty()) {
216 for (auto p = sl.begin(); p != sl.end(); ++p) {
217 if (pad) {
218 *out << " ";
219 }
220 *out << *p << "\n";
221 pad = true;
222 }
223 }
224 sl.clear();
225
226 if (!f) {
227 unsigned max_width = 1;
228 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
229 p != state_by_count.rend();
230 ++p)
231 {
232 std::stringstream ss;
233 ss << p->first;
234 max_width = MAX(ss.str().size(), max_width);
235 }
236
237 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
238 p != state_by_count.rend();
239 ++p)
240 {
241 if (pad) {
242 *out << " ";
243 }
244 pad = true;
245 out->setf(std::ios::left);
246 *out << std::setw(max_width) << p->first
247 << " " << pg_state_string(p->second) << "\n";
248 out->unsetf(std::ios::left);
249 }
250 }
251
252 ostringstream ss_rec_io;
253 overall_recovery_rate_summary(f, &ss_rec_io);
254 ostringstream ss_client_io;
255 overall_client_io_rate_summary(f, &ss_client_io);
256 ostringstream ss_cache_io;
257 overall_cache_io_rate_summary(f, &ss_cache_io);
258
259 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
260 || ss_cache_io.str().length())) {
261 *out << "\n \n";
262 *out << " io:\n";
263 }
264
265 if (!f && ss_client_io.str().length())
266 *out << " client: " << ss_client_io.str() << "\n";
267 if (!f && ss_rec_io.str().length())
268 *out << " recovery: " << ss_rec_io.str() << "\n";
269 if (!f && ss_cache_io.str().length())
270 *out << " cache: " << ss_cache_io.str() << "\n";
271 }
272
273 void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
274 {
275 std::stringstream ss;
276
277 if (f)
278 f->open_array_section("num_pg_by_state");
279 for (auto p = num_pg_by_state.begin();
280 p != num_pg_by_state.end();
281 ++p) {
282 if (f) {
283 f->open_object_section("state");
284 f->dump_string("name", pg_state_string(p->first));
285 f->dump_unsigned("num", p->second);
286 f->close_section();
287 }
288 if (p != num_pg_by_state.begin())
289 ss << ", ";
290 ss << p->second << " " << pg_state_string(p->first);
291 }
292 if (f)
293 f->close_section();
294
295 string states = ss.str();
296 if (out)
297 *out << num_pg << " pgs: "
298 << states << "; "
299 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
300 << kb_t(osd_sum.kb_used) << " used, "
301 << kb_t(osd_sum.kb_avail) << " / "
302 << kb_t(osd_sum.kb) << " avail";
303 if (f) {
304 f->dump_unsigned("num_pgs", num_pg);
305 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
306 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
307 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
308 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
309 }
310
311 // make non-negative; we can get negative values if osds send
312 // uncommitted stats and then "go backward" or if they are just
313 // buggy/wrong.
314 pool_stat_t pos_delta = pg_sum_delta;
315 pos_delta.floor(0);
316 if (pos_delta.stats.sum.num_rd ||
317 pos_delta.stats.sum.num_wr) {
318 if (out)
319 *out << "; ";
320 if (pos_delta.stats.sum.num_rd) {
321 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
322 if (out)
323 *out << pretty_si_t(rd) << "B/s rd, ";
324 if (f)
325 f->dump_unsigned("read_bytes_sec", rd);
326 }
327 if (pos_delta.stats.sum.num_wr) {
328 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
329 if (out)
330 *out << pretty_si_t(wr) << "B/s wr, ";
331 if (f)
332 f->dump_unsigned("write_bytes_sec", wr);
333 }
334 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
335 if (out)
336 *out << pretty_si_t(iops) << "op/s";
337 if (f)
338 f->dump_unsigned("io_sec", iops);
339 }
340
341 list<string> sl;
342 overall_recovery_summary(f, &sl);
343 if (out)
344 for (auto p = sl.begin(); p != sl.end(); ++p)
345 *out << "; " << *p;
346 std::stringstream ssr;
347 overall_recovery_rate_summary(f, &ssr);
348 if (out && ssr.str().length())
349 *out << "; " << ssr.str() << " recovering";
350 }
351
352 void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
353 const pool_stat_t& delta_sum) const
354 {
355 if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
356 double pc = (double)delta_sum.stats.sum.num_objects_degraded /
357 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
358 char b[20];
359 snprintf(b, sizeof(b), "%.3lf", pc);
360 if (f) {
361 f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
362 f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
363 f->dump_float("degraded_ratio", pc / 100.0);
364 } else {
365 ostringstream ss;
366 ss << delta_sum.stats.sum.num_objects_degraded
367 << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
368 psl->push_back(ss.str());
369 }
370 }
371 if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
372 double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
373 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
374 char b[20];
375 snprintf(b, sizeof(b), "%.3lf", pc);
376 if (f) {
377 f->dump_unsigned("misplaced_objects", delta_sum.stats.sum.num_objects_misplaced);
378 f->dump_unsigned("misplaced_total", delta_sum.stats.sum.num_object_copies);
379 f->dump_float("misplaced_ratio", pc / 100.0);
380 } else {
381 ostringstream ss;
382 ss << delta_sum.stats.sum.num_objects_misplaced
383 << "/" << delta_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
384 psl->push_back(ss.str());
385 }
386 }
387 if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
388 double pc = (double)delta_sum.stats.sum.num_objects_unfound /
389 (double)delta_sum.stats.sum.num_objects * (double)100.0;
390 char b[20];
391 snprintf(b, sizeof(b), "%.3lf", pc);
392 if (f) {
393 f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
394 f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
395 f->dump_float("unfound_ratio", pc / 100.0);
396 } else {
397 ostringstream ss;
398 ss << delta_sum.stats.sum.num_objects_unfound
399 << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
400 psl->push_back(ss.str());
401 }
402 }
403 }
404
405 void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
406 const pool_stat_t& delta_sum,
407 utime_t delta_stamp) const
408 {
409 // make non-negative; we can get negative values if osds send
410 // uncommitted stats and then "go backward" or if they are just
411 // buggy/wrong.
412 pool_stat_t pos_delta = delta_sum;
413 pos_delta.floor(0);
414 if (pos_delta.stats.sum.num_objects_recovered ||
415 pos_delta.stats.sum.num_bytes_recovered ||
416 pos_delta.stats.sum.num_keys_recovered) {
417 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
418 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
419 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
420 if (f) {
421 f->dump_int("recovering_objects_per_sec", objps);
422 f->dump_int("recovering_bytes_per_sec", bps);
423 f->dump_int("recovering_keys_per_sec", kps);
424 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
425 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
426 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
427 } else {
428 *out << pretty_si_t(bps) << "B/s";
429 if (pos_delta.stats.sum.num_keys_recovered)
430 *out << ", " << pretty_si_t(kps) << "keys/s";
431 *out << ", " << pretty_si_t(objps) << "objects/s";
432 }
433 }
434 }
435
436 void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
437 {
438 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
439 }
440
441 void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
442 {
443 recovery_summary(f, psl, pg_sum);
444 }
445
446 void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
447 uint64_t poolid) const
448 {
449 auto p = per_pool_sum_delta.find(poolid);
450 if (p == per_pool_sum_delta.end())
451 return;
452
453 auto ts = per_pool_sum_deltas_stamps.find(p->first);
454 assert(ts != per_pool_sum_deltas_stamps.end());
455 recovery_rate_summary(f, out, p->second.first, ts->second);
456 }
457
458 void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
459 uint64_t poolid) const
460 {
461 auto p = per_pool_sum_delta.find(poolid);
462 if (p == per_pool_sum_delta.end())
463 return;
464
465 recovery_summary(f, psl, p->second.first);
466 }
467
468 void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
469 const pool_stat_t& delta_sum,
470 utime_t delta_stamp) const
471 {
472 pool_stat_t pos_delta = delta_sum;
473 pos_delta.floor(0);
474 if (pos_delta.stats.sum.num_rd ||
475 pos_delta.stats.sum.num_wr) {
476 if (pos_delta.stats.sum.num_rd) {
477 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
478 if (f) {
479 f->dump_int("read_bytes_sec", rd);
480 } else {
481 *out << pretty_si_t(rd) << "B/s rd, ";
482 }
483 }
484 if (pos_delta.stats.sum.num_wr) {
485 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
486 if (f) {
487 f->dump_int("write_bytes_sec", wr);
488 } else {
489 *out << pretty_si_t(wr) << "B/s wr, ";
490 }
491 }
492 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
493 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
494 if (f) {
495 f->dump_int("read_op_per_sec", iops_rd);
496 f->dump_int("write_op_per_sec", iops_wr);
497 } else {
498 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
499 }
500 }
501 }
502
503 void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
504 {
505 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
506 }
507
508 void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
509 uint64_t poolid) const
510 {
511 auto p = per_pool_sum_delta.find(poolid);
512 if (p == per_pool_sum_delta.end())
513 return;
514
515 auto ts = per_pool_sum_deltas_stamps.find(p->first);
516 assert(ts != per_pool_sum_deltas_stamps.end());
517 client_io_rate_summary(f, out, p->second.first, ts->second);
518 }
519
520 void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
521 const pool_stat_t& delta_sum,
522 utime_t delta_stamp) const
523 {
524 pool_stat_t pos_delta = delta_sum;
525 pos_delta.floor(0);
526 bool have_output = false;
527
528 if (pos_delta.stats.sum.num_flush) {
529 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
530 if (f) {
531 f->dump_int("flush_bytes_sec", flush);
532 } else {
533 *out << pretty_si_t(flush) << "B/s flush";
534 have_output = true;
535 }
536 }
537 if (pos_delta.stats.sum.num_evict) {
538 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
539 if (f) {
540 f->dump_int("evict_bytes_sec", evict);
541 } else {
542 if (have_output)
543 *out << ", ";
544 *out << pretty_si_t(evict) << "B/s evict";
545 have_output = true;
546 }
547 }
548 if (pos_delta.stats.sum.num_promote) {
549 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
550 if (f) {
551 f->dump_int("promote_op_per_sec", promote);
552 } else {
553 if (have_output)
554 *out << ", ";
555 *out << pretty_si_t(promote) << "op/s promote";
556 have_output = true;
557 }
558 }
559 if (pos_delta.stats.sum.num_flush_mode_low) {
560 if (f) {
561 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
562 } else {
563 if (have_output)
564 *out << ", ";
565 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
566 have_output = true;
567 }
568 }
569 if (pos_delta.stats.sum.num_flush_mode_high) {
570 if (f) {
571 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
572 } else {
573 if (have_output)
574 *out << ", ";
575 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
576 have_output = true;
577 }
578 }
579 if (pos_delta.stats.sum.num_evict_mode_some) {
580 if (f) {
581 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
582 } else {
583 if (have_output)
584 *out << ", ";
585 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
586 have_output = true;
587 }
588 }
589 if (pos_delta.stats.sum.num_evict_mode_full) {
590 if (f) {
591 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
592 } else {
593 if (have_output)
594 *out << ", ";
595 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
596 }
597 }
598 }
599
600 void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
601 {
602 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
603 }
604
605 void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
606 uint64_t poolid) const
607 {
608 auto p = per_pool_sum_delta.find(poolid);
609 if (p == per_pool_sum_delta.end())
610 return;
611
612 auto ts = per_pool_sum_deltas_stamps.find(p->first);
613 assert(ts != per_pool_sum_deltas_stamps.end());
614 cache_io_rate_summary(f, out, p->second.first, ts->second);
615 }
616
617 void PGMapDigest::dump_pool_stats_full(
618 const OSDMap &osd_map,
619 stringstream *ss,
620 Formatter *f,
621 bool verbose) const
622 {
623 TextTable tbl;
624
625 if (f) {
626 f->open_array_section("pools");
627 } else {
628 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
629 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
630 if (verbose) {
631 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
632 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
633 }
634
635 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
636 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
637 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
638 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
639 if (verbose) {
640 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
641 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
642 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
643 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
644 }
645 }
646
647 map<int,uint64_t> avail_by_rule;
648 for (auto p = osd_map.get_pools().begin();
649 p != osd_map.get_pools().end(); ++p) {
650 int64_t pool_id = p->first;
651 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
652 continue;
653 const string& pool_name = osd_map.get_pool_name(pool_id);
654 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
655
656 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
657 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
658 pool->get_type(),
659 pool->get_size());
660 int64_t avail;
661 float raw_used_rate;
662 if (avail_by_rule.count(ruleno) == 0) {
663 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
664 avail = get_rule_avail(ruleno);
665 if (avail < 0)
666 avail = 0;
667 avail_by_rule[ruleno] = avail;
668 } else {
669 avail = avail_by_rule[ruleno];
670 }
671 switch (pool->get_type()) {
672 case pg_pool_t::TYPE_REPLICATED:
673 avail /= pool->get_size();
674 raw_used_rate = pool->get_size();
675 break;
676 case pg_pool_t::TYPE_ERASURE:
677 {
678 auto& ecp =
679 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
680 auto pm = ecp.find("m");
681 auto pk = ecp.find("k");
682 if (pm != ecp.end() && pk != ecp.end()) {
683 int k = atoi(pk->second.c_str());
684 int m = atoi(pm->second.c_str());
685 int mk = m + k;
686 assert(mk != 0);
687 avail = avail * k / mk;
688 assert(k != 0);
689 raw_used_rate = (float)mk / k;
690 } else {
691 raw_used_rate = 0.0;
692 }
693 }
694 break;
695 default:
696 assert(0 == "unrecognized pool type");
697 }
698
699 if (f) {
700 f->open_object_section("pool");
701 f->dump_string("name", pool_name);
702 f->dump_int("id", pool_id);
703 f->open_object_section("stats");
704 } else {
705 tbl << pool_name
706 << pool_id;
707 if (verbose) {
708 if (pool->quota_max_objects == 0)
709 tbl << "N/A";
710 else
711 tbl << si_t(pool->quota_max_objects);
712
713 if (pool->quota_max_bytes == 0)
714 tbl << "N/A";
715 else
716 tbl << si_t(pool->quota_max_bytes);
717 }
718
719 }
720 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
721 if (f)
722 f->close_section(); // stats
723 else
724 tbl << TextTable::endrow;
725
726 if (f)
727 f->close_section(); // pool
728 }
729 if (f)
730 f->close_section();
731 else {
732 assert(ss != nullptr);
733 *ss << "POOLS:\n";
734 tbl.set_indent(4);
735 *ss << tbl;
736 }
737 }
738
739 void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
740 {
741 if (f) {
742 f->open_object_section("stats");
743 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
744 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
745 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
746 if (verbose) {
747 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
748 }
749 f->close_section();
750 } else {
751 assert(ss != nullptr);
752 TextTable tbl;
753 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
754 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
755 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
756 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
757 if (verbose) {
758 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
759 }
760 tbl << stringify(si_t(osd_sum.kb*1024))
761 << stringify(si_t(osd_sum.kb_avail*1024))
762 << stringify(si_t(osd_sum.kb_used*1024));
763 float used = 0.0;
764 if (osd_sum.kb > 0) {
765 used = ((float)osd_sum.kb_used / osd_sum.kb);
766 }
767 tbl << percentify(used*100);
768 if (verbose) {
769 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
770 }
771 tbl << TextTable::endrow;
772
773 *ss << "GLOBAL:\n";
774 tbl.set_indent(4);
775 *ss << tbl;
776 }
777 }
778
779 void PGMapDigest::dump_object_stat_sum(
780 TextTable &tbl, Formatter *f,
781 const object_stat_sum_t &sum, uint64_t avail,
782 float raw_used_rate, bool verbose,
783 const pg_pool_t *pool)
784 {
785 float curr_object_copies_rate = 0.0;
786 if (sum.num_object_copies > 0)
787 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
788
789 float used = 0.0;
790 if (avail) {
791 used = sum.num_bytes * curr_object_copies_rate;
792 used /= used + avail;
793 } else if (sum.num_bytes) {
794 used = 1.0;
795 }
796
797 if (f) {
798 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
799 f->dump_int("bytes_used", sum.num_bytes);
800 f->dump_format_unquoted("percent_used", "%.2f", (used*100));
801 f->dump_unsigned("max_avail", avail);
802 f->dump_int("objects", sum.num_objects);
803 if (verbose) {
804 f->dump_int("quota_objects", pool->quota_max_objects);
805 f->dump_int("quota_bytes", pool->quota_max_bytes);
806 f->dump_int("dirty", sum.num_objects_dirty);
807 f->dump_int("rd", sum.num_rd);
808 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
809 f->dump_int("wr", sum.num_wr);
810 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
811 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
812 }
813 } else {
814 tbl << stringify(si_t(sum.num_bytes));
815 tbl << percentify(used*100);
816 tbl << si_t(avail);
817 tbl << sum.num_objects;
818 if (verbose) {
819 tbl << stringify(si_t(sum.num_objects_dirty))
820 << stringify(si_t(sum.num_rd))
821 << stringify(si_t(sum.num_wr))
822 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
823 }
824 }
825 }
826
827 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
828 {
829 map<int,float> wm;
830 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
831 if (r < 0) {
832 return r;
833 }
834 if (wm.empty()) {
835 return 0;
836 }
837
838 float fratio;
839 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
840 osdmap.get_full_ratio() > 0) {
841 fratio = osdmap.get_full_ratio();
842 } else {
843 fratio = get_fallback_full_ratio();
844 }
845
846 int64_t min = -1;
847 for (auto p = wm.begin(); p != wm.end(); ++p) {
848 auto osd_info = osd_stat.find(p->first);
849 if (osd_info != osd_stat.end()) {
850 if (osd_info->second.kb == 0 || p->second == 0) {
851 // osd must be out, hence its stats have been zeroed
852 // (unless we somehow managed to have a disk with size 0...)
853 //
854 // (p->second == 0), if osd weight is 0, no need to
855 // calculate proj below.
856 continue;
857 }
858 double unusable = (double)osd_info->second.kb *
859 (1.0 - fratio);
860 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
861 avail *= 1024.0;
862 int64_t proj = (int64_t)(avail / (double)p->second);
863 if (min < 0 || proj < min) {
864 min = proj;
865 }
866 } else {
867 dout(0) << "Cannot get stat of OSD " << p->first << dendl;
868 }
869 }
870 return min;
871 }
872
873 void PGMap::get_rules_avail(const OSDMap& osdmap,
874 std::map<int,int64_t> *avail_map) const
875 {
876 avail_map->clear();
877 for (auto p : osdmap.get_pools()) {
878 int64_t pool_id = p.first;
879 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
880 continue;
881 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
882 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
883 pool->get_type(),
884 pool->get_size());
885 if (avail_map->count(ruleno) == 0)
886 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
887 }
888 }
889
890 // ---------------------
891 // PGMap
892
893 void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
894 {
895 if ((features & CEPH_FEATURE_MONENC) == 0) {
896 __u8 v = 4;
897 ::encode(v, bl);
898 ::encode(version, bl);
899 ::encode(pg_stat_updates, bl);
900 ::encode(osd_stat_updates, bl);
901 ::encode(osd_stat_rm, bl);
902 ::encode(osdmap_epoch, bl);
903 ::encode(pg_scan, bl);
904 ::encode(full_ratio, bl);
905 ::encode(nearfull_ratio, bl);
906 ::encode(pg_remove, bl);
907 return;
908 }
909
910 ENCODE_START(7, 5, bl);
911 ::encode(version, bl);
912 ::encode(pg_stat_updates, bl);
913 ::encode(osd_stat_updates, bl);
914 ::encode(osd_stat_rm, bl);
915 ::encode(osdmap_epoch, bl);
916 ::encode(pg_scan, bl);
917 ::encode(full_ratio, bl);
918 ::encode(nearfull_ratio, bl);
919 ::encode(pg_remove, bl);
920 ::encode(stamp, bl);
921 ::encode(osd_epochs, bl);
922 ENCODE_FINISH(bl);
923 }
924
925 void PGMap::Incremental::decode(bufferlist::iterator &bl)
926 {
927 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
928 ::decode(version, bl);
929 if (struct_v < 3) {
930 pg_stat_updates.clear();
931 __u32 n;
932 ::decode(n, bl);
933 while (n--) {
934 old_pg_t opgid;
935 ::decode(opgid, bl);
936 pg_t pgid = opgid;
937 ::decode(pg_stat_updates[pgid], bl);
938 }
939 } else {
940 ::decode(pg_stat_updates, bl);
941 }
942 ::decode(osd_stat_updates, bl);
943 ::decode(osd_stat_rm, bl);
944 ::decode(osdmap_epoch, bl);
945 ::decode(pg_scan, bl);
946 if (struct_v >= 2) {
947 ::decode(full_ratio, bl);
948 ::decode(nearfull_ratio, bl);
949 }
950 if (struct_v < 3) {
951 pg_remove.clear();
952 __u32 n;
953 ::decode(n, bl);
954 while (n--) {
955 old_pg_t opgid;
956 ::decode(opgid, bl);
957 pg_remove.insert(pg_t(opgid));
958 }
959 } else {
960 ::decode(pg_remove, bl);
961 }
962 if (struct_v < 4 && full_ratio == 0) {
963 full_ratio = -1;
964 }
965 if (struct_v < 4 && nearfull_ratio == 0) {
966 nearfull_ratio = -1;
967 }
968 if (struct_v >= 6)
969 ::decode(stamp, bl);
970 if (struct_v >= 7) {
971 ::decode(osd_epochs, bl);
972 } else {
973 for (auto i = osd_stat_updates.begin();
974 i != osd_stat_updates.end();
975 ++i) {
976 // This isn't accurate, but will cause trimming to behave like
977 // previously.
978 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
979 }
980 }
981 DECODE_FINISH(bl);
982 }
983
984 void PGMap::Incremental::dump(Formatter *f) const
985 {
986 f->dump_unsigned("version", version);
987 f->dump_stream("stamp") << stamp;
988 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
989 f->dump_unsigned("pg_scan_epoch", pg_scan);
990 f->dump_float("full_ratio", full_ratio);
991 f->dump_float("nearfull_ratio", nearfull_ratio);
992
993 f->open_array_section("pg_stat_updates");
994 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
995 f->open_object_section("pg_stat");
996 f->dump_stream("pgid") << p->first;
997 p->second.dump(f);
998 f->close_section();
999 }
1000 f->close_section();
1001
1002 f->open_array_section("osd_stat_updates");
1003 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1004 f->open_object_section("osd_stat");
1005 f->dump_int("osd", p->first);
1006 p->second.dump(f);
1007 f->close_section();
1008 }
1009 f->close_section();
1010
1011 f->open_array_section("osd_stat_removals");
1012 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1013 f->dump_int("osd", *p);
1014 f->close_section();
1015
1016 f->open_array_section("pg_removals");
1017 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1018 f->dump_stream("pgid") << *p;
1019 f->close_section();
1020 }
1021
1022 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1023 {
1024 o.push_back(new Incremental);
1025 o.push_back(new Incremental);
1026 o.back()->version = 1;
1027 o.back()->stamp = utime_t(123,345);
1028 o.push_back(new Incremental);
1029 o.back()->version = 2;
1030 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
1031 o.back()->osd_stat_updates[5] = osd_stat_t();
1032 o.back()->osd_epochs[5] = 12;
1033 o.push_back(new Incremental);
1034 o.back()->version = 3;
1035 o.back()->osdmap_epoch = 1;
1036 o.back()->pg_scan = 2;
1037 o.back()->full_ratio = .2;
1038 o.back()->nearfull_ratio = .3;
1039 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
1040 o.back()->osd_stat_updates[6] = osd_stat_t();
1041 o.back()->osd_epochs[6] = 12;
1042 o.back()->pg_remove.insert(pg_t(1,2,3));
1043 o.back()->osd_stat_rm.insert(5);
1044 }
1045
1046
1047 // --
1048
1049 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1050 {
1051 assert(inc.version == version+1);
1052 version++;
1053
1054 utime_t delta_t;
1055 delta_t = inc.stamp;
1056 delta_t -= stamp;
1057 stamp = inc.stamp;
1058
1059 pool_stat_t pg_sum_old = pg_sum;
1060 mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
1061
1062 bool ratios_changed = false;
1063 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
1064 full_ratio = inc.full_ratio;
1065 ratios_changed = true;
1066 }
1067 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
1068 nearfull_ratio = inc.nearfull_ratio;
1069 ratios_changed = true;
1070 }
1071 if (ratios_changed)
1072 redo_full_sets();
1073
1074 for (auto p = inc.pg_stat_updates.begin();
1075 p != inc.pg_stat_updates.end();
1076 ++p) {
1077 const pg_t &update_pg(p->first);
1078 const pg_stat_t &update_stat(p->second);
1079
1080 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
1081 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
1082
1083 auto t = pg_stat.find(update_pg);
1084 if (t == pg_stat.end()) {
1085 pg_stat.insert(make_pair(update_pg, update_stat));
1086 } else {
1087 stat_pg_sub(update_pg, t->second);
1088 t->second = update_stat;
1089 }
1090 stat_pg_add(update_pg, update_stat);
1091 }
1092 assert(osd_stat.size() == osd_epochs.size());
1093 for (auto p = inc.get_osd_stat_updates().begin();
1094 p != inc.get_osd_stat_updates().end();
1095 ++p) {
1096 int osd = p->first;
1097 const osd_stat_t &new_stats(p->second);
1098
1099 auto t = osd_stat.find(osd);
1100 if (t == osd_stat.end()) {
1101 osd_stat.insert(make_pair(osd, new_stats));
1102 } else {
1103 stat_osd_sub(t->first, t->second);
1104 t->second = new_stats;
1105 }
1106 auto i = osd_epochs.find(osd);
1107 auto j = inc.get_osd_epochs().find(osd);
1108 assert(j != inc.get_osd_epochs().end());
1109
1110 if (i == osd_epochs.end())
1111 osd_epochs.insert(*j);
1112 else
1113 i->second = j->second;
1114
1115 stat_osd_add(osd, new_stats);
1116
1117 // adjust [near]full status
1118 register_nearfull_status(osd, new_stats);
1119 }
1120 set<int64_t> deleted_pools;
1121 for (auto p = inc.pg_remove.begin();
1122 p != inc.pg_remove.end();
1123 ++p) {
1124 const pg_t &removed_pg(*p);
1125 auto s = pg_stat.find(removed_pg);
1126 if (s != pg_stat.end()) {
1127 stat_pg_sub(removed_pg, s->second);
1128 pg_stat.erase(s);
1129 }
1130 deleted_pools.insert(removed_pg.pool());
1131 }
1132
1133 for (auto p = inc.get_osd_stat_rm().begin();
1134 p != inc.get_osd_stat_rm().end();
1135 ++p) {
1136 auto t = osd_stat.find(*p);
1137 if (t != osd_stat.end()) {
1138 stat_osd_sub(t->first, t->second);
1139 osd_stat.erase(t);
1140 osd_epochs.erase(*p);
1141 }
1142
1143 // remove these old osds from full/nearfull set(s), too
1144 nearfull_osds.erase(*p);
1145 full_osds.erase(*p);
1146 }
1147
1148 // calculate a delta, and average over the last 2 deltas.
1149 pool_stat_t d = pg_sum;
1150 d.stats.sub(pg_sum_old.stats);
1151 pg_sum_deltas.push_back(make_pair(d, delta_t));
1152 stamp_delta += delta_t;
1153
1154 pg_sum_delta.stats.add(d.stats);
1155 if (pg_sum_deltas.size() > (unsigned)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
1156 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1157 stamp_delta -= pg_sum_deltas.front().second;
1158 pg_sum_deltas.pop_front();
1159 }
1160
1161 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1162
1163 for (auto p : deleted_pools) {
1164 if (cct)
1165 dout(20) << " deleted pool " << p << dendl;
1166 deleted_pool(p);
1167 }
1168
1169 if (inc.osdmap_epoch)
1170 last_osdmap_epoch = inc.osdmap_epoch;
1171 if (inc.pg_scan)
1172 last_pg_scan = inc.pg_scan;
1173
1174 min_last_epoch_clean = 0; // invalidate
1175 }
1176
1177 void PGMap::redo_full_sets()
1178 {
1179 full_osds.clear();
1180 nearfull_osds.clear();
1181 for (auto i = osd_stat.begin();
1182 i != osd_stat.end();
1183 ++i) {
1184 register_nearfull_status(i->first, i->second);
1185 }
1186 }
1187
1188 void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
1189 {
1190 float ratio = ((float)s.kb_used) / ((float)s.kb);
1191
1192 if (full_ratio > 0 && ratio > full_ratio) {
1193 // full
1194 full_osds.insert(osd);
1195 nearfull_osds.erase(osd);
1196 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
1197 // nearfull
1198 full_osds.erase(osd);
1199 nearfull_osds.insert(osd);
1200 } else {
1201 // ok
1202 full_osds.erase(osd);
1203 nearfull_osds.erase(osd);
1204 }
1205 }
1206
1207 void PGMap::calc_stats()
1208 {
1209 num_pg = 0;
1210 num_pg_active = 0;
1211 num_pg_unknown = 0;
1212 num_osd = 0;
1213 pg_pool_sum.clear();
1214 num_pg_by_pool.clear();
1215 pg_by_osd.clear();
1216 pg_sum = pool_stat_t();
1217 osd_sum = osd_stat_t();
1218 num_pg_by_state.clear();
1219 num_pg_by_osd.clear();
1220
1221 for (auto p = pg_stat.begin();
1222 p != pg_stat.end();
1223 ++p) {
1224 stat_pg_add(p->first, p->second);
1225 }
1226 for (auto p = osd_stat.begin();
1227 p != osd_stat.end();
1228 ++p)
1229 stat_osd_add(p->first, p->second);
1230
1231 redo_full_sets();
1232
1233 min_last_epoch_clean = calc_min_last_epoch_clean();
1234 }
1235
1236 void PGMap::update_pg(pg_t pgid, bufferlist& bl)
1237 {
1238 bufferlist::iterator p = bl.begin();
1239 auto s = pg_stat.find(pgid);
1240 epoch_t old_lec = 0, lec;
1241 if (s != pg_stat.end()) {
1242 old_lec = s->second.get_effective_last_epoch_clean();
1243 stat_pg_update(pgid, s->second, p);
1244 lec = s->second.get_effective_last_epoch_clean();
1245 } else {
1246 pg_stat_t& r = pg_stat[pgid];
1247 ::decode(r, p);
1248 stat_pg_add(pgid, r);
1249 lec = r.get_effective_last_epoch_clean();
1250 }
1251
1252 if (min_last_epoch_clean &&
1253 (lec < min_last_epoch_clean || // we did
1254 (lec > min_last_epoch_clean && // we might
1255 old_lec == min_last_epoch_clean)
1256 ))
1257 min_last_epoch_clean = 0;
1258 }
1259
1260 void PGMap::remove_pg(pg_t pgid)
1261 {
1262 auto s = pg_stat.find(pgid);
1263 if (s != pg_stat.end()) {
1264 if (min_last_epoch_clean &&
1265 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
1266 min_last_epoch_clean = 0;
1267 stat_pg_sub(pgid, s->second);
1268 pg_stat.erase(s);
1269 }
1270 }
1271
1272 void PGMap::update_osd(int osd, bufferlist& bl)
1273 {
1274 bufferlist::iterator p = bl.begin();
1275 auto o = osd_stat.find(osd);
1276 epoch_t old_lec = 0;
1277 if (o != osd_stat.end()) {
1278 auto i = osd_epochs.find(osd);
1279 if (i != osd_epochs.end())
1280 old_lec = i->second;
1281 stat_osd_sub(osd, o->second);
1282 }
1283 osd_stat_t& r = osd_stat[osd];
1284 ::decode(r, p);
1285 stat_osd_add(osd, r);
1286
1287 // adjust [near]full status
1288 register_nearfull_status(osd, r);
1289
1290 // epoch?
1291 if (!p.end()) {
1292 epoch_t e;
1293 ::decode(e, p);
1294
1295 if (e < min_last_epoch_clean ||
1296 (e > min_last_epoch_clean &&
1297 old_lec == min_last_epoch_clean))
1298 min_last_epoch_clean = 0;
1299 } else {
1300 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1301 // or old mon running.
1302 }
1303 }
1304
1305 void PGMap::remove_osd(int osd)
1306 {
1307 auto o = osd_stat.find(osd);
1308 if (o != osd_stat.end()) {
1309 stat_osd_sub(osd, o->second);
1310 osd_stat.erase(o);
1311
1312 // remove these old osds from full/nearfull set(s), too
1313 nearfull_osds.erase(osd);
1314 full_osds.erase(osd);
1315 }
1316 }
1317
1318 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1319 bool sameosds)
1320 {
1321 pg_pool_sum[pgid.pool()].add(s);
1322 pg_sum.add(s);
1323
1324 num_pg++;
1325 num_pg_by_state[s.state]++;
1326 num_pg_by_pool[pgid.pool()]++;
1327
1328 if ((s.state & PG_STATE_CREATING) &&
1329 s.parent_split_bits == 0) {
1330 creating_pgs.insert(pgid);
1331 if (s.acting_primary >= 0) {
1332 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1333 }
1334 }
1335
1336 if (s.state & PG_STATE_ACTIVE) {
1337 ++num_pg_active;
1338 }
1339 if (s.state == 0) {
1340 ++num_pg_unknown;
1341 }
1342
1343 if (sameosds)
1344 return;
1345
1346 for (auto p = s.blocked_by.begin();
1347 p != s.blocked_by.end();
1348 ++p) {
1349 ++blocked_by_sum[*p];
1350 }
1351
1352 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1353 pg_by_osd[*p].insert(pgid);
1354 num_pg_by_osd[*p].acting++;
1355 }
1356 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1357 pg_by_osd[*p].insert(pgid);
1358 num_pg_by_osd[*p].up++;
1359 }
1360
1361 if (s.up_primary >= 0) {
1362 num_pg_by_osd[s.up_primary].primary++;
1363 }
1364 }
1365
1366 void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1367 bool sameosds)
1368 {
1369 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
1370 ps.sub(s);
1371 pg_sum.sub(s);
1372
1373 num_pg--;
1374 int end = --num_pg_by_state[s.state];
1375 assert(end >= 0);
1376 if (end == 0)
1377 num_pg_by_state.erase(s.state);
1378 end = --num_pg_by_pool[pgid.pool()];
1379 if (end == 0) {
1380 num_pg_by_pool.erase(pgid.pool());
1381 pg_pool_sum.erase(pgid.pool());
1382 }
1383
1384 if ((s.state & PG_STATE_CREATING) &&
1385 s.parent_split_bits == 0) {
1386 creating_pgs.erase(pgid);
1387 if (s.acting_primary >= 0) {
1388 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1389 r[s.mapping_epoch].erase(pgid);
1390 if (r[s.mapping_epoch].empty())
1391 r.erase(s.mapping_epoch);
1392 if (r.empty())
1393 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1394 }
1395 }
1396
1397 if (s.state & PG_STATE_ACTIVE) {
1398 --num_pg_active;
1399 }
1400 if (s.state == 0) {
1401 --num_pg_unknown;
1402 }
1403
1404 if (sameosds)
1405 return;
1406
1407 for (auto p = s.blocked_by.begin();
1408 p != s.blocked_by.end();
1409 ++p) {
1410 auto q = blocked_by_sum.find(*p);
1411 assert(q != blocked_by_sum.end());
1412 --q->second;
1413 if (q->second == 0)
1414 blocked_by_sum.erase(q);
1415 }
1416
1417 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1418 auto& oset = pg_by_osd[*p];
1419 oset.erase(pgid);
1420 if (oset.empty())
1421 pg_by_osd.erase(*p);
1422 auto it = num_pg_by_osd.find(*p);
1423 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1424 it->second.acting--;
1425 }
1426 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1427 auto& oset = pg_by_osd[*p];
1428 oset.erase(pgid);
1429 if (oset.empty())
1430 pg_by_osd.erase(*p);
1431 auto it = num_pg_by_osd.find(*p);
1432 if (it != num_pg_by_osd.end() && it->second.up > 0)
1433 it->second.up--;
1434 }
1435
1436 if (s.up_primary >= 0) {
1437 auto it = num_pg_by_osd.find(s.up_primary);
1438 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1439 it->second.primary--;
1440 }
1441 }
1442
1443 void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
1444 bufferlist::iterator& blp)
1445 {
1446 pg_stat_t n;
1447 ::decode(n, blp);
1448
1449 bool sameosds =
1450 s.acting == n.acting &&
1451 s.up == n.up &&
1452 s.blocked_by == n.blocked_by;
1453
1454 stat_pg_sub(pgid, s, sameosds);
1455
1456 // if acting_primary has shift to an just restored osd, and pg yet to finish
1457 // peering, many attributes in current stats remain stale. others seem don't
1458 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1459 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
1460 n.last_active < s.last_active)
1461 n.last_active = s.last_active;
1462 s = n;
1463 stat_pg_add(pgid, n, sameosds);
1464 }
1465
1466 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1467 {
1468 num_osd++;
1469 osd_sum.add(s);
1470 if (osd >= (int)osd_last_seq.size()) {
1471 osd_last_seq.resize(osd + 1);
1472 }
1473 osd_last_seq[osd] = s.seq;
1474 }
1475
1476 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1477 {
1478 num_osd--;
1479 osd_sum.sub(s);
1480 assert(osd < (int)osd_last_seq.size());
1481 osd_last_seq[osd] = 0;
1482 }
1483
1484 epoch_t PGMap::calc_min_last_epoch_clean() const
1485 {
1486 if (pg_stat.empty())
1487 return 0;
1488
1489 auto p = pg_stat.begin();
1490 epoch_t min = p->second.get_effective_last_epoch_clean();
1491 for (++p; p != pg_stat.end(); ++p) {
1492 epoch_t lec = p->second.get_effective_last_epoch_clean();
1493 if (lec < min)
1494 min = lec;
1495 }
1496 // also scan osd epochs
1497 // don't trim past the oldest reported osd epoch
1498 for (auto i = osd_epochs.begin();
1499 i != osd_epochs.end();
1500 ++i) {
1501 if (i->second < min)
1502 min = i->second;
1503 }
1504 return min;
1505 }
1506
1507 void PGMap::encode_digest(const OSDMap& osdmap,
1508 bufferlist& bl, uint64_t features) const
1509 {
1510 get_rules_avail(osdmap, &avail_space_by_rule);
1511 PGMapDigest::encode(bl, features);
1512 }
1513
1514 void PGMap::encode(bufferlist &bl, uint64_t features) const
1515 {
1516 if ((features & CEPH_FEATURE_MONENC) == 0) {
1517 __u8 v = 3;
1518 ::encode(v, bl);
1519 ::encode(version, bl);
1520 ::encode(pg_stat, bl);
1521 ::encode(osd_stat, bl);
1522 ::encode(last_osdmap_epoch, bl);
1523 ::encode(last_pg_scan, bl);
1524 ::encode(full_ratio, bl);
1525 ::encode(nearfull_ratio, bl);
1526 return;
1527 }
1528
1529 ENCODE_START(6, 4, bl);
1530 ::encode(version, bl);
1531 ::encode(pg_stat, bl);
1532 ::encode(osd_stat, bl);
1533 ::encode(last_osdmap_epoch, bl);
1534 ::encode(last_pg_scan, bl);
1535 ::encode(full_ratio, bl);
1536 ::encode(nearfull_ratio, bl);
1537 ::encode(stamp, bl);
1538 ::encode(osd_epochs, bl);
1539 ENCODE_FINISH(bl);
1540 }
1541
1542 void PGMap::decode(bufferlist::iterator &bl)
1543 {
1544 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
1545 ::decode(version, bl);
1546 if (struct_v < 3) {
1547 pg_stat.clear();
1548 __u32 n;
1549 ::decode(n, bl);
1550 while (n--) {
1551 old_pg_t opgid;
1552 ::decode(opgid, bl);
1553 pg_t pgid = opgid;
1554 ::decode(pg_stat[pgid], bl);
1555 }
1556 } else {
1557 ::decode(pg_stat, bl);
1558 }
1559 ::decode(osd_stat, bl);
1560 ::decode(last_osdmap_epoch, bl);
1561 ::decode(last_pg_scan, bl);
1562 if (struct_v >= 2) {
1563 ::decode(full_ratio, bl);
1564 ::decode(nearfull_ratio, bl);
1565 }
1566 if (struct_v >= 5)
1567 ::decode(stamp, bl);
1568 if (struct_v >= 6) {
1569 ::decode(osd_epochs, bl);
1570 } else {
1571 for (auto i = osd_stat.begin();
1572 i != osd_stat.end();
1573 ++i) {
1574 // This isn't accurate, but will cause trimming to behave like
1575 // previously.
1576 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
1577 }
1578 }
1579 DECODE_FINISH(bl);
1580
1581 calc_stats();
1582 }
1583
1584 void PGMap::dirty_all(Incremental& inc)
1585 {
1586 inc.osdmap_epoch = last_osdmap_epoch;
1587 inc.pg_scan = last_pg_scan;
1588 inc.full_ratio = full_ratio;
1589 inc.nearfull_ratio = nearfull_ratio;
1590
1591 for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) {
1592 inc.pg_stat_updates[p->first] = p->second;
1593 }
1594 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
1595 assert(osd_epochs.count(p->first));
1596 inc.update_stat(p->first,
1597 inc.get_osd_epochs().find(p->first)->second,
1598 p->second);
1599 }
1600 }
1601
1602 void PGMap::dump(Formatter *f) const
1603 {
1604 dump_basic(f);
1605 dump_pg_stats(f, false);
1606 dump_pool_stats(f);
1607 dump_osd_stats(f);
1608 }
1609
1610 void PGMap::dump_basic(Formatter *f) const
1611 {
1612 f->dump_unsigned("version", version);
1613 f->dump_stream("stamp") << stamp;
1614 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1615 f->dump_unsigned("last_pg_scan", last_pg_scan);
1616 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
1617 f->dump_float("full_ratio", full_ratio);
1618 f->dump_float("near_full_ratio", nearfull_ratio);
1619
1620 f->open_object_section("pg_stats_sum");
1621 pg_sum.dump(f);
1622 f->close_section();
1623
1624 f->open_object_section("osd_stats_sum");
1625 osd_sum.dump(f);
1626 f->close_section();
1627
1628 f->open_array_section("osd_epochs");
1629 for (auto p = osd_epochs.begin(); p != osd_epochs.end(); ++p) {
1630 f->open_object_section("osd");
1631 f->dump_unsigned("osd", p->first);
1632 f->dump_unsigned("epoch", p->second);
1633 f->close_section();
1634 }
1635 f->close_section();
1636
1637 dump_delta(f);
1638 }
1639
1640 void PGMap::dump_delta(Formatter *f) const
1641 {
1642 f->open_object_section("pg_stats_delta");
1643 pg_sum_delta.dump(f);
1644 f->close_section();
1645 }
1646
1647 void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1648 {
1649 f->open_array_section("pg_stats");
1650 for (auto i = pg_stat.begin();
1651 i != pg_stat.end();
1652 ++i) {
1653 f->open_object_section("pg_stat");
1654 f->dump_stream("pgid") << i->first;
1655 if (brief)
1656 i->second.dump_brief(f);
1657 else
1658 i->second.dump(f);
1659 f->close_section();
1660 }
1661 f->close_section();
1662 }
1663
1664 void PGMap::dump_pool_stats(Formatter *f) const
1665 {
1666 f->open_array_section("pool_stats");
1667 for (auto p = pg_pool_sum.begin();
1668 p != pg_pool_sum.end();
1669 ++p) {
1670 f->open_object_section("pool_stat");
1671 f->dump_int("poolid", p->first);
1672 auto q = num_pg_by_pool.find(p->first);
1673 if (q != num_pg_by_pool.end())
1674 f->dump_unsigned("num_pg", q->second);
1675 p->second.dump(f);
1676 f->close_section();
1677 }
1678 f->close_section();
1679 }
1680
1681 void PGMap::dump_osd_stats(Formatter *f) const
1682 {
1683 f->open_array_section("osd_stats");
1684 for (auto q = osd_stat.begin();
1685 q != osd_stat.end();
1686 ++q) {
1687 f->open_object_section("osd_stat");
1688 f->dump_int("osd", q->first);
1689 q->second.dump(f);
1690 f->close_section();
1691 }
1692 f->close_section();
1693 }
1694
1695 void PGMap::dump_pg_stats_plain(
1696 ostream& ss,
1697 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1698 bool brief) const
1699 {
1700 TextTable tab;
1701
1702 if (brief){
1703 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1704 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1705 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1706 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1707 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1708 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1709 }
1710 else {
1711 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1712 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1713 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1714 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1715 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1716 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1717 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1718 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1719 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1720 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1721 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1722 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1723 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1724 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1725 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1726 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1727 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1728 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1729 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1730 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1731 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1732 }
1733
1734 for (auto i = pg_stats.begin();
1735 i != pg_stats.end(); ++i) {
1736 const pg_stat_t &st(i->second);
1737 if (brief) {
1738 tab << i->first
1739 << pg_state_string(st.state)
1740 << st.up
1741 << st.up_primary
1742 << st.acting
1743 << st.acting_primary
1744 << TextTable::endrow;
1745 } else {
1746 ostringstream reported;
1747 reported << st.reported_epoch << ":" << st.reported_seq;
1748
1749 tab << i->first
1750 << st.stats.sum.num_objects
1751 << st.stats.sum.num_objects_missing_on_primary
1752 << st.stats.sum.num_objects_degraded
1753 << st.stats.sum.num_objects_misplaced
1754 << st.stats.sum.num_objects_unfound
1755 << st.stats.sum.num_bytes
1756 << st.log_size
1757 << st.ondisk_log_size
1758 << pg_state_string(st.state)
1759 << st.last_change
1760 << st.version
1761 << reported.str()
1762 << pg_vector_string(st.up)
1763 << st.up_primary
1764 << pg_vector_string(st.acting)
1765 << st.acting_primary
1766 << st.last_scrub
1767 << st.last_scrub_stamp
1768 << st.last_deep_scrub
1769 << st.last_deep_scrub_stamp
1770 << TextTable::endrow;
1771 }
1772 }
1773
1774 ss << tab;
1775 }
1776
1777 void PGMap::dump(ostream& ss) const
1778 {
1779 dump_basic(ss);
1780 dump_pg_stats(ss, false);
1781 dump_pool_stats(ss, false);
1782 dump_pg_sum_stats(ss, false);
1783 dump_osd_stats(ss);
1784 }
1785
1786 void PGMap::dump_basic(ostream& ss) const
1787 {
1788 ss << "version " << version << std::endl;
1789 ss << "stamp " << stamp << std::endl;
1790 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1791 ss << "last_pg_scan " << last_pg_scan << std::endl;
1792 ss << "full_ratio " << full_ratio << std::endl;
1793 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
1794 }
1795
1796 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1797 {
1798 dump_pg_stats_plain(ss, pg_stat, brief);
1799 }
1800
1801 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1802 {
1803 TextTable tab;
1804
1805 if (header) {
1806 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1807 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1808 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1809 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1810 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1811 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1812 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1813 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1814 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1815 } else {
1816 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1817 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1818 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1819 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1820 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1821 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1822 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1823 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1824 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1825 }
1826
1827 for (auto p = pg_pool_sum.begin();
1828 p != pg_pool_sum.end();
1829 ++p) {
1830 tab << p->first
1831 << p->second.stats.sum.num_objects
1832 << p->second.stats.sum.num_objects_missing_on_primary
1833 << p->second.stats.sum.num_objects_degraded
1834 << p->second.stats.sum.num_objects_misplaced
1835 << p->second.stats.sum.num_objects_unfound
1836 << p->second.stats.sum.num_bytes
1837 << p->second.log_size
1838 << p->second.ondisk_log_size
1839 << TextTable::endrow;
1840 }
1841
1842 ss << tab;
1843 }
1844
1845 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1846 {
1847 TextTable tab;
1848
1849 if (header) {
1850 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1851 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1852 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1853 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1854 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1855 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1856 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1857 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1858 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1859 } else {
1860 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1861 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1864 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1865 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1866 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1867 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1868 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1869 };
1870
1871 tab << "sum"
1872 << pg_sum.stats.sum.num_objects
1873 << pg_sum.stats.sum.num_objects_missing_on_primary
1874 << pg_sum.stats.sum.num_objects_degraded
1875 << pg_sum.stats.sum.num_objects_misplaced
1876 << pg_sum.stats.sum.num_objects_unfound
1877 << pg_sum.stats.sum.num_bytes
1878 << pg_sum.log_size
1879 << pg_sum.ondisk_log_size
1880 << TextTable::endrow;
1881
1882 ss << tab;
1883 }
1884
1885 void PGMap::dump_osd_stats(ostream& ss) const
1886 {
1887 TextTable tab;
1888
1889 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1890 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1891 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1892 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1893 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1894 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1895 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1896
1897 for (auto p = osd_stat.begin();
1898 p != osd_stat.end();
1899 ++p) {
1900 tab << p->first
1901 << si_t(p->second.kb_used << 10)
1902 << si_t(p->second.kb_avail << 10)
1903 << si_t(p->second.kb << 10)
1904 << p->second.hb_peers
1905 << get_num_pg_by_osd(p->first)
1906 << get_num_primary_pg_by_osd(p->first)
1907 << TextTable::endrow;
1908 }
1909
1910 tab << "sum"
1911 << si_t(osd_sum.kb_used << 10)
1912 << si_t(osd_sum.kb_avail << 10)
1913 << si_t(osd_sum.kb << 10)
1914 << TextTable::endrow;
1915
1916 ss << tab;
1917 }
1918
1919 void PGMap::dump_osd_sum_stats(ostream& ss) const
1920 {
1921 TextTable tab;
1922
1923 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1924 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1925 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1926 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1927
1928 tab << "sum"
1929 << si_t(osd_sum.kb_used << 10)
1930 << si_t(osd_sum.kb_avail << 10)
1931 << si_t(osd_sum.kb << 10)
1932 << TextTable::endrow;
1933
1934 ss << tab;
1935 }
1936
1937 void PGMap::get_stuck_stats(
1938 int types, const utime_t cutoff,
1939 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1940 {
1941 assert(types != 0);
1942 for (auto i = pg_stat.begin();
1943 i != pg_stat.end();
1944 ++i) {
1945 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1946
1947 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1948 if (i->second.last_active < val)
1949 val = i->second.last_active;
1950 }
1951
1952 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1953 if (i->second.last_clean < val)
1954 val = i->second.last_clean;
1955 }
1956
1957 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1958 if (i->second.last_undegraded < val)
1959 val = i->second.last_undegraded;
1960 }
1961
1962 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1963 if (i->second.last_fullsized < val)
1964 val = i->second.last_fullsized;
1965 }
1966
1967 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1968 if (i->second.last_unstale < val)
1969 val = i->second.last_unstale;
1970 }
1971
1972 // val is now the earliest any of the requested stuck states began
1973 if (val < cutoff) {
1974 stuck_pgs[i->first] = i->second;
1975 }
1976 }
1977 }
1978
1979 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1980 {
1981 int inactive = 0;
1982 int unclean = 0;
1983 int degraded = 0;
1984 int undersized = 0;
1985 int stale = 0;
1986
1987 for (auto i = pg_stat.begin();
1988 i != pg_stat.end();
1989 ++i) {
1990 if (! (i->second.state & PG_STATE_ACTIVE)) {
1991 if (i->second.last_active < cutoff)
1992 ++inactive;
1993 }
1994 if (! (i->second.state & PG_STATE_CLEAN)) {
1995 if (i->second.last_clean < cutoff)
1996 ++unclean;
1997 }
1998 if (i->second.state & PG_STATE_DEGRADED) {
1999 if (i->second.last_undegraded < cutoff)
2000 ++degraded;
2001 }
2002 if (i->second.state & PG_STATE_UNDERSIZED) {
2003 if (i->second.last_fullsized < cutoff)
2004 ++undersized;
2005 }
2006 if (i->second.state & PG_STATE_STALE) {
2007 if (i->second.last_unstale < cutoff)
2008 ++stale;
2009 }
2010 }
2011
2012 if (inactive)
2013 note["stuck inactive"] = inactive;
2014
2015 if (unclean)
2016 note["stuck unclean"] = unclean;
2017
2018 if (undersized)
2019 note["stuck undersized"] = undersized;
2020
2021 if (degraded)
2022 note["stuck degraded"] = degraded;
2023
2024 if (stale)
2025 note["stuck stale"] = stale;
2026
2027 return inactive || unclean || undersized || degraded || stale;
2028 }
2029
2030 void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
2031 {
2032 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2033 get_stuck_stats(types, cutoff, stuck_pg_stats);
2034 f->open_array_section("stuck_pg_stats");
2035 for (auto i = stuck_pg_stats.begin();
2036 i != stuck_pg_stats.end();
2037 ++i) {
2038 f->open_object_section("pg_stat");
2039 f->dump_stream("pgid") << i->first;
2040 i->second.dump(f);
2041 f->close_section();
2042 }
2043 f->close_section();
2044 }
2045
2046 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2047 {
2048 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2049 get_stuck_stats(types, cutoff, stuck_pg_stats);
2050 if (!stuck_pg_stats.empty())
2051 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2052 }
2053
2054 int PGMap::dump_stuck_pg_stats(
2055 stringstream &ds,
2056 Formatter *f,
2057 int threshold,
2058 vector<string>& args) const
2059 {
2060 int stuck_types = 0;
2061
2062 for (auto i = args.begin(); i != args.end(); ++i) {
2063 if (*i == "inactive")
2064 stuck_types |= PGMap::STUCK_INACTIVE;
2065 else if (*i == "unclean")
2066 stuck_types |= PGMap::STUCK_UNCLEAN;
2067 else if (*i == "undersized")
2068 stuck_types |= PGMap::STUCK_UNDERSIZED;
2069 else if (*i == "degraded")
2070 stuck_types |= PGMap::STUCK_DEGRADED;
2071 else if (*i == "stale")
2072 stuck_types |= PGMap::STUCK_STALE;
2073 else {
2074 ds << "Unknown type: " << *i << std::endl;
2075 return -EINVAL;
2076 }
2077 }
2078
2079 utime_t now(ceph_clock_now());
2080 utime_t cutoff = now - utime_t(threshold, 0);
2081
2082 if (!f) {
2083 dump_stuck_plain(ds, stuck_types, cutoff);
2084 } else {
2085 dump_stuck(f, stuck_types, cutoff);
2086 f->flush(ds);
2087 }
2088
2089 return 0;
2090 }
2091
2092 void PGMap::dump_osd_perf_stats(Formatter *f) const
2093 {
2094 f->open_array_section("osd_perf_infos");
2095 for (auto i = osd_stat.begin();
2096 i != osd_stat.end();
2097 ++i) {
2098 f->open_object_section("osd");
2099 f->dump_int("id", i->first);
2100 {
2101 f->open_object_section("perf_stats");
2102 i->second.os_perf_stat.dump(f);
2103 f->close_section();
2104 }
2105 f->close_section();
2106 }
2107 f->close_section();
2108 }
2109 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2110 {
2111 TextTable tab;
2112 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2113 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2114 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2115 for (auto i = osd_stat.begin();
2116 i != osd_stat.end();
2117 ++i) {
2118 tab << i->first;
2119 tab << i->second.os_perf_stat.os_commit_latency;
2120 tab << i->second.os_perf_stat.os_apply_latency;
2121 tab << TextTable::endrow;
2122 }
2123 (*ss) << tab;
2124 }
2125
2126 void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2127 {
2128 f->open_array_section("osd_blocked_by_infos");
2129 for (auto i = blocked_by_sum.begin();
2130 i != blocked_by_sum.end();
2131 ++i) {
2132 f->open_object_section("osd");
2133 f->dump_int("id", i->first);
2134 f->dump_int("num_blocked", i->second);
2135 f->close_section();
2136 }
2137 f->close_section();
2138 }
2139 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2140 {
2141 TextTable tab;
2142 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2143 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2144 for (auto i = blocked_by_sum.begin();
2145 i != blocked_by_sum.end();
2146 ++i) {
2147 tab << i->first;
2148 tab << i->second;
2149 tab << TextTable::endrow;
2150 }
2151 (*ss) << tab;
2152 }
2153
2154
2155 /**
2156 * update aggregated delta
2157 *
2158 * @param cct ceph context
2159 * @param ts Timestamp for the stats being delta'ed
2160 * @param old_pool_sum Previous stats sum
2161 * @param last_ts Last timestamp for pool
2162 * @param result_pool_sum Resulting stats
2163 * @param result_pool_delta Resulting pool delta
2164 * @param result_ts_delta Resulting timestamp delta
2165 * @param delta_avg_list List of last N computed deltas, used to average
2166 */
2167 void PGMap::update_delta(
2168 CephContext *cct,
2169 const utime_t ts,
2170 const pool_stat_t& old_pool_sum,
2171 utime_t *last_ts,
2172 const pool_stat_t& current_pool_sum,
2173 pool_stat_t *result_pool_delta,
2174 utime_t *result_ts_delta,
2175 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2176 {
2177 /* @p ts is the timestamp we want to associate with the data
2178 * in @p old_pool_sum, and on which we will base ourselves to
2179 * calculate the delta, stored in 'delta_t'.
2180 */
2181 utime_t delta_t;
2182 delta_t = ts; // start with the provided timestamp
2183 delta_t -= *last_ts; // take the last timestamp we saw
2184 *last_ts = ts; // @p ts becomes the last timestamp we saw
2185
2186 // adjust delta_t, quick start if there is no update in a long period
2187 delta_t = std::min(delta_t,
2188 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2189
2190 // calculate a delta, and average over the last 6 deltas by default.
2191 /* start by taking a copy of our current @p result_pool_sum, and by
2192 * taking out the stats from @p old_pool_sum. This generates a stats
2193 * delta. Stash this stats delta in @p delta_avg_list, along with the
2194 * timestamp delta for these results.
2195 */
2196 pool_stat_t d = current_pool_sum;
2197 d.stats.sub(old_pool_sum.stats);
2198 delta_avg_list->push_back(make_pair(d,delta_t));
2199 *result_ts_delta += delta_t;
2200
2201 /* Aggregate current delta, and take out the last seen delta (if any) to
2202 * average it out.
2203 */
2204 result_pool_delta->stats.add(d.stats);
2205 size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
2206 if (delta_avg_list->size() > s) {
2207 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2208 *result_ts_delta -= delta_avg_list->front().second;
2209 delta_avg_list->pop_front();
2210 }
2211 }
2212
2213 /**
2214 * update aggregated delta
2215 *
2216 * @param cct ceph context
2217 * @param ts Timestamp
2218 * @param pg_sum_old Old pg_sum
2219 */
2220 void PGMap::update_global_delta(CephContext *cct,
2221 const utime_t ts, const pool_stat_t& pg_sum_old)
2222 {
2223 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
2224 &stamp_delta, &pg_sum_deltas);
2225 }
2226
2227 /**
2228 * Update a given pool's deltas
2229 *
2230 * @param cct Ceph Context
2231 * @param ts Timestamp for the stats being delta'ed
2232 * @param pool Pool's id
2233 * @param old_pool_sum Previous stats sum
2234 */
2235 void PGMap::update_one_pool_delta(
2236 CephContext *cct,
2237 const utime_t ts,
2238 const uint64_t pool,
2239 const pool_stat_t& old_pool_sum)
2240 {
2241 if (per_pool_sum_deltas.count(pool) == 0) {
2242 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2243 assert(per_pool_sum_delta.count(pool) == 0);
2244 }
2245
2246 auto& sum_delta = per_pool_sum_delta[pool];
2247
2248 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2249 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2250 &per_pool_sum_deltas[pool]);
2251 }
2252
2253 /**
2254 * Update pools' deltas
2255 *
2256 * @param cct CephContext
2257 * @param ts Timestamp for the stats being delta'ed
2258 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2259 */
2260 void PGMap::update_pool_deltas(
2261 CephContext *cct, const utime_t ts,
2262 const mempool::pgmap::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
2263 {
2264 for (auto it = pg_pool_sum_old.begin();
2265 it != pg_pool_sum_old.end(); ++it) {
2266 update_one_pool_delta(cct, ts, it->first, it->second);
2267 }
2268 }
2269
2270 void PGMap::clear_delta()
2271 {
2272 pg_sum_delta = pool_stat_t();
2273 pg_sum_deltas.clear();
2274 stamp_delta = utime_t();
2275 }
2276
2277 void PGMap::generate_test_instances(list<PGMap*>& o)
2278 {
2279 o.push_back(new PGMap);
2280 list<Incremental*> inc;
2281 Incremental::generate_test_instances(inc);
2282 delete inc.front();
2283 inc.pop_front();
2284 while (!inc.empty()) {
2285 PGMap *pmp = new PGMap();
2286 *pmp = *o.back();
2287 o.push_back(pmp);
2288 o.back()->apply_incremental(NULL, *inc.front());
2289 delete inc.front();
2290 inc.pop_front();
2291 }
2292 }
2293
2294 void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
2295 bool primary, set<pg_t>& pgs) const
2296 {
2297 for (auto i = pg_stat.begin();
2298 i != pg_stat.end();
2299 ++i) {
2300 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
2301 continue;
2302 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2303 continue;
2304 if (!(i->second.state & state))
2305 continue;
2306 pgs.insert(i->first);
2307 }
2308 }
2309
2310 void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2311 {
2312 f->open_array_section("pg_stats");
2313 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2314 const pg_stat_t& st = pg_stat.at(*i);
2315 f->open_object_section("pg_stat");
2316 f->dump_stream("pgid") << *i;
2317 st.dump(f);
2318 f->close_section();
2319 }
2320 f->close_section();
2321 }
2322
2323 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2324 {
2325 TextTable tab;
2326
2327 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
2328 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2329 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2330 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2331 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2332 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2333 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2334 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2335 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
2336 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2337 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
2338 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2339 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2340 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2341 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2342 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2343 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2344 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2345 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2346 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2347 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2348
2349 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2350 const pg_stat_t& st = pg_stat.at(*i);
2351
2352 ostringstream reported;
2353 reported << st.reported_epoch << ":" << st.reported_seq;
2354
2355 tab << *i
2356 << st.stats.sum.num_objects
2357 << st.stats.sum.num_objects_missing_on_primary
2358 << st.stats.sum.num_objects_degraded
2359 << st.stats.sum.num_objects_misplaced
2360 << st.stats.sum.num_objects_unfound
2361 << st.stats.sum.num_bytes
2362 << st.log_size
2363 << st.ondisk_log_size
2364 << pg_state_string(st.state)
2365 << st.last_change
2366 << st.version
2367 << reported.str()
2368 << st.up
2369 << st.up_primary
2370 << st.acting
2371 << st.acting_primary
2372 << st.last_scrub
2373 << st.last_scrub_stamp
2374 << st.last_deep_scrub
2375 << st.last_deep_scrub_stamp
2376 << TextTable::endrow;
2377 }
2378
2379 ss << tab;
2380 }
2381
2382
2383
2384 // Only called with a single bit set in "what"
2385 static void note_stuck_detail(
2386 int what,
2387 mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
2388 int max_detail,
2389 list<pair<health_status_t,string> > *detail)
2390 {
2391 int n = 0;
2392 for (auto p = stuck_pgs.begin();
2393 p != stuck_pgs.end();
2394 ++p) {
2395 ostringstream ss;
2396 utime_t since;
2397 const char *whatname = 0;
2398 switch (what) {
2399 case PGMap::STUCK_INACTIVE:
2400 since = p->second.last_active;
2401 whatname = "inactive";
2402 break;
2403 case PGMap::STUCK_UNCLEAN:
2404 since = p->second.last_clean;
2405 whatname = "unclean";
2406 break;
2407 case PGMap::STUCK_DEGRADED:
2408 since = p->second.last_undegraded;
2409 whatname = "degraded";
2410 break;
2411 case PGMap::STUCK_UNDERSIZED:
2412 since = p->second.last_fullsized;
2413 whatname = "undersized";
2414 break;
2415 case PGMap::STUCK_STALE:
2416 since = p->second.last_unstale;
2417 whatname = "stale";
2418 break;
2419 default:
2420 ceph_abort();
2421 }
2422 if (--max_detail == 0) {
2423 ostringstream ss;
2424 ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
2425 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2426 break;
2427 }
2428 ++n;
2429 ss << "pg " << p->first << " is stuck " << whatname;
2430 if (since == utime_t()) {
2431 ss << " since forever";
2432 } else {
2433 utime_t dur = ceph_clock_now() - since;
2434 ss << " for " << dur;
2435 }
2436 ss << ", current state " << pg_state_string(p->second.state)
2437 << ", last acting " << p->second.acting;
2438 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2439 }
2440 }
2441
2442 static pair<int,int> _warn_slow_request_histogram(
2443 CephContext *cct,
2444 const pow2_hist_t& h,
2445 string suffix,
2446 list<pair<health_status_t,string> >& summary,
2447 list<pair<health_status_t,string> > *detail)
2448 {
2449 if (h.h.empty())
2450 return make_pair(0, 0);
2451
2452 unsigned warn = 0, error = 0;
2453 float err_age =
2454 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
2455 for (unsigned i = h.h.size() - 1; i > 0; --i) {
2456 float ub = (float)(1 << i) / 1000.0;
2457 if (ub < cct->_conf->mon_osd_warn_op_age)
2458 break;
2459 if (h.h[i]) {
2460 auto sev = HEALTH_WARN;
2461 if (ub > err_age) {
2462 sev = HEALTH_ERR;
2463 error += h.h[i];
2464 } else {
2465 warn += h.h[i];
2466 }
2467 if (detail) {
2468 ostringstream ss;
2469 ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
2470 detail->push_back(make_pair(sev, ss.str()));
2471 }
2472 }
2473 }
2474 return make_pair(warn, error);
2475 }
2476
2477 namespace {
2478 enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
2479
2480 void print_unscrubbed_detailed(
2481 const std::pair<const pg_t,pg_stat_t> &pg_entry,
2482 list<pair<health_status_t,string> > *detail,
2483 scrubbed_or_deepscrubbed_t how_scrubbed)
2484 {
2485 std::stringstream ss;
2486 const auto& pg_stat(pg_entry.second);
2487
2488 ss << "pg " << pg_entry.first << " is not ";
2489 if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
2490 ss << "scrubbed, last_scrub_stamp "
2491 << pg_stat.last_scrub_stamp;
2492 } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
2493 ss << "deep-scrubbed, last_deep_scrub_stamp "
2494 << pg_stat.last_deep_scrub_stamp;
2495 }
2496
2497 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2498 }
2499
2500 using pg_stat_map_t = const mempool::pgmap::unordered_map<pg_t,pg_stat_t>;
2501
2502 void print_unscrubbed_pgs(
2503 pg_stat_map_t& pg_stats,
2504 list<pair<health_status_t,string> > &summary,
2505 list<pair<health_status_t,string> > *detail,
2506 const CephContext* cct)
2507 {
2508 if (cct->_conf->mon_warn_not_scrubbed == 0 &&
2509 cct->_conf->mon_warn_not_deep_scrubbed == 0)
2510 return;
2511
2512 int pgs_count = 0;
2513 const utime_t now = ceph_clock_now();
2514 for (const auto& pg_entry : pg_stats) {
2515 const auto& pg_stat(pg_entry.second);
2516 const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
2517 const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
2518
2519 const int mon_warn_not_scrubbed =
2520 cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
2521
2522 const int mon_warn_not_deep_scrubbed =
2523 cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
2524
2525 bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
2526 cct->_conf->mon_warn_not_scrubbed != 0);
2527
2528 bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
2529 cct->_conf->mon_warn_not_deep_scrubbed != 0);
2530
2531 if (detail != nullptr) {
2532 if (not_scrubbed) {
2533 print_unscrubbed_detailed(pg_entry,
2534 detail,
2535 scrubbed_or_deepscrubbed_t::SCRUBBED);
2536 }
2537 if (not_deep_scrubbed) {
2538 print_unscrubbed_detailed(pg_entry,
2539 detail,
2540 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
2541 }
2542 }
2543 if (not_scrubbed || not_deep_scrubbed) {
2544 ++pgs_count;
2545 }
2546 }
2547
2548 if (pgs_count > 0) {
2549 std::stringstream ss;
2550 ss << pgs_count << " unscrubbed pgs";
2551 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2552 }
2553 }
2554 }
2555
2556 void PGMap::get_health_checks(
2557 CephContext *cct,
2558 const OSDMap& osdmap,
2559 health_check_map_t *checks) const
2560 {
2561 utime_t now = ceph_clock_now();
2562 const unsigned max = cct->_conf->mon_health_max_detail;
2563 const auto& pools = osdmap.get_pools();
2564
2565 checks->clear();
2566
2567 typedef enum pg_consequence_t {
2568 UNAVAILABLE = 1, // Client IO to the pool may block
2569 DEGRADED = 2, // Fewer than the requested number of replicas are present
2570 DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
2571 // and insufficiet resources are present to fix this
2572 DAMAGED = 4 // The data may be missing or inconsistent on disk and
2573 // requires repair
2574 } pg_consequence_t;
2575
2576 // For a given PG state, how should it be reported at the pool level?
2577 class PgStateResponse {
2578 public:
2579 pg_consequence_t consequence;
2580 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2581 stuck_cb stuck_since;
2582 bool invert;
2583
2584 PgStateResponse(const pg_consequence_t &c, stuck_cb s)
2585 : consequence(c), stuck_since(s), invert(false)
2586 {
2587 }
2588
2589 PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
2590 : consequence(c), stuck_since(s), invert(i)
2591 {
2592 }
2593 };
2594
2595 // Record the PG state counts that contributed to a reported pool state
2596 class PgCauses {
2597 public:
2598 // Map of PG_STATE_* to number of pgs in that state.
2599 std::map<unsigned, unsigned> states;
2600
2601 // List of all PG IDs that had a state contributing
2602 // to this health condition.
2603 std::set<pg_t> pgs;
2604
2605 std::map<pg_t, std::string> pg_messages;
2606 };
2607
2608 // Map of PG state to how to respond to it
2609 std::map<unsigned, PgStateResponse> state_to_response = {
2610 // Immediate reports
2611 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2612 { PG_STATE_INCOMPLETE, {DEGRADED, {}} },
2613 { PG_STATE_REPAIR, {DAMAGED, {}} },
2614 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2615 { PG_STATE_BACKFILL_TOOFULL, {DEGRADED, {}} },
2616 { PG_STATE_RECOVERY_TOOFULL, {DEGRADED, {}} },
2617 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2618 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2619 // Delayed (wait until stuck) reports
2620 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2621 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2622 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2623 // Delayed and inverted reports
2624 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} },
2625 { PG_STATE_CLEAN, {DEGRADED, [](const pg_stat_t &p){return p.last_clean;}, true} }
2626 };
2627
2628 // Specialized state printer that takes account of inversion of
2629 // ACTIVE, CLEAN checks.
2630 auto state_name = [](const uint32_t &state) {
2631 // Special cases for the states that are inverted checks
2632 if (state == PG_STATE_CLEAN) {
2633 return std::string("unclean");
2634 } else if (state == PG_STATE_ACTIVE) {
2635 return std::string("inactive");
2636 } else {
2637 return pg_state_string(state);
2638 }
2639 };
2640
2641 // Map of what is wrong to information about why, implicitly also stores
2642 // the list of what is wrong.
2643 std::map<pg_consequence_t, PgCauses> detected;
2644
2645 // Optimisation: trim down the number of checks to apply based on
2646 // the summary counters
2647 std::map<unsigned, PgStateResponse> possible_responses;
2648 for (const auto &i : num_pg_by_state) {
2649 for (const auto &j : state_to_response) {
2650 if (!j.second.invert) {
2651 // Check for normal tests by seeing if any pgs have the flag
2652 if (i.first & j.first) {
2653 possible_responses.insert(j);
2654 }
2655 }
2656 }
2657 }
2658
2659 for (const auto &j : state_to_response) {
2660 if (j.second.invert) {
2661 // Check for inverted tests by seeing if not-all pgs have the flag
2662 const auto &found = num_pg_by_state.find(j.first);
2663 if (found == num_pg_by_state.end() || found->second != num_pg) {
2664 possible_responses.insert(j);
2665 }
2666 }
2667 }
2668
2669 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
2670 // Loop over all PGs, if there are any possibly-unhealthy states in there
2671 if (!possible_responses.empty()) {
2672 for (const auto& i : pg_stat) {
2673 const auto &pg_id = i.first;
2674 const auto &pg_info = i.second;
2675
2676 for (const auto &j : state_to_response) {
2677 const auto &pg_response_state = j.first;
2678 const auto &pg_response = j.second;
2679
2680 // Apply the state test
2681 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2682 continue;
2683 }
2684
2685 // Apply stuckness test if needed
2686 if (pg_response.stuck_since) {
2687 // Delayed response, check for stuckness
2688 utime_t last_whatever = pg_response.stuck_since(pg_info);
2689 if (last_whatever >= cutoff) {
2690 // Not stuck enough, ignore.
2691 continue;
2692 } else {
2693
2694 }
2695 }
2696
2697 auto &causes = detected[pg_response.consequence];
2698 causes.states[pg_response_state]++;
2699 causes.pgs.insert(pg_id);
2700
2701 // Don't bother composing detail string if we have already recorded
2702 // too many
2703 if (causes.pg_messages.size() > max) {
2704 continue;
2705 }
2706
2707 std::ostringstream ss;
2708 if (pg_response.stuck_since) {
2709 utime_t since = pg_response.stuck_since(pg_info);
2710 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2711 if (since == utime_t()) {
2712 ss << " since forever";
2713 } else {
2714 utime_t dur = now - since;
2715 ss << " for " << dur;
2716 }
2717 ss << ", current state " << pg_state_string(pg_info.state)
2718 << ", last acting " << pg_info.acting;
2719 } else {
2720 ss << "pg " << pg_id << " is "
2721 << pg_state_string(pg_info.state);
2722 ss << ", acting " << pg_info.acting;
2723 if (pg_info.stats.sum.num_objects_unfound) {
2724 ss << ", " << pg_info.stats.sum.num_objects_unfound
2725 << " unfound";
2726 }
2727 }
2728
2729 if (pg_info.state & PG_STATE_INCOMPLETE) {
2730 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2731 if (pi && pi->min_size > 1) {
2732 ss << " (reducing pool "
2733 << osdmap.get_pool_name(pg_id.pool())
2734 << " min_size from " << (int)pi->min_size
2735 << " may help; search ceph.com/docs for 'incomplete')";
2736 }
2737 }
2738
2739 causes.pg_messages[pg_id] = ss.str();
2740 }
2741 }
2742 } else {
2743 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2744 }
2745
2746 for (const auto &i : detected) {
2747 std::string health_code;
2748 health_status_t sev;
2749 std::string summary;
2750 switch(i.first) {
2751 case UNAVAILABLE:
2752 health_code = "PG_AVAILABILITY";
2753 sev = HEALTH_WARN;
2754 summary = "Reduced data availability: ";
2755 break;
2756 case DEGRADED:
2757 health_code = "PG_DEGRADED";
2758 summary = "Degraded data redundancy: ";
2759 sev = HEALTH_WARN;
2760 break;
2761 case DEGRADED_FULL:
2762 health_code = "PG_DEGRADED_FULL";
2763 summary = "Degraded data redundancy (low space): ";
2764 sev = HEALTH_ERR;
2765 break;
2766 case DAMAGED:
2767 health_code = "PG_DAMAGED";
2768 summary = "Possible data damage: ";
2769 sev = HEALTH_ERR;
2770 break;
2771 default:
2772 assert(false);
2773 }
2774
2775 if (i.first == DEGRADED) {
2776 if (pg_sum.stats.sum.num_objects_degraded &&
2777 pg_sum.stats.sum.num_object_copies > 0) {
2778 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2779 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2780 char b[20];
2781 snprintf(b, sizeof(b), "%.3lf", pc);
2782 ostringstream ss;
2783 ss << pg_sum.stats.sum.num_objects_degraded
2784 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2785 << b << "%)";
2786
2787 // Throw in a comma for the benefit of the following PG counts
2788 summary += ss.str() + ", ";
2789 }
2790 }
2791
2792 // Compose summary message saying how many PGs in what states led
2793 // to this health check failing
2794 std::vector<std::string> pg_msgs;
2795 for (const auto &j : i.second.states) {
2796 std::ostringstream msg;
2797 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2798 pg_msgs.push_back(msg.str());
2799 }
2800 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2801
2802
2803
2804 health_check_t *check = &checks->add(
2805 health_code,
2806 sev,
2807 summary);
2808
2809 // Compose list of PGs contributing to this health check failing
2810 for (const auto &j : i.second.pg_messages) {
2811 check->detail.push_back(j.second);
2812 }
2813 }
2814
2815 // OSD_SKEWED_USAGE
2816 if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
2817 int max_osd = -1, min_osd = -1;
2818 float max_osd_usage = 0.0, min_osd_usage = 1.0;
2819 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
2820 // kb should never be 0, but avoid divide by zero in case of corruption
2821 if (p->second.kb <= 0)
2822 continue;
2823 float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
2824 if (usage > max_osd_usage) {
2825 max_osd_usage = usage;
2826 max_osd = p->first;
2827 }
2828 if (usage < min_osd_usage) {
2829 min_osd_usage = usage;
2830 min_osd = p->first;
2831 }
2832 }
2833 float diff = max_osd_usage - min_osd_usage;
2834 if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
2835 auto& d = checks->add("OSD_SKEWED_USAGE", HEALTH_WARN,
2836 "skewed osd utilization");
2837 ostringstream ss;
2838 ss << "difference between min (osd." << min_osd << " at "
2839 << roundf(min_osd_usage*1000.0)/100.0
2840 << "%) and max (osd." << max_osd << " at "
2841 << roundf(max_osd_usage*1000.0)/100.0
2842 << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
2843 << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
2844 << " (mon_warn_osd_usage_min_max_delta)";
2845 d.detail.push_back(ss.str());
2846 }
2847 }
2848
2849 // OSD_SCRUB_ERRORS
2850 if (pg_sum.stats.sum.num_scrub_errors) {
2851 ostringstream ss;
2852 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2853 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
2854 }
2855
2856 // CACHE_POOL_NEAR_FULL
2857 {
2858 list<string> detail;
2859 unsigned num_pools = 0;
2860 for (auto& p : pools) {
2861 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2862 !pg_pool_sum.count(p.first)) {
2863 continue;
2864 }
2865 bool nearfull = false;
2866 const string& name = osdmap.get_pool_name(p.first);
2867 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2868 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2869 ((1000000 - p.second.cache_target_full_ratio_micro) *
2870 cct->_conf->mon_cache_target_full_warn_ratio);
2871 if (p.second.target_max_objects &&
2872 (uint64_t)(st.stats.sum.num_objects -
2873 st.stats.sum.num_objects_hit_set_archive) >
2874 p.second.target_max_objects * (ratio / 1000000.0)) {
2875 ostringstream ss;
2876 ss << "cache pool '" << name << "' with "
2877 << si_t(st.stats.sum.num_objects)
2878 << " objects at/near target max "
2879 << si_t(p.second.target_max_objects) << " objects";
2880 detail.push_back(ss.str());
2881 nearfull = true;
2882 }
2883 if (p.second.target_max_bytes &&
2884 (uint64_t)(st.stats.sum.num_bytes -
2885 st.stats.sum.num_bytes_hit_set_archive) >
2886 p.second.target_max_bytes * (ratio / 1000000.0)) {
2887 ostringstream ss;
2888 ss << "cache pool '" << name
2889 << "' with " << si_t(st.stats.sum.num_bytes)
2890 << "B at/near target max "
2891 << si_t(p.second.target_max_bytes) << "B";
2892 detail.push_back(ss.str());
2893 nearfull = true;
2894 }
2895 if (nearfull) {
2896 ++num_pools;
2897 }
2898 }
2899 if (!detail.empty()) {
2900 ostringstream ss;
2901 ss << num_pools << " cache pools at or near target size";
2902 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2903 d.detail.swap(detail);
2904 }
2905 }
2906
2907 // TOO_FEW_PGS
2908 int num_in = osdmap.get_num_in_osds();
2909 int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
2910 if (num_in &&
2911 cct->_conf->mon_pg_warn_min_per_osd > 0 &&
2912 osdmap.get_pools().size() > 0) {
2913 int per = sum_pg_up / num_in;
2914 if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
2915 ostringstream ss;
2916 ss << "too few PGs per OSD (" << per
2917 << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
2918 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
2919 }
2920 }
2921
2922 // TOO_MANY_PGS
2923 if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
2924 int per = sum_pg_up / num_in;
2925 if (per > cct->_conf->mon_pg_warn_max_per_osd) {
2926 ostringstream ss;
2927 ss << "too many PGs per OSD (" << per
2928 << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
2929 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
2930 }
2931 }
2932
2933 // SMALLER_PGP_NUM
2934 // MANY_OBJECTS_PER_PG
2935 if (!pg_stat.empty()) {
2936 list<string> pgp_detail, many_detail;
2937 for (auto p = pg_pool_sum.begin();
2938 p != pg_pool_sum.end();
2939 ++p) {
2940 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2941 if (!pi)
2942 continue; // in case osdmap changes haven't propagated to PGMap yet
2943 const string& name = osdmap.get_pool_name(p->first);
2944 if (pi->get_pg_num() > pi->get_pgp_num() &&
2945 !(name.find(".DELETED") != string::npos &&
2946 cct->_conf->mon_fake_pool_delete)) {
2947 ostringstream ss;
2948 ss << "pool " << name << " pg_num "
2949 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
2950 pgp_detail.push_back(ss.str());
2951 }
2952 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2953 if (average_objects_per_pg > 0 &&
2954 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
2955 p->second.stats.sum.num_objects >=
2956 cct->_conf->mon_pg_warn_min_pool_objects) {
2957 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
2958 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2959 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
2960 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
2961 ostringstream ss;
2962 ss << "pool " << name << " objects per pg ("
2963 << objects_per_pg << ") is more than " << ratio
2964 << " times cluster average ("
2965 << average_objects_per_pg << ")";
2966 many_detail.push_back(ss.str());
2967 }
2968 }
2969 }
2970 if (!pgp_detail.empty()) {
2971 ostringstream ss;
2972 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2973 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
2974 d.detail.swap(pgp_detail);
2975 }
2976 if (!many_detail.empty()) {
2977 ostringstream ss;
2978 ss << many_detail.size() << " pools have many more objects per pg than"
2979 << " average";
2980 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
2981 d.detail.swap(many_detail);
2982 }
2983 }
2984
2985 // POOL_FULL
2986 // POOL_NEAR_FULL
2987 {
2988 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
2989 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
2990 list<string> full_detail, nearfull_detail;
2991 unsigned full_pools = 0, nearfull_pools = 0;
2992 for (auto it : pools) {
2993 auto it2 = pg_pool_sum.find(it.first);
2994 if (it2 == pg_pool_sum.end()) {
2995 continue;
2996 }
2997 const pool_stat_t *pstat = &it2->second;
2998 const object_stat_sum_t& sum = pstat->stats.sum;
2999 const string& pool_name = osdmap.get_pool_name(it.first);
3000 const pg_pool_t &pool = it.second;
3001 bool full = false, nearfull = false;
3002 if (pool.quota_max_objects > 0) {
3003 stringstream ss;
3004 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3005 } else if (crit_threshold > 0 &&
3006 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3007 ss << "pool '" << pool_name
3008 << "' has " << sum.num_objects << " objects"
3009 << " (max " << pool.quota_max_objects << ")";
3010 full_detail.push_back(ss.str());
3011 full = true;
3012 } else if (warn_threshold > 0 &&
3013 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3014 ss << "pool '" << pool_name
3015 << "' has " << sum.num_objects << " objects"
3016 << " (max " << pool.quota_max_objects << ")";
3017 nearfull_detail.push_back(ss.str());
3018 nearfull = true;
3019 }
3020 }
3021 if (pool.quota_max_bytes > 0) {
3022 stringstream ss;
3023 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3024 } else if (crit_threshold > 0 &&
3025 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3026 ss << "pool '" << pool_name
3027 << "' has " << si_t(sum.num_bytes) << " bytes"
3028 << " (max " << si_t(pool.quota_max_bytes) << ")";
3029 full_detail.push_back(ss.str());
3030 full = true;
3031 } else if (warn_threshold > 0 &&
3032 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3033 ss << "pool '" << pool_name
3034 << "' has " << si_t(sum.num_bytes) << " bytes"
3035 << " (max " << si_t(pool.quota_max_bytes) << ")";
3036 nearfull_detail.push_back(ss.str());
3037 nearfull = true;
3038 }
3039 }
3040 if (full) {
3041 ++full_pools;
3042 }
3043 if (nearfull) {
3044 ++nearfull_pools;
3045 }
3046 }
3047 if (full_pools) {
3048 ostringstream ss;
3049 ss << full_pools << " pools full";
3050 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
3051 d.detail.swap(full_detail);
3052 }
3053 if (nearfull_pools) {
3054 ostringstream ss;
3055 ss << nearfull_pools << " pools full";
3056 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
3057 d.detail.swap(nearfull_detail);
3058 }
3059 }
3060
3061 // OBJECT_MISPLACED
3062 if (pg_sum.stats.sum.num_objects_misplaced &&
3063 pg_sum.stats.sum.num_object_copies > 0) {
3064 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3065 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3066 char b[20];
3067 snprintf(b, sizeof(b), "%.3lf", pc);
3068 ostringstream ss;
3069 ss << pg_sum.stats.sum.num_objects_misplaced
3070 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3071 << b << "%)";
3072 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
3073 }
3074
3075 // OBJECT_UNFOUND
3076 if (pg_sum.stats.sum.num_objects_unfound &&
3077 pg_sum.stats.sum.num_objects) {
3078 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3079 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3080 char b[20];
3081 snprintf(b, sizeof(b), "%.3lf", pc);
3082 ostringstream ss;
3083 ss << pg_sum.stats.sum.num_objects_unfound
3084 << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
3085 checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
3086 }
3087
3088 // REQUEST_SLOW
3089 // REQUEST_STUCK
3090 if (cct->_conf->mon_osd_warn_op_age > 0 &&
3091 osd_sum.op_queue_age_hist.upper_bound() >
3092 cct->_conf->mon_osd_warn_op_age) {
3093 list<string> warn_detail, error_detail;
3094 unsigned warn = 0, error = 0;
3095 float err_age =
3096 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3097 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3098 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3099 float ub = (float)(1 << i) / 1000.0;
3100 if (ub < cct->_conf->mon_osd_warn_op_age)
3101 break;
3102 if (h.h[i]) {
3103 ostringstream ss;
3104 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3105 if (ub > err_age) {
3106 error += h.h[i];
3107 error_detail.push_back(ss.str());
3108 } else {
3109 warn += h.h[i];
3110 warn_detail.push_back(ss.str());
3111 }
3112 }
3113 }
3114
3115 map<float,set<int>> warn_osd_by_max; // max -> osds
3116 map<float,set<int>> error_osd_by_max; // max -> osds
3117 if (!warn_detail.empty() || !error_detail.empty()) {
3118 for (auto& p : osd_stat) {
3119 const pow2_hist_t& h = p.second.op_queue_age_hist;
3120 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3121 float ub = (float)(1 << i) / 1000.0;
3122 if (ub < cct->_conf->mon_osd_warn_op_age)
3123 break;
3124 if (h.h[i]) {
3125 if (ub > err_age) {
3126 error_osd_by_max[ub].insert(p.first);
3127 } else {
3128 warn_osd_by_max[ub].insert(p.first);
3129 }
3130 break;
3131 }
3132 }
3133 }
3134 }
3135
3136 if (!warn_detail.empty()) {
3137 ostringstream ss;
3138 ss << warn << " slow requests are blocked > "
3139 << cct->_conf->mon_osd_warn_op_age << " sec";
3140 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
3141 d.detail.swap(warn_detail);
3142 int left = max;
3143 for (auto& p : warn_osd_by_max) {
3144 ostringstream ss;
3145 if (p.second.size() > 1) {
3146 ss << "osds " << p.second;
3147 } else {
3148 ss << "osd." << *p.second.begin();
3149 }
3150 ss << " have blocked requests > " << p.first << " sec";
3151 d.detail.push_back(ss.str());
3152 if (--left == 0) {
3153 break;
3154 }
3155 }
3156 }
3157 if (!error_detail.empty()) {
3158 ostringstream ss;
3159 ss << warn << " stuck requests are blocked > "
3160 << err_age << " sec";
3161 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
3162 d.detail.swap(error_detail);
3163 int left = max;
3164 for (auto& p : error_osd_by_max) {
3165 ostringstream ss;
3166 if (p.second.size() > 1) {
3167 ss << "osds " << p.second;
3168 } else {
3169 ss << "osd." << *p.second.begin();
3170 }
3171 ss << " have stuck requests > " << p.first << " sec";
3172 d.detail.push_back(ss.str());
3173 if (--left == 0) {
3174 break;
3175 }
3176 }
3177 }
3178 }
3179
3180 // PG_NOT_SCRUBBED
3181 // PG_NOT_DEEP_SCRUBBED
3182 {
3183 list<string> detail, deep_detail;
3184 const double age = cct->_conf->mon_warn_not_scrubbed +
3185 cct->_conf->mon_scrub_interval;
3186 utime_t cutoff = now;
3187 cutoff -= age;
3188 const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
3189 cct->_conf->osd_deep_scrub_interval;
3190 utime_t deep_cutoff = now;
3191 deep_cutoff -= deep_age;
3192 for (auto& p : pg_stat) {
3193 if (p.second.last_scrub_stamp < cutoff) {
3194 ostringstream ss;
3195 ss << "pg " << p.first << " not scrubbed since "
3196 << p.second.last_scrub_stamp;
3197 detail.push_back(ss.str());
3198 }
3199 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3200 ostringstream ss;
3201 ss << "pg " << p.first << " not deep-scrubbed since "
3202 << p.second.last_deep_scrub_stamp;
3203 deep_detail.push_back(ss.str());
3204 }
3205 }
3206 if (!detail.empty()) {
3207 ostringstream ss;
3208 ss << detail.size() << " pgs not scrubbed for " << age;
3209 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
3210 d.detail.swap(detail);
3211 }
3212 if (!deep_detail.empty()) {
3213 ostringstream ss;
3214 ss << deep_detail.size() << " pgs not deep-scrubbed for " << age;
3215 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
3216 d.detail.swap(deep_detail);
3217 }
3218 }
3219 }
3220
3221 void PGMap::get_health(
3222 CephContext *cct,
3223 const OSDMap& osdmap,
3224 list<pair<health_status_t,string> >& summary,
3225 list<pair<health_status_t,string> > *detail) const
3226 {
3227 map<string,int> note;
3228 auto p = num_pg_by_state.begin();
3229 auto p_end = num_pg_by_state.end();
3230 for (; p != p_end; ++p) {
3231 if (p->first & PG_STATE_STALE)
3232 note["stale"] += p->second;
3233 if (p->first & PG_STATE_DOWN)
3234 note["down"] += p->second;
3235 if (p->first & PG_STATE_UNDERSIZED)
3236 note["undersized"] += p->second;
3237 if (p->first & PG_STATE_DEGRADED)
3238 note["degraded"] += p->second;
3239 if (p->first & PG_STATE_INCONSISTENT)
3240 note["inconsistent"] += p->second;
3241 if (p->first & PG_STATE_PEERING)
3242 note["peering"] += p->second;
3243 if (p->first & PG_STATE_REPAIR)
3244 note["repair"] += p->second;
3245 if (p->first & PG_STATE_RECOVERING)
3246 note["recovering"] += p->second;
3247 if (p->first & PG_STATE_RECOVERY_WAIT)
3248 note["recovery_wait"] += p->second;
3249 if (p->first & PG_STATE_INCOMPLETE)
3250 note["incomplete"] += p->second;
3251 if (p->first & PG_STATE_BACKFILL_WAIT)
3252 note["backfill_wait"] += p->second;
3253 if (p->first & PG_STATE_BACKFILL)
3254 note["backfilling"] += p->second;
3255 if (p->first & PG_STATE_BACKFILL_TOOFULL)
3256 note["backfill_toofull"] += p->second;
3257 if (p->first & PG_STATE_RECOVERY_TOOFULL)
3258 note["recovery_toofull"] += p->second;
3259 if (p->first & PG_STATE_SNAPTRIM_ERROR)
3260 note["snaptrim_error"] += p->second;
3261 }
3262
3263 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
3264 utime_t now(ceph_clock_now());
3265 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
3266 uint64_t num_inactive_pgs = 0;
3267
3268 if (detail) {
3269 // we need to collect details of stuck pgs, first do a quick check
3270 // whether this will yield any results
3271 if (get_stuck_counts(cutoff, note)) {
3272
3273 // there are stuck pgs. gather details for specified statuses
3274 // only if we know that there are pgs stuck in that status
3275
3276 if (note.find("stuck inactive") != note.end()) {
3277 get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
3278 note["stuck inactive"] = stuck_pgs.size();
3279 num_inactive_pgs += stuck_pgs.size();
3280 note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
3281 cct->_conf->mon_health_max_detail, detail);
3282 stuck_pgs.clear();
3283 }
3284
3285 if (note.find("stuck unclean") != note.end()) {
3286 get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
3287 note["stuck unclean"] = stuck_pgs.size();
3288 note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
3289 cct->_conf->mon_health_max_detail, detail);
3290 stuck_pgs.clear();
3291 }
3292
3293 if (note.find("stuck undersized") != note.end()) {
3294 get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
3295 note["stuck undersized"] = stuck_pgs.size();
3296 note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
3297 cct->_conf->mon_health_max_detail, detail);
3298 stuck_pgs.clear();
3299 }
3300
3301 if (note.find("stuck degraded") != note.end()) {
3302 get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
3303 note["stuck degraded"] = stuck_pgs.size();
3304 note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
3305 cct->_conf->mon_health_max_detail, detail);
3306 stuck_pgs.clear();
3307 }
3308
3309 if (note.find("stuck stale") != note.end()) {
3310 get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
3311 note["stuck stale"] = stuck_pgs.size();
3312 num_inactive_pgs += stuck_pgs.size();
3313 note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
3314 cct->_conf->mon_health_max_detail, detail);
3315 }
3316 }
3317 } else {
3318 get_stuck_counts(cutoff, note);
3319 auto p = note.find("stuck inactive");
3320 if (p != note.end())
3321 num_inactive_pgs += p->second;
3322 p = note.find("stuck stale");
3323 if (p != note.end())
3324 num_inactive_pgs += p->second;
3325 }
3326
3327 if (cct->_conf->mon_pg_min_inactive > 0 &&
3328 num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
3329 ostringstream ss;
3330 ss << num_inactive_pgs << " pgs are stuck inactive for more than " << cct->_conf->mon_pg_stuck_threshold << " seconds";
3331 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3332 }
3333
3334 if (!note.empty()) {
3335 for (auto p = note.begin(); p != note.end(); ++p) {
3336 ostringstream ss;
3337 ss << p->second << " pgs " << p->first;
3338 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3339 }
3340 if (detail) {
3341 int n = 0, more = 0;
3342 int max = cct->_conf->mon_health_max_detail;
3343 for (auto p = pg_stat.begin();
3344 p != pg_stat.end();
3345 ++p) {
3346 if ((p->second.state & (PG_STATE_STALE |
3347 PG_STATE_DOWN |
3348 PG_STATE_UNDERSIZED |
3349 PG_STATE_DEGRADED |
3350 PG_STATE_INCONSISTENT |
3351 PG_STATE_PEERING |
3352 PG_STATE_REPAIR |
3353 PG_STATE_RECOVERING |
3354 PG_STATE_RECOVERY_WAIT |
3355 PG_STATE_RECOVERY_TOOFULL |
3356 PG_STATE_INCOMPLETE |
3357 PG_STATE_BACKFILL_WAIT |
3358 PG_STATE_BACKFILL |
3359 PG_STATE_BACKFILL_TOOFULL)) &&
3360 stuck_pgs.count(p->first) == 0) {
3361 if (max > 0) {
3362 --max;
3363 } else {
3364 ++more;
3365 continue;
3366 }
3367 ++n;
3368 ostringstream ss;
3369 ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
3370 ss << ", acting " << p->second.acting;
3371 if (p->second.stats.sum.num_objects_unfound)
3372 ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
3373 if (p->second.state & PG_STATE_INCOMPLETE) {
3374 const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
3375 if (pi && pi->min_size > 1) {
3376 ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
3377 << " min_size from " << (int)pi->min_size
3378 << " may help; search ceph.com/docs for 'incomplete')";
3379 }
3380 }
3381 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3382 }
3383 }
3384 if (more) {
3385 ostringstream ss;
3386 ss << more << " more pgs are also unhealthy";
3387 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3388 }
3389 }
3390 }
3391
3392 // slow requests
3393 if (cct->_conf->mon_osd_warn_op_age > 0 &&
3394 osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) {
3395 auto sum = _warn_slow_request_histogram(
3396 cct, osd_sum.op_queue_age_hist, "", summary, NULL);
3397 if (sum.first > 0 || sum.second > 0) {
3398 if (sum.first > 0) {
3399 ostringstream ss;
3400 ss << sum.first << " requests are blocked > "
3401 << cct->_conf->mon_osd_warn_op_age
3402 << " sec";
3403 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3404 }
3405 if (sum.second > 0) {
3406 ostringstream ss;
3407 ss << sum.first << " requests are blocked > "
3408 << (cct->_conf->mon_osd_warn_op_age *
3409 cct->_conf->mon_osd_err_op_age_ratio)
3410 << " sec";
3411 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3412 }
3413
3414 if (detail) {
3415 unsigned num_warn = 0, num_err = 0;
3416 // do per-osd warnings
3417 for (auto p = osd_stat.begin();
3418 p != osd_stat.end();
3419 ++p) {
3420 auto sum = _warn_slow_request_histogram(
3421 cct,
3422 p->second.op_queue_age_hist,
3423 string(" on osd.") + stringify(p->first),
3424 summary, detail);
3425 if (sum.second)
3426 ++num_err;
3427 else if (sum.first)
3428 ++num_warn;
3429 }
3430 if (num_err) {
3431 ostringstream ss2;
3432 ss2 << num_err << " osds have very slow requests";
3433 summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
3434 detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
3435 }
3436 if (num_warn) {
3437 ostringstream ss2;
3438 ss2 << num_warn << " osds have slow requests";
3439 summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
3440 detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
3441 }
3442 }
3443 }
3444 }
3445
3446 if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
3447 float max_osd_usage = 0.0, min_osd_usage = 1.0;
3448 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
3449 // kb should never be 0, but avoid divide by zero in case of corruption
3450 if (p->second.kb <= 0)
3451 continue;
3452 float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
3453 if (usage > max_osd_usage)
3454 max_osd_usage = usage;
3455 if (usage < min_osd_usage)
3456 min_osd_usage = usage;
3457 }
3458 float diff = max_osd_usage - min_osd_usage;
3459 if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
3460 ostringstream ss;
3461 ss << "difference between min (" << roundf(min_osd_usage*1000.0)/10.0
3462 << "%) and max (" << roundf(max_osd_usage*1000.0)/10.0
3463 << "%) osd usage " << roundf(diff*1000.0)/10.0 << "% > "
3464 << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/10.0
3465 << "% (mon_warn_osd_usage_min_max_delta)";
3466 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3467 if (detail)
3468 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3469 }
3470 }
3471
3472 // recovery
3473 list<string> sl;
3474 overall_recovery_summary(NULL, &sl);
3475 for (auto p = sl.begin(); p != sl.end(); ++p) {
3476 summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3477 if (detail)
3478 detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3479 }
3480
3481 // near-target max pools
3482 auto& pools = osdmap.get_pools();
3483 for (auto p = pools.begin();
3484 p != pools.end(); ++p) {
3485 if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
3486 !pg_pool_sum.count(p->first))
3487 continue;
3488 bool nearfull = false;
3489 const string& name = osdmap.get_pool_name(p->first);
3490 const pool_stat_t& st = get_pg_pool_sum_stat(p->first);
3491 uint64_t ratio = p->second.cache_target_full_ratio_micro +
3492 ((1000000 - p->second.cache_target_full_ratio_micro) *
3493 cct->_conf->mon_cache_target_full_warn_ratio);
3494 if (p->second.target_max_objects &&
3495 (uint64_t)(st.stats.sum.num_objects -
3496 st.stats.sum.num_objects_hit_set_archive) >
3497 p->second.target_max_objects * (ratio / 1000000.0)) {
3498 nearfull = true;
3499 if (detail) {
3500 ostringstream ss;
3501 ss << "cache pool '" << name << "' with "
3502 << si_t(st.stats.sum.num_objects)
3503 << " objects at/near target max "
3504 << si_t(p->second.target_max_objects) << " objects";
3505 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3506 }
3507 }
3508 if (p->second.target_max_bytes &&
3509 (uint64_t)(st.stats.sum.num_bytes -
3510 st.stats.sum.num_bytes_hit_set_archive) >
3511 p->second.target_max_bytes * (ratio / 1000000.0)) {
3512 nearfull = true;
3513 if (detail) {
3514 ostringstream ss;
3515 ss << "cache pool '" << name
3516 << "' with " << si_t(st.stats.sum.num_bytes)
3517 << "B at/near target max "
3518 << si_t(p->second.target_max_bytes) << "B";
3519 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3520 }
3521 }
3522 if (nearfull) {
3523 ostringstream ss;
3524 ss << "'" << name << "' at/near target max";
3525 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3526 }
3527 }
3528
3529 // scrub
3530 if (pg_sum.stats.sum.num_scrub_errors) {
3531 ostringstream ss;
3532 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
3533 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3534 if (detail) {
3535 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3536 }
3537 }
3538
3539 // pg skew
3540 int num_in = osdmap.get_num_in_osds();
3541 int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
3542 int sum_objects = pg_sum.stats.sum.num_objects;
3543 if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
3544 return;
3545 }
3546 if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
3547 int per = sum_pg_up / num_in;
3548 if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
3549 ostringstream ss;
3550 ss << "too few PGs per OSD (" << per << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
3551 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3552 if (detail)
3553 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3554 }
3555 }
3556 if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
3557 int per = sum_pg_up / num_in;
3558 if (per > cct->_conf->mon_pg_warn_max_per_osd) {
3559 ostringstream ss;
3560 ss << "too many PGs per OSD (" << per << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
3561 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3562 if (detail)
3563 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3564 }
3565 }
3566 if (!pg_stat.empty()) {
3567 for (auto p = pg_pool_sum.begin();
3568 p != pg_pool_sum.end();
3569 ++p) {
3570 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
3571 if (!pi)
3572 continue; // in case osdmap changes haven't propagated to PGMap yet
3573 const string& name = osdmap.get_pool_name(p->first);
3574 if (pi->get_pg_num() > pi->get_pgp_num() &&
3575 !(name.find(".DELETED") != string::npos &&
3576 cct->_conf->mon_fake_pool_delete)) {
3577 ostringstream ss;
3578 ss << "pool " << name << " pg_num "
3579 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
3580 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3581 if (detail)
3582 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3583 }
3584 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
3585 if (average_objects_per_pg > 0 &&
3586 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
3587 p->second.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_pool_objects) {
3588 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
3589 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
3590 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
3591 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
3592 ostringstream ss;
3593 ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
3594 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3595 if (detail) {
3596 ostringstream ss;
3597 ss << "pool " << name << " objects per pg ("
3598 << objects_per_pg << ") is more than " << ratio << " times cluster average ("
3599 << average_objects_per_pg << ")";
3600 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3601 }
3602 }
3603 }
3604 }
3605 }
3606
3607 for (auto it : pools) {
3608 auto it2 = pg_pool_sum.find(it.first);
3609 if (it2 == pg_pool_sum.end()) {
3610 continue;
3611 }
3612 const pool_stat_t *pstat = &it2->second;
3613 const object_stat_sum_t& sum = pstat->stats.sum;
3614 const string& pool_name = osdmap.get_pool_name(it.first);
3615 const pg_pool_t &pool = it.second;
3616
3617 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3618 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3619
3620 if (pool.quota_max_objects > 0) {
3621 stringstream ss;
3622 health_status_t status = HEALTH_OK;
3623 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3624 } else if (crit_threshold > 0 &&
3625 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3626 ss << "pool '" << pool_name
3627 << "' has " << sum.num_objects << " objects"
3628 << " (max " << pool.quota_max_objects << ")";
3629 status = HEALTH_ERR;
3630 } else if (warn_threshold > 0 &&
3631 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3632 ss << "pool '" << pool_name
3633 << "' has " << sum.num_objects << " objects"
3634 << " (max " << pool.quota_max_objects << ")";
3635 status = HEALTH_WARN;
3636 }
3637 if (status != HEALTH_OK) {
3638 pair<health_status_t,string> s(status, ss.str());
3639 summary.push_back(s);
3640 if (detail)
3641 detail->push_back(s);
3642 }
3643 }
3644
3645 if (pool.quota_max_bytes > 0) {
3646 health_status_t status = HEALTH_OK;
3647 stringstream ss;
3648 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3649 } else if (crit_threshold > 0 &&
3650 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3651 ss << "pool '" << pool_name
3652 << "' has " << si_t(sum.num_bytes) << " bytes"
3653 << " (max " << si_t(pool.quota_max_bytes) << ")";
3654 status = HEALTH_ERR;
3655 } else if (warn_threshold > 0 &&
3656 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3657 ss << "pool '" << pool_name
3658 << "' has " << si_t(sum.num_bytes) << " bytes"
3659 << " (max " << si_t(pool.quota_max_bytes) << ")";
3660 status = HEALTH_WARN;
3661 }
3662 if (status != HEALTH_OK) {
3663 pair<health_status_t,string> s(status, ss.str());
3664 summary.push_back(s);
3665 if (detail)
3666 detail->push_back(s);
3667 }
3668 }
3669 }
3670
3671 print_unscrubbed_pgs(pg_stat, summary, detail, cct);
3672 }
3673
3674 int process_pg_map_command(
3675 const string& orig_prefix,
3676 const map<string,cmd_vartype>& orig_cmdmap,
3677 const PGMap& pg_map,
3678 const OSDMap& osdmap,
3679 Formatter *f,
3680 stringstream *ss,
3681 bufferlist *odata)
3682 {
3683 string prefix = orig_prefix;
3684 map<string,cmd_vartype> cmdmap = orig_cmdmap;
3685
3686 // perhaps these would be better in the parsing, but it's weird
3687 bool primary = false;
3688 if (prefix == "pg dump_json") {
3689 vector<string> v;
3690 v.push_back(string("all"));
3691 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3692 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3693 prefix = "pg dump";
3694 } else if (prefix == "pg dump_pools_json") {
3695 vector<string> v;
3696 v.push_back(string("pools"));
3697 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3698 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3699 prefix = "pg dump";
3700 } else if (prefix == "pg ls-by-primary") {
3701 primary = true;
3702 prefix = "pg ls";
3703 } else if (prefix == "pg ls-by-osd") {
3704 prefix = "pg ls";
3705 } else if (prefix == "pg ls-by-pool") {
3706 prefix = "pg ls";
3707 string poolstr;
3708 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3709 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3710 if (pool < 0) {
3711 *ss << "pool " << poolstr << " does not exist";
3712 return -ENOENT;
3713 }
3714 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3715 }
3716
3717 int r = 0;
3718 stringstream ds;
3719 if (prefix == "pg stat") {
3720 if (f) {
3721 f->open_object_section("pg_summary");
3722 pg_map.print_oneline_summary(f, NULL);
3723 f->close_section();
3724 f->flush(ds);
3725 } else {
3726 ds << pg_map;
3727 }
3728 odata->append(ds);
3729 return 0;
3730 }
3731
3732 if (prefix == "pg getmap") {
3733 pg_map.encode(*odata);
3734 *ss << "got pgmap version " << pg_map.version;
3735 return 0;
3736 }
3737
3738 if (prefix == "pg dump") {
3739 string val;
3740 vector<string> dumpcontents;
3741 set<string> what;
3742 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3743 copy(dumpcontents.begin(), dumpcontents.end(),
3744 inserter(what, what.end()));
3745 }
3746 if (what.empty())
3747 what.insert("all");
3748 if (f) {
3749 if (what.count("all")) {
3750 f->open_object_section("pg_map");
3751 pg_map.dump(f);
3752 f->close_section();
3753 } else if (what.count("summary") || what.count("sum")) {
3754 f->open_object_section("pg_map");
3755 pg_map.dump_basic(f);
3756 f->close_section();
3757 } else {
3758 if (what.count("pools")) {
3759 pg_map.dump_pool_stats(f);
3760 }
3761 if (what.count("osds")) {
3762 pg_map.dump_osd_stats(f);
3763 }
3764 if (what.count("pgs")) {
3765 pg_map.dump_pg_stats(f, false);
3766 }
3767 if (what.count("pgs_brief")) {
3768 pg_map.dump_pg_stats(f, true);
3769 }
3770 if (what.count("delta")) {
3771 f->open_object_section("delta");
3772 pg_map.dump_delta(f);
3773 f->close_section();
3774 }
3775 }
3776 f->flush(*odata);
3777 } else {
3778 if (what.count("all")) {
3779 pg_map.dump(ds);
3780 } else if (what.count("summary") || what.count("sum")) {
3781 pg_map.dump_basic(ds);
3782 pg_map.dump_pg_sum_stats(ds, true);
3783 pg_map.dump_osd_sum_stats(ds);
3784 } else {
3785 if (what.count("pgs_brief")) {
3786 pg_map.dump_pg_stats(ds, true);
3787 }
3788 bool header = true;
3789 if (what.count("pgs")) {
3790 pg_map.dump_pg_stats(ds, false);
3791 header = false;
3792 }
3793 if (what.count("pools")) {
3794 pg_map.dump_pool_stats(ds, header);
3795 }
3796 if (what.count("osds")) {
3797 pg_map.dump_osd_stats(ds);
3798 }
3799 }
3800 odata->append(ds);
3801 }
3802 *ss << "dumped " << what;
3803 return 0;
3804 }
3805
3806 if (prefix == "pg ls") {
3807 int64_t osd = -1;
3808 int64_t pool = -1;
3809 vector<string>states;
3810 set<pg_t> pgs;
3811 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3812 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3813 cmd_getval(g_ceph_context, cmdmap, "states", states);
3814 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3815 *ss << "pool " << pool << " does not exist";
3816 return -ENOENT;
3817 }
3818 if (osd >= 0 && !osdmap.is_up(osd)) {
3819 *ss << "osd " << osd << " is not up";
3820 return -EAGAIN;
3821 }
3822 if (states.empty())
3823 states.push_back("all");
3824
3825 uint32_t state = 0;
3826
3827 while (!states.empty()) {
3828 string state_str = states.back();
3829
3830 if (state_str == "all") {
3831 state = -1;
3832 break;
3833 } else {
3834 int filter = pg_string_state(state_str);
3835 assert(filter != -1);
3836 state |= filter;
3837 }
3838
3839 states.pop_back();
3840 }
3841
3842 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3843
3844 if (f && !pgs.empty()) {
3845 pg_map.dump_filtered_pg_stats(f, pgs);
3846 f->flush(*odata);
3847 } else if (!pgs.empty()) {
3848 pg_map.dump_filtered_pg_stats(ds, pgs);
3849 odata->append(ds);
3850 }
3851 return 0;
3852 }
3853
3854 if (prefix == "pg dump_stuck") {
3855 vector<string> stuckop_vec;
3856 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3857 if (stuckop_vec.empty())
3858 stuckop_vec.push_back("unclean");
3859 int64_t threshold;
3860 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3861 int64_t(g_conf->mon_pg_stuck_threshold));
3862
3863 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
3864 odata->append(ds);
3865 if (r < 0)
3866 *ss << "failed";
3867 else
3868 *ss << "ok";
3869 return 0;
3870 }
3871
3872 if (prefix == "pg debug") {
3873 string debugop;
3874 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3875 string("unfound_objects_exist"));
3876 if (debugop == "unfound_objects_exist") {
3877 bool unfound_objects_exist = false;
3878 for (const auto& p : pg_map.pg_stat) {
3879 if (p.second.stats.sum.num_objects_unfound > 0) {
3880 unfound_objects_exist = true;
3881 break;
3882 }
3883 }
3884 if (unfound_objects_exist)
3885 ds << "TRUE";
3886 else
3887 ds << "FALSE";
3888 odata->append(ds);
3889 return 0;
3890 }
3891 if (debugop == "degraded_pgs_exist") {
3892 bool degraded_pgs_exist = false;
3893 for (const auto& p : pg_map.pg_stat) {
3894 if (p.second.stats.sum.num_objects_degraded > 0) {
3895 degraded_pgs_exist = true;
3896 break;
3897 }
3898 }
3899 if (degraded_pgs_exist)
3900 ds << "TRUE";
3901 else
3902 ds << "FALSE";
3903 odata->append(ds);
3904 return 0;
3905 }
3906 }
3907
3908 if (prefix == "osd perf") {
3909 if (f) {
3910 f->open_object_section("osdstats");
3911 pg_map.dump_osd_perf_stats(f);
3912 f->close_section();
3913 f->flush(ds);
3914 } else {
3915 pg_map.print_osd_perf_stats(&ds);
3916 }
3917 odata->append(ds);
3918 return 0;
3919 }
3920
3921 if (prefix == "osd blocked-by") {
3922 if (f) {
3923 f->open_object_section("osd_blocked_by");
3924 pg_map.dump_osd_blocked_by_stats(f);
3925 f->close_section();
3926 f->flush(ds);
3927 } else {
3928 pg_map.print_osd_blocked_by_stats(&ds);
3929 }
3930 odata->append(ds);
3931 return 0;
3932 }
3933
3934 if (prefix == "osd pool stats") {
3935 string pool_name;
3936 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
3937
3938 int64_t poolid = -ENOENT;
3939 bool one_pool = false;
3940 if (!pool_name.empty()) {
3941 poolid = osdmap.lookup_pg_pool_name(pool_name);
3942 if (poolid < 0) {
3943 assert(poolid == -ENOENT);
3944 *ss << "unrecognized pool '" << pool_name << "'";
3945 return -ENOENT;
3946 }
3947 one_pool = true;
3948 }
3949
3950 stringstream rs;
3951
3952 if (f)
3953 f->open_array_section("pool_stats");
3954 else {
3955 if (osdmap.get_pools().empty()) {
3956 *ss << "there are no pools!";
3957 goto stats_out;
3958 }
3959 }
3960
3961 for (auto& p : osdmap.get_pools()) {
3962 if (!one_pool)
3963 poolid = p.first;
3964
3965 pool_name = osdmap.get_pool_name(poolid);
3966
3967 if (f) {
3968 f->open_object_section("pool");
3969 f->dump_string("pool_name", pool_name.c_str());
3970 f->dump_int("pool_id", poolid);
3971 f->open_object_section("recovery");
3972 }
3973
3974 list<string> sl;
3975 stringstream tss;
3976 pg_map.pool_recovery_summary(f, &sl, poolid);
3977 if (!f && !sl.empty()) {
3978 for (auto& p : sl)
3979 tss << " " << p << "\n";
3980 }
3981
3982 if (f) {
3983 f->close_section();
3984 f->open_object_section("recovery_rate");
3985 }
3986
3987 ostringstream rss;
3988 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
3989 if (!f && !rss.str().empty())
3990 tss << " recovery io " << rss.str() << "\n";
3991
3992 if (f) {
3993 f->close_section();
3994 f->open_object_section("client_io_rate");
3995 }
3996 rss.clear();
3997 rss.str("");
3998
3999 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
4000 if (!f && !rss.str().empty())
4001 tss << " client io " << rss.str() << "\n";
4002
4003 // dump cache tier IO rate for cache pool
4004 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
4005 if (pool->is_tier()) {
4006 if (f) {
4007 f->close_section();
4008 f->open_object_section("cache_io_rate");
4009 }
4010 rss.clear();
4011 rss.str("");
4012
4013 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
4014 if (!f && !rss.str().empty())
4015 tss << " cache tier io " << rss.str() << "\n";
4016 }
4017 if (f) {
4018 f->close_section();
4019 f->close_section();
4020 } else {
4021 rs << "pool " << pool_name << " id " << poolid << "\n";
4022 if (!tss.str().empty())
4023 rs << tss.str() << "\n";
4024 else
4025 rs << " nothing is going on\n\n";
4026 }
4027 if (one_pool)
4028 break;
4029 }
4030
4031 stats_out:
4032 if (f) {
4033 f->close_section();
4034 f->flush(ds);
4035 odata->append(ds);
4036 } else {
4037 odata->append(rs.str());
4038 }
4039 return 0;
4040 }
4041
4042 return -EOPNOTSUPP;
4043 }
4044
4045 void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
4046 std::set<int> *need_check_down_pg_osds,
4047 std::map<int,utime_t> *last_osd_report,
4048 PGMap *pg_map,
4049 PGMap::Incremental *pending_inc)
4050 {
4051 for (const auto &p : osd_inc.new_weight) {
4052 if (p.second == CEPH_OSD_OUT) {
4053 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
4054 auto j = pg_map->osd_epochs.find(p.first);
4055 if (j != pg_map->osd_epochs.end())
4056 pending_inc->stat_osd_out(p.first, j->second);
4057 }
4058 }
4059
4060 // this is conservative: we want to know if any osds (maybe) got marked down.
4061 for (const auto &p : osd_inc.new_state) {
4062 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
4063 // but we're too lazy to check
4064 // which
4065 need_check_down_pg_osds->insert(p.first);
4066
4067 // clear out the last_osd_report for this OSD
4068 auto report = last_osd_report->find(p.first);
4069 if (report != last_osd_report->end()) {
4070 last_osd_report->erase(report);
4071 }
4072
4073 // clear out osd_stat slow request histogram
4074 dout(20) << __func__ << " clearing osd." << p.first
4075 << " request histogram" << dendl;
4076 pending_inc->stat_osd_down_up(p.first, osd_inc.epoch, *pg_map);
4077 }
4078
4079 if (p.second & CEPH_OSD_EXISTS) {
4080 // whether it was created *or* destroyed, we can safely drop
4081 // it's osd_stat_t record.
4082 dout(10) << __func__ << " osd." << p.first
4083 << " created or destroyed" << dendl;
4084 pending_inc->rm_stat(p.first);
4085
4086 // and adjust full, nearfull set
4087 pg_map->nearfull_osds.erase(p.first);
4088 pg_map->full_osds.erase(p.first);
4089 }
4090 }
4091 }
4092
4093 void PGMapUpdater::check_osd_map(
4094 CephContext *cct,
4095 const OSDMap& osdmap,
4096 const PGMap& pgmap,
4097 PGMap::Incremental *pending_inc)
4098 {
4099 for (auto& p : pgmap.osd_stat) {
4100 if (!osdmap.exists(p.first)) {
4101 // remove osd_stat
4102 pending_inc->rm_stat(p.first);
4103 } else if (osdmap.is_out(p.first)) {
4104 // zero osd_stat
4105 if (p.second.kb != 0) {
4106 auto j = pgmap.osd_epochs.find(p.first);
4107 if (j != pgmap.osd_epochs.end()) {
4108 pending_inc->stat_osd_out(p.first, j->second);
4109 }
4110 }
4111 } else if (!osdmap.is_up(p.first)) {
4112 // zero the op_queue_age_hist
4113 if (!p.second.op_queue_age_hist.empty()) {
4114 pending_inc->stat_osd_down_up(p.first, osdmap.get_epoch(), pgmap);
4115 }
4116 }
4117 }
4118
4119 // deleted pgs (pools)?
4120 for (auto& p : pgmap.pg_pool_sum) {
4121 if (!osdmap.have_pg_pool(p.first)) {
4122 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
4123 << dendl;
4124 for (auto& q : pgmap.pg_stat) {
4125 if (q.first.pool() == (uint64_t)p.first) {
4126 pending_inc->pg_remove.insert(q.first);
4127 }
4128 }
4129 auto q = pending_inc->pg_stat_updates.begin();
4130 while (q != pending_inc->pg_stat_updates.end()) {
4131 if (q->first.pool() == (uint64_t)p.first) {
4132 q = pending_inc->pg_stat_updates.erase(q);
4133 } else {
4134 ++q;
4135 }
4136 }
4137 }
4138 }
4139
4140 // new pgs (split or new pool)?
4141 for (auto& p : osdmap.get_pools()) {
4142 int64_t poolid = p.first;
4143 const pg_pool_t& pi = p.second;
4144 auto q = pgmap.num_pg_by_pool.find(poolid);
4145 unsigned my_pg_num = 0;
4146 if (q != pgmap.num_pg_by_pool.end())
4147 my_pg_num = q->second;
4148 unsigned pg_num = pi.get_pg_num();
4149 if (my_pg_num != pg_num) {
4150 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
4151 << " != my pg_num " << my_pg_num << dendl;
4152 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
4153 pg_t pgid(ps, poolid);
4154 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
4155 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
4156 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4157 stats.last_fresh = osdmap.get_modified();
4158 stats.last_active = osdmap.get_modified();
4159 stats.last_change = osdmap.get_modified();
4160 stats.last_peered = osdmap.get_modified();
4161 stats.last_clean = osdmap.get_modified();
4162 stats.last_unstale = osdmap.get_modified();
4163 stats.last_undegraded = osdmap.get_modified();
4164 stats.last_fullsized = osdmap.get_modified();
4165 stats.last_scrub_stamp = osdmap.get_modified();
4166 stats.last_deep_scrub_stamp = osdmap.get_modified();
4167 stats.last_clean_scrub_stamp = osdmap.get_modified();
4168 }
4169 }
4170 }
4171 }
4172 }
4173
4174 void PGMapUpdater::register_pg(
4175 const OSDMap &osd_map,
4176 pg_t pgid, epoch_t epoch,
4177 bool new_pool,
4178 const PGMap &pg_map,
4179 PGMap::Incremental *pending_inc)
4180 {
4181 pg_t parent;
4182 int split_bits = 0;
4183 auto parent_stat = pg_map.pg_stat.end();
4184 if (!new_pool) {
4185 parent = pgid;
4186 while (1) {
4187 // remove most significant bit
4188 int msb = cbits(parent.ps());
4189 if (!msb)
4190 break;
4191 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
4192 split_bits++;
4193 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
4194 parent_stat = pg_map.pg_stat.find(parent);
4195 if (parent_stat != pg_map.pg_stat.end() &&
4196 parent_stat->second.state != PG_STATE_CREATING) {
4197 dout(10) << " parent is " << parent << dendl;
4198 break;
4199 }
4200 }
4201 }
4202
4203 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4204 stats.state = PG_STATE_CREATING;
4205 stats.created = epoch;
4206 stats.parent = parent;
4207 stats.parent_split_bits = split_bits;
4208 stats.mapping_epoch = epoch;
4209
4210 if (parent_stat != pg_map.pg_stat.end()) {
4211 const pg_stat_t &ps = parent_stat->second;
4212 stats.last_fresh = ps.last_fresh;
4213 stats.last_active = ps.last_active;
4214 stats.last_change = ps.last_change;
4215 stats.last_peered = ps.last_peered;
4216 stats.last_clean = ps.last_clean;
4217 stats.last_unstale = ps.last_unstale;
4218 stats.last_undegraded = ps.last_undegraded;
4219 stats.last_fullsized = ps.last_fullsized;
4220 stats.last_scrub_stamp = ps.last_scrub_stamp;
4221 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
4222 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
4223 } else {
4224 utime_t now = osd_map.get_modified();
4225 stats.last_fresh = now;
4226 stats.last_active = now;
4227 stats.last_change = now;
4228 stats.last_peered = now;
4229 stats.last_clean = now;
4230 stats.last_unstale = now;
4231 stats.last_undegraded = now;
4232 stats.last_fullsized = now;
4233 stats.last_scrub_stamp = now;
4234 stats.last_deep_scrub_stamp = now;
4235 stats.last_clean_scrub_stamp = now;
4236 }
4237
4238 osd_map.pg_to_up_acting_osds(
4239 pgid,
4240 &stats.up,
4241 &stats.up_primary,
4242 &stats.acting,
4243 &stats.acting_primary);
4244
4245 if (split_bits == 0) {
4246 dout(10) << __func__ << " will create " << pgid
4247 << " primary " << stats.acting_primary
4248 << " acting " << stats.acting
4249 << dendl;
4250 } else {
4251 dout(10) << __func__ << " will create " << pgid
4252 << " primary " << stats.acting_primary
4253 << " acting " << stats.acting
4254 << " parent " << parent
4255 << " by " << split_bits << " bits"
4256 << dendl;
4257 }
4258 }
4259
4260 void PGMapUpdater::register_new_pgs(
4261 const OSDMap &osd_map,
4262 const PGMap &pg_map,
4263 PGMap::Incremental *pending_inc)
4264 {
4265 epoch_t epoch = osd_map.get_epoch();
4266 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
4267 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
4268
4269 int created = 0;
4270 const auto &pools = osd_map.get_pools();
4271
4272 for (const auto &p : pools) {
4273 int64_t poolid = p.first;
4274 const pg_pool_t &pool = p.second;
4275 int ruleno = osd_map.crush->find_rule(pool.get_crush_rule(),
4276 pool.get_type(), pool.get_size());
4277 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
4278 continue;
4279
4280 if (pool.get_last_change() <= pg_map.last_pg_scan ||
4281 pool.get_last_change() <= pending_inc->pg_scan) {
4282 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
4283 continue;
4284 }
4285
4286 dout(10) << __func__ << " scanning pool " << poolid
4287 << " " << pool << dendl;
4288
4289 // first pgs in this pool
4290 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
4291
4292 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
4293 pg_t pgid(ps, poolid, -1);
4294 if (pg_map.pg_stat.count(pgid)) {
4295 dout(20) << "register_new_pgs have " << pgid << dendl;
4296 continue;
4297 }
4298 created++;
4299 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
4300 pg_map, pending_inc);
4301 }
4302 }
4303
4304 int removed = 0;
4305 for (const auto &p : pg_map.creating_pgs) {
4306 if (p.preferred() >= 0) {
4307 dout(20) << " removing creating_pg " << p
4308 << " because it is localized and obsolete" << dendl;
4309 pending_inc->pg_remove.insert(p);
4310 ++removed;
4311 } else if (!osd_map.have_pg_pool(p.pool())) {
4312 dout(20) << " removing creating_pg " << p
4313 << " because containing pool deleted" << dendl;
4314 pending_inc->pg_remove.insert(p);
4315 ++removed;
4316 }
4317 }
4318
4319 // deleted pools?
4320 for (const auto &p : pg_map.pg_stat) {
4321 if (!osd_map.have_pg_pool(p.first.pool())) {
4322 dout(20) << " removing pg_stat " << p.first << " because "
4323 << "containing pool deleted" << dendl;
4324 pending_inc->pg_remove.insert(p.first);
4325 ++removed;
4326 } else if (p.first.preferred() >= 0) {
4327 dout(20) << " removing localized pg " << p.first << dendl;
4328 pending_inc->pg_remove.insert(p.first);
4329 ++removed;
4330 }
4331 }
4332
4333 // we don't want to redo this work if we can avoid it.
4334 pending_inc->pg_scan = epoch;
4335
4336 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
4337 << removed << " uncreated pgs" << dendl;
4338 }
4339
4340
4341 void PGMapUpdater::update_creating_pgs(
4342 const OSDMap &osd_map,
4343 const PGMap &pg_map,
4344 PGMap::Incremental *pending_inc)
4345 {
4346 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
4347 << " pgs, osdmap epoch " << osd_map.get_epoch()
4348 << dendl;
4349
4350 unsigned changed = 0;
4351 for (auto p = pg_map.creating_pgs.begin();
4352 p != pg_map.creating_pgs.end();
4353 ++p) {
4354 pg_t pgid = *p;
4355 pg_t on = pgid;
4356 auto q = pg_map.pg_stat.find(pgid);
4357 assert(q != pg_map.pg_stat.end());
4358 const pg_stat_t *s = &q->second;
4359
4360 if (s->parent_split_bits)
4361 on = s->parent;
4362
4363 vector<int> up, acting;
4364 int up_primary, acting_primary;
4365 osd_map.pg_to_up_acting_osds(
4366 on,
4367 &up,
4368 &up_primary,
4369 &acting,
4370 &acting_primary);
4371
4372 if (up != s->up ||
4373 up_primary != s->up_primary ||
4374 acting != s->acting ||
4375 acting_primary != s->acting_primary) {
4376 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
4377 if (osd_map.get_epoch() > ns->reported_epoch) {
4378 dout(20) << __func__ << " " << pgid << " "
4379 << " acting_primary: " << s->acting_primary
4380 << " -> " << acting_primary
4381 << " acting: " << s->acting << " -> " << acting
4382 << " up_primary: " << s->up_primary << " -> " << up_primary
4383 << " up: " << s->up << " -> " << up
4384 << dendl;
4385
4386 // only initialize if it wasn't already a pending update
4387 if (ns->reported_epoch == 0)
4388 *ns = *s;
4389
4390 // note epoch if the target of the create message changed
4391 if (acting_primary != ns->acting_primary)
4392 ns->mapping_epoch = osd_map.get_epoch();
4393
4394 ns->up = up;
4395 ns->up_primary = up_primary;
4396 ns->acting = acting;
4397 ns->acting_primary = acting_primary;
4398
4399 ++changed;
4400 } else {
4401 dout(20) << __func__ << " " << pgid << " has pending update from newer"
4402 << " epoch " << ns->reported_epoch
4403 << dendl;
4404 }
4405 }
4406 }
4407 if (changed) {
4408 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
4409 }
4410 }
4411
4412 static void _try_mark_pg_stale(
4413 const OSDMap& osdmap,
4414 pg_t pgid,
4415 const pg_stat_t& cur,
4416 PGMap::Incremental *pending_inc)
4417 {
4418 if ((cur.state & PG_STATE_STALE) == 0 &&
4419 cur.acting_primary != -1 &&
4420 osdmap.is_down(cur.acting_primary)) {
4421 pg_stat_t *newstat;
4422 auto q = pending_inc->pg_stat_updates.find(pgid);
4423 if (q != pending_inc->pg_stat_updates.end()) {
4424 if ((q->second.acting_primary == cur.acting_primary) ||
4425 ((q->second.state & PG_STATE_STALE) == 0 &&
4426 q->second.acting_primary != -1 &&
4427 osdmap.is_down(q->second.acting_primary))) {
4428 newstat = &q->second;
4429 } else {
4430 // pending update is no longer down or already stale
4431 return;
4432 }
4433 } else {
4434 newstat = &pending_inc->pg_stat_updates[pgid];
4435 *newstat = cur;
4436 }
4437 dout(10) << __func__ << " marking pg " << pgid
4438 << " stale (acting_primary " << newstat->acting_primary
4439 << ")" << dendl;
4440 newstat->state |= PG_STATE_STALE;
4441 newstat->last_unstale = ceph_clock_now();
4442 }
4443 }
4444
4445 void PGMapUpdater::check_down_pgs(
4446 const OSDMap &osdmap,
4447 const PGMap &pg_map,
4448 bool check_all,
4449 const set<int>& need_check_down_pg_osds,
4450 PGMap::Incremental *pending_inc)
4451 {
4452 // if a large number of osds changed state, just iterate over the whole
4453 // pg map.
4454 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
4455 g_conf->mon_pg_check_down_all_threshold) {
4456 check_all = true;
4457 }
4458
4459 if (check_all) {
4460 for (const auto& p : pg_map.pg_stat) {
4461 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
4462 }
4463 } else {
4464 for (auto osd : need_check_down_pg_osds) {
4465 if (osdmap.is_down(osd)) {
4466 auto p = pg_map.pg_by_osd.find(osd);
4467 if (p == pg_map.pg_by_osd.end()) {
4468 continue;
4469 }
4470 for (auto pgid : p->second) {
4471 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
4472 assert(stat.acting_primary == osd);
4473 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
4474 }
4475 }
4476 }
4477 }
4478 }
4479
4480 int reweight::by_utilization(
4481 const OSDMap &osdmap,
4482 const PGMap &pgm,
4483 int oload,
4484 double max_changef,
4485 int max_osds,
4486 bool by_pg, const set<int64_t> *pools,
4487 bool no_increasing,
4488 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
4489 std::stringstream *ss,
4490 std::string *out_str,
4491 Formatter *f)
4492 {
4493 if (oload <= 100) {
4494 *ss << "You must give a percentage higher than 100. "
4495 "The reweighting threshold will be calculated as <average-utilization> "
4496 "times <input-percentage>. For example, an argument of 200 would "
4497 "reweight OSDs which are twice as utilized as the average OSD.\n";
4498 return -EINVAL;
4499 }
4500
4501 vector<int> pgs_by_osd(osdmap.get_max_osd());
4502
4503 // Avoid putting a small number (or 0) in the denominator when calculating
4504 // average_util
4505 double average_util;
4506 if (by_pg) {
4507 // by pg mapping
4508 double weight_sum = 0.0; // sum up the crush weights
4509 unsigned num_pg_copies = 0;
4510 int num_osds = 0;
4511 for (const auto& pg : pgm.pg_stat) {
4512 if (pools && pools->count(pg.first.pool()) == 0)
4513 continue;
4514 for (const auto acting : pg.second.acting) {
4515 if (acting >= (int)pgs_by_osd.size())
4516 pgs_by_osd.resize(acting);
4517 if (pgs_by_osd[acting] == 0) {
4518 if (osdmap.crush->get_item_weightf(acting) <= 0) {
4519 //skip if we currently can not identify item
4520 continue;
4521 }
4522 weight_sum += osdmap.crush->get_item_weightf(acting);
4523 ++num_osds;
4524 }
4525 ++pgs_by_osd[acting];
4526 ++num_pg_copies;
4527 }
4528 }
4529
4530 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
4531 *ss << "Refusing to reweight: we only have " << num_pg_copies
4532 << " PGs across " << num_osds << " osds!\n";
4533 return -EDOM;
4534 }
4535
4536 average_util = (double)num_pg_copies / weight_sum;
4537 } else {
4538 // by osd utilization
4539 int num_osd = MAX(1, pgm.osd_stat.size());
4540 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
4541 < g_conf->mon_reweight_min_bytes_per_osd) {
4542 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
4543 << " kb across all osds!\n";
4544 return -EDOM;
4545 }
4546 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
4547 < g_conf->mon_reweight_min_bytes_per_osd) {
4548 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
4549 << " kb used across all osds!\n";
4550 return -EDOM;
4551 }
4552
4553 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
4554 }
4555
4556 // adjust down only if we are above the threshold
4557 const double overload_util = average_util * (double)oload / 100.0;
4558
4559 // but aggressively adjust weights up whenever possible.
4560 const double underload_util = average_util;
4561
4562 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4563
4564 ostringstream oss;
4565 if (f) {
4566 f->open_object_section("reweight_by_utilization");
4567 f->dump_int("overload_min", oload);
4568 f->dump_float("max_change", max_changef);
4569 f->dump_int("max_change_osds", max_osds);
4570 f->dump_float("average_utilization", average_util);
4571 f->dump_float("overload_utilization", overload_util);
4572 } else {
4573 oss << "oload " << oload << "\n";
4574 oss << "max_change " << max_changef << "\n";
4575 oss << "max_change_osds " << max_osds << "\n";
4576 oss.precision(4);
4577 oss << "average_utilization " << std::fixed << average_util << "\n";
4578 oss << "overload_utilization " << overload_util << "\n";
4579 }
4580 int num_changed = 0;
4581
4582 // precompute util for each OSD
4583 std::vector<std::pair<int, float> > util_by_osd;
4584 for (const auto& p : pgm.osd_stat) {
4585 std::pair<int, float> osd_util;
4586 osd_util.first = p.first;
4587 if (by_pg) {
4588 if (p.first >= (int)pgs_by_osd.size() ||
4589 pgs_by_osd[p.first] == 0) {
4590 // skip if this OSD does not contain any pg
4591 // belonging to the specified pool(s).
4592 continue;
4593 }
4594
4595 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4596 // skip if we are unable to locate item.
4597 continue;
4598 }
4599
4600 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4601 } else {
4602 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
4603 }
4604 util_by_osd.push_back(osd_util);
4605 }
4606
4607 // sort by absolute deviation from the mean utilization,
4608 // in descending order.
4609 std::sort(util_by_osd.begin(), util_by_osd.end(),
4610 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4611 return abs(l.second - average_util) > abs(r.second - average_util);
4612 }
4613 );
4614
4615 if (f)
4616 f->open_array_section("reweights");
4617
4618 for (const auto& p : util_by_osd) {
4619 unsigned weight = osdmap.get_weight(p.first);
4620 if (weight == 0) {
4621 // skip if OSD is currently out
4622 continue;
4623 }
4624 float util = p.second;
4625
4626 if (util >= overload_util) {
4627 // Assign a lower weight to overloaded OSDs. The current weight
4628 // is a factor to take into account the original weights,
4629 // to represent e.g. differing storage capacities
4630 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4631 if (weight > max_change)
4632 new_weight = MAX(new_weight, weight - max_change);
4633 new_weights->insert({p.first, new_weight});
4634 if (f) {
4635 f->open_object_section("osd");
4636 f->dump_int("osd", p.first);
4637 f->dump_float("weight", (float)weight / (float)0x10000);
4638 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4639 f->close_section();
4640 } else {
4641 oss << "osd." << p.first << " weight "
4642 << (float)weight / (float)0x10000 << " -> "
4643 << (float)new_weight / (float)0x10000 << "\n";
4644 }
4645 if (++num_changed >= max_osds)
4646 break;
4647 }
4648 if (!no_increasing && util <= underload_util) {
4649 // assign a higher weight.. if we can.
4650 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4651 new_weight = MIN(new_weight, weight + max_change);
4652 if (new_weight > 0x10000)
4653 new_weight = 0x10000;
4654 if (new_weight > weight) {
4655 new_weights->insert({p.first, new_weight});
4656 oss << "osd." << p.first << " weight "
4657 << (float)weight / (float)0x10000 << " -> "
4658 << (float)new_weight / (float)0x10000 << "\n";
4659 if (++num_changed >= max_osds)
4660 break;
4661 }
4662 }
4663 }
4664 if (f) {
4665 f->close_section();
4666 }
4667
4668 OSDMap newmap;
4669 newmap.deepish_copy_from(osdmap);
4670 OSDMap::Incremental newinc;
4671 newinc.fsid = newmap.get_fsid();
4672 newinc.epoch = newmap.get_epoch() + 1;
4673 newinc.new_weight = *new_weights;
4674 newmap.apply_incremental(newinc);
4675
4676 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4677
4678 if (f) {
4679 f->close_section();
4680 } else {
4681 *out_str += "\n";
4682 *out_str += oss.str();
4683 }
4684 return num_changed;
4685 }