]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
update sources to 12.2.2
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Formatter.h"
11 #include "include/ceph_features.h"
12 #include "include/stringify.h"
13
14 #include "osd/osd_types.h"
15 #include "osd/OSDMap.h"
16
17 #define dout_context g_ceph_context
18
19 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
22
23
24 // ---------------------
25 // PGMapDigest
26
27 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
28 {
29 // NOTE: see PGMap::encode_digest
30 ENCODE_START(1, 1, bl);
31 ::encode(num_pg, bl);
32 ::encode(num_pg_active, bl);
33 ::encode(num_pg_unknown, bl);
34 ::encode(num_osd, bl);
35 ::encode(pg_pool_sum, bl, features);
36 ::encode(pg_sum, bl, features);
37 ::encode(osd_sum, bl);
38 ::encode(num_pg_by_state, bl);
39 ::encode(num_pg_by_osd, bl);
40 ::encode(num_pg_by_pool, bl);
41 ::encode(osd_last_seq, bl);
42 ::encode(per_pool_sum_delta, bl, features);
43 ::encode(per_pool_sum_deltas_stamps, bl);
44 ::encode(pg_sum_delta, bl, features);
45 ::encode(stamp_delta, bl);
46 ::encode(avail_space_by_rule, bl);
47 ENCODE_FINISH(bl);
48 }
49
50 void PGMapDigest::decode(bufferlist::iterator& p)
51 {
52 DECODE_START(1, p);
53 ::decode(num_pg, p);
54 ::decode(num_pg_active, p);
55 ::decode(num_pg_unknown, p);
56 ::decode(num_osd, p);
57 ::decode(pg_pool_sum, p);
58 ::decode(pg_sum, p);
59 ::decode(osd_sum, p);
60 ::decode(num_pg_by_state, p);
61 ::decode(num_pg_by_osd, p);
62 ::decode(num_pg_by_pool, p);
63 ::decode(osd_last_seq, p);
64 ::decode(per_pool_sum_delta, p);
65 ::decode(per_pool_sum_deltas_stamps, p);
66 ::decode(pg_sum_delta, p);
67 ::decode(stamp_delta, p);
68 ::decode(avail_space_by_rule, p);
69 DECODE_FINISH(p);
70 }
71
72 void PGMapDigest::dump(Formatter *f) const
73 {
74 f->dump_unsigned("num_pg", num_pg);
75 f->dump_unsigned("num_pg_active", num_pg_active);
76 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
77 f->dump_unsigned("num_osd", num_osd);
78 f->dump_object("pool_sum", pg_sum);
79 f->dump_object("osd_sum", osd_sum);
80 f->open_array_section("pool_stats");
81 for (auto& p : pg_pool_sum) {
82 f->open_object_section("pool_stat");
83 f->dump_int("poolid", p.first);
84 auto q = num_pg_by_pool.find(p.first);
85 if (q != num_pg_by_pool.end())
86 f->dump_unsigned("num_pg", q->second);
87 p.second.dump(f);
88 f->close_section();
89 }
90 f->close_section();
91 f->open_array_section("osd_stats");
92 int i = 0;
93 // TODO: this isn't really correct since we can dump non-existent OSDs
94 // I dunno what osd_last_seq is set to in that case...
95 for (auto& p : osd_last_seq) {
96 f->open_object_section("osd_stat");
97 f->dump_int("osd", i);
98 f->dump_unsigned("seq", p);
99 f->close_section();
100 ++i;
101 }
102 f->close_section();
103 f->open_array_section("num_pg_by_state");
104 for (auto& p : num_pg_by_state) {
105 f->open_object_section("count");
106 f->dump_string("state", pg_state_string(p.first));
107 f->dump_unsigned("num", p.second);
108 f->close_section();
109 }
110 f->close_section();
111 f->open_array_section("num_pg_by_osd");
112 for (auto& p : num_pg_by_osd) {
113 f->open_object_section("count");
114 f->dump_unsigned("osd", p.first);
115 f->dump_unsigned("num_primary_pg", p.second.primary);
116 f->dump_unsigned("num_acting_pg", p.second.acting);
117 f->dump_unsigned("num_up_pg", p.second.up);
118 f->close_section();
119 }
120 f->close_section();
121 }
122
123 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
124 {
125 ls.push_back(new PGMapDigest);
126 }
127
128 inline std::string percentify(const float& a) {
129 std::stringstream ss;
130 if (a < 0.01)
131 ss << "0";
132 else
133 ss << std::fixed << std::setprecision(2) << a;
134 return ss.str();
135 }
136
137 void PGMapDigest::print_summary(Formatter *f, ostream *out) const
138 {
139 if (f)
140 f->open_array_section("pgs_by_state");
141
142 // list is descending numeric order (by count)
143 multimap<int,int> state_by_count; // count -> state
144 for (auto p = num_pg_by_state.begin();
145 p != num_pg_by_state.end();
146 ++p) {
147 state_by_count.insert(make_pair(p->second, p->first));
148 }
149 if (f) {
150 for (auto p = state_by_count.rbegin();
151 p != state_by_count.rend();
152 ++p)
153 {
154 f->open_object_section("pgs_by_state_element");
155 f->dump_string("state_name", pg_state_string(p->second));
156 f->dump_unsigned("count", p->first);
157 f->close_section();
158 }
159 }
160 if (f)
161 f->close_section();
162
163 if (f) {
164 f->dump_unsigned("num_pgs", num_pg);
165 f->dump_unsigned("num_pools", pg_pool_sum.size());
166 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
167 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
168 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
169 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
170 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
171 } else {
172 *out << " pools: " << pg_pool_sum.size() << " pools, "
173 << num_pg << " pgs\n";
174 *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
175 << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
176 *out << " usage: "
177 << kb_t(osd_sum.kb_used) << " used, "
178 << kb_t(osd_sum.kb_avail) << " / "
179 << kb_t(osd_sum.kb) << " avail\n";
180 *out << " pgs: ";
181 }
182
183 bool pad = false;
184
185 if (num_pg_unknown > 0) {
186 float p = (float)num_pg_unknown / (float)num_pg;
187 if (f) {
188 f->dump_float("unknown_pgs_ratio", p);
189 } else {
190 char b[20];
191 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
192 *out << b << "% pgs unknown\n";
193 pad = true;
194 }
195 }
196
197 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
198 if (num_pg_inactive > 0) {
199 float p = (float)num_pg_inactive / (float)num_pg;
200 if (f) {
201 f->dump_float("inactive_pgs_ratio", p);
202 } else {
203 if (pad) {
204 *out << " ";
205 }
206 char b[20];
207 snprintf(b, sizeof(b), "%.3f", p * 100.0);
208 *out << b << "% pgs not active\n";
209 pad = true;
210 }
211 }
212
213 list<string> sl;
214 overall_recovery_summary(f, &sl);
215 if (!f && !sl.empty()) {
216 for (auto p = sl.begin(); p != sl.end(); ++p) {
217 if (pad) {
218 *out << " ";
219 }
220 *out << *p << "\n";
221 pad = true;
222 }
223 }
224 sl.clear();
225
226 if (!f) {
227 unsigned max_width = 1;
228 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
229 p != state_by_count.rend();
230 ++p)
231 {
232 std::stringstream ss;
233 ss << p->first;
234 max_width = MAX(ss.str().size(), max_width);
235 }
236
237 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
238 p != state_by_count.rend();
239 ++p)
240 {
241 if (pad) {
242 *out << " ";
243 }
244 pad = true;
245 out->setf(std::ios::left);
246 *out << std::setw(max_width) << p->first
247 << " " << pg_state_string(p->second) << "\n";
248 out->unsetf(std::ios::left);
249 }
250 }
251
252 ostringstream ss_rec_io;
253 overall_recovery_rate_summary(f, &ss_rec_io);
254 ostringstream ss_client_io;
255 overall_client_io_rate_summary(f, &ss_client_io);
256 ostringstream ss_cache_io;
257 overall_cache_io_rate_summary(f, &ss_cache_io);
258
259 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
260 || ss_cache_io.str().length())) {
261 *out << "\n \n";
262 *out << " io:\n";
263 }
264
265 if (!f && ss_client_io.str().length())
266 *out << " client: " << ss_client_io.str() << "\n";
267 if (!f && ss_rec_io.str().length())
268 *out << " recovery: " << ss_rec_io.str() << "\n";
269 if (!f && ss_cache_io.str().length())
270 *out << " cache: " << ss_cache_io.str() << "\n";
271 }
272
273 void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
274 {
275 std::stringstream ss;
276
277 if (f)
278 f->open_array_section("num_pg_by_state");
279 for (auto p = num_pg_by_state.begin();
280 p != num_pg_by_state.end();
281 ++p) {
282 if (f) {
283 f->open_object_section("state");
284 f->dump_string("name", pg_state_string(p->first));
285 f->dump_unsigned("num", p->second);
286 f->close_section();
287 }
288 if (p != num_pg_by_state.begin())
289 ss << ", ";
290 ss << p->second << " " << pg_state_string(p->first);
291 }
292 if (f)
293 f->close_section();
294
295 string states = ss.str();
296 if (out)
297 *out << num_pg << " pgs: "
298 << states << "; "
299 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
300 << kb_t(osd_sum.kb_used) << " used, "
301 << kb_t(osd_sum.kb_avail) << " / "
302 << kb_t(osd_sum.kb) << " avail";
303 if (f) {
304 f->dump_unsigned("num_pgs", num_pg);
305 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
306 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
307 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
308 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
309 }
310
311 // make non-negative; we can get negative values if osds send
312 // uncommitted stats and then "go backward" or if they are just
313 // buggy/wrong.
314 pool_stat_t pos_delta = pg_sum_delta;
315 pos_delta.floor(0);
316 if (pos_delta.stats.sum.num_rd ||
317 pos_delta.stats.sum.num_wr) {
318 if (out)
319 *out << "; ";
320 if (pos_delta.stats.sum.num_rd) {
321 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
322 if (out)
323 *out << pretty_si_t(rd) << "B/s rd, ";
324 if (f)
325 f->dump_unsigned("read_bytes_sec", rd);
326 }
327 if (pos_delta.stats.sum.num_wr) {
328 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
329 if (out)
330 *out << pretty_si_t(wr) << "B/s wr, ";
331 if (f)
332 f->dump_unsigned("write_bytes_sec", wr);
333 }
334 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
335 if (out)
336 *out << pretty_si_t(iops) << "op/s";
337 if (f)
338 f->dump_unsigned("io_sec", iops);
339 }
340
341 list<string> sl;
342 overall_recovery_summary(f, &sl);
343 if (out)
344 for (auto p = sl.begin(); p != sl.end(); ++p)
345 *out << "; " << *p;
346 std::stringstream ssr;
347 overall_recovery_rate_summary(f, &ssr);
348 if (out && ssr.str().length())
349 *out << "; " << ssr.str() << " recovering";
350 }
351
352 void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
353 const pool_stat_t& delta_sum) const
354 {
355 if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
356 double pc = (double)delta_sum.stats.sum.num_objects_degraded /
357 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
358 char b[20];
359 snprintf(b, sizeof(b), "%.3lf", pc);
360 if (f) {
361 f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
362 f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
363 f->dump_float("degraded_ratio", pc / 100.0);
364 } else {
365 ostringstream ss;
366 ss << delta_sum.stats.sum.num_objects_degraded
367 << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
368 psl->push_back(ss.str());
369 }
370 }
371 if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
372 double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
373 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
374 char b[20];
375 snprintf(b, sizeof(b), "%.3lf", pc);
376 if (f) {
377 f->dump_unsigned("misplaced_objects", delta_sum.stats.sum.num_objects_misplaced);
378 f->dump_unsigned("misplaced_total", delta_sum.stats.sum.num_object_copies);
379 f->dump_float("misplaced_ratio", pc / 100.0);
380 } else {
381 ostringstream ss;
382 ss << delta_sum.stats.sum.num_objects_misplaced
383 << "/" << delta_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
384 psl->push_back(ss.str());
385 }
386 }
387 if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
388 double pc = (double)delta_sum.stats.sum.num_objects_unfound /
389 (double)delta_sum.stats.sum.num_objects * (double)100.0;
390 char b[20];
391 snprintf(b, sizeof(b), "%.3lf", pc);
392 if (f) {
393 f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
394 f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
395 f->dump_float("unfound_ratio", pc / 100.0);
396 } else {
397 ostringstream ss;
398 ss << delta_sum.stats.sum.num_objects_unfound
399 << "/" << delta_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
400 psl->push_back(ss.str());
401 }
402 }
403 }
404
405 void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
406 const pool_stat_t& delta_sum,
407 utime_t delta_stamp) const
408 {
409 // make non-negative; we can get negative values if osds send
410 // uncommitted stats and then "go backward" or if they are just
411 // buggy/wrong.
412 pool_stat_t pos_delta = delta_sum;
413 pos_delta.floor(0);
414 if (pos_delta.stats.sum.num_objects_recovered ||
415 pos_delta.stats.sum.num_bytes_recovered ||
416 pos_delta.stats.sum.num_keys_recovered) {
417 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
418 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
419 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
420 if (f) {
421 f->dump_int("recovering_objects_per_sec", objps);
422 f->dump_int("recovering_bytes_per_sec", bps);
423 f->dump_int("recovering_keys_per_sec", kps);
424 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
425 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
426 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
427 } else {
428 *out << pretty_si_t(bps) << "B/s";
429 if (pos_delta.stats.sum.num_keys_recovered)
430 *out << ", " << pretty_si_t(kps) << "keys/s";
431 *out << ", " << pretty_si_t(objps) << "objects/s";
432 }
433 }
434 }
435
436 void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
437 {
438 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
439 }
440
441 void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
442 {
443 recovery_summary(f, psl, pg_sum);
444 }
445
446 void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
447 uint64_t poolid) const
448 {
449 auto p = per_pool_sum_delta.find(poolid);
450 if (p == per_pool_sum_delta.end())
451 return;
452
453 auto ts = per_pool_sum_deltas_stamps.find(p->first);
454 assert(ts != per_pool_sum_deltas_stamps.end());
455 recovery_rate_summary(f, out, p->second.first, ts->second);
456 }
457
458 void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
459 uint64_t poolid) const
460 {
461 auto p = per_pool_sum_delta.find(poolid);
462 if (p == per_pool_sum_delta.end())
463 return;
464
465 recovery_summary(f, psl, p->second.first);
466 }
467
468 void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
469 const pool_stat_t& delta_sum,
470 utime_t delta_stamp) const
471 {
472 pool_stat_t pos_delta = delta_sum;
473 pos_delta.floor(0);
474 if (pos_delta.stats.sum.num_rd ||
475 pos_delta.stats.sum.num_wr) {
476 if (pos_delta.stats.sum.num_rd) {
477 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
478 if (f) {
479 f->dump_int("read_bytes_sec", rd);
480 } else {
481 *out << pretty_si_t(rd) << "B/s rd, ";
482 }
483 }
484 if (pos_delta.stats.sum.num_wr) {
485 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
486 if (f) {
487 f->dump_int("write_bytes_sec", wr);
488 } else {
489 *out << pretty_si_t(wr) << "B/s wr, ";
490 }
491 }
492 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
493 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
494 if (f) {
495 f->dump_int("read_op_per_sec", iops_rd);
496 f->dump_int("write_op_per_sec", iops_wr);
497 } else {
498 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
499 }
500 }
501 }
502
503 void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
504 {
505 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
506 }
507
508 void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
509 uint64_t poolid) const
510 {
511 auto p = per_pool_sum_delta.find(poolid);
512 if (p == per_pool_sum_delta.end())
513 return;
514
515 auto ts = per_pool_sum_deltas_stamps.find(p->first);
516 assert(ts != per_pool_sum_deltas_stamps.end());
517 client_io_rate_summary(f, out, p->second.first, ts->second);
518 }
519
520 void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
521 const pool_stat_t& delta_sum,
522 utime_t delta_stamp) const
523 {
524 pool_stat_t pos_delta = delta_sum;
525 pos_delta.floor(0);
526 bool have_output = false;
527
528 if (pos_delta.stats.sum.num_flush) {
529 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
530 if (f) {
531 f->dump_int("flush_bytes_sec", flush);
532 } else {
533 *out << pretty_si_t(flush) << "B/s flush";
534 have_output = true;
535 }
536 }
537 if (pos_delta.stats.sum.num_evict) {
538 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
539 if (f) {
540 f->dump_int("evict_bytes_sec", evict);
541 } else {
542 if (have_output)
543 *out << ", ";
544 *out << pretty_si_t(evict) << "B/s evict";
545 have_output = true;
546 }
547 }
548 if (pos_delta.stats.sum.num_promote) {
549 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
550 if (f) {
551 f->dump_int("promote_op_per_sec", promote);
552 } else {
553 if (have_output)
554 *out << ", ";
555 *out << pretty_si_t(promote) << "op/s promote";
556 have_output = true;
557 }
558 }
559 if (pos_delta.stats.sum.num_flush_mode_low) {
560 if (f) {
561 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
562 } else {
563 if (have_output)
564 *out << ", ";
565 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
566 have_output = true;
567 }
568 }
569 if (pos_delta.stats.sum.num_flush_mode_high) {
570 if (f) {
571 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
572 } else {
573 if (have_output)
574 *out << ", ";
575 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
576 have_output = true;
577 }
578 }
579 if (pos_delta.stats.sum.num_evict_mode_some) {
580 if (f) {
581 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
582 } else {
583 if (have_output)
584 *out << ", ";
585 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
586 have_output = true;
587 }
588 }
589 if (pos_delta.stats.sum.num_evict_mode_full) {
590 if (f) {
591 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
592 } else {
593 if (have_output)
594 *out << ", ";
595 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
596 }
597 }
598 }
599
600 void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
601 {
602 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
603 }
604
605 void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
606 uint64_t poolid) const
607 {
608 auto p = per_pool_sum_delta.find(poolid);
609 if (p == per_pool_sum_delta.end())
610 return;
611
612 auto ts = per_pool_sum_deltas_stamps.find(p->first);
613 assert(ts != per_pool_sum_deltas_stamps.end());
614 cache_io_rate_summary(f, out, p->second.first, ts->second);
615 }
616
617 static float pool_raw_used_rate(const OSDMap &osd_map, int64_t poolid)
618 {
619 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
620
621 switch (pool->get_type()) {
622 case pg_pool_t::TYPE_REPLICATED:
623 return pool->get_size();
624 break;
625 case pg_pool_t::TYPE_ERASURE:
626 {
627 auto& ecp =
628 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
629 auto pm = ecp.find("m");
630 auto pk = ecp.find("k");
631 if (pm != ecp.end() && pk != ecp.end()) {
632 int k = atoi(pk->second.c_str());
633 int m = atoi(pm->second.c_str());
634 int mk = m + k;
635 assert(mk != 0);
636 assert(k != 0);
637 return (float)mk / k;
638 } else {
639 return 0.0;
640 }
641 }
642 break;
643 default:
644 assert(0 == "unrecognized pool type");
645 }
646 }
647
648 ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
649 boost::optional<int64_t> data_pool) const
650 {
651 ceph_statfs statfs;
652 bool filter = false;
653 object_stat_sum_t sum;
654
655 if (data_pool) {
656 auto i = pg_pool_sum.find(*data_pool);
657 if (i != pg_pool_sum.end()) {
658 sum = i->second.stats.sum;
659 filter = true;
660 }
661 }
662
663 if (filter) {
664 statfs.kb_used = (sum.num_bytes >> 10);
665 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
666 statfs.num_objects = sum.num_objects;
667 statfs.kb = statfs.kb_used + statfs.kb_avail;
668 } else {
669 // these are in KB.
670 statfs.kb = osd_sum.kb;
671 statfs.kb_used = osd_sum.kb_used;
672 statfs.kb_avail = osd_sum.kb_avail;
673 statfs.num_objects = pg_sum.stats.sum.num_objects;
674 }
675
676 return statfs;
677 }
678
679 void PGMapDigest::dump_pool_stats_full(
680 const OSDMap &osd_map,
681 stringstream *ss,
682 Formatter *f,
683 bool verbose) const
684 {
685 TextTable tbl;
686
687 if (f) {
688 f->open_array_section("pools");
689 } else {
690 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
691 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
692 if (verbose) {
693 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
694 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
695 }
696
697 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
698 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
699 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
700 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
701 if (verbose) {
702 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
703 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
704 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
705 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
706 }
707 }
708
709 map<int,uint64_t> avail_by_rule;
710 for (auto p = osd_map.get_pools().begin();
711 p != osd_map.get_pools().end(); ++p) {
712 int64_t pool_id = p->first;
713 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
714 continue;
715 const string& pool_name = osd_map.get_pool_name(pool_id);
716 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
717
718 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
719 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
720 pool->get_type(),
721 pool->get_size());
722 int64_t avail;
723 float raw_used_rate;
724 if (avail_by_rule.count(ruleno) == 0) {
725 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
726 avail = get_rule_avail(ruleno);
727 if (avail < 0)
728 avail = 0;
729 avail_by_rule[ruleno] = avail;
730 } else {
731 avail = avail_by_rule[ruleno];
732 }
733
734 raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);
735
736 if (f) {
737 f->open_object_section("pool");
738 f->dump_string("name", pool_name);
739 f->dump_int("id", pool_id);
740 f->open_object_section("stats");
741 } else {
742 tbl << pool_name
743 << pool_id;
744 if (verbose) {
745 if (pool->quota_max_objects == 0)
746 tbl << "N/A";
747 else
748 tbl << si_t(pool->quota_max_objects);
749
750 if (pool->quota_max_bytes == 0)
751 tbl << "N/A";
752 else
753 tbl << si_t(pool->quota_max_bytes);
754 }
755
756 }
757 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
758 if (f)
759 f->close_section(); // stats
760 else
761 tbl << TextTable::endrow;
762
763 if (f)
764 f->close_section(); // pool
765 }
766 if (f)
767 f->close_section();
768 else {
769 assert(ss != nullptr);
770 *ss << "POOLS:\n";
771 tbl.set_indent(4);
772 *ss << tbl;
773 }
774 }
775
776 void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
777 {
778 if (f) {
779 f->open_object_section("stats");
780 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
781 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
782 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
783 if (verbose) {
784 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
785 }
786 f->close_section();
787 } else {
788 assert(ss != nullptr);
789 TextTable tbl;
790 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
791 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
792 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
793 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
794 if (verbose) {
795 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
796 }
797 tbl << stringify(si_t(osd_sum.kb*1024))
798 << stringify(si_t(osd_sum.kb_avail*1024))
799 << stringify(si_t(osd_sum.kb_used*1024));
800 float used = 0.0;
801 if (osd_sum.kb > 0) {
802 used = ((float)osd_sum.kb_used / osd_sum.kb);
803 }
804 tbl << percentify(used*100);
805 if (verbose) {
806 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
807 }
808 tbl << TextTable::endrow;
809
810 *ss << "GLOBAL:\n";
811 tbl.set_indent(4);
812 *ss << tbl;
813 }
814 }
815
816 void PGMapDigest::dump_object_stat_sum(
817 TextTable &tbl, Formatter *f,
818 const object_stat_sum_t &sum, uint64_t avail,
819 float raw_used_rate, bool verbose,
820 const pg_pool_t *pool)
821 {
822 float curr_object_copies_rate = 0.0;
823 if (sum.num_object_copies > 0)
824 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
825
826 float used = 0.0;
827 // note avail passed in is raw_avail, calc raw_used here.
828 if (avail) {
829 used = sum.num_bytes * raw_used_rate * curr_object_copies_rate;
830 used /= used + avail;
831 } else if (sum.num_bytes) {
832 used = 1.0;
833 }
834
835 if (f) {
836 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
837 f->dump_int("bytes_used", sum.num_bytes);
838 f->dump_format_unquoted("percent_used", "%.2f", (used*100));
839 f->dump_unsigned("max_avail", avail / raw_used_rate);
840 f->dump_int("objects", sum.num_objects);
841 if (verbose) {
842 f->dump_int("quota_objects", pool->quota_max_objects);
843 f->dump_int("quota_bytes", pool->quota_max_bytes);
844 f->dump_int("dirty", sum.num_objects_dirty);
845 f->dump_int("rd", sum.num_rd);
846 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
847 f->dump_int("wr", sum.num_wr);
848 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
849 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
850 }
851 } else {
852 tbl << stringify(si_t(sum.num_bytes));
853 tbl << percentify(used*100);
854 tbl << si_t(avail / raw_used_rate);
855 tbl << sum.num_objects;
856 if (verbose) {
857 tbl << stringify(si_t(sum.num_objects_dirty))
858 << stringify(si_t(sum.num_rd))
859 << stringify(si_t(sum.num_wr))
860 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
861 }
862 }
863 }
864
865 int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
866 int64_t poolid) const
867 {
868 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
869 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
870 pool->get_type(),
871 pool->get_size());
872 int64_t avail;
873 avail = get_rule_avail(ruleno);
874 if (avail < 0)
875 avail = 0;
876
877 return avail / ::pool_raw_used_rate(osd_map, poolid);
878 }
879
880 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
881 {
882 map<int,float> wm;
883 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
884 if (r < 0) {
885 return r;
886 }
887 if (wm.empty()) {
888 return 0;
889 }
890
891 float fratio;
892 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
893 osdmap.get_full_ratio() > 0) {
894 fratio = osdmap.get_full_ratio();
895 } else {
896 fratio = get_fallback_full_ratio();
897 }
898
899 int64_t min = -1;
900 for (auto p = wm.begin(); p != wm.end(); ++p) {
901 auto osd_info = osd_stat.find(p->first);
902 if (osd_info != osd_stat.end()) {
903 if (osd_info->second.kb == 0 || p->second == 0) {
904 // osd must be out, hence its stats have been zeroed
905 // (unless we somehow managed to have a disk with size 0...)
906 //
907 // (p->second == 0), if osd weight is 0, no need to
908 // calculate proj below.
909 continue;
910 }
911 double unusable = (double)osd_info->second.kb *
912 (1.0 - fratio);
913 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
914 avail *= 1024.0;
915 int64_t proj = (int64_t)(avail / (double)p->second);
916 if (min < 0 || proj < min) {
917 min = proj;
918 }
919 } else {
920 dout(0) << "Cannot get stat of OSD " << p->first << dendl;
921 }
922 }
923 return min;
924 }
925
926 void PGMap::get_rules_avail(const OSDMap& osdmap,
927 std::map<int,int64_t> *avail_map) const
928 {
929 avail_map->clear();
930 for (auto p : osdmap.get_pools()) {
931 int64_t pool_id = p.first;
932 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
933 continue;
934 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
935 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
936 pool->get_type(),
937 pool->get_size());
938 if (avail_map->count(ruleno) == 0)
939 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
940 }
941 }
942
943 // ---------------------
944 // PGMap
945
946 void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
947 {
948 if ((features & CEPH_FEATURE_MONENC) == 0) {
949 __u8 v = 4;
950 ::encode(v, bl);
951 ::encode(version, bl);
952 ::encode(pg_stat_updates, bl);
953 ::encode(osd_stat_updates, bl);
954 ::encode(osd_stat_rm, bl);
955 ::encode(osdmap_epoch, bl);
956 ::encode(pg_scan, bl);
957 ::encode(full_ratio, bl);
958 ::encode(nearfull_ratio, bl);
959 ::encode(pg_remove, bl);
960 return;
961 }
962
963 ENCODE_START(7, 5, bl);
964 ::encode(version, bl);
965 ::encode(pg_stat_updates, bl);
966 ::encode(osd_stat_updates, bl);
967 ::encode(osd_stat_rm, bl);
968 ::encode(osdmap_epoch, bl);
969 ::encode(pg_scan, bl);
970 ::encode(full_ratio, bl);
971 ::encode(nearfull_ratio, bl);
972 ::encode(pg_remove, bl);
973 ::encode(stamp, bl);
974 ::encode(osd_epochs, bl);
975 ENCODE_FINISH(bl);
976 }
977
978 void PGMap::Incremental::decode(bufferlist::iterator &bl)
979 {
980 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
981 ::decode(version, bl);
982 if (struct_v < 3) {
983 pg_stat_updates.clear();
984 __u32 n;
985 ::decode(n, bl);
986 while (n--) {
987 old_pg_t opgid;
988 ::decode(opgid, bl);
989 pg_t pgid = opgid;
990 ::decode(pg_stat_updates[pgid], bl);
991 }
992 } else {
993 ::decode(pg_stat_updates, bl);
994 }
995 ::decode(osd_stat_updates, bl);
996 ::decode(osd_stat_rm, bl);
997 ::decode(osdmap_epoch, bl);
998 ::decode(pg_scan, bl);
999 if (struct_v >= 2) {
1000 ::decode(full_ratio, bl);
1001 ::decode(nearfull_ratio, bl);
1002 }
1003 if (struct_v < 3) {
1004 pg_remove.clear();
1005 __u32 n;
1006 ::decode(n, bl);
1007 while (n--) {
1008 old_pg_t opgid;
1009 ::decode(opgid, bl);
1010 pg_remove.insert(pg_t(opgid));
1011 }
1012 } else {
1013 ::decode(pg_remove, bl);
1014 }
1015 if (struct_v < 4 && full_ratio == 0) {
1016 full_ratio = -1;
1017 }
1018 if (struct_v < 4 && nearfull_ratio == 0) {
1019 nearfull_ratio = -1;
1020 }
1021 if (struct_v >= 6)
1022 ::decode(stamp, bl);
1023 if (struct_v >= 7) {
1024 ::decode(osd_epochs, bl);
1025 } else {
1026 for (auto i = osd_stat_updates.begin();
1027 i != osd_stat_updates.end();
1028 ++i) {
1029 // This isn't accurate, but will cause trimming to behave like
1030 // previously.
1031 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
1032 }
1033 }
1034 DECODE_FINISH(bl);
1035 }
1036
1037 void PGMap::Incremental::dump(Formatter *f) const
1038 {
1039 f->dump_unsigned("version", version);
1040 f->dump_stream("stamp") << stamp;
1041 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1042 f->dump_unsigned("pg_scan_epoch", pg_scan);
1043 f->dump_float("full_ratio", full_ratio);
1044 f->dump_float("nearfull_ratio", nearfull_ratio);
1045
1046 f->open_array_section("pg_stat_updates");
1047 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1048 f->open_object_section("pg_stat");
1049 f->dump_stream("pgid") << p->first;
1050 p->second.dump(f);
1051 f->close_section();
1052 }
1053 f->close_section();
1054
1055 f->open_array_section("osd_stat_updates");
1056 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1057 f->open_object_section("osd_stat");
1058 f->dump_int("osd", p->first);
1059 p->second.dump(f);
1060 f->close_section();
1061 }
1062 f->close_section();
1063
1064 f->open_array_section("osd_stat_removals");
1065 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1066 f->dump_int("osd", *p);
1067 f->close_section();
1068
1069 f->open_array_section("pg_removals");
1070 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1071 f->dump_stream("pgid") << *p;
1072 f->close_section();
1073 }
1074
1075 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1076 {
1077 o.push_back(new Incremental);
1078 o.push_back(new Incremental);
1079 o.back()->version = 1;
1080 o.back()->stamp = utime_t(123,345);
1081 o.push_back(new Incremental);
1082 o.back()->version = 2;
1083 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
1084 o.back()->osd_stat_updates[5] = osd_stat_t();
1085 o.back()->osd_epochs[5] = 12;
1086 o.push_back(new Incremental);
1087 o.back()->version = 3;
1088 o.back()->osdmap_epoch = 1;
1089 o.back()->pg_scan = 2;
1090 o.back()->full_ratio = .2;
1091 o.back()->nearfull_ratio = .3;
1092 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
1093 o.back()->osd_stat_updates[6] = osd_stat_t();
1094 o.back()->osd_epochs[6] = 12;
1095 o.back()->pg_remove.insert(pg_t(1,2,3));
1096 o.back()->osd_stat_rm.insert(5);
1097 }
1098
1099
1100 // --
1101
1102 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1103 {
1104 assert(inc.version == version+1);
1105 version++;
1106
1107 utime_t delta_t;
1108 delta_t = inc.stamp;
1109 delta_t -= stamp;
1110 stamp = inc.stamp;
1111
1112 pool_stat_t pg_sum_old = pg_sum;
1113 mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
1114
1115 bool ratios_changed = false;
1116 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
1117 full_ratio = inc.full_ratio;
1118 ratios_changed = true;
1119 }
1120 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
1121 nearfull_ratio = inc.nearfull_ratio;
1122 ratios_changed = true;
1123 }
1124 if (ratios_changed)
1125 redo_full_sets();
1126
1127 for (auto p = inc.pg_stat_updates.begin();
1128 p != inc.pg_stat_updates.end();
1129 ++p) {
1130 const pg_t &update_pg(p->first);
1131 const pg_stat_t &update_stat(p->second);
1132
1133 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
1134 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
1135
1136 auto t = pg_stat.find(update_pg);
1137 if (t == pg_stat.end()) {
1138 pg_stat.insert(make_pair(update_pg, update_stat));
1139 } else {
1140 stat_pg_sub(update_pg, t->second);
1141 t->second = update_stat;
1142 }
1143 stat_pg_add(update_pg, update_stat);
1144 }
1145 assert(osd_stat.size() == osd_epochs.size());
1146 for (auto p = inc.get_osd_stat_updates().begin();
1147 p != inc.get_osd_stat_updates().end();
1148 ++p) {
1149 int osd = p->first;
1150 const osd_stat_t &new_stats(p->second);
1151
1152 auto t = osd_stat.find(osd);
1153 if (t == osd_stat.end()) {
1154 osd_stat.insert(make_pair(osd, new_stats));
1155 } else {
1156 stat_osd_sub(t->first, t->second);
1157 t->second = new_stats;
1158 }
1159 auto i = osd_epochs.find(osd);
1160 auto j = inc.get_osd_epochs().find(osd);
1161 assert(j != inc.get_osd_epochs().end());
1162
1163 if (i == osd_epochs.end())
1164 osd_epochs.insert(*j);
1165 else
1166 i->second = j->second;
1167
1168 stat_osd_add(osd, new_stats);
1169
1170 // adjust [near]full status
1171 register_nearfull_status(osd, new_stats);
1172 }
1173 set<int64_t> deleted_pools;
1174 for (auto p = inc.pg_remove.begin();
1175 p != inc.pg_remove.end();
1176 ++p) {
1177 const pg_t &removed_pg(*p);
1178 auto s = pg_stat.find(removed_pg);
1179 if (s != pg_stat.end()) {
1180 stat_pg_sub(removed_pg, s->second);
1181 pg_stat.erase(s);
1182 }
1183 deleted_pools.insert(removed_pg.pool());
1184 }
1185
1186 for (auto p = inc.get_osd_stat_rm().begin();
1187 p != inc.get_osd_stat_rm().end();
1188 ++p) {
1189 auto t = osd_stat.find(*p);
1190 if (t != osd_stat.end()) {
1191 stat_osd_sub(t->first, t->second);
1192 osd_stat.erase(t);
1193 osd_epochs.erase(*p);
1194 }
1195
1196 // remove these old osds from full/nearfull set(s), too
1197 nearfull_osds.erase(*p);
1198 full_osds.erase(*p);
1199 }
1200
1201 // calculate a delta, and average over the last 2 deltas.
1202 pool_stat_t d = pg_sum;
1203 d.stats.sub(pg_sum_old.stats);
1204 pg_sum_deltas.push_back(make_pair(d, delta_t));
1205 stamp_delta += delta_t;
1206
1207 pg_sum_delta.stats.add(d.stats);
1208 if (pg_sum_deltas.size() > (unsigned)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
1209 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1210 stamp_delta -= pg_sum_deltas.front().second;
1211 pg_sum_deltas.pop_front();
1212 }
1213
1214 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1215
1216 for (auto p : deleted_pools) {
1217 if (cct)
1218 dout(20) << " deleted pool " << p << dendl;
1219 deleted_pool(p);
1220 }
1221
1222 if (inc.osdmap_epoch)
1223 last_osdmap_epoch = inc.osdmap_epoch;
1224 if (inc.pg_scan)
1225 last_pg_scan = inc.pg_scan;
1226
1227 min_last_epoch_clean = 0; // invalidate
1228 }
1229
1230 void PGMap::redo_full_sets()
1231 {
1232 full_osds.clear();
1233 nearfull_osds.clear();
1234 for (auto i = osd_stat.begin();
1235 i != osd_stat.end();
1236 ++i) {
1237 register_nearfull_status(i->first, i->second);
1238 }
1239 }
1240
1241 void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
1242 {
1243 float ratio = ((float)s.kb_used) / ((float)s.kb);
1244
1245 if (full_ratio > 0 && ratio > full_ratio) {
1246 // full
1247 full_osds.insert(osd);
1248 nearfull_osds.erase(osd);
1249 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
1250 // nearfull
1251 full_osds.erase(osd);
1252 nearfull_osds.insert(osd);
1253 } else {
1254 // ok
1255 full_osds.erase(osd);
1256 nearfull_osds.erase(osd);
1257 }
1258 }
1259
1260 void PGMap::calc_stats()
1261 {
1262 num_pg = 0;
1263 num_pg_active = 0;
1264 num_pg_unknown = 0;
1265 num_osd = 0;
1266 pg_pool_sum.clear();
1267 num_pg_by_pool.clear();
1268 pg_by_osd.clear();
1269 pg_sum = pool_stat_t();
1270 osd_sum = osd_stat_t();
1271 num_pg_by_state.clear();
1272 num_pg_by_osd.clear();
1273
1274 for (auto p = pg_stat.begin();
1275 p != pg_stat.end();
1276 ++p) {
1277 stat_pg_add(p->first, p->second);
1278 }
1279 for (auto p = osd_stat.begin();
1280 p != osd_stat.end();
1281 ++p)
1282 stat_osd_add(p->first, p->second);
1283
1284 redo_full_sets();
1285
1286 min_last_epoch_clean = calc_min_last_epoch_clean();
1287 }
1288
1289 void PGMap::update_pg(pg_t pgid, bufferlist& bl)
1290 {
1291 bufferlist::iterator p = bl.begin();
1292 auto s = pg_stat.find(pgid);
1293 epoch_t old_lec = 0, lec;
1294 if (s != pg_stat.end()) {
1295 old_lec = s->second.get_effective_last_epoch_clean();
1296 stat_pg_update(pgid, s->second, p);
1297 lec = s->second.get_effective_last_epoch_clean();
1298 } else {
1299 pg_stat_t& r = pg_stat[pgid];
1300 ::decode(r, p);
1301 stat_pg_add(pgid, r);
1302 lec = r.get_effective_last_epoch_clean();
1303 }
1304
1305 if (min_last_epoch_clean &&
1306 (lec < min_last_epoch_clean || // we did
1307 (lec > min_last_epoch_clean && // we might
1308 old_lec == min_last_epoch_clean)
1309 ))
1310 min_last_epoch_clean = 0;
1311 }
1312
1313 void PGMap::remove_pg(pg_t pgid)
1314 {
1315 auto s = pg_stat.find(pgid);
1316 if (s != pg_stat.end()) {
1317 if (min_last_epoch_clean &&
1318 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
1319 min_last_epoch_clean = 0;
1320 stat_pg_sub(pgid, s->second);
1321 pg_stat.erase(s);
1322 }
1323 }
1324
1325 void PGMap::update_osd(int osd, bufferlist& bl)
1326 {
1327 bufferlist::iterator p = bl.begin();
1328 auto o = osd_stat.find(osd);
1329 epoch_t old_lec = 0;
1330 if (o != osd_stat.end()) {
1331 auto i = osd_epochs.find(osd);
1332 if (i != osd_epochs.end())
1333 old_lec = i->second;
1334 stat_osd_sub(osd, o->second);
1335 }
1336 osd_stat_t& r = osd_stat[osd];
1337 ::decode(r, p);
1338 stat_osd_add(osd, r);
1339
1340 // adjust [near]full status
1341 register_nearfull_status(osd, r);
1342
1343 // epoch?
1344 if (!p.end()) {
1345 epoch_t e;
1346 ::decode(e, p);
1347
1348 if (e < min_last_epoch_clean ||
1349 (e > min_last_epoch_clean &&
1350 old_lec == min_last_epoch_clean))
1351 min_last_epoch_clean = 0;
1352 } else {
1353 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1354 // or old mon running.
1355 }
1356 }
1357
1358 void PGMap::remove_osd(int osd)
1359 {
1360 auto o = osd_stat.find(osd);
1361 if (o != osd_stat.end()) {
1362 stat_osd_sub(osd, o->second);
1363 osd_stat.erase(o);
1364
1365 // remove these old osds from full/nearfull set(s), too
1366 nearfull_osds.erase(osd);
1367 full_osds.erase(osd);
1368 }
1369 }
1370
1371 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1372 bool sameosds)
1373 {
1374 pg_pool_sum[pgid.pool()].add(s);
1375 pg_sum.add(s);
1376
1377 num_pg++;
1378 num_pg_by_state[s.state]++;
1379 num_pg_by_pool[pgid.pool()]++;
1380
1381 if ((s.state & PG_STATE_CREATING) &&
1382 s.parent_split_bits == 0) {
1383 creating_pgs.insert(pgid);
1384 if (s.acting_primary >= 0) {
1385 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1386 }
1387 }
1388
1389 if (s.state & PG_STATE_ACTIVE) {
1390 ++num_pg_active;
1391 }
1392 if (s.state == 0) {
1393 ++num_pg_unknown;
1394 }
1395
1396 if (sameosds)
1397 return;
1398
1399 for (auto p = s.blocked_by.begin();
1400 p != s.blocked_by.end();
1401 ++p) {
1402 ++blocked_by_sum[*p];
1403 }
1404
1405 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1406 pg_by_osd[*p].insert(pgid);
1407 num_pg_by_osd[*p].acting++;
1408 }
1409 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1410 pg_by_osd[*p].insert(pgid);
1411 num_pg_by_osd[*p].up++;
1412 }
1413
1414 if (s.up_primary >= 0) {
1415 num_pg_by_osd[s.up_primary].primary++;
1416 }
1417 }
1418
1419 void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1420 bool sameosds)
1421 {
1422 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
1423 ps.sub(s);
1424 pg_sum.sub(s);
1425
1426 num_pg--;
1427 int end = --num_pg_by_state[s.state];
1428 assert(end >= 0);
1429 if (end == 0)
1430 num_pg_by_state.erase(s.state);
1431 end = --num_pg_by_pool[pgid.pool()];
1432 if (end == 0) {
1433 num_pg_by_pool.erase(pgid.pool());
1434 pg_pool_sum.erase(pgid.pool());
1435 }
1436
1437 if ((s.state & PG_STATE_CREATING) &&
1438 s.parent_split_bits == 0) {
1439 creating_pgs.erase(pgid);
1440 if (s.acting_primary >= 0) {
1441 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1442 r[s.mapping_epoch].erase(pgid);
1443 if (r[s.mapping_epoch].empty())
1444 r.erase(s.mapping_epoch);
1445 if (r.empty())
1446 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1447 }
1448 }
1449
1450 if (s.state & PG_STATE_ACTIVE) {
1451 --num_pg_active;
1452 }
1453 if (s.state == 0) {
1454 --num_pg_unknown;
1455 }
1456
1457 if (sameosds)
1458 return;
1459
1460 for (auto p = s.blocked_by.begin();
1461 p != s.blocked_by.end();
1462 ++p) {
1463 auto q = blocked_by_sum.find(*p);
1464 assert(q != blocked_by_sum.end());
1465 --q->second;
1466 if (q->second == 0)
1467 blocked_by_sum.erase(q);
1468 }
1469
1470 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1471 auto& oset = pg_by_osd[*p];
1472 oset.erase(pgid);
1473 if (oset.empty())
1474 pg_by_osd.erase(*p);
1475 auto it = num_pg_by_osd.find(*p);
1476 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1477 it->second.acting--;
1478 }
1479 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1480 auto& oset = pg_by_osd[*p];
1481 oset.erase(pgid);
1482 if (oset.empty())
1483 pg_by_osd.erase(*p);
1484 auto it = num_pg_by_osd.find(*p);
1485 if (it != num_pg_by_osd.end() && it->second.up > 0)
1486 it->second.up--;
1487 }
1488
1489 if (s.up_primary >= 0) {
1490 auto it = num_pg_by_osd.find(s.up_primary);
1491 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1492 it->second.primary--;
1493 }
1494 }
1495
1496 void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
1497 bufferlist::iterator& blp)
1498 {
1499 pg_stat_t n;
1500 ::decode(n, blp);
1501
1502 bool sameosds =
1503 s.acting == n.acting &&
1504 s.up == n.up &&
1505 s.blocked_by == n.blocked_by;
1506
1507 stat_pg_sub(pgid, s, sameosds);
1508
1509 // if acting_primary has shift to an just restored osd, and pg yet to finish
1510 // peering, many attributes in current stats remain stale. others seem don't
1511 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1512 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
1513 n.last_active < s.last_active)
1514 n.last_active = s.last_active;
1515 s = n;
1516 stat_pg_add(pgid, n, sameosds);
1517 }
1518
1519 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1520 {
1521 num_osd++;
1522 osd_sum.add(s);
1523 if (osd >= (int)osd_last_seq.size()) {
1524 osd_last_seq.resize(osd + 1);
1525 }
1526 osd_last_seq[osd] = s.seq;
1527 }
1528
1529 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1530 {
1531 num_osd--;
1532 osd_sum.sub(s);
1533 assert(osd < (int)osd_last_seq.size());
1534 osd_last_seq[osd] = 0;
1535 }
1536
1537 epoch_t PGMap::calc_min_last_epoch_clean() const
1538 {
1539 if (pg_stat.empty())
1540 return 0;
1541
1542 auto p = pg_stat.begin();
1543 epoch_t min = p->second.get_effective_last_epoch_clean();
1544 for (++p; p != pg_stat.end(); ++p) {
1545 epoch_t lec = p->second.get_effective_last_epoch_clean();
1546 if (lec < min)
1547 min = lec;
1548 }
1549 // also scan osd epochs
1550 // don't trim past the oldest reported osd epoch
1551 for (auto i = osd_epochs.begin();
1552 i != osd_epochs.end();
1553 ++i) {
1554 if (i->second < min)
1555 min = i->second;
1556 }
1557 return min;
1558 }
1559
1560 void PGMap::encode_digest(const OSDMap& osdmap,
1561 bufferlist& bl, uint64_t features) const
1562 {
1563 get_rules_avail(osdmap, &avail_space_by_rule);
1564 PGMapDigest::encode(bl, features);
1565 }
1566
1567 void PGMap::encode(bufferlist &bl, uint64_t features) const
1568 {
1569 if ((features & CEPH_FEATURE_MONENC) == 0) {
1570 __u8 v = 3;
1571 ::encode(v, bl);
1572 ::encode(version, bl);
1573 ::encode(pg_stat, bl);
1574 ::encode(osd_stat, bl);
1575 ::encode(last_osdmap_epoch, bl);
1576 ::encode(last_pg_scan, bl);
1577 ::encode(full_ratio, bl);
1578 ::encode(nearfull_ratio, bl);
1579 return;
1580 }
1581
1582 ENCODE_START(6, 4, bl);
1583 ::encode(version, bl);
1584 ::encode(pg_stat, bl);
1585 ::encode(osd_stat, bl);
1586 ::encode(last_osdmap_epoch, bl);
1587 ::encode(last_pg_scan, bl);
1588 ::encode(full_ratio, bl);
1589 ::encode(nearfull_ratio, bl);
1590 ::encode(stamp, bl);
1591 ::encode(osd_epochs, bl);
1592 ENCODE_FINISH(bl);
1593 }
1594
1595 void PGMap::decode(bufferlist::iterator &bl)
1596 {
1597 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
1598 ::decode(version, bl);
1599 if (struct_v < 3) {
1600 pg_stat.clear();
1601 __u32 n;
1602 ::decode(n, bl);
1603 while (n--) {
1604 old_pg_t opgid;
1605 ::decode(opgid, bl);
1606 pg_t pgid = opgid;
1607 ::decode(pg_stat[pgid], bl);
1608 }
1609 } else {
1610 ::decode(pg_stat, bl);
1611 }
1612 ::decode(osd_stat, bl);
1613 ::decode(last_osdmap_epoch, bl);
1614 ::decode(last_pg_scan, bl);
1615 if (struct_v >= 2) {
1616 ::decode(full_ratio, bl);
1617 ::decode(nearfull_ratio, bl);
1618 }
1619 if (struct_v >= 5)
1620 ::decode(stamp, bl);
1621 if (struct_v >= 6) {
1622 ::decode(osd_epochs, bl);
1623 } else {
1624 for (auto i = osd_stat.begin();
1625 i != osd_stat.end();
1626 ++i) {
1627 // This isn't accurate, but will cause trimming to behave like
1628 // previously.
1629 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
1630 }
1631 }
1632 DECODE_FINISH(bl);
1633
1634 calc_stats();
1635 }
1636
1637 void PGMap::dirty_all(Incremental& inc)
1638 {
1639 inc.osdmap_epoch = last_osdmap_epoch;
1640 inc.pg_scan = last_pg_scan;
1641 inc.full_ratio = full_ratio;
1642 inc.nearfull_ratio = nearfull_ratio;
1643
1644 for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) {
1645 inc.pg_stat_updates[p->first] = p->second;
1646 }
1647 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
1648 assert(osd_epochs.count(p->first));
1649 inc.update_stat(p->first,
1650 inc.get_osd_epochs().find(p->first)->second,
1651 p->second);
1652 }
1653 }
1654
1655 void PGMap::dump(Formatter *f) const
1656 {
1657 dump_basic(f);
1658 dump_pg_stats(f, false);
1659 dump_pool_stats(f);
1660 dump_osd_stats(f);
1661 }
1662
1663 void PGMap::dump_basic(Formatter *f) const
1664 {
1665 f->dump_unsigned("version", version);
1666 f->dump_stream("stamp") << stamp;
1667 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1668 f->dump_unsigned("last_pg_scan", last_pg_scan);
1669 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
1670 f->dump_float("full_ratio", full_ratio);
1671 f->dump_float("near_full_ratio", nearfull_ratio);
1672
1673 f->open_object_section("pg_stats_sum");
1674 pg_sum.dump(f);
1675 f->close_section();
1676
1677 f->open_object_section("osd_stats_sum");
1678 osd_sum.dump(f);
1679 f->close_section();
1680
1681 f->open_array_section("osd_epochs");
1682 for (auto p = osd_epochs.begin(); p != osd_epochs.end(); ++p) {
1683 f->open_object_section("osd");
1684 f->dump_unsigned("osd", p->first);
1685 f->dump_unsigned("epoch", p->second);
1686 f->close_section();
1687 }
1688 f->close_section();
1689
1690 dump_delta(f);
1691 }
1692
1693 void PGMap::dump_delta(Formatter *f) const
1694 {
1695 f->open_object_section("pg_stats_delta");
1696 pg_sum_delta.dump(f);
1697 f->close_section();
1698 }
1699
1700 void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1701 {
1702 f->open_array_section("pg_stats");
1703 for (auto i = pg_stat.begin();
1704 i != pg_stat.end();
1705 ++i) {
1706 f->open_object_section("pg_stat");
1707 f->dump_stream("pgid") << i->first;
1708 if (brief)
1709 i->second.dump_brief(f);
1710 else
1711 i->second.dump(f);
1712 f->close_section();
1713 }
1714 f->close_section();
1715 }
1716
1717 void PGMap::dump_pool_stats(Formatter *f) const
1718 {
1719 f->open_array_section("pool_stats");
1720 for (auto p = pg_pool_sum.begin();
1721 p != pg_pool_sum.end();
1722 ++p) {
1723 f->open_object_section("pool_stat");
1724 f->dump_int("poolid", p->first);
1725 auto q = num_pg_by_pool.find(p->first);
1726 if (q != num_pg_by_pool.end())
1727 f->dump_unsigned("num_pg", q->second);
1728 p->second.dump(f);
1729 f->close_section();
1730 }
1731 f->close_section();
1732 }
1733
1734 void PGMap::dump_osd_stats(Formatter *f) const
1735 {
1736 f->open_array_section("osd_stats");
1737 for (auto q = osd_stat.begin();
1738 q != osd_stat.end();
1739 ++q) {
1740 f->open_object_section("osd_stat");
1741 f->dump_int("osd", q->first);
1742 q->second.dump(f);
1743 f->close_section();
1744 }
1745 f->close_section();
1746 }
1747
1748 void PGMap::dump_pg_stats_plain(
1749 ostream& ss,
1750 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1751 bool brief) const
1752 {
1753 TextTable tab;
1754
1755 if (brief){
1756 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1757 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1758 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1759 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1760 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1761 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1762 }
1763 else {
1764 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1765 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1766 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1768 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1770 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1771 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1772 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1774 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1775 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1776 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1777 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1778 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1780 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1782 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1784 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1785 }
1786
1787 for (auto i = pg_stats.begin();
1788 i != pg_stats.end(); ++i) {
1789 const pg_stat_t &st(i->second);
1790 if (brief) {
1791 tab << i->first
1792 << pg_state_string(st.state)
1793 << st.up
1794 << st.up_primary
1795 << st.acting
1796 << st.acting_primary
1797 << TextTable::endrow;
1798 } else {
1799 ostringstream reported;
1800 reported << st.reported_epoch << ":" << st.reported_seq;
1801
1802 tab << i->first
1803 << st.stats.sum.num_objects
1804 << st.stats.sum.num_objects_missing_on_primary
1805 << st.stats.sum.num_objects_degraded
1806 << st.stats.sum.num_objects_misplaced
1807 << st.stats.sum.num_objects_unfound
1808 << st.stats.sum.num_bytes
1809 << st.log_size
1810 << st.ondisk_log_size
1811 << pg_state_string(st.state)
1812 << st.last_change
1813 << st.version
1814 << reported.str()
1815 << pg_vector_string(st.up)
1816 << st.up_primary
1817 << pg_vector_string(st.acting)
1818 << st.acting_primary
1819 << st.last_scrub
1820 << st.last_scrub_stamp
1821 << st.last_deep_scrub
1822 << st.last_deep_scrub_stamp
1823 << TextTable::endrow;
1824 }
1825 }
1826
1827 ss << tab;
1828 }
1829
1830 void PGMap::dump(ostream& ss) const
1831 {
1832 dump_basic(ss);
1833 dump_pg_stats(ss, false);
1834 dump_pool_stats(ss, false);
1835 dump_pg_sum_stats(ss, false);
1836 dump_osd_stats(ss);
1837 }
1838
1839 void PGMap::dump_basic(ostream& ss) const
1840 {
1841 ss << "version " << version << std::endl;
1842 ss << "stamp " << stamp << std::endl;
1843 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1844 ss << "last_pg_scan " << last_pg_scan << std::endl;
1845 ss << "full_ratio " << full_ratio << std::endl;
1846 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
1847 }
1848
1849 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1850 {
1851 dump_pg_stats_plain(ss, pg_stat, brief);
1852 }
1853
1854 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1855 {
1856 TextTable tab;
1857
1858 if (header) {
1859 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1860 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1864 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1865 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1866 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1867 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1868 } else {
1869 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1870 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1871 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1872 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1873 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1874 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1875 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1876 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1877 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1878 }
1879
1880 for (auto p = pg_pool_sum.begin();
1881 p != pg_pool_sum.end();
1882 ++p) {
1883 tab << p->first
1884 << p->second.stats.sum.num_objects
1885 << p->second.stats.sum.num_objects_missing_on_primary
1886 << p->second.stats.sum.num_objects_degraded
1887 << p->second.stats.sum.num_objects_misplaced
1888 << p->second.stats.sum.num_objects_unfound
1889 << p->second.stats.sum.num_bytes
1890 << p->second.log_size
1891 << p->second.ondisk_log_size
1892 << TextTable::endrow;
1893 }
1894
1895 ss << tab;
1896 }
1897
1898 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1899 {
1900 TextTable tab;
1901
1902 if (header) {
1903 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1904 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1905 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1906 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1907 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1908 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1909 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1910 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1911 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1912 } else {
1913 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1914 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1915 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1916 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1917 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1918 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1919 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1920 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1921 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1922 };
1923
1924 tab << "sum"
1925 << pg_sum.stats.sum.num_objects
1926 << pg_sum.stats.sum.num_objects_missing_on_primary
1927 << pg_sum.stats.sum.num_objects_degraded
1928 << pg_sum.stats.sum.num_objects_misplaced
1929 << pg_sum.stats.sum.num_objects_unfound
1930 << pg_sum.stats.sum.num_bytes
1931 << pg_sum.log_size
1932 << pg_sum.ondisk_log_size
1933 << TextTable::endrow;
1934
1935 ss << tab;
1936 }
1937
1938 void PGMap::dump_osd_stats(ostream& ss) const
1939 {
1940 TextTable tab;
1941
1942 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1943 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1944 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1945 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1946 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1947 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1948 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1949
1950 for (auto p = osd_stat.begin();
1951 p != osd_stat.end();
1952 ++p) {
1953 tab << p->first
1954 << si_t(p->second.kb_used << 10)
1955 << si_t(p->second.kb_avail << 10)
1956 << si_t(p->second.kb << 10)
1957 << p->second.hb_peers
1958 << get_num_pg_by_osd(p->first)
1959 << get_num_primary_pg_by_osd(p->first)
1960 << TextTable::endrow;
1961 }
1962
1963 tab << "sum"
1964 << si_t(osd_sum.kb_used << 10)
1965 << si_t(osd_sum.kb_avail << 10)
1966 << si_t(osd_sum.kb << 10)
1967 << TextTable::endrow;
1968
1969 ss << tab;
1970 }
1971
1972 void PGMap::dump_osd_sum_stats(ostream& ss) const
1973 {
1974 TextTable tab;
1975
1976 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1977 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1978 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1979 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1980
1981 tab << "sum"
1982 << si_t(osd_sum.kb_used << 10)
1983 << si_t(osd_sum.kb_avail << 10)
1984 << si_t(osd_sum.kb << 10)
1985 << TextTable::endrow;
1986
1987 ss << tab;
1988 }
1989
1990 void PGMap::get_stuck_stats(
1991 int types, const utime_t cutoff,
1992 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1993 {
1994 assert(types != 0);
1995 for (auto i = pg_stat.begin();
1996 i != pg_stat.end();
1997 ++i) {
1998 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1999
2000 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
2001 if (i->second.last_active < val)
2002 val = i->second.last_active;
2003 }
2004
2005 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
2006 if (i->second.last_clean < val)
2007 val = i->second.last_clean;
2008 }
2009
2010 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
2011 if (i->second.last_undegraded < val)
2012 val = i->second.last_undegraded;
2013 }
2014
2015 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
2016 if (i->second.last_fullsized < val)
2017 val = i->second.last_fullsized;
2018 }
2019
2020 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
2021 if (i->second.last_unstale < val)
2022 val = i->second.last_unstale;
2023 }
2024
2025 // val is now the earliest any of the requested stuck states began
2026 if (val < cutoff) {
2027 stuck_pgs[i->first] = i->second;
2028 }
2029 }
2030 }
2031
2032 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
2033 {
2034 int inactive = 0;
2035 int unclean = 0;
2036 int degraded = 0;
2037 int undersized = 0;
2038 int stale = 0;
2039
2040 for (auto i = pg_stat.begin();
2041 i != pg_stat.end();
2042 ++i) {
2043 if (! (i->second.state & PG_STATE_ACTIVE)) {
2044 if (i->second.last_active < cutoff)
2045 ++inactive;
2046 }
2047 if (! (i->second.state & PG_STATE_CLEAN)) {
2048 if (i->second.last_clean < cutoff)
2049 ++unclean;
2050 }
2051 if (i->second.state & PG_STATE_DEGRADED) {
2052 if (i->second.last_undegraded < cutoff)
2053 ++degraded;
2054 }
2055 if (i->second.state & PG_STATE_UNDERSIZED) {
2056 if (i->second.last_fullsized < cutoff)
2057 ++undersized;
2058 }
2059 if (i->second.state & PG_STATE_STALE) {
2060 if (i->second.last_unstale < cutoff)
2061 ++stale;
2062 }
2063 }
2064
2065 if (inactive)
2066 note["stuck inactive"] = inactive;
2067
2068 if (unclean)
2069 note["stuck unclean"] = unclean;
2070
2071 if (undersized)
2072 note["stuck undersized"] = undersized;
2073
2074 if (degraded)
2075 note["stuck degraded"] = degraded;
2076
2077 if (stale)
2078 note["stuck stale"] = stale;
2079
2080 return inactive || unclean || undersized || degraded || stale;
2081 }
2082
2083 void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
2084 {
2085 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2086 get_stuck_stats(types, cutoff, stuck_pg_stats);
2087 f->open_array_section("stuck_pg_stats");
2088 for (auto i = stuck_pg_stats.begin();
2089 i != stuck_pg_stats.end();
2090 ++i) {
2091 f->open_object_section("pg_stat");
2092 f->dump_stream("pgid") << i->first;
2093 i->second.dump(f);
2094 f->close_section();
2095 }
2096 f->close_section();
2097 }
2098
2099 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2100 {
2101 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2102 get_stuck_stats(types, cutoff, stuck_pg_stats);
2103 if (!stuck_pg_stats.empty())
2104 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2105 }
2106
2107 int PGMap::dump_stuck_pg_stats(
2108 stringstream &ds,
2109 Formatter *f,
2110 int threshold,
2111 vector<string>& args) const
2112 {
2113 int stuck_types = 0;
2114
2115 for (auto i = args.begin(); i != args.end(); ++i) {
2116 if (*i == "inactive")
2117 stuck_types |= PGMap::STUCK_INACTIVE;
2118 else if (*i == "unclean")
2119 stuck_types |= PGMap::STUCK_UNCLEAN;
2120 else if (*i == "undersized")
2121 stuck_types |= PGMap::STUCK_UNDERSIZED;
2122 else if (*i == "degraded")
2123 stuck_types |= PGMap::STUCK_DEGRADED;
2124 else if (*i == "stale")
2125 stuck_types |= PGMap::STUCK_STALE;
2126 else {
2127 ds << "Unknown type: " << *i << std::endl;
2128 return -EINVAL;
2129 }
2130 }
2131
2132 utime_t now(ceph_clock_now());
2133 utime_t cutoff = now - utime_t(threshold, 0);
2134
2135 if (!f) {
2136 dump_stuck_plain(ds, stuck_types, cutoff);
2137 } else {
2138 dump_stuck(f, stuck_types, cutoff);
2139 f->flush(ds);
2140 }
2141
2142 return 0;
2143 }
2144
2145 void PGMap::dump_osd_perf_stats(Formatter *f) const
2146 {
2147 f->open_array_section("osd_perf_infos");
2148 for (auto i = osd_stat.begin();
2149 i != osd_stat.end();
2150 ++i) {
2151 f->open_object_section("osd");
2152 f->dump_int("id", i->first);
2153 {
2154 f->open_object_section("perf_stats");
2155 i->second.os_perf_stat.dump(f);
2156 f->close_section();
2157 }
2158 f->close_section();
2159 }
2160 f->close_section();
2161 }
2162 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2163 {
2164 TextTable tab;
2165 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2166 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2167 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2168 for (auto i = osd_stat.begin();
2169 i != osd_stat.end();
2170 ++i) {
2171 tab << i->first;
2172 tab << i->second.os_perf_stat.os_commit_latency;
2173 tab << i->second.os_perf_stat.os_apply_latency;
2174 tab << TextTable::endrow;
2175 }
2176 (*ss) << tab;
2177 }
2178
2179 void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2180 {
2181 f->open_array_section("osd_blocked_by_infos");
2182 for (auto i = blocked_by_sum.begin();
2183 i != blocked_by_sum.end();
2184 ++i) {
2185 f->open_object_section("osd");
2186 f->dump_int("id", i->first);
2187 f->dump_int("num_blocked", i->second);
2188 f->close_section();
2189 }
2190 f->close_section();
2191 }
2192 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2193 {
2194 TextTable tab;
2195 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2196 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2197 for (auto i = blocked_by_sum.begin();
2198 i != blocked_by_sum.end();
2199 ++i) {
2200 tab << i->first;
2201 tab << i->second;
2202 tab << TextTable::endrow;
2203 }
2204 (*ss) << tab;
2205 }
2206
2207
2208 /**
2209 * update aggregated delta
2210 *
2211 * @param cct ceph context
2212 * @param ts Timestamp for the stats being delta'ed
2213 * @param old_pool_sum Previous stats sum
2214 * @param last_ts Last timestamp for pool
2215 * @param result_pool_sum Resulting stats
2216 * @param result_pool_delta Resulting pool delta
2217 * @param result_ts_delta Resulting timestamp delta
2218 * @param delta_avg_list List of last N computed deltas, used to average
2219 */
2220 void PGMap::update_delta(
2221 CephContext *cct,
2222 const utime_t ts,
2223 const pool_stat_t& old_pool_sum,
2224 utime_t *last_ts,
2225 const pool_stat_t& current_pool_sum,
2226 pool_stat_t *result_pool_delta,
2227 utime_t *result_ts_delta,
2228 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2229 {
2230 /* @p ts is the timestamp we want to associate with the data
2231 * in @p old_pool_sum, and on which we will base ourselves to
2232 * calculate the delta, stored in 'delta_t'.
2233 */
2234 utime_t delta_t;
2235 delta_t = ts; // start with the provided timestamp
2236 delta_t -= *last_ts; // take the last timestamp we saw
2237 *last_ts = ts; // @p ts becomes the last timestamp we saw
2238
2239 // adjust delta_t, quick start if there is no update in a long period
2240 delta_t = std::min(delta_t,
2241 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2242
2243 // calculate a delta, and average over the last 6 deltas by default.
2244 /* start by taking a copy of our current @p result_pool_sum, and by
2245 * taking out the stats from @p old_pool_sum. This generates a stats
2246 * delta. Stash this stats delta in @p delta_avg_list, along with the
2247 * timestamp delta for these results.
2248 */
2249 pool_stat_t d = current_pool_sum;
2250 d.stats.sub(old_pool_sum.stats);
2251 delta_avg_list->push_back(make_pair(d,delta_t));
2252 *result_ts_delta += delta_t;
2253
2254 /* Aggregate current delta, and take out the last seen delta (if any) to
2255 * average it out.
2256 */
2257 result_pool_delta->stats.add(d.stats);
2258 size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
2259 if (delta_avg_list->size() > s) {
2260 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2261 *result_ts_delta -= delta_avg_list->front().second;
2262 delta_avg_list->pop_front();
2263 }
2264 }
2265
2266 /**
2267 * update aggregated delta
2268 *
2269 * @param cct ceph context
2270 * @param ts Timestamp
2271 * @param pg_sum_old Old pg_sum
2272 */
2273 void PGMap::update_global_delta(CephContext *cct,
2274 const utime_t ts, const pool_stat_t& pg_sum_old)
2275 {
2276 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
2277 &stamp_delta, &pg_sum_deltas);
2278 }
2279
2280 /**
2281 * Update a given pool's deltas
2282 *
2283 * @param cct Ceph Context
2284 * @param ts Timestamp for the stats being delta'ed
2285 * @param pool Pool's id
2286 * @param old_pool_sum Previous stats sum
2287 */
2288 void PGMap::update_one_pool_delta(
2289 CephContext *cct,
2290 const utime_t ts,
2291 const uint64_t pool,
2292 const pool_stat_t& old_pool_sum)
2293 {
2294 if (per_pool_sum_deltas.count(pool) == 0) {
2295 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2296 assert(per_pool_sum_delta.count(pool) == 0);
2297 }
2298
2299 auto& sum_delta = per_pool_sum_delta[pool];
2300
2301 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2302 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2303 &per_pool_sum_deltas[pool]);
2304 }
2305
2306 /**
2307 * Update pools' deltas
2308 *
2309 * @param cct CephContext
2310 * @param ts Timestamp for the stats being delta'ed
2311 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2312 */
2313 void PGMap::update_pool_deltas(
2314 CephContext *cct, const utime_t ts,
2315 const mempool::pgmap::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
2316 {
2317 for (auto it = pg_pool_sum_old.begin();
2318 it != pg_pool_sum_old.end(); ++it) {
2319 update_one_pool_delta(cct, ts, it->first, it->second);
2320 }
2321 }
2322
2323 void PGMap::clear_delta()
2324 {
2325 pg_sum_delta = pool_stat_t();
2326 pg_sum_deltas.clear();
2327 stamp_delta = utime_t();
2328 }
2329
2330 void PGMap::generate_test_instances(list<PGMap*>& o)
2331 {
2332 o.push_back(new PGMap);
2333 list<Incremental*> inc;
2334 Incremental::generate_test_instances(inc);
2335 delete inc.front();
2336 inc.pop_front();
2337 while (!inc.empty()) {
2338 PGMap *pmp = new PGMap();
2339 *pmp = *o.back();
2340 o.push_back(pmp);
2341 o.back()->apply_incremental(NULL, *inc.front());
2342 delete inc.front();
2343 inc.pop_front();
2344 }
2345 }
2346
2347 void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
2348 bool primary, set<pg_t>& pgs) const
2349 {
2350 for (auto i = pg_stat.begin();
2351 i != pg_stat.end();
2352 ++i) {
2353 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
2354 continue;
2355 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2356 continue;
2357 if (!(i->second.state & state))
2358 continue;
2359 pgs.insert(i->first);
2360 }
2361 }
2362
2363 void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2364 {
2365 f->open_array_section("pg_stats");
2366 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2367 const pg_stat_t& st = pg_stat.at(*i);
2368 f->open_object_section("pg_stat");
2369 f->dump_stream("pgid") << *i;
2370 st.dump(f);
2371 f->close_section();
2372 }
2373 f->close_section();
2374 }
2375
2376 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2377 {
2378 TextTable tab;
2379
2380 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
2381 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2382 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2383 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2384 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2385 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2386 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2387 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2388 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
2389 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2390 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
2391 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2392 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2393 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2394 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2395 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2396 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2397 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2398 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2399 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2400 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2401
2402 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2403 const pg_stat_t& st = pg_stat.at(*i);
2404
2405 ostringstream reported;
2406 reported << st.reported_epoch << ":" << st.reported_seq;
2407
2408 tab << *i
2409 << st.stats.sum.num_objects
2410 << st.stats.sum.num_objects_missing_on_primary
2411 << st.stats.sum.num_objects_degraded
2412 << st.stats.sum.num_objects_misplaced
2413 << st.stats.sum.num_objects_unfound
2414 << st.stats.sum.num_bytes
2415 << st.log_size
2416 << st.ondisk_log_size
2417 << pg_state_string(st.state)
2418 << st.last_change
2419 << st.version
2420 << reported.str()
2421 << st.up
2422 << st.up_primary
2423 << st.acting
2424 << st.acting_primary
2425 << st.last_scrub
2426 << st.last_scrub_stamp
2427 << st.last_deep_scrub
2428 << st.last_deep_scrub_stamp
2429 << TextTable::endrow;
2430 }
2431
2432 ss << tab;
2433 }
2434
2435
2436
2437 // Only called with a single bit set in "what"
2438 static void note_stuck_detail(
2439 int what,
2440 mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
2441 int max_detail,
2442 list<pair<health_status_t,string> > *detail)
2443 {
2444 int n = 0;
2445 for (auto p = stuck_pgs.begin();
2446 p != stuck_pgs.end();
2447 ++p) {
2448 ostringstream ss;
2449 utime_t since;
2450 const char *whatname = 0;
2451 switch (what) {
2452 case PGMap::STUCK_INACTIVE:
2453 since = p->second.last_active;
2454 whatname = "inactive";
2455 break;
2456 case PGMap::STUCK_UNCLEAN:
2457 since = p->second.last_clean;
2458 whatname = "unclean";
2459 break;
2460 case PGMap::STUCK_DEGRADED:
2461 since = p->second.last_undegraded;
2462 whatname = "degraded";
2463 break;
2464 case PGMap::STUCK_UNDERSIZED:
2465 since = p->second.last_fullsized;
2466 whatname = "undersized";
2467 break;
2468 case PGMap::STUCK_STALE:
2469 since = p->second.last_unstale;
2470 whatname = "stale";
2471 break;
2472 default:
2473 ceph_abort();
2474 }
2475 if (--max_detail == 0) {
2476 ostringstream ss;
2477 ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
2478 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2479 break;
2480 }
2481 ++n;
2482 ss << "pg " << p->first << " is stuck " << whatname;
2483 if (since == utime_t()) {
2484 ss << " since forever";
2485 } else {
2486 utime_t dur = ceph_clock_now() - since;
2487 ss << " for " << dur;
2488 }
2489 ss << ", current state " << pg_state_string(p->second.state)
2490 << ", last acting " << p->second.acting;
2491 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2492 }
2493 }
2494
2495 static pair<int,int> _warn_slow_request_histogram(
2496 CephContext *cct,
2497 const pow2_hist_t& h,
2498 string suffix,
2499 list<pair<health_status_t,string> >& summary,
2500 list<pair<health_status_t,string> > *detail)
2501 {
2502 if (h.h.empty())
2503 return make_pair(0, 0);
2504
2505 unsigned warn = 0, error = 0;
2506 float err_age =
2507 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
2508 for (unsigned i = h.h.size() - 1; i > 0; --i) {
2509 float ub = (float)(1 << i) / 1000.0;
2510 if (ub < cct->_conf->mon_osd_warn_op_age)
2511 break;
2512 if (h.h[i]) {
2513 auto sev = HEALTH_WARN;
2514 if (ub > err_age) {
2515 sev = HEALTH_ERR;
2516 error += h.h[i];
2517 } else {
2518 warn += h.h[i];
2519 }
2520 if (detail) {
2521 ostringstream ss;
2522 ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
2523 detail->push_back(make_pair(sev, ss.str()));
2524 }
2525 }
2526 }
2527 return make_pair(warn, error);
2528 }
2529
2530 namespace {
2531 enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
2532
2533 void print_unscrubbed_detailed(
2534 const std::pair<const pg_t,pg_stat_t> &pg_entry,
2535 list<pair<health_status_t,string> > *detail,
2536 scrubbed_or_deepscrubbed_t how_scrubbed)
2537 {
2538 std::stringstream ss;
2539 const auto& pg_stat(pg_entry.second);
2540
2541 ss << "pg " << pg_entry.first << " is not ";
2542 if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
2543 ss << "scrubbed, last_scrub_stamp "
2544 << pg_stat.last_scrub_stamp;
2545 } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
2546 ss << "deep-scrubbed, last_deep_scrub_stamp "
2547 << pg_stat.last_deep_scrub_stamp;
2548 }
2549
2550 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2551 }
2552
2553 using pg_stat_map_t = const mempool::pgmap::unordered_map<pg_t,pg_stat_t>;
2554
2555 void print_unscrubbed_pgs(
2556 pg_stat_map_t& pg_stats,
2557 list<pair<health_status_t,string> > &summary,
2558 list<pair<health_status_t,string> > *detail,
2559 const CephContext* cct)
2560 {
2561 if (cct->_conf->mon_warn_not_scrubbed == 0 &&
2562 cct->_conf->mon_warn_not_deep_scrubbed == 0)
2563 return;
2564
2565 int pgs_count = 0;
2566 const utime_t now = ceph_clock_now();
2567 for (const auto& pg_entry : pg_stats) {
2568 const auto& pg_stat(pg_entry.second);
2569 const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
2570 const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
2571
2572 const int mon_warn_not_scrubbed =
2573 cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
2574
2575 const int mon_warn_not_deep_scrubbed =
2576 cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
2577
2578 bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
2579 cct->_conf->mon_warn_not_scrubbed != 0);
2580
2581 bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
2582 cct->_conf->mon_warn_not_deep_scrubbed != 0);
2583
2584 if (detail != nullptr) {
2585 if (not_scrubbed) {
2586 print_unscrubbed_detailed(pg_entry,
2587 detail,
2588 scrubbed_or_deepscrubbed_t::SCRUBBED);
2589 }
2590 if (not_deep_scrubbed) {
2591 print_unscrubbed_detailed(pg_entry,
2592 detail,
2593 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
2594 }
2595 }
2596 if (not_scrubbed || not_deep_scrubbed) {
2597 ++pgs_count;
2598 }
2599 }
2600
2601 if (pgs_count > 0) {
2602 std::stringstream ss;
2603 ss << pgs_count << " unscrubbed pgs";
2604 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2605 }
2606 }
2607 }
2608
2609 void PGMap::get_health_checks(
2610 CephContext *cct,
2611 const OSDMap& osdmap,
2612 health_check_map_t *checks) const
2613 {
2614 utime_t now = ceph_clock_now();
2615 const unsigned max = cct->_conf->mon_health_max_detail;
2616 const auto& pools = osdmap.get_pools();
2617
2618 typedef enum pg_consequence_t {
2619 UNAVAILABLE = 1, // Client IO to the pool may block
2620 DEGRADED = 2, // Fewer than the requested number of replicas are present
2621 DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
2622 // and insufficiet resources are present to fix this
2623 DAMAGED = 4 // The data may be missing or inconsistent on disk and
2624 // requires repair
2625 } pg_consequence_t;
2626
2627 // For a given PG state, how should it be reported at the pool level?
2628 class PgStateResponse {
2629 public:
2630 pg_consequence_t consequence;
2631 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2632 stuck_cb stuck_since;
2633 bool invert;
2634
2635 PgStateResponse(const pg_consequence_t &c, stuck_cb s)
2636 : consequence(c), stuck_since(s), invert(false)
2637 {
2638 }
2639
2640 PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
2641 : consequence(c), stuck_since(s), invert(i)
2642 {
2643 }
2644 };
2645
2646 // Record the PG state counts that contributed to a reported pool state
2647 class PgCauses {
2648 public:
2649 // Map of PG_STATE_* to number of pgs in that state.
2650 std::map<unsigned, unsigned> states;
2651
2652 // List of all PG IDs that had a state contributing
2653 // to this health condition.
2654 std::set<pg_t> pgs;
2655
2656 std::map<pg_t, std::string> pg_messages;
2657 };
2658
2659 // Map of PG state to how to respond to it
2660 std::map<unsigned, PgStateResponse> state_to_response = {
2661 // Immediate reports
2662 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2663 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
2664 { PG_STATE_REPAIR, {DAMAGED, {}} },
2665 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2666 { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
2667 { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
2668 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2669 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2670 // Delayed (wait until stuck) reports
2671 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2672 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2673 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2674 // Delayed and inverted reports
2675 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} },
2676 { PG_STATE_CLEAN, {DEGRADED, [](const pg_stat_t &p){return p.last_clean;}, true} }
2677 };
2678
2679 // Specialized state printer that takes account of inversion of
2680 // ACTIVE, CLEAN checks.
2681 auto state_name = [](const uint32_t &state) {
2682 // Special cases for the states that are inverted checks
2683 if (state == PG_STATE_CLEAN) {
2684 return std::string("unclean");
2685 } else if (state == PG_STATE_ACTIVE) {
2686 return std::string("inactive");
2687 } else {
2688 return pg_state_string(state);
2689 }
2690 };
2691
2692 // Map of what is wrong to information about why, implicitly also stores
2693 // the list of what is wrong.
2694 std::map<pg_consequence_t, PgCauses> detected;
2695
2696 // Optimisation: trim down the number of checks to apply based on
2697 // the summary counters
2698 std::map<unsigned, PgStateResponse> possible_responses;
2699 for (const auto &i : num_pg_by_state) {
2700 for (const auto &j : state_to_response) {
2701 if (!j.second.invert) {
2702 // Check for normal tests by seeing if any pgs have the flag
2703 if (i.first & j.first) {
2704 possible_responses.insert(j);
2705 }
2706 }
2707 }
2708 }
2709
2710 for (const auto &j : state_to_response) {
2711 if (j.second.invert) {
2712 // Check for inverted tests by seeing if not-all pgs have the flag
2713 const auto &found = num_pg_by_state.find(j.first);
2714 if (found == num_pg_by_state.end() || found->second != num_pg) {
2715 possible_responses.insert(j);
2716 }
2717 }
2718 }
2719
2720 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
2721 // Loop over all PGs, if there are any possibly-unhealthy states in there
2722 if (!possible_responses.empty()) {
2723 for (const auto& i : pg_stat) {
2724 const auto &pg_id = i.first;
2725 const auto &pg_info = i.second;
2726
2727 for (const auto &j : state_to_response) {
2728 const auto &pg_response_state = j.first;
2729 const auto &pg_response = j.second;
2730
2731 // Apply the state test
2732 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2733 continue;
2734 }
2735
2736 // Apply stuckness test if needed
2737 if (pg_response.stuck_since) {
2738 // Delayed response, check for stuckness
2739 utime_t last_whatever = pg_response.stuck_since(pg_info);
2740 if (last_whatever >= cutoff) {
2741 // Not stuck enough, ignore.
2742 continue;
2743 } else {
2744
2745 }
2746 }
2747
2748 auto &causes = detected[pg_response.consequence];
2749 causes.states[pg_response_state]++;
2750 causes.pgs.insert(pg_id);
2751
2752 // Don't bother composing detail string if we have already recorded
2753 // too many
2754 if (causes.pg_messages.size() > max) {
2755 continue;
2756 }
2757
2758 std::ostringstream ss;
2759 if (pg_response.stuck_since) {
2760 utime_t since = pg_response.stuck_since(pg_info);
2761 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2762 if (since == utime_t()) {
2763 ss << " since forever";
2764 } else {
2765 utime_t dur = now - since;
2766 ss << " for " << dur;
2767 }
2768 ss << ", current state " << pg_state_string(pg_info.state)
2769 << ", last acting " << pg_info.acting;
2770 } else {
2771 ss << "pg " << pg_id << " is "
2772 << pg_state_string(pg_info.state);
2773 ss << ", acting " << pg_info.acting;
2774 if (pg_info.stats.sum.num_objects_unfound) {
2775 ss << ", " << pg_info.stats.sum.num_objects_unfound
2776 << " unfound";
2777 }
2778 }
2779
2780 if (pg_info.state & PG_STATE_INCOMPLETE) {
2781 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2782 if (pi && pi->min_size > 1) {
2783 ss << " (reducing pool "
2784 << osdmap.get_pool_name(pg_id.pool())
2785 << " min_size from " << (int)pi->min_size
2786 << " may help; search ceph.com/docs for 'incomplete')";
2787 }
2788 }
2789
2790 causes.pg_messages[pg_id] = ss.str();
2791 }
2792 }
2793 } else {
2794 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2795 }
2796
2797 for (const auto &i : detected) {
2798 std::string health_code;
2799 health_status_t sev;
2800 std::string summary;
2801 switch(i.first) {
2802 case UNAVAILABLE:
2803 health_code = "PG_AVAILABILITY";
2804 sev = HEALTH_WARN;
2805 summary = "Reduced data availability: ";
2806 break;
2807 case DEGRADED:
2808 health_code = "PG_DEGRADED";
2809 summary = "Degraded data redundancy: ";
2810 sev = HEALTH_WARN;
2811 break;
2812 case DEGRADED_FULL:
2813 health_code = "PG_DEGRADED_FULL";
2814 summary = "Degraded data redundancy (low space): ";
2815 sev = HEALTH_ERR;
2816 break;
2817 case DAMAGED:
2818 health_code = "PG_DAMAGED";
2819 summary = "Possible data damage: ";
2820 sev = HEALTH_ERR;
2821 break;
2822 default:
2823 assert(false);
2824 }
2825
2826 if (i.first == DEGRADED) {
2827 if (pg_sum.stats.sum.num_objects_degraded &&
2828 pg_sum.stats.sum.num_object_copies > 0) {
2829 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2830 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2831 char b[20];
2832 snprintf(b, sizeof(b), "%.3lf", pc);
2833 ostringstream ss;
2834 ss << pg_sum.stats.sum.num_objects_degraded
2835 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2836 << b << "%)";
2837
2838 // Throw in a comma for the benefit of the following PG counts
2839 summary += ss.str() + ", ";
2840 }
2841 }
2842
2843 // Compose summary message saying how many PGs in what states led
2844 // to this health check failing
2845 std::vector<std::string> pg_msgs;
2846 for (const auto &j : i.second.states) {
2847 std::ostringstream msg;
2848 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2849 pg_msgs.push_back(msg.str());
2850 }
2851 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2852
2853
2854
2855 health_check_t *check = &checks->add(
2856 health_code,
2857 sev,
2858 summary);
2859
2860 // Compose list of PGs contributing to this health check failing
2861 for (const auto &j : i.second.pg_messages) {
2862 check->detail.push_back(j.second);
2863 }
2864 }
2865
2866 // OSD_SCRUB_ERRORS
2867 if (pg_sum.stats.sum.num_scrub_errors) {
2868 ostringstream ss;
2869 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2870 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
2871 }
2872
2873 // CACHE_POOL_NEAR_FULL
2874 {
2875 list<string> detail;
2876 unsigned num_pools = 0;
2877 for (auto& p : pools) {
2878 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2879 !pg_pool_sum.count(p.first)) {
2880 continue;
2881 }
2882 bool nearfull = false;
2883 const string& name = osdmap.get_pool_name(p.first);
2884 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2885 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2886 ((1000000 - p.second.cache_target_full_ratio_micro) *
2887 cct->_conf->mon_cache_target_full_warn_ratio);
2888 if (p.second.target_max_objects &&
2889 (uint64_t)(st.stats.sum.num_objects -
2890 st.stats.sum.num_objects_hit_set_archive) >
2891 p.second.target_max_objects * (ratio / 1000000.0)) {
2892 ostringstream ss;
2893 ss << "cache pool '" << name << "' with "
2894 << si_t(st.stats.sum.num_objects)
2895 << " objects at/near target max "
2896 << si_t(p.second.target_max_objects) << " objects";
2897 detail.push_back(ss.str());
2898 nearfull = true;
2899 }
2900 if (p.second.target_max_bytes &&
2901 (uint64_t)(st.stats.sum.num_bytes -
2902 st.stats.sum.num_bytes_hit_set_archive) >
2903 p.second.target_max_bytes * (ratio / 1000000.0)) {
2904 ostringstream ss;
2905 ss << "cache pool '" << name
2906 << "' with " << si_t(st.stats.sum.num_bytes)
2907 << "B at/near target max "
2908 << si_t(p.second.target_max_bytes) << "B";
2909 detail.push_back(ss.str());
2910 nearfull = true;
2911 }
2912 if (nearfull) {
2913 ++num_pools;
2914 }
2915 }
2916 if (!detail.empty()) {
2917 ostringstream ss;
2918 ss << num_pools << " cache pools at or near target size";
2919 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2920 d.detail.swap(detail);
2921 }
2922 }
2923
2924 // TOO_FEW_PGS
2925 unsigned num_in = osdmap.get_num_in_osds();
2926 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2927 const auto min_pg_per_osd =
2928 cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
2929 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2930 auto per = sum_pg_up / num_in;
2931 if (per < min_pg_per_osd && per) {
2932 ostringstream ss;
2933 ss << "too few PGs per OSD (" << per
2934 << " < min " << min_pg_per_osd << ")";
2935 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
2936 }
2937 }
2938
2939 // TOO_MANY_PGS
2940 auto max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
2941 if (num_in && max_pg_per_osd > 0) {
2942 auto per = sum_pg_up / num_in;
2943 if (per > max_pg_per_osd) {
2944 ostringstream ss;
2945 ss << "too many PGs per OSD (" << per
2946 << " > max " << max_pg_per_osd << ")";
2947 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
2948 }
2949 }
2950
2951 // SMALLER_PGP_NUM
2952 // MANY_OBJECTS_PER_PG
2953 if (!pg_stat.empty()) {
2954 list<string> pgp_detail, many_detail;
2955 for (auto p = pg_pool_sum.begin();
2956 p != pg_pool_sum.end();
2957 ++p) {
2958 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2959 if (!pi)
2960 continue; // in case osdmap changes haven't propagated to PGMap yet
2961 const string& name = osdmap.get_pool_name(p->first);
2962 if (pi->get_pg_num() > pi->get_pgp_num() &&
2963 !(name.find(".DELETED") != string::npos &&
2964 cct->_conf->mon_fake_pool_delete)) {
2965 ostringstream ss;
2966 ss << "pool " << name << " pg_num "
2967 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
2968 pgp_detail.push_back(ss.str());
2969 }
2970 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2971 if (average_objects_per_pg > 0 &&
2972 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
2973 p->second.stats.sum.num_objects >=
2974 cct->_conf->mon_pg_warn_min_pool_objects) {
2975 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
2976 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2977 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
2978 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
2979 ostringstream ss;
2980 ss << "pool " << name << " objects per pg ("
2981 << objects_per_pg << ") is more than " << ratio
2982 << " times cluster average ("
2983 << average_objects_per_pg << ")";
2984 many_detail.push_back(ss.str());
2985 }
2986 }
2987 }
2988 if (!pgp_detail.empty()) {
2989 ostringstream ss;
2990 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2991 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
2992 d.detail.swap(pgp_detail);
2993 }
2994 if (!many_detail.empty()) {
2995 ostringstream ss;
2996 ss << many_detail.size() << " pools have many more objects per pg than"
2997 << " average";
2998 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
2999 d.detail.swap(many_detail);
3000 }
3001 }
3002
3003 // POOL_FULL
3004 // POOL_NEAR_FULL
3005 {
3006 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3007 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3008 list<string> full_detail, nearfull_detail;
3009 unsigned full_pools = 0, nearfull_pools = 0;
3010 for (auto it : pools) {
3011 auto it2 = pg_pool_sum.find(it.first);
3012 if (it2 == pg_pool_sum.end()) {
3013 continue;
3014 }
3015 const pool_stat_t *pstat = &it2->second;
3016 const object_stat_sum_t& sum = pstat->stats.sum;
3017 const string& pool_name = osdmap.get_pool_name(it.first);
3018 const pg_pool_t &pool = it.second;
3019 bool full = false, nearfull = false;
3020 if (pool.quota_max_objects > 0) {
3021 stringstream ss;
3022 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3023 } else if (crit_threshold > 0 &&
3024 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3025 ss << "pool '" << pool_name
3026 << "' has " << sum.num_objects << " objects"
3027 << " (max " << pool.quota_max_objects << ")";
3028 full_detail.push_back(ss.str());
3029 full = true;
3030 } else if (warn_threshold > 0 &&
3031 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3032 ss << "pool '" << pool_name
3033 << "' has " << sum.num_objects << " objects"
3034 << " (max " << pool.quota_max_objects << ")";
3035 nearfull_detail.push_back(ss.str());
3036 nearfull = true;
3037 }
3038 }
3039 if (pool.quota_max_bytes > 0) {
3040 stringstream ss;
3041 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3042 } else if (crit_threshold > 0 &&
3043 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3044 ss << "pool '" << pool_name
3045 << "' has " << si_t(sum.num_bytes) << " bytes"
3046 << " (max " << si_t(pool.quota_max_bytes) << ")";
3047 full_detail.push_back(ss.str());
3048 full = true;
3049 } else if (warn_threshold > 0 &&
3050 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3051 ss << "pool '" << pool_name
3052 << "' has " << si_t(sum.num_bytes) << " bytes"
3053 << " (max " << si_t(pool.quota_max_bytes) << ")";
3054 nearfull_detail.push_back(ss.str());
3055 nearfull = true;
3056 }
3057 }
3058 if (full) {
3059 ++full_pools;
3060 }
3061 if (nearfull) {
3062 ++nearfull_pools;
3063 }
3064 }
3065 if (full_pools) {
3066 ostringstream ss;
3067 ss << full_pools << " pools full";
3068 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
3069 d.detail.swap(full_detail);
3070 }
3071 if (nearfull_pools) {
3072 ostringstream ss;
3073 ss << nearfull_pools << " pools full";
3074 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
3075 d.detail.swap(nearfull_detail);
3076 }
3077 }
3078
3079 // OBJECT_MISPLACED
3080 if (pg_sum.stats.sum.num_objects_misplaced &&
3081 pg_sum.stats.sum.num_object_copies > 0) {
3082 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3083 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3084 char b[20];
3085 snprintf(b, sizeof(b), "%.3lf", pc);
3086 ostringstream ss;
3087 ss << pg_sum.stats.sum.num_objects_misplaced
3088 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3089 << b << "%)";
3090 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
3091 }
3092
3093 // OBJECT_UNFOUND
3094 if (pg_sum.stats.sum.num_objects_unfound &&
3095 pg_sum.stats.sum.num_objects) {
3096 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3097 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3098 char b[20];
3099 snprintf(b, sizeof(b), "%.3lf", pc);
3100 ostringstream ss;
3101 ss << pg_sum.stats.sum.num_objects_unfound
3102 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
3103 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
3104
3105 for (auto& p : pg_stat) {
3106 if (p.second.stats.sum.num_objects_unfound) {
3107 ostringstream ss;
3108 ss << "pg " << p.first
3109 << " has " << p.second.stats.sum.num_objects_unfound
3110 << " unfound objects";
3111 d.detail.push_back(ss.str());
3112 if (d.detail.size() > max) {
3113 d.detail.push_back("(additional pgs left out for brevity)");
3114 break;
3115 }
3116 }
3117 }
3118 }
3119
3120 // REQUEST_SLOW
3121 // REQUEST_STUCK
3122 if (cct->_conf->mon_osd_warn_op_age > 0 &&
3123 !osd_sum.op_queue_age_hist.h.empty() &&
3124 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3125 cct->_conf->mon_osd_warn_op_age) {
3126 list<string> warn_detail, error_detail;
3127 unsigned warn = 0, error = 0;
3128 float err_age =
3129 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3130 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3131 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3132 float ub = (float)(1 << i) / 1000.0;
3133 if (ub < cct->_conf->mon_osd_warn_op_age)
3134 break;
3135 if (h.h[i]) {
3136 ostringstream ss;
3137 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3138 if (ub > err_age) {
3139 error += h.h[i];
3140 error_detail.push_back(ss.str());
3141 } else {
3142 warn += h.h[i];
3143 warn_detail.push_back(ss.str());
3144 }
3145 }
3146 }
3147
3148 map<float,set<int>> warn_osd_by_max; // max -> osds
3149 map<float,set<int>> error_osd_by_max; // max -> osds
3150 if (!warn_detail.empty() || !error_detail.empty()) {
3151 for (auto& p : osd_stat) {
3152 const pow2_hist_t& h = p.second.op_queue_age_hist;
3153 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3154 float ub = (float)(1 << i) / 1000.0;
3155 if (ub < cct->_conf->mon_osd_warn_op_age)
3156 break;
3157 if (h.h[i]) {
3158 if (ub > err_age) {
3159 error_osd_by_max[ub].insert(p.first);
3160 } else {
3161 warn_osd_by_max[ub].insert(p.first);
3162 }
3163 break;
3164 }
3165 }
3166 }
3167 }
3168
3169 if (!warn_detail.empty()) {
3170 ostringstream ss;
3171 ss << warn << " slow requests are blocked > "
3172 << cct->_conf->mon_osd_warn_op_age << " sec";
3173 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
3174 d.detail.swap(warn_detail);
3175 int left = max;
3176 for (auto& p : warn_osd_by_max) {
3177 ostringstream ss;
3178 if (p.second.size() > 1) {
3179 ss << "osds " << p.second
3180 << " have blocked requests > " << p.first << " sec";
3181 } else {
3182 ss << "osd." << *p.second.begin()
3183 << " has blocked requests > " << p.first << " sec";
3184 }
3185 d.detail.push_back(ss.str());
3186 if (--left == 0) {
3187 break;
3188 }
3189 }
3190 }
3191 if (!error_detail.empty()) {
3192 ostringstream ss;
3193 ss << error << " stuck requests are blocked > "
3194 << err_age << " sec";
3195 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
3196 d.detail.swap(error_detail);
3197 int left = max;
3198 for (auto& p : error_osd_by_max) {
3199 ostringstream ss;
3200 if (p.second.size() > 1) {
3201 ss << "osds " << p.second
3202 << " have stuck requests > " << p.first << " sec";
3203 } else {
3204 ss << "osd." << *p.second.begin()
3205 << " has stuck requests > " << p.first << " sec";
3206 }
3207 d.detail.push_back(ss.str());
3208 if (--left == 0) {
3209 break;
3210 }
3211 }
3212 }
3213 }
3214
3215 // PG_NOT_SCRUBBED
3216 // PG_NOT_DEEP_SCRUBBED
3217 {
3218 if (cct->_conf->mon_warn_not_scrubbed ||
3219 cct->_conf->mon_warn_not_deep_scrubbed) {
3220 list<string> detail, deep_detail;
3221 const double age = cct->_conf->mon_warn_not_scrubbed +
3222 cct->_conf->mon_scrub_interval;
3223 utime_t cutoff = now;
3224 cutoff -= age;
3225 const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
3226 cct->_conf->osd_deep_scrub_interval;
3227 utime_t deep_cutoff = now;
3228 deep_cutoff -= deep_age;
3229 for (auto& p : pg_stat) {
3230 if (cct->_conf->mon_warn_not_scrubbed &&
3231 p.second.last_scrub_stamp < cutoff) {
3232 ostringstream ss;
3233 ss << "pg " << p.first << " not scrubbed since "
3234 << p.second.last_scrub_stamp;
3235 detail.push_back(ss.str());
3236 }
3237 if (cct->_conf->mon_warn_not_deep_scrubbed &&
3238 p.second.last_deep_scrub_stamp < deep_cutoff) {
3239 ostringstream ss;
3240 ss << "pg " << p.first << " not deep-scrubbed since "
3241 << p.second.last_deep_scrub_stamp;
3242 deep_detail.push_back(ss.str());
3243 }
3244 }
3245 if (!detail.empty()) {
3246 ostringstream ss;
3247 ss << detail.size() << " pgs not scrubbed for " << age;
3248 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
3249 d.detail.swap(detail);
3250 }
3251 if (!deep_detail.empty()) {
3252 ostringstream ss;
3253 ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
3254 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
3255 d.detail.swap(deep_detail);
3256 }
3257 }
3258 }
3259
3260 // POOL_APP
3261 if (g_conf->get_val<bool>("mon_warn_on_pool_no_app")) {
3262 list<string> detail;
3263 for (auto &it : pools) {
3264 const pg_pool_t &pool = it.second;
3265 const string& pool_name = osdmap.get_pool_name(it.first);
3266 auto it2 = pg_pool_sum.find(it.first);
3267 if (it2 == pg_pool_sum.end()) {
3268 continue;
3269 }
3270 const pool_stat_t *pstat = &it2->second;
3271 if (pstat == nullptr) {
3272 continue;
3273 }
3274 const object_stat_sum_t& sum = pstat->stats.sum;
3275 // application metadata is not encoded until luminous is minimum
3276 // required release
3277 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3278 sum.num_objects > 0 && pool.application_metadata.empty() &&
3279 !pool.is_tier() && !g_conf->mon_debug_no_require_luminous) {
3280 stringstream ss;
3281 ss << "application not enabled on pool '" << pool_name << "'";
3282 detail.push_back(ss.str());
3283 }
3284 }
3285 if (!detail.empty()) {
3286 ostringstream ss;
3287 ss << "application not enabled on " << detail.size() << " pool(s)";
3288 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
3289 stringstream tip;
3290 tip << "use 'ceph osd pool application enable <pool-name> "
3291 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3292 << "or freeform for custom applications.";
3293 detail.push_back(tip.str());
3294 d.detail.swap(detail);
3295 }
3296 }
3297 }
3298
3299 void PGMap::get_health(
3300 CephContext *cct,
3301 const OSDMap& osdmap,
3302 list<pair<health_status_t,string> >& summary,
3303 list<pair<health_status_t,string> > *detail) const
3304 {
3305 map<string,int> note;
3306 auto p = num_pg_by_state.begin();
3307 auto p_end = num_pg_by_state.end();
3308 for (; p != p_end; ++p) {
3309 if (p->first & PG_STATE_STALE)
3310 note["stale"] += p->second;
3311 if (p->first & PG_STATE_DOWN)
3312 note["down"] += p->second;
3313 if (p->first & PG_STATE_UNDERSIZED)
3314 note["undersized"] += p->second;
3315 if (p->first & PG_STATE_DEGRADED)
3316 note["degraded"] += p->second;
3317 if (p->first & PG_STATE_INCONSISTENT)
3318 note["inconsistent"] += p->second;
3319 if (p->first & PG_STATE_PEERING)
3320 note["peering"] += p->second;
3321 if (p->first & PG_STATE_REPAIR)
3322 note["repair"] += p->second;
3323 if (p->first & PG_STATE_RECOVERING)
3324 note["recovering"] += p->second;
3325 if (p->first & PG_STATE_RECOVERY_WAIT)
3326 note["recovery_wait"] += p->second;
3327 if (p->first & PG_STATE_INCOMPLETE)
3328 note["incomplete"] += p->second;
3329 if (p->first & PG_STATE_BACKFILL_WAIT)
3330 note["backfill_wait"] += p->second;
3331 if (p->first & PG_STATE_BACKFILLING)
3332 note["backfilling"] += p->second;
3333 if (p->first & PG_STATE_BACKFILL_TOOFULL)
3334 note["backfill_toofull"] += p->second;
3335 if (p->first & PG_STATE_RECOVERY_TOOFULL)
3336 note["recovery_toofull"] += p->second;
3337 if (p->first & PG_STATE_SNAPTRIM_ERROR)
3338 note["snaptrim_error"] += p->second;
3339 }
3340
3341 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
3342 utime_t now(ceph_clock_now());
3343 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
3344 uint64_t num_inactive_pgs = 0;
3345
3346 if (detail) {
3347 // we need to collect details of stuck pgs, first do a quick check
3348 // whether this will yield any results
3349 if (get_stuck_counts(cutoff, note)) {
3350
3351 // there are stuck pgs. gather details for specified statuses
3352 // only if we know that there are pgs stuck in that status
3353
3354 if (note.find("stuck inactive") != note.end()) {
3355 get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
3356 note["stuck inactive"] = stuck_pgs.size();
3357 num_inactive_pgs += stuck_pgs.size();
3358 note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
3359 cct->_conf->mon_health_max_detail, detail);
3360 stuck_pgs.clear();
3361 }
3362
3363 if (note.find("stuck unclean") != note.end()) {
3364 get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
3365 note["stuck unclean"] = stuck_pgs.size();
3366 note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
3367 cct->_conf->mon_health_max_detail, detail);
3368 stuck_pgs.clear();
3369 }
3370
3371 if (note.find("stuck undersized") != note.end()) {
3372 get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
3373 note["stuck undersized"] = stuck_pgs.size();
3374 note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
3375 cct->_conf->mon_health_max_detail, detail);
3376 stuck_pgs.clear();
3377 }
3378
3379 if (note.find("stuck degraded") != note.end()) {
3380 get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
3381 note["stuck degraded"] = stuck_pgs.size();
3382 note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
3383 cct->_conf->mon_health_max_detail, detail);
3384 stuck_pgs.clear();
3385 }
3386
3387 if (note.find("stuck stale") != note.end()) {
3388 get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
3389 note["stuck stale"] = stuck_pgs.size();
3390 num_inactive_pgs += stuck_pgs.size();
3391 note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
3392 cct->_conf->mon_health_max_detail, detail);
3393 }
3394 }
3395 } else {
3396 get_stuck_counts(cutoff, note);
3397 auto p = note.find("stuck inactive");
3398 if (p != note.end())
3399 num_inactive_pgs += p->second;
3400 p = note.find("stuck stale");
3401 if (p != note.end())
3402 num_inactive_pgs += p->second;
3403 }
3404
3405 if (cct->_conf->mon_pg_min_inactive > 0 &&
3406 num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
3407 ostringstream ss;
3408 ss << num_inactive_pgs << " pgs are stuck inactive for more than " << cct->_conf->mon_pg_stuck_threshold << " seconds";
3409 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3410 }
3411
3412 if (!note.empty()) {
3413 for (auto p = note.begin(); p != note.end(); ++p) {
3414 ostringstream ss;
3415 ss << p->second << " pgs " << p->first;
3416 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3417 }
3418 if (detail) {
3419 int n = 0, more = 0;
3420 int max = cct->_conf->mon_health_max_detail;
3421 for (auto p = pg_stat.begin();
3422 p != pg_stat.end();
3423 ++p) {
3424 if ((p->second.state & (PG_STATE_STALE |
3425 PG_STATE_DOWN |
3426 PG_STATE_UNDERSIZED |
3427 PG_STATE_DEGRADED |
3428 PG_STATE_INCONSISTENT |
3429 PG_STATE_PEERING |
3430 PG_STATE_REPAIR |
3431 PG_STATE_RECOVERING |
3432 PG_STATE_RECOVERY_WAIT |
3433 PG_STATE_RECOVERY_TOOFULL |
3434 PG_STATE_INCOMPLETE |
3435 PG_STATE_BACKFILL_WAIT |
3436 PG_STATE_BACKFILLING |
3437 PG_STATE_BACKFILL_TOOFULL)) &&
3438 stuck_pgs.count(p->first) == 0) {
3439 if (max > 0) {
3440 --max;
3441 } else {
3442 ++more;
3443 continue;
3444 }
3445 ++n;
3446 ostringstream ss;
3447 ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
3448 ss << ", acting " << p->second.acting;
3449 if (p->second.stats.sum.num_objects_unfound)
3450 ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
3451 if (p->second.state & PG_STATE_INCOMPLETE) {
3452 const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
3453 if (pi && pi->min_size > 1) {
3454 ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
3455 << " min_size from " << (int)pi->min_size
3456 << " may help; search ceph.com/docs for 'incomplete')";
3457 }
3458 }
3459 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3460 }
3461 }
3462 if (more) {
3463 ostringstream ss;
3464 ss << more << " more pgs are also unhealthy";
3465 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3466 }
3467 }
3468 }
3469
3470 // slow requests
3471 if (cct->_conf->mon_osd_warn_op_age > 0 &&
3472 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3473 cct->_conf->mon_osd_warn_op_age) {
3474 auto sum = _warn_slow_request_histogram(
3475 cct, osd_sum.op_queue_age_hist, "", summary, NULL);
3476 if (sum.first > 0 || sum.second > 0) {
3477 if (sum.first > 0) {
3478 ostringstream ss;
3479 ss << sum.first << " requests are blocked > "
3480 << cct->_conf->mon_osd_warn_op_age
3481 << " sec";
3482 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3483 }
3484 if (sum.second > 0) {
3485 ostringstream ss;
3486 ss << sum.second << " requests are blocked > "
3487 << (cct->_conf->mon_osd_warn_op_age *
3488 cct->_conf->mon_osd_err_op_age_ratio)
3489 << " sec";
3490 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3491 }
3492
3493 if (detail) {
3494 unsigned num_warn = 0, num_err = 0;
3495 // do per-osd warnings
3496 for (auto p = osd_stat.begin();
3497 p != osd_stat.end();
3498 ++p) {
3499 auto sum = _warn_slow_request_histogram(
3500 cct,
3501 p->second.op_queue_age_hist,
3502 string(" on osd.") + stringify(p->first),
3503 summary, detail);
3504 if (sum.second)
3505 ++num_err;
3506 else if (sum.first)
3507 ++num_warn;
3508 }
3509 if (num_err) {
3510 ostringstream ss2;
3511 ss2 << num_err << " osds have very slow requests";
3512 summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
3513 detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
3514 }
3515 if (num_warn) {
3516 ostringstream ss2;
3517 ss2 << num_warn << " osds have slow requests";
3518 summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
3519 detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
3520 }
3521 }
3522 }
3523 }
3524
3525 // recovery
3526 list<string> sl;
3527 overall_recovery_summary(NULL, &sl);
3528 for (auto p = sl.begin(); p != sl.end(); ++p) {
3529 summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3530 if (detail)
3531 detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3532 }
3533
3534 // near-target max pools
3535 auto& pools = osdmap.get_pools();
3536 for (auto p = pools.begin();
3537 p != pools.end(); ++p) {
3538 if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
3539 !pg_pool_sum.count(p->first))
3540 continue;
3541 bool nearfull = false;
3542 const string& name = osdmap.get_pool_name(p->first);
3543 const pool_stat_t& st = get_pg_pool_sum_stat(p->first);
3544 uint64_t ratio = p->second.cache_target_full_ratio_micro +
3545 ((1000000 - p->second.cache_target_full_ratio_micro) *
3546 cct->_conf->mon_cache_target_full_warn_ratio);
3547 if (p->second.target_max_objects &&
3548 (uint64_t)(st.stats.sum.num_objects -
3549 st.stats.sum.num_objects_hit_set_archive) >
3550 p->second.target_max_objects * (ratio / 1000000.0)) {
3551 nearfull = true;
3552 if (detail) {
3553 ostringstream ss;
3554 ss << "cache pool '" << name << "' with "
3555 << si_t(st.stats.sum.num_objects)
3556 << " objects at/near target max "
3557 << si_t(p->second.target_max_objects) << " objects";
3558 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3559 }
3560 }
3561 if (p->second.target_max_bytes &&
3562 (uint64_t)(st.stats.sum.num_bytes -
3563 st.stats.sum.num_bytes_hit_set_archive) >
3564 p->second.target_max_bytes * (ratio / 1000000.0)) {
3565 nearfull = true;
3566 if (detail) {
3567 ostringstream ss;
3568 ss << "cache pool '" << name
3569 << "' with " << si_t(st.stats.sum.num_bytes)
3570 << "B at/near target max "
3571 << si_t(p->second.target_max_bytes) << "B";
3572 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3573 }
3574 }
3575 if (nearfull) {
3576 ostringstream ss;
3577 ss << "'" << name << "' at/near target max";
3578 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3579 }
3580 }
3581
3582 // scrub
3583 if (pg_sum.stats.sum.num_scrub_errors) {
3584 ostringstream ss;
3585 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
3586 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3587 if (detail) {
3588 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3589 }
3590 }
3591
3592 // pg skew
3593 auto num_in = osdmap.get_num_in_osds();
3594 auto sum_pg_up = MAX(static_cast<unsigned>(pg_sum.up), pg_stat.size());
3595 int sum_objects = pg_sum.stats.sum.num_objects;
3596 if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
3597 return;
3598 }
3599 const auto min_pg_per_osd =
3600 cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
3601 if (num_in && min_pg_per_osd > 0) {
3602 auto per = sum_pg_up / num_in;
3603 if (per < min_pg_per_osd && per) {
3604 ostringstream ss;
3605 ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")";
3606 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3607 if (detail)
3608 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3609 }
3610 }
3611 int64_t max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
3612 if (num_in && max_pg_per_osd > 0) {
3613 int per = sum_pg_up / num_in;
3614 if (per > max_pg_per_osd) {
3615 ostringstream ss;
3616 ss << "too many PGs per OSD (" << per << " > max "
3617 << max_pg_per_osd << ")";
3618 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3619 if (detail)
3620 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3621 }
3622 }
3623 if (!pg_stat.empty()) {
3624 for (auto p = pg_pool_sum.begin();
3625 p != pg_pool_sum.end();
3626 ++p) {
3627 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
3628 if (!pi)
3629 continue; // in case osdmap changes haven't propagated to PGMap yet
3630 const string& name = osdmap.get_pool_name(p->first);
3631 if (pi->get_pg_num() > pi->get_pgp_num() &&
3632 !(name.find(".DELETED") != string::npos &&
3633 cct->_conf->mon_fake_pool_delete)) {
3634 ostringstream ss;
3635 ss << "pool " << name << " pg_num "
3636 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
3637 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3638 if (detail)
3639 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3640 }
3641 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
3642 if (average_objects_per_pg > 0 &&
3643 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
3644 p->second.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_pool_objects) {
3645 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
3646 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
3647 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
3648 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
3649 ostringstream ss;
3650 ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
3651 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3652 if (detail) {
3653 ostringstream ss;
3654 ss << "pool " << name << " objects per pg ("
3655 << objects_per_pg << ") is more than " << ratio << " times cluster average ("
3656 << average_objects_per_pg << ")";
3657 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3658 }
3659 }
3660 }
3661 }
3662 }
3663
3664 for (auto it : pools) {
3665 auto it2 = pg_pool_sum.find(it.first);
3666 if (it2 == pg_pool_sum.end()) {
3667 continue;
3668 }
3669 const pool_stat_t *pstat = &it2->second;
3670 const object_stat_sum_t& sum = pstat->stats.sum;
3671 const string& pool_name = osdmap.get_pool_name(it.first);
3672 const pg_pool_t &pool = it.second;
3673
3674 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3675 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3676
3677 if (pool.quota_max_objects > 0) {
3678 stringstream ss;
3679 health_status_t status = HEALTH_OK;
3680 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3681 } else if (crit_threshold > 0 &&
3682 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3683 ss << "pool '" << pool_name
3684 << "' has " << sum.num_objects << " objects"
3685 << " (max " << pool.quota_max_objects << ")";
3686 status = HEALTH_ERR;
3687 } else if (warn_threshold > 0 &&
3688 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3689 ss << "pool '" << pool_name
3690 << "' has " << sum.num_objects << " objects"
3691 << " (max " << pool.quota_max_objects << ")";
3692 status = HEALTH_WARN;
3693 }
3694 if (status != HEALTH_OK) {
3695 pair<health_status_t,string> s(status, ss.str());
3696 summary.push_back(s);
3697 if (detail)
3698 detail->push_back(s);
3699 }
3700 }
3701
3702 if (pool.quota_max_bytes > 0) {
3703 health_status_t status = HEALTH_OK;
3704 stringstream ss;
3705 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3706 } else if (crit_threshold > 0 &&
3707 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3708 ss << "pool '" << pool_name
3709 << "' has " << si_t(sum.num_bytes) << " bytes"
3710 << " (max " << si_t(pool.quota_max_bytes) << ")";
3711 status = HEALTH_ERR;
3712 } else if (warn_threshold > 0 &&
3713 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3714 ss << "pool '" << pool_name
3715 << "' has " << si_t(sum.num_bytes) << " bytes"
3716 << " (max " << si_t(pool.quota_max_bytes) << ")";
3717 status = HEALTH_WARN;
3718 }
3719 if (status != HEALTH_OK) {
3720 pair<health_status_t,string> s(status, ss.str());
3721 summary.push_back(s);
3722 if (detail)
3723 detail->push_back(s);
3724 }
3725 }
3726 }
3727
3728 print_unscrubbed_pgs(pg_stat, summary, detail, cct);
3729 }
3730
3731 int process_pg_map_command(
3732 const string& orig_prefix,
3733 const map<string,cmd_vartype>& orig_cmdmap,
3734 const PGMap& pg_map,
3735 const OSDMap& osdmap,
3736 Formatter *f,
3737 stringstream *ss,
3738 bufferlist *odata)
3739 {
3740 string prefix = orig_prefix;
3741 map<string,cmd_vartype> cmdmap = orig_cmdmap;
3742
3743 // perhaps these would be better in the parsing, but it's weird
3744 bool primary = false;
3745 if (prefix == "pg dump_json") {
3746 vector<string> v;
3747 v.push_back(string("all"));
3748 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3749 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3750 prefix = "pg dump";
3751 } else if (prefix == "pg dump_pools_json") {
3752 vector<string> v;
3753 v.push_back(string("pools"));
3754 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3755 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3756 prefix = "pg dump";
3757 } else if (prefix == "pg ls-by-primary") {
3758 primary = true;
3759 prefix = "pg ls";
3760 } else if (prefix == "pg ls-by-osd") {
3761 prefix = "pg ls";
3762 } else if (prefix == "pg ls-by-pool") {
3763 prefix = "pg ls";
3764 string poolstr;
3765 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3766 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3767 if (pool < 0) {
3768 *ss << "pool " << poolstr << " does not exist";
3769 return -ENOENT;
3770 }
3771 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3772 }
3773
3774 int r = 0;
3775 stringstream ds;
3776 if (prefix == "pg stat") {
3777 if (f) {
3778 f->open_object_section("pg_summary");
3779 pg_map.print_oneline_summary(f, NULL);
3780 f->close_section();
3781 f->flush(ds);
3782 } else {
3783 ds << pg_map;
3784 }
3785 odata->append(ds);
3786 return 0;
3787 }
3788
3789 if (prefix == "pg getmap") {
3790 pg_map.encode(*odata);
3791 *ss << "got pgmap version " << pg_map.version;
3792 return 0;
3793 }
3794
3795 if (prefix == "pg dump") {
3796 string val;
3797 vector<string> dumpcontents;
3798 set<string> what;
3799 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3800 copy(dumpcontents.begin(), dumpcontents.end(),
3801 inserter(what, what.end()));
3802 }
3803 if (what.empty())
3804 what.insert("all");
3805 if (f) {
3806 if (what.count("all")) {
3807 f->open_object_section("pg_map");
3808 pg_map.dump(f);
3809 f->close_section();
3810 } else if (what.count("summary") || what.count("sum")) {
3811 f->open_object_section("pg_map");
3812 pg_map.dump_basic(f);
3813 f->close_section();
3814 } else {
3815 if (what.count("pools")) {
3816 pg_map.dump_pool_stats(f);
3817 }
3818 if (what.count("osds")) {
3819 pg_map.dump_osd_stats(f);
3820 }
3821 if (what.count("pgs")) {
3822 pg_map.dump_pg_stats(f, false);
3823 }
3824 if (what.count("pgs_brief")) {
3825 pg_map.dump_pg_stats(f, true);
3826 }
3827 if (what.count("delta")) {
3828 f->open_object_section("delta");
3829 pg_map.dump_delta(f);
3830 f->close_section();
3831 }
3832 }
3833 f->flush(*odata);
3834 } else {
3835 if (what.count("all")) {
3836 pg_map.dump(ds);
3837 } else if (what.count("summary") || what.count("sum")) {
3838 pg_map.dump_basic(ds);
3839 pg_map.dump_pg_sum_stats(ds, true);
3840 pg_map.dump_osd_sum_stats(ds);
3841 } else {
3842 if (what.count("pgs_brief")) {
3843 pg_map.dump_pg_stats(ds, true);
3844 }
3845 bool header = true;
3846 if (what.count("pgs")) {
3847 pg_map.dump_pg_stats(ds, false);
3848 header = false;
3849 }
3850 if (what.count("pools")) {
3851 pg_map.dump_pool_stats(ds, header);
3852 }
3853 if (what.count("osds")) {
3854 pg_map.dump_osd_stats(ds);
3855 }
3856 }
3857 odata->append(ds);
3858 }
3859 *ss << "dumped " << what;
3860 return 0;
3861 }
3862
3863 if (prefix == "pg ls") {
3864 int64_t osd = -1;
3865 int64_t pool = -1;
3866 vector<string>states;
3867 set<pg_t> pgs;
3868 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3869 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3870 cmd_getval(g_ceph_context, cmdmap, "states", states);
3871 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3872 *ss << "pool " << pool << " does not exist";
3873 return -ENOENT;
3874 }
3875 if (osd >= 0 && !osdmap.is_up(osd)) {
3876 *ss << "osd " << osd << " is not up";
3877 return -EAGAIN;
3878 }
3879 if (states.empty())
3880 states.push_back("all");
3881
3882 uint32_t state = 0;
3883
3884 while (!states.empty()) {
3885 string state_str = states.back();
3886
3887 if (state_str == "all") {
3888 state = -1;
3889 break;
3890 } else {
3891 auto filter = pg_string_state(state_str);
3892 if (!filter) {
3893 *ss << "'" << state_str << "' is not a valid pg state,"
3894 << " available choices: " << pg_state_string(0xFFFFFFFF);
3895 return -EINVAL;
3896 }
3897 state |= *filter;
3898 }
3899
3900 states.pop_back();
3901 }
3902
3903 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3904
3905 if (f && !pgs.empty()) {
3906 pg_map.dump_filtered_pg_stats(f, pgs);
3907 f->flush(*odata);
3908 } else if (!pgs.empty()) {
3909 pg_map.dump_filtered_pg_stats(ds, pgs);
3910 odata->append(ds);
3911 }
3912 return 0;
3913 }
3914
3915 if (prefix == "pg dump_stuck") {
3916 vector<string> stuckop_vec;
3917 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3918 if (stuckop_vec.empty())
3919 stuckop_vec.push_back("unclean");
3920 int64_t threshold;
3921 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3922 int64_t(g_conf->mon_pg_stuck_threshold));
3923
3924 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
3925 odata->append(ds);
3926 if (r < 0)
3927 *ss << "failed";
3928 else
3929 *ss << "ok";
3930 return 0;
3931 }
3932
3933 if (prefix == "pg debug") {
3934 string debugop;
3935 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3936 string("unfound_objects_exist"));
3937 if (debugop == "unfound_objects_exist") {
3938 bool unfound_objects_exist = false;
3939 for (const auto& p : pg_map.pg_stat) {
3940 if (p.second.stats.sum.num_objects_unfound > 0) {
3941 unfound_objects_exist = true;
3942 break;
3943 }
3944 }
3945 if (unfound_objects_exist)
3946 ds << "TRUE";
3947 else
3948 ds << "FALSE";
3949 odata->append(ds);
3950 return 0;
3951 }
3952 if (debugop == "degraded_pgs_exist") {
3953 bool degraded_pgs_exist = false;
3954 for (const auto& p : pg_map.pg_stat) {
3955 if (p.second.stats.sum.num_objects_degraded > 0) {
3956 degraded_pgs_exist = true;
3957 break;
3958 }
3959 }
3960 if (degraded_pgs_exist)
3961 ds << "TRUE";
3962 else
3963 ds << "FALSE";
3964 odata->append(ds);
3965 return 0;
3966 }
3967 }
3968
3969 if (prefix == "osd perf") {
3970 if (f) {
3971 f->open_object_section("osdstats");
3972 pg_map.dump_osd_perf_stats(f);
3973 f->close_section();
3974 f->flush(ds);
3975 } else {
3976 pg_map.print_osd_perf_stats(&ds);
3977 }
3978 odata->append(ds);
3979 return 0;
3980 }
3981
3982 if (prefix == "osd blocked-by") {
3983 if (f) {
3984 f->open_object_section("osd_blocked_by");
3985 pg_map.dump_osd_blocked_by_stats(f);
3986 f->close_section();
3987 f->flush(ds);
3988 } else {
3989 pg_map.print_osd_blocked_by_stats(&ds);
3990 }
3991 odata->append(ds);
3992 return 0;
3993 }
3994
3995 if (prefix == "osd pool stats") {
3996 string pool_name;
3997 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
3998
3999 int64_t poolid = -ENOENT;
4000 bool one_pool = false;
4001 if (!pool_name.empty()) {
4002 poolid = osdmap.lookup_pg_pool_name(pool_name);
4003 if (poolid < 0) {
4004 assert(poolid == -ENOENT);
4005 *ss << "unrecognized pool '" << pool_name << "'";
4006 return -ENOENT;
4007 }
4008 one_pool = true;
4009 }
4010
4011 stringstream rs;
4012
4013 if (f)
4014 f->open_array_section("pool_stats");
4015 else {
4016 if (osdmap.get_pools().empty()) {
4017 *ss << "there are no pools!";
4018 goto stats_out;
4019 }
4020 }
4021
4022 for (auto& p : osdmap.get_pools()) {
4023 if (!one_pool)
4024 poolid = p.first;
4025
4026 pool_name = osdmap.get_pool_name(poolid);
4027
4028 if (f) {
4029 f->open_object_section("pool");
4030 f->dump_string("pool_name", pool_name.c_str());
4031 f->dump_int("pool_id", poolid);
4032 f->open_object_section("recovery");
4033 }
4034
4035 list<string> sl;
4036 stringstream tss;
4037 pg_map.pool_recovery_summary(f, &sl, poolid);
4038 if (!f && !sl.empty()) {
4039 for (auto& p : sl)
4040 tss << " " << p << "\n";
4041 }
4042
4043 if (f) {
4044 f->close_section();
4045 f->open_object_section("recovery_rate");
4046 }
4047
4048 ostringstream rss;
4049 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
4050 if (!f && !rss.str().empty())
4051 tss << " recovery io " << rss.str() << "\n";
4052
4053 if (f) {
4054 f->close_section();
4055 f->open_object_section("client_io_rate");
4056 }
4057 rss.clear();
4058 rss.str("");
4059
4060 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
4061 if (!f && !rss.str().empty())
4062 tss << " client io " << rss.str() << "\n";
4063
4064 // dump cache tier IO rate for cache pool
4065 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
4066 if (pool->is_tier()) {
4067 if (f) {
4068 f->close_section();
4069 f->open_object_section("cache_io_rate");
4070 }
4071 rss.clear();
4072 rss.str("");
4073
4074 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
4075 if (!f && !rss.str().empty())
4076 tss << " cache tier io " << rss.str() << "\n";
4077 }
4078 if (f) {
4079 f->close_section();
4080 f->close_section();
4081 } else {
4082 rs << "pool " << pool_name << " id " << poolid << "\n";
4083 if (!tss.str().empty())
4084 rs << tss.str() << "\n";
4085 else
4086 rs << " nothing is going on\n\n";
4087 }
4088 if (one_pool)
4089 break;
4090 }
4091
4092 stats_out:
4093 if (f) {
4094 f->close_section();
4095 f->flush(ds);
4096 odata->append(ds);
4097 } else {
4098 odata->append(rs.str());
4099 }
4100 return 0;
4101 }
4102
4103 return -EOPNOTSUPP;
4104 }
4105
4106 void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
4107 std::set<int> *need_check_down_pg_osds,
4108 std::map<int,utime_t> *last_osd_report,
4109 PGMap *pg_map,
4110 PGMap::Incremental *pending_inc)
4111 {
4112 for (const auto &p : osd_inc.new_weight) {
4113 if (p.second == CEPH_OSD_OUT) {
4114 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
4115 auto j = pg_map->osd_epochs.find(p.first);
4116 if (j != pg_map->osd_epochs.end())
4117 pending_inc->stat_osd_out(p.first, j->second);
4118 }
4119 }
4120
4121 // this is conservative: we want to know if any osds (maybe) got marked down.
4122 for (const auto &p : osd_inc.new_state) {
4123 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
4124 // but we're too lazy to check
4125 // which
4126 need_check_down_pg_osds->insert(p.first);
4127
4128 // clear out the last_osd_report for this OSD
4129 auto report = last_osd_report->find(p.first);
4130 if (report != last_osd_report->end()) {
4131 last_osd_report->erase(report);
4132 }
4133
4134 // clear out osd_stat slow request histogram
4135 dout(20) << __func__ << " clearing osd." << p.first
4136 << " request histogram" << dendl;
4137 pending_inc->stat_osd_down_up(p.first, osd_inc.epoch, *pg_map);
4138 }
4139
4140 if (p.second & CEPH_OSD_EXISTS) {
4141 // whether it was created *or* destroyed, we can safely drop
4142 // it's osd_stat_t record.
4143 dout(10) << __func__ << " osd." << p.first
4144 << " created or destroyed" << dendl;
4145 pending_inc->rm_stat(p.first);
4146
4147 // and adjust full, nearfull set
4148 pg_map->nearfull_osds.erase(p.first);
4149 pg_map->full_osds.erase(p.first);
4150 }
4151 }
4152 }
4153
4154 void PGMapUpdater::check_osd_map(
4155 CephContext *cct,
4156 const OSDMap& osdmap,
4157 const PGMap& pgmap,
4158 PGMap::Incremental *pending_inc)
4159 {
4160 for (auto& p : pgmap.osd_stat) {
4161 if (!osdmap.exists(p.first)) {
4162 // remove osd_stat
4163 pending_inc->rm_stat(p.first);
4164 } else if (osdmap.is_out(p.first)) {
4165 // zero osd_stat
4166 if (p.second.kb != 0) {
4167 auto j = pgmap.osd_epochs.find(p.first);
4168 if (j != pgmap.osd_epochs.end()) {
4169 pending_inc->stat_osd_out(p.first, j->second);
4170 }
4171 }
4172 } else if (!osdmap.is_up(p.first)) {
4173 // zero the op_queue_age_hist
4174 if (!p.second.op_queue_age_hist.empty()) {
4175 pending_inc->stat_osd_down_up(p.first, osdmap.get_epoch(), pgmap);
4176 }
4177 }
4178 }
4179
4180 // deleted pgs (pools)?
4181 for (auto& p : pgmap.pg_pool_sum) {
4182 if (!osdmap.have_pg_pool(p.first)) {
4183 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
4184 << dendl;
4185 for (auto& q : pgmap.pg_stat) {
4186 if (q.first.pool() == (uint64_t)p.first) {
4187 pending_inc->pg_remove.insert(q.first);
4188 }
4189 }
4190 auto q = pending_inc->pg_stat_updates.begin();
4191 while (q != pending_inc->pg_stat_updates.end()) {
4192 if (q->first.pool() == (uint64_t)p.first) {
4193 q = pending_inc->pg_stat_updates.erase(q);
4194 } else {
4195 ++q;
4196 }
4197 }
4198 }
4199 }
4200
4201 // new pgs (split or new pool)?
4202 for (auto& p : osdmap.get_pools()) {
4203 int64_t poolid = p.first;
4204 const pg_pool_t& pi = p.second;
4205 auto q = pgmap.num_pg_by_pool.find(poolid);
4206 unsigned my_pg_num = 0;
4207 if (q != pgmap.num_pg_by_pool.end())
4208 my_pg_num = q->second;
4209 unsigned pg_num = pi.get_pg_num();
4210 if (my_pg_num != pg_num) {
4211 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
4212 << " != my pg_num " << my_pg_num << dendl;
4213 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
4214 pg_t pgid(ps, poolid);
4215 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
4216 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
4217 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4218 stats.last_fresh = osdmap.get_modified();
4219 stats.last_active = osdmap.get_modified();
4220 stats.last_change = osdmap.get_modified();
4221 stats.last_peered = osdmap.get_modified();
4222 stats.last_clean = osdmap.get_modified();
4223 stats.last_unstale = osdmap.get_modified();
4224 stats.last_undegraded = osdmap.get_modified();
4225 stats.last_fullsized = osdmap.get_modified();
4226 stats.last_scrub_stamp = osdmap.get_modified();
4227 stats.last_deep_scrub_stamp = osdmap.get_modified();
4228 stats.last_clean_scrub_stamp = osdmap.get_modified();
4229 }
4230 }
4231 }
4232 }
4233 }
4234
4235 void PGMapUpdater::register_pg(
4236 const OSDMap &osd_map,
4237 pg_t pgid, epoch_t epoch,
4238 bool new_pool,
4239 const PGMap &pg_map,
4240 PGMap::Incremental *pending_inc)
4241 {
4242 pg_t parent;
4243 int split_bits = 0;
4244 auto parent_stat = pg_map.pg_stat.end();
4245 if (!new_pool) {
4246 parent = pgid;
4247 while (1) {
4248 // remove most significant bit
4249 int msb = cbits(parent.ps());
4250 if (!msb)
4251 break;
4252 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
4253 split_bits++;
4254 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
4255 parent_stat = pg_map.pg_stat.find(parent);
4256 if (parent_stat != pg_map.pg_stat.end() &&
4257 parent_stat->second.state != PG_STATE_CREATING) {
4258 dout(10) << " parent is " << parent << dendl;
4259 break;
4260 }
4261 }
4262 }
4263
4264 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4265 stats.state = PG_STATE_CREATING;
4266 stats.created = epoch;
4267 stats.parent = parent;
4268 stats.parent_split_bits = split_bits;
4269 stats.mapping_epoch = epoch;
4270
4271 if (parent_stat != pg_map.pg_stat.end()) {
4272 const pg_stat_t &ps = parent_stat->second;
4273 stats.last_fresh = ps.last_fresh;
4274 stats.last_active = ps.last_active;
4275 stats.last_change = ps.last_change;
4276 stats.last_peered = ps.last_peered;
4277 stats.last_clean = ps.last_clean;
4278 stats.last_unstale = ps.last_unstale;
4279 stats.last_undegraded = ps.last_undegraded;
4280 stats.last_fullsized = ps.last_fullsized;
4281 stats.last_scrub_stamp = ps.last_scrub_stamp;
4282 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
4283 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
4284 } else {
4285 utime_t now = osd_map.get_modified();
4286 stats.last_fresh = now;
4287 stats.last_active = now;
4288 stats.last_change = now;
4289 stats.last_peered = now;
4290 stats.last_clean = now;
4291 stats.last_unstale = now;
4292 stats.last_undegraded = now;
4293 stats.last_fullsized = now;
4294 stats.last_scrub_stamp = now;
4295 stats.last_deep_scrub_stamp = now;
4296 stats.last_clean_scrub_stamp = now;
4297 }
4298
4299 osd_map.pg_to_up_acting_osds(
4300 pgid,
4301 &stats.up,
4302 &stats.up_primary,
4303 &stats.acting,
4304 &stats.acting_primary);
4305
4306 if (split_bits == 0) {
4307 dout(10) << __func__ << " will create " << pgid
4308 << " primary " << stats.acting_primary
4309 << " acting " << stats.acting
4310 << dendl;
4311 } else {
4312 dout(10) << __func__ << " will create " << pgid
4313 << " primary " << stats.acting_primary
4314 << " acting " << stats.acting
4315 << " parent " << parent
4316 << " by " << split_bits << " bits"
4317 << dendl;
4318 }
4319 }
4320
4321 void PGMapUpdater::register_new_pgs(
4322 const OSDMap &osd_map,
4323 const PGMap &pg_map,
4324 PGMap::Incremental *pending_inc)
4325 {
4326 epoch_t epoch = osd_map.get_epoch();
4327 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
4328 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
4329
4330 int created = 0;
4331 const auto &pools = osd_map.get_pools();
4332
4333 for (const auto &p : pools) {
4334 int64_t poolid = p.first;
4335 const pg_pool_t &pool = p.second;
4336 int ruleno = osd_map.crush->find_rule(pool.get_crush_rule(),
4337 pool.get_type(), pool.get_size());
4338 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
4339 continue;
4340
4341 if (pool.get_last_change() <= pg_map.last_pg_scan ||
4342 pool.get_last_change() <= pending_inc->pg_scan) {
4343 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
4344 continue;
4345 }
4346
4347 dout(10) << __func__ << " scanning pool " << poolid
4348 << " " << pool << dendl;
4349
4350 // first pgs in this pool
4351 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
4352
4353 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
4354 pg_t pgid(ps, poolid, -1);
4355 if (pg_map.pg_stat.count(pgid)) {
4356 dout(20) << "register_new_pgs have " << pgid << dendl;
4357 continue;
4358 }
4359 created++;
4360 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
4361 pg_map, pending_inc);
4362 }
4363 }
4364
4365 int removed = 0;
4366 for (const auto &p : pg_map.creating_pgs) {
4367 if (p.preferred() >= 0) {
4368 dout(20) << " removing creating_pg " << p
4369 << " because it is localized and obsolete" << dendl;
4370 pending_inc->pg_remove.insert(p);
4371 ++removed;
4372 } else if (!osd_map.have_pg_pool(p.pool())) {
4373 dout(20) << " removing creating_pg " << p
4374 << " because containing pool deleted" << dendl;
4375 pending_inc->pg_remove.insert(p);
4376 ++removed;
4377 }
4378 }
4379
4380 // deleted pools?
4381 for (const auto &p : pg_map.pg_stat) {
4382 if (!osd_map.have_pg_pool(p.first.pool())) {
4383 dout(20) << " removing pg_stat " << p.first << " because "
4384 << "containing pool deleted" << dendl;
4385 pending_inc->pg_remove.insert(p.first);
4386 ++removed;
4387 } else if (p.first.preferred() >= 0) {
4388 dout(20) << " removing localized pg " << p.first << dendl;
4389 pending_inc->pg_remove.insert(p.first);
4390 ++removed;
4391 }
4392 }
4393
4394 // we don't want to redo this work if we can avoid it.
4395 pending_inc->pg_scan = epoch;
4396
4397 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
4398 << removed << " uncreated pgs" << dendl;
4399 }
4400
4401
4402 void PGMapUpdater::update_creating_pgs(
4403 const OSDMap &osd_map,
4404 const PGMap &pg_map,
4405 PGMap::Incremental *pending_inc)
4406 {
4407 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
4408 << " pgs, osdmap epoch " << osd_map.get_epoch()
4409 << dendl;
4410
4411 unsigned changed = 0;
4412 for (auto p = pg_map.creating_pgs.begin();
4413 p != pg_map.creating_pgs.end();
4414 ++p) {
4415 pg_t pgid = *p;
4416 pg_t on = pgid;
4417 auto q = pg_map.pg_stat.find(pgid);
4418 assert(q != pg_map.pg_stat.end());
4419 const pg_stat_t *s = &q->second;
4420
4421 if (s->parent_split_bits)
4422 on = s->parent;
4423
4424 vector<int> up, acting;
4425 int up_primary, acting_primary;
4426 osd_map.pg_to_up_acting_osds(
4427 on,
4428 &up,
4429 &up_primary,
4430 &acting,
4431 &acting_primary);
4432
4433 if (up != s->up ||
4434 up_primary != s->up_primary ||
4435 acting != s->acting ||
4436 acting_primary != s->acting_primary) {
4437 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
4438 if (osd_map.get_epoch() > ns->reported_epoch) {
4439 dout(20) << __func__ << " " << pgid << " "
4440 << " acting_primary: " << s->acting_primary
4441 << " -> " << acting_primary
4442 << " acting: " << s->acting << " -> " << acting
4443 << " up_primary: " << s->up_primary << " -> " << up_primary
4444 << " up: " << s->up << " -> " << up
4445 << dendl;
4446
4447 // only initialize if it wasn't already a pending update
4448 if (ns->reported_epoch == 0)
4449 *ns = *s;
4450
4451 // note epoch if the target of the create message changed
4452 if (acting_primary != ns->acting_primary)
4453 ns->mapping_epoch = osd_map.get_epoch();
4454
4455 ns->up = up;
4456 ns->up_primary = up_primary;
4457 ns->acting = acting;
4458 ns->acting_primary = acting_primary;
4459
4460 ++changed;
4461 } else {
4462 dout(20) << __func__ << " " << pgid << " has pending update from newer"
4463 << " epoch " << ns->reported_epoch
4464 << dendl;
4465 }
4466 }
4467 }
4468 if (changed) {
4469 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
4470 }
4471 }
4472
4473 static void _try_mark_pg_stale(
4474 const OSDMap& osdmap,
4475 pg_t pgid,
4476 const pg_stat_t& cur,
4477 PGMap::Incremental *pending_inc)
4478 {
4479 if ((cur.state & PG_STATE_STALE) == 0 &&
4480 cur.acting_primary != -1 &&
4481 osdmap.is_down(cur.acting_primary)) {
4482 pg_stat_t *newstat;
4483 auto q = pending_inc->pg_stat_updates.find(pgid);
4484 if (q != pending_inc->pg_stat_updates.end()) {
4485 if ((q->second.acting_primary == cur.acting_primary) ||
4486 ((q->second.state & PG_STATE_STALE) == 0 &&
4487 q->second.acting_primary != -1 &&
4488 osdmap.is_down(q->second.acting_primary))) {
4489 newstat = &q->second;
4490 } else {
4491 // pending update is no longer down or already stale
4492 return;
4493 }
4494 } else {
4495 newstat = &pending_inc->pg_stat_updates[pgid];
4496 *newstat = cur;
4497 }
4498 dout(10) << __func__ << " marking pg " << pgid
4499 << " stale (acting_primary " << newstat->acting_primary
4500 << ")" << dendl;
4501 newstat->state |= PG_STATE_STALE;
4502 newstat->last_unstale = ceph_clock_now();
4503 }
4504 }
4505
4506 void PGMapUpdater::check_down_pgs(
4507 const OSDMap &osdmap,
4508 const PGMap &pg_map,
4509 bool check_all,
4510 const set<int>& need_check_down_pg_osds,
4511 PGMap::Incremental *pending_inc)
4512 {
4513 // if a large number of osds changed state, just iterate over the whole
4514 // pg map.
4515 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
4516 g_conf->mon_pg_check_down_all_threshold) {
4517 check_all = true;
4518 }
4519
4520 if (check_all) {
4521 for (const auto& p : pg_map.pg_stat) {
4522 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
4523 }
4524 } else {
4525 for (auto osd : need_check_down_pg_osds) {
4526 if (osdmap.is_down(osd)) {
4527 auto p = pg_map.pg_by_osd.find(osd);
4528 if (p == pg_map.pg_by_osd.end()) {
4529 continue;
4530 }
4531 for (auto pgid : p->second) {
4532 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
4533 assert(stat.acting_primary == osd);
4534 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
4535 }
4536 }
4537 }
4538 }
4539 }
4540
4541 int reweight::by_utilization(
4542 const OSDMap &osdmap,
4543 const PGMap &pgm,
4544 int oload,
4545 double max_changef,
4546 int max_osds,
4547 bool by_pg, const set<int64_t> *pools,
4548 bool no_increasing,
4549 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
4550 std::stringstream *ss,
4551 std::string *out_str,
4552 Formatter *f)
4553 {
4554 if (oload <= 100) {
4555 *ss << "You must give a percentage higher than 100. "
4556 "The reweighting threshold will be calculated as <average-utilization> "
4557 "times <input-percentage>. For example, an argument of 200 would "
4558 "reweight OSDs which are twice as utilized as the average OSD.\n";
4559 return -EINVAL;
4560 }
4561
4562 vector<int> pgs_by_osd(osdmap.get_max_osd());
4563
4564 // Avoid putting a small number (or 0) in the denominator when calculating
4565 // average_util
4566 double average_util;
4567 if (by_pg) {
4568 // by pg mapping
4569 double weight_sum = 0.0; // sum up the crush weights
4570 unsigned num_pg_copies = 0;
4571 int num_osds = 0;
4572 for (const auto& pg : pgm.pg_stat) {
4573 if (pools && pools->count(pg.first.pool()) == 0)
4574 continue;
4575 for (const auto acting : pg.second.acting) {
4576 if (!osdmap.exists(acting)) {
4577 continue;
4578 }
4579 if (acting >= (int)pgs_by_osd.size())
4580 pgs_by_osd.resize(acting);
4581 if (pgs_by_osd[acting] == 0) {
4582 if (osdmap.crush->get_item_weightf(acting) <= 0) {
4583 //skip if we currently can not identify item
4584 continue;
4585 }
4586 weight_sum += osdmap.crush->get_item_weightf(acting);
4587 ++num_osds;
4588 }
4589 ++pgs_by_osd[acting];
4590 ++num_pg_copies;
4591 }
4592 }
4593
4594 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
4595 *ss << "Refusing to reweight: we only have " << num_pg_copies
4596 << " PGs across " << num_osds << " osds!\n";
4597 return -EDOM;
4598 }
4599
4600 average_util = (double)num_pg_copies / weight_sum;
4601 } else {
4602 // by osd utilization
4603 int num_osd = MAX(1, pgm.osd_stat.size());
4604 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
4605 < g_conf->mon_reweight_min_bytes_per_osd) {
4606 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
4607 << " kb across all osds!\n";
4608 return -EDOM;
4609 }
4610 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
4611 < g_conf->mon_reweight_min_bytes_per_osd) {
4612 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
4613 << " kb used across all osds!\n";
4614 return -EDOM;
4615 }
4616
4617 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
4618 }
4619
4620 // adjust down only if we are above the threshold
4621 const double overload_util = average_util * (double)oload / 100.0;
4622
4623 // but aggressively adjust weights up whenever possible.
4624 const double underload_util = average_util;
4625
4626 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4627
4628 ostringstream oss;
4629 if (f) {
4630 f->open_object_section("reweight_by_utilization");
4631 f->dump_int("overload_min", oload);
4632 f->dump_float("max_change", max_changef);
4633 f->dump_int("max_change_osds", max_osds);
4634 f->dump_float("average_utilization", average_util);
4635 f->dump_float("overload_utilization", overload_util);
4636 } else {
4637 oss << "oload " << oload << "\n";
4638 oss << "max_change " << max_changef << "\n";
4639 oss << "max_change_osds " << max_osds << "\n";
4640 oss.precision(4);
4641 oss << "average_utilization " << std::fixed << average_util << "\n";
4642 oss << "overload_utilization " << overload_util << "\n";
4643 }
4644 int num_changed = 0;
4645
4646 // precompute util for each OSD
4647 std::vector<std::pair<int, float> > util_by_osd;
4648 for (const auto& p : pgm.osd_stat) {
4649 std::pair<int, float> osd_util;
4650 osd_util.first = p.first;
4651 if (by_pg) {
4652 if (p.first >= (int)pgs_by_osd.size() ||
4653 pgs_by_osd[p.first] == 0) {
4654 // skip if this OSD does not contain any pg
4655 // belonging to the specified pool(s).
4656 continue;
4657 }
4658
4659 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4660 // skip if we are unable to locate item.
4661 continue;
4662 }
4663
4664 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4665 } else {
4666 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
4667 }
4668 util_by_osd.push_back(osd_util);
4669 }
4670
4671 // sort by absolute deviation from the mean utilization,
4672 // in descending order.
4673 std::sort(util_by_osd.begin(), util_by_osd.end(),
4674 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4675 return abs(l.second - average_util) > abs(r.second - average_util);
4676 }
4677 );
4678
4679 if (f)
4680 f->open_array_section("reweights");
4681
4682 for (const auto& p : util_by_osd) {
4683 unsigned weight = osdmap.get_weight(p.first);
4684 if (weight == 0) {
4685 // skip if OSD is currently out
4686 continue;
4687 }
4688 float util = p.second;
4689
4690 if (util >= overload_util) {
4691 // Assign a lower weight to overloaded OSDs. The current weight
4692 // is a factor to take into account the original weights,
4693 // to represent e.g. differing storage capacities
4694 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4695 if (weight > max_change)
4696 new_weight = MAX(new_weight, weight - max_change);
4697 new_weights->insert({p.first, new_weight});
4698 if (f) {
4699 f->open_object_section("osd");
4700 f->dump_int("osd", p.first);
4701 f->dump_float("weight", (float)weight / (float)0x10000);
4702 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4703 f->close_section();
4704 } else {
4705 oss << "osd." << p.first << " weight "
4706 << (float)weight / (float)0x10000 << " -> "
4707 << (float)new_weight / (float)0x10000 << "\n";
4708 }
4709 if (++num_changed >= max_osds)
4710 break;
4711 }
4712 if (!no_increasing && util <= underload_util) {
4713 // assign a higher weight.. if we can.
4714 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4715 new_weight = MIN(new_weight, weight + max_change);
4716 if (new_weight > 0x10000)
4717 new_weight = 0x10000;
4718 if (new_weight > weight) {
4719 new_weights->insert({p.first, new_weight});
4720 oss << "osd." << p.first << " weight "
4721 << (float)weight / (float)0x10000 << " -> "
4722 << (float)new_weight / (float)0x10000 << "\n";
4723 if (++num_changed >= max_osds)
4724 break;
4725 }
4726 }
4727 }
4728 if (f) {
4729 f->close_section();
4730 }
4731
4732 OSDMap newmap;
4733 newmap.deepish_copy_from(osdmap);
4734 OSDMap::Incremental newinc;
4735 newinc.fsid = newmap.get_fsid();
4736 newinc.epoch = newmap.get_epoch() + 1;
4737 newinc.new_weight = *new_weights;
4738 newmap.apply_incremental(newinc);
4739
4740 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4741
4742 if (f) {
4743 f->close_section();
4744 } else {
4745 *out_str += "\n";
4746 *out_str += oss.str();
4747 }
4748 return num_changed;
4749 }