]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
update sources to v12.2.3
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Formatter.h"
11 #include "include/ceph_features.h"
12 #include "include/stringify.h"
13
14 #include "osd/osd_types.h"
15 #include "osd/OSDMap.h"
16
17 #define dout_context g_ceph_context
18
19 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
20 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
21 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
22
23
24 // ---------------------
25 // PGMapDigest
26
27 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
28 {
29 // NOTE: see PGMap::encode_digest
30 ENCODE_START(1, 1, bl);
31 ::encode(num_pg, bl);
32 ::encode(num_pg_active, bl);
33 ::encode(num_pg_unknown, bl);
34 ::encode(num_osd, bl);
35 ::encode(pg_pool_sum, bl, features);
36 ::encode(pg_sum, bl, features);
37 ::encode(osd_sum, bl);
38 ::encode(num_pg_by_state, bl);
39 ::encode(num_pg_by_osd, bl);
40 ::encode(num_pg_by_pool, bl);
41 ::encode(osd_last_seq, bl);
42 ::encode(per_pool_sum_delta, bl, features);
43 ::encode(per_pool_sum_deltas_stamps, bl);
44 ::encode(pg_sum_delta, bl, features);
45 ::encode(stamp_delta, bl);
46 ::encode(avail_space_by_rule, bl);
47 ENCODE_FINISH(bl);
48 }
49
50 void PGMapDigest::decode(bufferlist::iterator& p)
51 {
52 DECODE_START(1, p);
53 ::decode(num_pg, p);
54 ::decode(num_pg_active, p);
55 ::decode(num_pg_unknown, p);
56 ::decode(num_osd, p);
57 ::decode(pg_pool_sum, p);
58 ::decode(pg_sum, p);
59 ::decode(osd_sum, p);
60 ::decode(num_pg_by_state, p);
61 ::decode(num_pg_by_osd, p);
62 ::decode(num_pg_by_pool, p);
63 ::decode(osd_last_seq, p);
64 ::decode(per_pool_sum_delta, p);
65 ::decode(per_pool_sum_deltas_stamps, p);
66 ::decode(pg_sum_delta, p);
67 ::decode(stamp_delta, p);
68 ::decode(avail_space_by_rule, p);
69 DECODE_FINISH(p);
70 }
71
72 void PGMapDigest::dump(Formatter *f) const
73 {
74 f->dump_unsigned("num_pg", num_pg);
75 f->dump_unsigned("num_pg_active", num_pg_active);
76 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
77 f->dump_unsigned("num_osd", num_osd);
78 f->dump_object("pool_sum", pg_sum);
79 f->dump_object("osd_sum", osd_sum);
80 f->open_array_section("pool_stats");
81 for (auto& p : pg_pool_sum) {
82 f->open_object_section("pool_stat");
83 f->dump_int("poolid", p.first);
84 auto q = num_pg_by_pool.find(p.first);
85 if (q != num_pg_by_pool.end())
86 f->dump_unsigned("num_pg", q->second);
87 p.second.dump(f);
88 f->close_section();
89 }
90 f->close_section();
91 f->open_array_section("osd_stats");
92 int i = 0;
93 // TODO: this isn't really correct since we can dump non-existent OSDs
94 // I dunno what osd_last_seq is set to in that case...
95 for (auto& p : osd_last_seq) {
96 f->open_object_section("osd_stat");
97 f->dump_int("osd", i);
98 f->dump_unsigned("seq", p);
99 f->close_section();
100 ++i;
101 }
102 f->close_section();
103 f->open_array_section("num_pg_by_state");
104 for (auto& p : num_pg_by_state) {
105 f->open_object_section("count");
106 f->dump_string("state", pg_state_string(p.first));
107 f->dump_unsigned("num", p.second);
108 f->close_section();
109 }
110 f->close_section();
111 f->open_array_section("num_pg_by_osd");
112 for (auto& p : num_pg_by_osd) {
113 f->open_object_section("count");
114 f->dump_unsigned("osd", p.first);
115 f->dump_unsigned("num_primary_pg", p.second.primary);
116 f->dump_unsigned("num_acting_pg", p.second.acting);
117 f->dump_unsigned("num_up_pg", p.second.up);
118 f->close_section();
119 }
120 f->close_section();
121 }
122
123 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
124 {
125 ls.push_back(new PGMapDigest);
126 }
127
128 inline std::string percentify(const float& a) {
129 std::stringstream ss;
130 if (a < 0.01)
131 ss << "0";
132 else
133 ss << std::fixed << std::setprecision(2) << a;
134 return ss.str();
135 }
136
137 void PGMapDigest::print_summary(Formatter *f, ostream *out) const
138 {
139 if (f)
140 f->open_array_section("pgs_by_state");
141
142 // list is descending numeric order (by count)
143 multimap<int,int> state_by_count; // count -> state
144 for (auto p = num_pg_by_state.begin();
145 p != num_pg_by_state.end();
146 ++p) {
147 state_by_count.insert(make_pair(p->second, p->first));
148 }
149 if (f) {
150 for (auto p = state_by_count.rbegin();
151 p != state_by_count.rend();
152 ++p)
153 {
154 f->open_object_section("pgs_by_state_element");
155 f->dump_string("state_name", pg_state_string(p->second));
156 f->dump_unsigned("count", p->first);
157 f->close_section();
158 }
159 }
160 if (f)
161 f->close_section();
162
163 if (f) {
164 f->dump_unsigned("num_pgs", num_pg);
165 f->dump_unsigned("num_pools", pg_pool_sum.size());
166 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
167 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
168 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
169 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
170 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
171 } else {
172 *out << " pools: " << pg_pool_sum.size() << " pools, "
173 << num_pg << " pgs\n";
174 *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
175 << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
176 *out << " usage: "
177 << kb_t(osd_sum.kb_used) << " used, "
178 << kb_t(osd_sum.kb_avail) << " / "
179 << kb_t(osd_sum.kb) << " avail\n";
180 *out << " pgs: ";
181 }
182
183 bool pad = false;
184
185 if (num_pg_unknown > 0) {
186 float p = (float)num_pg_unknown / (float)num_pg;
187 if (f) {
188 f->dump_float("unknown_pgs_ratio", p);
189 } else {
190 char b[20];
191 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
192 *out << b << "% pgs unknown\n";
193 pad = true;
194 }
195 }
196
197 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
198 if (num_pg_inactive > 0) {
199 float p = (float)num_pg_inactive / (float)num_pg;
200 if (f) {
201 f->dump_float("inactive_pgs_ratio", p);
202 } else {
203 if (pad) {
204 *out << " ";
205 }
206 char b[20];
207 snprintf(b, sizeof(b), "%.3f", p * 100.0);
208 *out << b << "% pgs not active\n";
209 pad = true;
210 }
211 }
212
213 list<string> sl;
214 overall_recovery_summary(f, &sl);
215 if (!f && !sl.empty()) {
216 for (auto p = sl.begin(); p != sl.end(); ++p) {
217 if (pad) {
218 *out << " ";
219 }
220 *out << *p << "\n";
221 pad = true;
222 }
223 }
224 sl.clear();
225
226 if (!f) {
227 unsigned max_width = 1;
228 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
229 p != state_by_count.rend();
230 ++p)
231 {
232 std::stringstream ss;
233 ss << p->first;
234 max_width = MAX(ss.str().size(), max_width);
235 }
236
237 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
238 p != state_by_count.rend();
239 ++p)
240 {
241 if (pad) {
242 *out << " ";
243 }
244 pad = true;
245 out->setf(std::ios::left);
246 *out << std::setw(max_width) << p->first
247 << " " << pg_state_string(p->second) << "\n";
248 out->unsetf(std::ios::left);
249 }
250 }
251
252 ostringstream ss_rec_io;
253 overall_recovery_rate_summary(f, &ss_rec_io);
254 ostringstream ss_client_io;
255 overall_client_io_rate_summary(f, &ss_client_io);
256 ostringstream ss_cache_io;
257 overall_cache_io_rate_summary(f, &ss_cache_io);
258
259 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
260 || ss_cache_io.str().length())) {
261 *out << "\n \n";
262 *out << " io:\n";
263 }
264
265 if (!f && ss_client_io.str().length())
266 *out << " client: " << ss_client_io.str() << "\n";
267 if (!f && ss_rec_io.str().length())
268 *out << " recovery: " << ss_rec_io.str() << "\n";
269 if (!f && ss_cache_io.str().length())
270 *out << " cache: " << ss_cache_io.str() << "\n";
271 }
272
273 void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
274 {
275 std::stringstream ss;
276
277 if (f)
278 f->open_array_section("num_pg_by_state");
279 for (auto p = num_pg_by_state.begin();
280 p != num_pg_by_state.end();
281 ++p) {
282 if (f) {
283 f->open_object_section("state");
284 f->dump_string("name", pg_state_string(p->first));
285 f->dump_unsigned("num", p->second);
286 f->close_section();
287 }
288 if (p != num_pg_by_state.begin())
289 ss << ", ";
290 ss << p->second << " " << pg_state_string(p->first);
291 }
292 if (f)
293 f->close_section();
294
295 string states = ss.str();
296 if (out)
297 *out << num_pg << " pgs: "
298 << states << "; "
299 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
300 << kb_t(osd_sum.kb_used) << " used, "
301 << kb_t(osd_sum.kb_avail) << " / "
302 << kb_t(osd_sum.kb) << " avail";
303 if (f) {
304 f->dump_unsigned("num_pgs", num_pg);
305 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
306 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
307 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
308 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
309 }
310
311 // make non-negative; we can get negative values if osds send
312 // uncommitted stats and then "go backward" or if they are just
313 // buggy/wrong.
314 pool_stat_t pos_delta = pg_sum_delta;
315 pos_delta.floor(0);
316 if (pos_delta.stats.sum.num_rd ||
317 pos_delta.stats.sum.num_wr) {
318 if (out)
319 *out << "; ";
320 if (pos_delta.stats.sum.num_rd) {
321 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
322 if (out)
323 *out << pretty_si_t(rd) << "B/s rd, ";
324 if (f)
325 f->dump_unsigned("read_bytes_sec", rd);
326 }
327 if (pos_delta.stats.sum.num_wr) {
328 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
329 if (out)
330 *out << pretty_si_t(wr) << "B/s wr, ";
331 if (f)
332 f->dump_unsigned("write_bytes_sec", wr);
333 }
334 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
335 if (out)
336 *out << pretty_si_t(iops) << "op/s";
337 if (f)
338 f->dump_unsigned("io_sec", iops);
339 }
340
341 list<string> sl;
342 overall_recovery_summary(f, &sl);
343 if (out)
344 for (auto p = sl.begin(); p != sl.end(); ++p)
345 *out << "; " << *p;
346 std::stringstream ssr;
347 overall_recovery_rate_summary(f, &ssr);
348 if (out && ssr.str().length())
349 *out << "; " << ssr.str() << " recovering";
350 }
351
352 void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
353 const pool_stat_t& pool_sum) const
354 {
355 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
356 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
357 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
358 char b[20];
359 snprintf(b, sizeof(b), "%.3lf", pc);
360 if (f) {
361 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
362 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
363 f->dump_float("degraded_ratio", pc / 100.0);
364 } else {
365 ostringstream ss;
366 ss << pool_sum.stats.sum.num_objects_degraded
367 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
368 psl->push_back(ss.str());
369 }
370 }
371 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
372 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
373 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
374 char b[20];
375 snprintf(b, sizeof(b), "%.3lf", pc);
376 if (f) {
377 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
378 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
379 f->dump_float("misplaced_ratio", pc / 100.0);
380 } else {
381 ostringstream ss;
382 ss << pool_sum.stats.sum.num_objects_misplaced
383 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
384 psl->push_back(ss.str());
385 }
386 }
387 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
388 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
389 (double)pool_sum.stats.sum.num_objects * (double)100.0;
390 char b[20];
391 snprintf(b, sizeof(b), "%.3lf", pc);
392 if (f) {
393 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
394 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
395 f->dump_float("unfound_ratio", pc / 100.0);
396 } else {
397 ostringstream ss;
398 ss << pool_sum.stats.sum.num_objects_unfound
399 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
400 psl->push_back(ss.str());
401 }
402 }
403 }
404
405 void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
406 const pool_stat_t& delta_sum,
407 utime_t delta_stamp) const
408 {
409 // make non-negative; we can get negative values if osds send
410 // uncommitted stats and then "go backward" or if they are just
411 // buggy/wrong.
412 pool_stat_t pos_delta = delta_sum;
413 pos_delta.floor(0);
414 if (pos_delta.stats.sum.num_objects_recovered ||
415 pos_delta.stats.sum.num_bytes_recovered ||
416 pos_delta.stats.sum.num_keys_recovered) {
417 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
418 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
419 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
420 if (f) {
421 f->dump_int("recovering_objects_per_sec", objps);
422 f->dump_int("recovering_bytes_per_sec", bps);
423 f->dump_int("recovering_keys_per_sec", kps);
424 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
425 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
426 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
427 } else {
428 *out << pretty_si_t(bps) << "B/s";
429 if (pos_delta.stats.sum.num_keys_recovered)
430 *out << ", " << pretty_si_t(kps) << "keys/s";
431 *out << ", " << pretty_si_t(objps) << "objects/s";
432 }
433 }
434 }
435
436 void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
437 {
438 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
439 }
440
441 void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
442 {
443 recovery_summary(f, psl, pg_sum);
444 }
445
446 void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
447 uint64_t poolid) const
448 {
449 auto p = per_pool_sum_delta.find(poolid);
450 if (p == per_pool_sum_delta.end())
451 return;
452
453 auto ts = per_pool_sum_deltas_stamps.find(p->first);
454 assert(ts != per_pool_sum_deltas_stamps.end());
455 recovery_rate_summary(f, out, p->second.first, ts->second);
456 }
457
458 void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
459 uint64_t poolid) const
460 {
461 auto p = pg_pool_sum.find(poolid);
462 if (p == pg_pool_sum.end())
463 return;
464
465 recovery_summary(f, psl, p->second);
466 }
467
468 void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
469 const pool_stat_t& delta_sum,
470 utime_t delta_stamp) const
471 {
472 pool_stat_t pos_delta = delta_sum;
473 pos_delta.floor(0);
474 if (pos_delta.stats.sum.num_rd ||
475 pos_delta.stats.sum.num_wr) {
476 if (pos_delta.stats.sum.num_rd) {
477 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
478 if (f) {
479 f->dump_int("read_bytes_sec", rd);
480 } else {
481 *out << pretty_si_t(rd) << "B/s rd, ";
482 }
483 }
484 if (pos_delta.stats.sum.num_wr) {
485 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
486 if (f) {
487 f->dump_int("write_bytes_sec", wr);
488 } else {
489 *out << pretty_si_t(wr) << "B/s wr, ";
490 }
491 }
492 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
493 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
494 if (f) {
495 f->dump_int("read_op_per_sec", iops_rd);
496 f->dump_int("write_op_per_sec", iops_wr);
497 } else {
498 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
499 }
500 }
501 }
502
503 void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
504 {
505 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
506 }
507
508 void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
509 uint64_t poolid) const
510 {
511 auto p = per_pool_sum_delta.find(poolid);
512 if (p == per_pool_sum_delta.end())
513 return;
514
515 auto ts = per_pool_sum_deltas_stamps.find(p->first);
516 assert(ts != per_pool_sum_deltas_stamps.end());
517 client_io_rate_summary(f, out, p->second.first, ts->second);
518 }
519
520 void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
521 const pool_stat_t& delta_sum,
522 utime_t delta_stamp) const
523 {
524 pool_stat_t pos_delta = delta_sum;
525 pos_delta.floor(0);
526 bool have_output = false;
527
528 if (pos_delta.stats.sum.num_flush) {
529 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
530 if (f) {
531 f->dump_int("flush_bytes_sec", flush);
532 } else {
533 *out << pretty_si_t(flush) << "B/s flush";
534 have_output = true;
535 }
536 }
537 if (pos_delta.stats.sum.num_evict) {
538 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
539 if (f) {
540 f->dump_int("evict_bytes_sec", evict);
541 } else {
542 if (have_output)
543 *out << ", ";
544 *out << pretty_si_t(evict) << "B/s evict";
545 have_output = true;
546 }
547 }
548 if (pos_delta.stats.sum.num_promote) {
549 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
550 if (f) {
551 f->dump_int("promote_op_per_sec", promote);
552 } else {
553 if (have_output)
554 *out << ", ";
555 *out << pretty_si_t(promote) << "op/s promote";
556 have_output = true;
557 }
558 }
559 if (pos_delta.stats.sum.num_flush_mode_low) {
560 if (f) {
561 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
562 } else {
563 if (have_output)
564 *out << ", ";
565 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
566 have_output = true;
567 }
568 }
569 if (pos_delta.stats.sum.num_flush_mode_high) {
570 if (f) {
571 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
572 } else {
573 if (have_output)
574 *out << ", ";
575 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
576 have_output = true;
577 }
578 }
579 if (pos_delta.stats.sum.num_evict_mode_some) {
580 if (f) {
581 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
582 } else {
583 if (have_output)
584 *out << ", ";
585 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
586 have_output = true;
587 }
588 }
589 if (pos_delta.stats.sum.num_evict_mode_full) {
590 if (f) {
591 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
592 } else {
593 if (have_output)
594 *out << ", ";
595 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
596 }
597 }
598 }
599
600 void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
601 {
602 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
603 }
604
605 void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
606 uint64_t poolid) const
607 {
608 auto p = per_pool_sum_delta.find(poolid);
609 if (p == per_pool_sum_delta.end())
610 return;
611
612 auto ts = per_pool_sum_deltas_stamps.find(p->first);
613 assert(ts != per_pool_sum_deltas_stamps.end());
614 cache_io_rate_summary(f, out, p->second.first, ts->second);
615 }
616
617 static float pool_raw_used_rate(const OSDMap &osd_map, int64_t poolid)
618 {
619 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
620
621 switch (pool->get_type()) {
622 case pg_pool_t::TYPE_REPLICATED:
623 return pool->get_size();
624 break;
625 case pg_pool_t::TYPE_ERASURE:
626 {
627 auto& ecp =
628 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
629 auto pm = ecp.find("m");
630 auto pk = ecp.find("k");
631 if (pm != ecp.end() && pk != ecp.end()) {
632 int k = atoi(pk->second.c_str());
633 int m = atoi(pm->second.c_str());
634 int mk = m + k;
635 assert(mk != 0);
636 assert(k != 0);
637 return (float)mk / k;
638 } else {
639 return 0.0;
640 }
641 }
642 break;
643 default:
644 assert(0 == "unrecognized pool type");
645 }
646 }
647
648 ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
649 boost::optional<int64_t> data_pool) const
650 {
651 ceph_statfs statfs;
652 bool filter = false;
653 object_stat_sum_t sum;
654
655 if (data_pool) {
656 auto i = pg_pool_sum.find(*data_pool);
657 if (i != pg_pool_sum.end()) {
658 sum = i->second.stats.sum;
659 filter = true;
660 }
661 }
662
663 if (filter) {
664 statfs.kb_used = (sum.num_bytes >> 10);
665 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
666 statfs.num_objects = sum.num_objects;
667 statfs.kb = statfs.kb_used + statfs.kb_avail;
668 } else {
669 // these are in KB.
670 statfs.kb = osd_sum.kb;
671 statfs.kb_used = osd_sum.kb_used;
672 statfs.kb_avail = osd_sum.kb_avail;
673 statfs.num_objects = pg_sum.stats.sum.num_objects;
674 }
675
676 return statfs;
677 }
678
679 void PGMapDigest::dump_pool_stats_full(
680 const OSDMap &osd_map,
681 stringstream *ss,
682 Formatter *f,
683 bool verbose) const
684 {
685 TextTable tbl;
686
687 if (f) {
688 f->open_array_section("pools");
689 } else {
690 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
691 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
692 if (verbose) {
693 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
694 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
695 }
696
697 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
698 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
699 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
700 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
701 if (verbose) {
702 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
703 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
704 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
705 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
706 }
707 }
708
709 map<int,uint64_t> avail_by_rule;
710 for (auto p = osd_map.get_pools().begin();
711 p != osd_map.get_pools().end(); ++p) {
712 int64_t pool_id = p->first;
713 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
714 continue;
715 const string& pool_name = osd_map.get_pool_name(pool_id);
716 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
717
718 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
719 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
720 pool->get_type(),
721 pool->get_size());
722 int64_t avail;
723 float raw_used_rate;
724 if (avail_by_rule.count(ruleno) == 0) {
725 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
726 avail = get_rule_avail(ruleno);
727 if (avail < 0)
728 avail = 0;
729 avail_by_rule[ruleno] = avail;
730 } else {
731 avail = avail_by_rule[ruleno];
732 }
733
734 raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);
735
736 if (f) {
737 f->open_object_section("pool");
738 f->dump_string("name", pool_name);
739 f->dump_int("id", pool_id);
740 f->open_object_section("stats");
741 } else {
742 tbl << pool_name
743 << pool_id;
744 if (verbose) {
745 if (pool->quota_max_objects == 0)
746 tbl << "N/A";
747 else
748 tbl << si_t(pool->quota_max_objects);
749
750 if (pool->quota_max_bytes == 0)
751 tbl << "N/A";
752 else
753 tbl << si_t(pool->quota_max_bytes);
754 }
755
756 }
757 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
758 if (f)
759 f->close_section(); // stats
760 else
761 tbl << TextTable::endrow;
762
763 if (f)
764 f->close_section(); // pool
765 }
766 if (f)
767 f->close_section();
768 else {
769 assert(ss != nullptr);
770 *ss << "POOLS:\n";
771 tbl.set_indent(4);
772 *ss << tbl;
773 }
774 }
775
776 void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
777 {
778 if (f) {
779 f->open_object_section("stats");
780 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
781 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
782 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
783 if (verbose) {
784 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
785 }
786 f->close_section();
787 } else {
788 assert(ss != nullptr);
789 TextTable tbl;
790 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
791 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
792 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
793 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
794 if (verbose) {
795 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
796 }
797 tbl << stringify(si_t(osd_sum.kb*1024))
798 << stringify(si_t(osd_sum.kb_avail*1024))
799 << stringify(si_t(osd_sum.kb_used*1024));
800 float used = 0.0;
801 if (osd_sum.kb > 0) {
802 used = ((float)osd_sum.kb_used / osd_sum.kb);
803 }
804 tbl << percentify(used*100);
805 if (verbose) {
806 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
807 }
808 tbl << TextTable::endrow;
809
810 *ss << "GLOBAL:\n";
811 tbl.set_indent(4);
812 *ss << tbl;
813 }
814 }
815
816 void PGMapDigest::dump_object_stat_sum(
817 TextTable &tbl, Formatter *f,
818 const object_stat_sum_t &sum, uint64_t avail,
819 float raw_used_rate, bool verbose,
820 const pg_pool_t *pool)
821 {
822 float curr_object_copies_rate = 0.0;
823 if (sum.num_object_copies > 0)
824 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
825
826 float used = 0.0;
827 // note avail passed in is raw_avail, calc raw_used here.
828 if (avail) {
829 used = sum.num_bytes * raw_used_rate * curr_object_copies_rate;
830 used /= used + avail;
831 } else if (sum.num_bytes) {
832 used = 1.0;
833 }
834
835 if (f) {
836 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
837 f->dump_int("bytes_used", sum.num_bytes);
838 f->dump_format_unquoted("percent_used", "%.2f", (used*100));
839 f->dump_unsigned("max_avail", avail / raw_used_rate);
840 f->dump_int("objects", sum.num_objects);
841 if (verbose) {
842 f->dump_int("quota_objects", pool->quota_max_objects);
843 f->dump_int("quota_bytes", pool->quota_max_bytes);
844 f->dump_int("dirty", sum.num_objects_dirty);
845 f->dump_int("rd", sum.num_rd);
846 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
847 f->dump_int("wr", sum.num_wr);
848 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
849 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
850 }
851 } else {
852 tbl << stringify(si_t(sum.num_bytes));
853 tbl << percentify(used*100);
854 tbl << si_t(avail / raw_used_rate);
855 tbl << sum.num_objects;
856 if (verbose) {
857 tbl << stringify(si_t(sum.num_objects_dirty))
858 << stringify(si_t(sum.num_rd))
859 << stringify(si_t(sum.num_wr))
860 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
861 }
862 }
863 }
864
865 int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
866 int64_t poolid) const
867 {
868 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
869 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
870 pool->get_type(),
871 pool->get_size());
872 int64_t avail;
873 avail = get_rule_avail(ruleno);
874 if (avail < 0)
875 avail = 0;
876
877 return avail / ::pool_raw_used_rate(osd_map, poolid);
878 }
879
880 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
881 {
882 map<int,float> wm;
883 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
884 if (r < 0) {
885 return r;
886 }
887 if (wm.empty()) {
888 return 0;
889 }
890
891 float fratio;
892 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
893 osdmap.get_full_ratio() > 0) {
894 fratio = osdmap.get_full_ratio();
895 } else {
896 fratio = get_fallback_full_ratio();
897 }
898
899 int64_t min = -1;
900 for (auto p = wm.begin(); p != wm.end(); ++p) {
901 auto osd_info = osd_stat.find(p->first);
902 if (osd_info != osd_stat.end()) {
903 if (osd_info->second.kb == 0 || p->second == 0) {
904 // osd must be out, hence its stats have been zeroed
905 // (unless we somehow managed to have a disk with size 0...)
906 //
907 // (p->second == 0), if osd weight is 0, no need to
908 // calculate proj below.
909 continue;
910 }
911 double unusable = (double)osd_info->second.kb *
912 (1.0 - fratio);
913 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
914 avail *= 1024.0;
915 int64_t proj = (int64_t)(avail / (double)p->second);
916 if (min < 0 || proj < min) {
917 min = proj;
918 }
919 } else {
920 dout(0) << "Cannot get stat of OSD " << p->first << dendl;
921 }
922 }
923 return min;
924 }
925
926 void PGMap::get_rules_avail(const OSDMap& osdmap,
927 std::map<int,int64_t> *avail_map) const
928 {
929 avail_map->clear();
930 for (auto p : osdmap.get_pools()) {
931 int64_t pool_id = p.first;
932 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
933 continue;
934 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
935 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
936 pool->get_type(),
937 pool->get_size());
938 if (avail_map->count(ruleno) == 0)
939 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
940 }
941 }
942
943 // ---------------------
944 // PGMap
945
946 void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
947 {
948 if ((features & CEPH_FEATURE_MONENC) == 0) {
949 __u8 v = 4;
950 ::encode(v, bl);
951 ::encode(version, bl);
952 ::encode(pg_stat_updates, bl);
953 ::encode(osd_stat_updates, bl);
954 ::encode(osd_stat_rm, bl);
955 ::encode(osdmap_epoch, bl);
956 ::encode(pg_scan, bl);
957 ::encode(full_ratio, bl);
958 ::encode(nearfull_ratio, bl);
959 ::encode(pg_remove, bl);
960 return;
961 }
962
963 ENCODE_START(7, 5, bl);
964 ::encode(version, bl);
965 ::encode(pg_stat_updates, bl);
966 ::encode(osd_stat_updates, bl);
967 ::encode(osd_stat_rm, bl);
968 ::encode(osdmap_epoch, bl);
969 ::encode(pg_scan, bl);
970 ::encode(full_ratio, bl);
971 ::encode(nearfull_ratio, bl);
972 ::encode(pg_remove, bl);
973 ::encode(stamp, bl);
974 ::encode(osd_epochs, bl);
975 ENCODE_FINISH(bl);
976 }
977
978 void PGMap::Incremental::decode(bufferlist::iterator &bl)
979 {
980 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
981 ::decode(version, bl);
982 if (struct_v < 3) {
983 pg_stat_updates.clear();
984 __u32 n;
985 ::decode(n, bl);
986 while (n--) {
987 old_pg_t opgid;
988 ::decode(opgid, bl);
989 pg_t pgid = opgid;
990 ::decode(pg_stat_updates[pgid], bl);
991 }
992 } else {
993 ::decode(pg_stat_updates, bl);
994 }
995 ::decode(osd_stat_updates, bl);
996 ::decode(osd_stat_rm, bl);
997 ::decode(osdmap_epoch, bl);
998 ::decode(pg_scan, bl);
999 if (struct_v >= 2) {
1000 ::decode(full_ratio, bl);
1001 ::decode(nearfull_ratio, bl);
1002 }
1003 if (struct_v < 3) {
1004 pg_remove.clear();
1005 __u32 n;
1006 ::decode(n, bl);
1007 while (n--) {
1008 old_pg_t opgid;
1009 ::decode(opgid, bl);
1010 pg_remove.insert(pg_t(opgid));
1011 }
1012 } else {
1013 ::decode(pg_remove, bl);
1014 }
1015 if (struct_v < 4 && full_ratio == 0) {
1016 full_ratio = -1;
1017 }
1018 if (struct_v < 4 && nearfull_ratio == 0) {
1019 nearfull_ratio = -1;
1020 }
1021 if (struct_v >= 6)
1022 ::decode(stamp, bl);
1023 if (struct_v >= 7) {
1024 ::decode(osd_epochs, bl);
1025 } else {
1026 for (auto i = osd_stat_updates.begin();
1027 i != osd_stat_updates.end();
1028 ++i) {
1029 // This isn't accurate, but will cause trimming to behave like
1030 // previously.
1031 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
1032 }
1033 }
1034 DECODE_FINISH(bl);
1035 }
1036
1037 void PGMap::Incremental::dump(Formatter *f) const
1038 {
1039 f->dump_unsigned("version", version);
1040 f->dump_stream("stamp") << stamp;
1041 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1042 f->dump_unsigned("pg_scan_epoch", pg_scan);
1043 f->dump_float("full_ratio", full_ratio);
1044 f->dump_float("nearfull_ratio", nearfull_ratio);
1045
1046 f->open_array_section("pg_stat_updates");
1047 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1048 f->open_object_section("pg_stat");
1049 f->dump_stream("pgid") << p->first;
1050 p->second.dump(f);
1051 f->close_section();
1052 }
1053 f->close_section();
1054
1055 f->open_array_section("osd_stat_updates");
1056 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1057 f->open_object_section("osd_stat");
1058 f->dump_int("osd", p->first);
1059 p->second.dump(f);
1060 f->close_section();
1061 }
1062 f->close_section();
1063
1064 f->open_array_section("osd_stat_removals");
1065 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1066 f->dump_int("osd", *p);
1067 f->close_section();
1068
1069 f->open_array_section("pg_removals");
1070 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1071 f->dump_stream("pgid") << *p;
1072 f->close_section();
1073 }
1074
1075 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1076 {
1077 o.push_back(new Incremental);
1078 o.push_back(new Incremental);
1079 o.back()->version = 1;
1080 o.back()->stamp = utime_t(123,345);
1081 o.push_back(new Incremental);
1082 o.back()->version = 2;
1083 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
1084 o.back()->osd_stat_updates[5] = osd_stat_t();
1085 o.back()->osd_epochs[5] = 12;
1086 o.push_back(new Incremental);
1087 o.back()->version = 3;
1088 o.back()->osdmap_epoch = 1;
1089 o.back()->pg_scan = 2;
1090 o.back()->full_ratio = .2;
1091 o.back()->nearfull_ratio = .3;
1092 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
1093 o.back()->osd_stat_updates[6] = osd_stat_t();
1094 o.back()->osd_epochs[6] = 12;
1095 o.back()->pg_remove.insert(pg_t(1,2,3));
1096 o.back()->osd_stat_rm.insert(5);
1097 }
1098
1099
1100 // --
1101
1102 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1103 {
1104 assert(inc.version == version+1);
1105 version++;
1106
1107 pool_stat_t pg_sum_old = pg_sum;
1108 mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
1109
1110 bool ratios_changed = false;
1111 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
1112 full_ratio = inc.full_ratio;
1113 ratios_changed = true;
1114 }
1115 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
1116 nearfull_ratio = inc.nearfull_ratio;
1117 ratios_changed = true;
1118 }
1119 if (ratios_changed)
1120 redo_full_sets();
1121
1122 for (auto p = inc.pg_stat_updates.begin();
1123 p != inc.pg_stat_updates.end();
1124 ++p) {
1125 const pg_t &update_pg(p->first);
1126 const pg_stat_t &update_stat(p->second);
1127
1128 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
1129 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
1130
1131 auto t = pg_stat.find(update_pg);
1132 if (t == pg_stat.end()) {
1133 pg_stat.insert(make_pair(update_pg, update_stat));
1134 } else {
1135 stat_pg_sub(update_pg, t->second);
1136 t->second = update_stat;
1137 }
1138 stat_pg_add(update_pg, update_stat);
1139 }
1140 assert(osd_stat.size() == osd_epochs.size());
1141 for (auto p = inc.get_osd_stat_updates().begin();
1142 p != inc.get_osd_stat_updates().end();
1143 ++p) {
1144 int osd = p->first;
1145 const osd_stat_t &new_stats(p->second);
1146
1147 auto t = osd_stat.find(osd);
1148 if (t == osd_stat.end()) {
1149 osd_stat.insert(make_pair(osd, new_stats));
1150 } else {
1151 stat_osd_sub(t->first, t->second);
1152 t->second = new_stats;
1153 }
1154 auto i = osd_epochs.find(osd);
1155 auto j = inc.get_osd_epochs().find(osd);
1156 assert(j != inc.get_osd_epochs().end());
1157
1158 if (i == osd_epochs.end())
1159 osd_epochs.insert(*j);
1160 else
1161 i->second = j->second;
1162
1163 stat_osd_add(osd, new_stats);
1164
1165 // adjust [near]full status
1166 register_nearfull_status(osd, new_stats);
1167 }
1168 set<int64_t> deleted_pools;
1169 for (auto p = inc.pg_remove.begin();
1170 p != inc.pg_remove.end();
1171 ++p) {
1172 const pg_t &removed_pg(*p);
1173 auto s = pg_stat.find(removed_pg);
1174 if (s != pg_stat.end()) {
1175 stat_pg_sub(removed_pg, s->second);
1176 pg_stat.erase(s);
1177 }
1178 deleted_pools.insert(removed_pg.pool());
1179 }
1180
1181 for (auto p = inc.get_osd_stat_rm().begin();
1182 p != inc.get_osd_stat_rm().end();
1183 ++p) {
1184 auto t = osd_stat.find(*p);
1185 if (t != osd_stat.end()) {
1186 stat_osd_sub(t->first, t->second);
1187 osd_stat.erase(t);
1188 osd_epochs.erase(*p);
1189 }
1190
1191 // remove these old osds from full/nearfull set(s), too
1192 nearfull_osds.erase(*p);
1193 full_osds.erase(*p);
1194 }
1195
1196 // skip calculating delta while sum was not synchronized
1197 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1198 utime_t delta_t;
1199 delta_t = inc.stamp;
1200 delta_t -= stamp;
1201 // calculate a delta, and average over the last 2 deltas.
1202 pool_stat_t d = pg_sum;
1203 d.stats.sub(pg_sum_old.stats);
1204 pg_sum_deltas.push_back(make_pair(d, delta_t));
1205 stamp_delta += delta_t;
1206 pg_sum_delta.stats.add(d.stats);
1207 auto smooth_intervals =
1208 cct ? cct->_conf->get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1209 if (pg_sum_deltas.size() > smooth_intervals) {
1210 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1211 stamp_delta -= pg_sum_deltas.front().second;
1212 pg_sum_deltas.pop_front();
1213 }
1214 }
1215 stamp = inc.stamp;
1216
1217 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1218
1219 for (auto p : deleted_pools) {
1220 if (cct)
1221 dout(20) << " deleted pool " << p << dendl;
1222 deleted_pool(p);
1223 }
1224
1225 if (inc.osdmap_epoch)
1226 last_osdmap_epoch = inc.osdmap_epoch;
1227 if (inc.pg_scan)
1228 last_pg_scan = inc.pg_scan;
1229
1230 min_last_epoch_clean = 0; // invalidate
1231 }
1232
1233 void PGMap::redo_full_sets()
1234 {
1235 full_osds.clear();
1236 nearfull_osds.clear();
1237 for (auto i = osd_stat.begin();
1238 i != osd_stat.end();
1239 ++i) {
1240 register_nearfull_status(i->first, i->second);
1241 }
1242 }
1243
1244 void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
1245 {
1246 float ratio = ((float)s.kb_used) / ((float)s.kb);
1247
1248 if (full_ratio > 0 && ratio > full_ratio) {
1249 // full
1250 full_osds.insert(osd);
1251 nearfull_osds.erase(osd);
1252 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
1253 // nearfull
1254 full_osds.erase(osd);
1255 nearfull_osds.insert(osd);
1256 } else {
1257 // ok
1258 full_osds.erase(osd);
1259 nearfull_osds.erase(osd);
1260 }
1261 }
1262
1263 void PGMap::calc_stats()
1264 {
1265 num_pg = 0;
1266 num_pg_active = 0;
1267 num_pg_unknown = 0;
1268 num_osd = 0;
1269 pg_pool_sum.clear();
1270 num_pg_by_pool.clear();
1271 pg_by_osd.clear();
1272 pg_sum = pool_stat_t();
1273 osd_sum = osd_stat_t();
1274 num_pg_by_state.clear();
1275 num_pg_by_osd.clear();
1276
1277 for (auto p = pg_stat.begin();
1278 p != pg_stat.end();
1279 ++p) {
1280 stat_pg_add(p->first, p->second);
1281 }
1282 for (auto p = osd_stat.begin();
1283 p != osd_stat.end();
1284 ++p)
1285 stat_osd_add(p->first, p->second);
1286
1287 redo_full_sets();
1288
1289 min_last_epoch_clean = calc_min_last_epoch_clean();
1290 }
1291
1292 void PGMap::update_pg(pg_t pgid, bufferlist& bl)
1293 {
1294 bufferlist::iterator p = bl.begin();
1295 auto s = pg_stat.find(pgid);
1296 epoch_t old_lec = 0, lec;
1297 if (s != pg_stat.end()) {
1298 old_lec = s->second.get_effective_last_epoch_clean();
1299 stat_pg_update(pgid, s->second, p);
1300 lec = s->second.get_effective_last_epoch_clean();
1301 } else {
1302 pg_stat_t& r = pg_stat[pgid];
1303 ::decode(r, p);
1304 stat_pg_add(pgid, r);
1305 lec = r.get_effective_last_epoch_clean();
1306 }
1307
1308 if (min_last_epoch_clean &&
1309 (lec < min_last_epoch_clean || // we did
1310 (lec > min_last_epoch_clean && // we might
1311 old_lec == min_last_epoch_clean)
1312 ))
1313 min_last_epoch_clean = 0;
1314 }
1315
1316 void PGMap::remove_pg(pg_t pgid)
1317 {
1318 auto s = pg_stat.find(pgid);
1319 if (s != pg_stat.end()) {
1320 if (min_last_epoch_clean &&
1321 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
1322 min_last_epoch_clean = 0;
1323 stat_pg_sub(pgid, s->second);
1324 pg_stat.erase(s);
1325 }
1326 }
1327
1328 void PGMap::update_osd(int osd, bufferlist& bl)
1329 {
1330 bufferlist::iterator p = bl.begin();
1331 auto o = osd_stat.find(osd);
1332 epoch_t old_lec = 0;
1333 if (o != osd_stat.end()) {
1334 auto i = osd_epochs.find(osd);
1335 if (i != osd_epochs.end())
1336 old_lec = i->second;
1337 stat_osd_sub(osd, o->second);
1338 }
1339 osd_stat_t& r = osd_stat[osd];
1340 ::decode(r, p);
1341 stat_osd_add(osd, r);
1342
1343 // adjust [near]full status
1344 register_nearfull_status(osd, r);
1345
1346 // epoch?
1347 if (!p.end()) {
1348 epoch_t e;
1349 ::decode(e, p);
1350
1351 if (e < min_last_epoch_clean ||
1352 (e > min_last_epoch_clean &&
1353 old_lec == min_last_epoch_clean))
1354 min_last_epoch_clean = 0;
1355 } else {
1356 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1357 // or old mon running.
1358 }
1359 }
1360
1361 void PGMap::remove_osd(int osd)
1362 {
1363 auto o = osd_stat.find(osd);
1364 if (o != osd_stat.end()) {
1365 stat_osd_sub(osd, o->second);
1366 osd_stat.erase(o);
1367
1368 // remove these old osds from full/nearfull set(s), too
1369 nearfull_osds.erase(osd);
1370 full_osds.erase(osd);
1371 }
1372 }
1373
1374 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1375 bool sameosds)
1376 {
1377 pg_pool_sum[pgid.pool()].add(s);
1378 pg_sum.add(s);
1379
1380 num_pg++;
1381 num_pg_by_state[s.state]++;
1382 num_pg_by_pool[pgid.pool()]++;
1383
1384 if ((s.state & PG_STATE_CREATING) &&
1385 s.parent_split_bits == 0) {
1386 creating_pgs.insert(pgid);
1387 if (s.acting_primary >= 0) {
1388 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1389 }
1390 }
1391
1392 if (s.state & PG_STATE_ACTIVE) {
1393 ++num_pg_active;
1394 }
1395 if (s.state == 0) {
1396 ++num_pg_unknown;
1397 }
1398
1399 if (sameosds)
1400 return;
1401
1402 for (auto p = s.blocked_by.begin();
1403 p != s.blocked_by.end();
1404 ++p) {
1405 ++blocked_by_sum[*p];
1406 }
1407
1408 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1409 pg_by_osd[*p].insert(pgid);
1410 num_pg_by_osd[*p].acting++;
1411 }
1412 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1413 pg_by_osd[*p].insert(pgid);
1414 num_pg_by_osd[*p].up++;
1415 }
1416
1417 if (s.up_primary >= 0) {
1418 num_pg_by_osd[s.up_primary].primary++;
1419 }
1420 }
1421
1422 void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1423 bool sameosds)
1424 {
1425 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
1426 ps.sub(s);
1427 pg_sum.sub(s);
1428
1429 num_pg--;
1430 int end = --num_pg_by_state[s.state];
1431 assert(end >= 0);
1432 if (end == 0)
1433 num_pg_by_state.erase(s.state);
1434 end = --num_pg_by_pool[pgid.pool()];
1435 if (end == 0) {
1436 num_pg_by_pool.erase(pgid.pool());
1437 pg_pool_sum.erase(pgid.pool());
1438 }
1439
1440 if ((s.state & PG_STATE_CREATING) &&
1441 s.parent_split_bits == 0) {
1442 creating_pgs.erase(pgid);
1443 if (s.acting_primary >= 0) {
1444 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1445 r[s.mapping_epoch].erase(pgid);
1446 if (r[s.mapping_epoch].empty())
1447 r.erase(s.mapping_epoch);
1448 if (r.empty())
1449 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1450 }
1451 }
1452
1453 if (s.state & PG_STATE_ACTIVE) {
1454 --num_pg_active;
1455 }
1456 if (s.state == 0) {
1457 --num_pg_unknown;
1458 }
1459
1460 if (sameosds)
1461 return;
1462
1463 for (auto p = s.blocked_by.begin();
1464 p != s.blocked_by.end();
1465 ++p) {
1466 auto q = blocked_by_sum.find(*p);
1467 assert(q != blocked_by_sum.end());
1468 --q->second;
1469 if (q->second == 0)
1470 blocked_by_sum.erase(q);
1471 }
1472
1473 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1474 auto& oset = pg_by_osd[*p];
1475 oset.erase(pgid);
1476 if (oset.empty())
1477 pg_by_osd.erase(*p);
1478 auto it = num_pg_by_osd.find(*p);
1479 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1480 it->second.acting--;
1481 }
1482 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1483 auto& oset = pg_by_osd[*p];
1484 oset.erase(pgid);
1485 if (oset.empty())
1486 pg_by_osd.erase(*p);
1487 auto it = num_pg_by_osd.find(*p);
1488 if (it != num_pg_by_osd.end() && it->second.up > 0)
1489 it->second.up--;
1490 }
1491
1492 if (s.up_primary >= 0) {
1493 auto it = num_pg_by_osd.find(s.up_primary);
1494 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1495 it->second.primary--;
1496 }
1497 }
1498
1499 void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
1500 bufferlist::iterator& blp)
1501 {
1502 pg_stat_t n;
1503 ::decode(n, blp);
1504
1505 bool sameosds =
1506 s.acting == n.acting &&
1507 s.up == n.up &&
1508 s.blocked_by == n.blocked_by;
1509
1510 stat_pg_sub(pgid, s, sameosds);
1511
1512 // if acting_primary has shift to an just restored osd, and pg yet to finish
1513 // peering, many attributes in current stats remain stale. others seem don't
1514 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1515 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
1516 n.last_active < s.last_active)
1517 n.last_active = s.last_active;
1518 s = n;
1519 stat_pg_add(pgid, n, sameosds);
1520 }
1521
1522 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1523 {
1524 num_osd++;
1525 osd_sum.add(s);
1526 if (osd >= (int)osd_last_seq.size()) {
1527 osd_last_seq.resize(osd + 1);
1528 }
1529 osd_last_seq[osd] = s.seq;
1530 }
1531
1532 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1533 {
1534 num_osd--;
1535 osd_sum.sub(s);
1536 assert(osd < (int)osd_last_seq.size());
1537 osd_last_seq[osd] = 0;
1538 }
1539
1540 epoch_t PGMap::calc_min_last_epoch_clean() const
1541 {
1542 if (pg_stat.empty())
1543 return 0;
1544
1545 auto p = pg_stat.begin();
1546 epoch_t min = p->second.get_effective_last_epoch_clean();
1547 for (++p; p != pg_stat.end(); ++p) {
1548 epoch_t lec = p->second.get_effective_last_epoch_clean();
1549 if (lec < min)
1550 min = lec;
1551 }
1552 // also scan osd epochs
1553 // don't trim past the oldest reported osd epoch
1554 for (auto i = osd_epochs.begin();
1555 i != osd_epochs.end();
1556 ++i) {
1557 if (i->second < min)
1558 min = i->second;
1559 }
1560 return min;
1561 }
1562
1563 void PGMap::encode_digest(const OSDMap& osdmap,
1564 bufferlist& bl, uint64_t features) const
1565 {
1566 get_rules_avail(osdmap, &avail_space_by_rule);
1567 PGMapDigest::encode(bl, features);
1568 }
1569
1570 void PGMap::encode(bufferlist &bl, uint64_t features) const
1571 {
1572 if ((features & CEPH_FEATURE_MONENC) == 0) {
1573 __u8 v = 3;
1574 ::encode(v, bl);
1575 ::encode(version, bl);
1576 ::encode(pg_stat, bl);
1577 ::encode(osd_stat, bl);
1578 ::encode(last_osdmap_epoch, bl);
1579 ::encode(last_pg_scan, bl);
1580 ::encode(full_ratio, bl);
1581 ::encode(nearfull_ratio, bl);
1582 return;
1583 }
1584
1585 ENCODE_START(6, 4, bl);
1586 ::encode(version, bl);
1587 ::encode(pg_stat, bl);
1588 ::encode(osd_stat, bl);
1589 ::encode(last_osdmap_epoch, bl);
1590 ::encode(last_pg_scan, bl);
1591 ::encode(full_ratio, bl);
1592 ::encode(nearfull_ratio, bl);
1593 ::encode(stamp, bl);
1594 ::encode(osd_epochs, bl);
1595 ENCODE_FINISH(bl);
1596 }
1597
1598 void PGMap::decode(bufferlist::iterator &bl)
1599 {
1600 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
1601 ::decode(version, bl);
1602 if (struct_v < 3) {
1603 pg_stat.clear();
1604 __u32 n;
1605 ::decode(n, bl);
1606 while (n--) {
1607 old_pg_t opgid;
1608 ::decode(opgid, bl);
1609 pg_t pgid = opgid;
1610 ::decode(pg_stat[pgid], bl);
1611 }
1612 } else {
1613 ::decode(pg_stat, bl);
1614 }
1615 ::decode(osd_stat, bl);
1616 ::decode(last_osdmap_epoch, bl);
1617 ::decode(last_pg_scan, bl);
1618 if (struct_v >= 2) {
1619 ::decode(full_ratio, bl);
1620 ::decode(nearfull_ratio, bl);
1621 }
1622 if (struct_v >= 5)
1623 ::decode(stamp, bl);
1624 if (struct_v >= 6) {
1625 ::decode(osd_epochs, bl);
1626 } else {
1627 for (auto i = osd_stat.begin();
1628 i != osd_stat.end();
1629 ++i) {
1630 // This isn't accurate, but will cause trimming to behave like
1631 // previously.
1632 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
1633 }
1634 }
1635 DECODE_FINISH(bl);
1636
1637 calc_stats();
1638 }
1639
1640 void PGMap::dirty_all(Incremental& inc)
1641 {
1642 inc.osdmap_epoch = last_osdmap_epoch;
1643 inc.pg_scan = last_pg_scan;
1644 inc.full_ratio = full_ratio;
1645 inc.nearfull_ratio = nearfull_ratio;
1646
1647 for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) {
1648 inc.pg_stat_updates[p->first] = p->second;
1649 }
1650 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
1651 assert(osd_epochs.count(p->first));
1652 inc.update_stat(p->first,
1653 inc.get_osd_epochs().find(p->first)->second,
1654 p->second);
1655 }
1656 }
1657
1658 void PGMap::dump(Formatter *f) const
1659 {
1660 dump_basic(f);
1661 dump_pg_stats(f, false);
1662 dump_pool_stats(f);
1663 dump_osd_stats(f);
1664 }
1665
1666 void PGMap::dump_basic(Formatter *f) const
1667 {
1668 f->dump_unsigned("version", version);
1669 f->dump_stream("stamp") << stamp;
1670 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1671 f->dump_unsigned("last_pg_scan", last_pg_scan);
1672 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
1673 f->dump_float("full_ratio", full_ratio);
1674 f->dump_float("near_full_ratio", nearfull_ratio);
1675
1676 f->open_object_section("pg_stats_sum");
1677 pg_sum.dump(f);
1678 f->close_section();
1679
1680 f->open_object_section("osd_stats_sum");
1681 osd_sum.dump(f);
1682 f->close_section();
1683
1684 f->open_array_section("osd_epochs");
1685 for (auto p = osd_epochs.begin(); p != osd_epochs.end(); ++p) {
1686 f->open_object_section("osd");
1687 f->dump_unsigned("osd", p->first);
1688 f->dump_unsigned("epoch", p->second);
1689 f->close_section();
1690 }
1691 f->close_section();
1692
1693 dump_delta(f);
1694 }
1695
1696 void PGMap::dump_delta(Formatter *f) const
1697 {
1698 f->open_object_section("pg_stats_delta");
1699 pg_sum_delta.dump(f);
1700 f->close_section();
1701 }
1702
1703 void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1704 {
1705 f->open_array_section("pg_stats");
1706 for (auto i = pg_stat.begin();
1707 i != pg_stat.end();
1708 ++i) {
1709 f->open_object_section("pg_stat");
1710 f->dump_stream("pgid") << i->first;
1711 if (brief)
1712 i->second.dump_brief(f);
1713 else
1714 i->second.dump(f);
1715 f->close_section();
1716 }
1717 f->close_section();
1718 }
1719
1720 void PGMap::dump_pool_stats(Formatter *f) const
1721 {
1722 f->open_array_section("pool_stats");
1723 for (auto p = pg_pool_sum.begin();
1724 p != pg_pool_sum.end();
1725 ++p) {
1726 f->open_object_section("pool_stat");
1727 f->dump_int("poolid", p->first);
1728 auto q = num_pg_by_pool.find(p->first);
1729 if (q != num_pg_by_pool.end())
1730 f->dump_unsigned("num_pg", q->second);
1731 p->second.dump(f);
1732 f->close_section();
1733 }
1734 f->close_section();
1735 }
1736
1737 void PGMap::dump_osd_stats(Formatter *f) const
1738 {
1739 f->open_array_section("osd_stats");
1740 for (auto q = osd_stat.begin();
1741 q != osd_stat.end();
1742 ++q) {
1743 f->open_object_section("osd_stat");
1744 f->dump_int("osd", q->first);
1745 q->second.dump(f);
1746 f->close_section();
1747 }
1748 f->close_section();
1749 }
1750
1751 void PGMap::dump_pg_stats_plain(
1752 ostream& ss,
1753 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1754 bool brief) const
1755 {
1756 TextTable tab;
1757
1758 if (brief){
1759 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1760 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1761 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1762 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1763 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1764 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1765 }
1766 else {
1767 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1768 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1770 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1771 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1772 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1774 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1775 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1776 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1777 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1778 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1780 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1782 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1784 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1785 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1786 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1787 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1788 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
1789 }
1790
1791 for (auto i = pg_stats.begin();
1792 i != pg_stats.end(); ++i) {
1793 const pg_stat_t &st(i->second);
1794 if (brief) {
1795 tab << i->first
1796 << pg_state_string(st.state)
1797 << st.up
1798 << st.up_primary
1799 << st.acting
1800 << st.acting_primary
1801 << TextTable::endrow;
1802 } else {
1803 ostringstream reported;
1804 reported << st.reported_epoch << ":" << st.reported_seq;
1805
1806 tab << i->first
1807 << st.stats.sum.num_objects
1808 << st.stats.sum.num_objects_missing_on_primary
1809 << st.stats.sum.num_objects_degraded
1810 << st.stats.sum.num_objects_misplaced
1811 << st.stats.sum.num_objects_unfound
1812 << st.stats.sum.num_bytes
1813 << st.log_size
1814 << st.ondisk_log_size
1815 << pg_state_string(st.state)
1816 << st.last_change
1817 << st.version
1818 << reported.str()
1819 << pg_vector_string(st.up)
1820 << st.up_primary
1821 << pg_vector_string(st.acting)
1822 << st.acting_primary
1823 << st.last_scrub
1824 << st.last_scrub_stamp
1825 << st.last_deep_scrub
1826 << st.last_deep_scrub_stamp
1827 << st.snaptrimq_len
1828 << TextTable::endrow;
1829 }
1830 }
1831
1832 ss << tab;
1833 }
1834
1835 void PGMap::dump(ostream& ss) const
1836 {
1837 dump_basic(ss);
1838 dump_pg_stats(ss, false);
1839 dump_pool_stats(ss, false);
1840 dump_pg_sum_stats(ss, false);
1841 dump_osd_stats(ss);
1842 }
1843
1844 void PGMap::dump_basic(ostream& ss) const
1845 {
1846 ss << "version " << version << std::endl;
1847 ss << "stamp " << stamp << std::endl;
1848 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1849 ss << "last_pg_scan " << last_pg_scan << std::endl;
1850 ss << "full_ratio " << full_ratio << std::endl;
1851 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
1852 }
1853
1854 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1855 {
1856 dump_pg_stats_plain(ss, pg_stat, brief);
1857 }
1858
1859 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1860 {
1861 TextTable tab;
1862
1863 if (header) {
1864 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1865 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1866 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1867 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1868 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1869 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1870 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1871 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1872 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1873 } else {
1874 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1875 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1876 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1877 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1878 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1879 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1880 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1881 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1882 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1883 }
1884
1885 for (auto p = pg_pool_sum.begin();
1886 p != pg_pool_sum.end();
1887 ++p) {
1888 tab << p->first
1889 << p->second.stats.sum.num_objects
1890 << p->second.stats.sum.num_objects_missing_on_primary
1891 << p->second.stats.sum.num_objects_degraded
1892 << p->second.stats.sum.num_objects_misplaced
1893 << p->second.stats.sum.num_objects_unfound
1894 << p->second.stats.sum.num_bytes
1895 << p->second.log_size
1896 << p->second.ondisk_log_size
1897 << TextTable::endrow;
1898 }
1899
1900 ss << tab;
1901 }
1902
1903 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1904 {
1905 TextTable tab;
1906
1907 if (header) {
1908 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1909 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1910 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1911 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1912 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1913 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1914 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1915 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1916 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1917 } else {
1918 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1919 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1920 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1921 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1922 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1923 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1924 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1925 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1926 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1927 };
1928
1929 tab << "sum"
1930 << pg_sum.stats.sum.num_objects
1931 << pg_sum.stats.sum.num_objects_missing_on_primary
1932 << pg_sum.stats.sum.num_objects_degraded
1933 << pg_sum.stats.sum.num_objects_misplaced
1934 << pg_sum.stats.sum.num_objects_unfound
1935 << pg_sum.stats.sum.num_bytes
1936 << pg_sum.log_size
1937 << pg_sum.ondisk_log_size
1938 << TextTable::endrow;
1939
1940 ss << tab;
1941 }
1942
1943 void PGMap::dump_osd_stats(ostream& ss) const
1944 {
1945 TextTable tab;
1946
1947 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1948 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1949 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1950 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1951 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1952 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1953 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1954
1955 for (auto p = osd_stat.begin();
1956 p != osd_stat.end();
1957 ++p) {
1958 tab << p->first
1959 << si_t(p->second.kb_used << 10)
1960 << si_t(p->second.kb_avail << 10)
1961 << si_t(p->second.kb << 10)
1962 << p->second.hb_peers
1963 << get_num_pg_by_osd(p->first)
1964 << get_num_primary_pg_by_osd(p->first)
1965 << TextTable::endrow;
1966 }
1967
1968 tab << "sum"
1969 << si_t(osd_sum.kb_used << 10)
1970 << si_t(osd_sum.kb_avail << 10)
1971 << si_t(osd_sum.kb << 10)
1972 << TextTable::endrow;
1973
1974 ss << tab;
1975 }
1976
1977 void PGMap::dump_osd_sum_stats(ostream& ss) const
1978 {
1979 TextTable tab;
1980
1981 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1982 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1983 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1984 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1985
1986 tab << "sum"
1987 << si_t(osd_sum.kb_used << 10)
1988 << si_t(osd_sum.kb_avail << 10)
1989 << si_t(osd_sum.kb << 10)
1990 << TextTable::endrow;
1991
1992 ss << tab;
1993 }
1994
1995 void PGMap::get_stuck_stats(
1996 int types, const utime_t cutoff,
1997 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1998 {
1999 assert(types != 0);
2000 for (auto i = pg_stat.begin();
2001 i != pg_stat.end();
2002 ++i) {
2003 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
2004
2005 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
2006 if (i->second.last_active < val)
2007 val = i->second.last_active;
2008 }
2009
2010 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
2011 if (i->second.last_clean < val)
2012 val = i->second.last_clean;
2013 }
2014
2015 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
2016 if (i->second.last_undegraded < val)
2017 val = i->second.last_undegraded;
2018 }
2019
2020 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
2021 if (i->second.last_fullsized < val)
2022 val = i->second.last_fullsized;
2023 }
2024
2025 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
2026 if (i->second.last_unstale < val)
2027 val = i->second.last_unstale;
2028 }
2029
2030 // val is now the earliest any of the requested stuck states began
2031 if (val < cutoff) {
2032 stuck_pgs[i->first] = i->second;
2033 }
2034 }
2035 }
2036
2037 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
2038 {
2039 int inactive = 0;
2040 int unclean = 0;
2041 int degraded = 0;
2042 int undersized = 0;
2043 int stale = 0;
2044
2045 for (auto i = pg_stat.begin();
2046 i != pg_stat.end();
2047 ++i) {
2048 if (! (i->second.state & PG_STATE_ACTIVE)) {
2049 if (i->second.last_active < cutoff)
2050 ++inactive;
2051 }
2052 if (! (i->second.state & PG_STATE_CLEAN)) {
2053 if (i->second.last_clean < cutoff)
2054 ++unclean;
2055 }
2056 if (i->second.state & PG_STATE_DEGRADED) {
2057 if (i->second.last_undegraded < cutoff)
2058 ++degraded;
2059 }
2060 if (i->second.state & PG_STATE_UNDERSIZED) {
2061 if (i->second.last_fullsized < cutoff)
2062 ++undersized;
2063 }
2064 if (i->second.state & PG_STATE_STALE) {
2065 if (i->second.last_unstale < cutoff)
2066 ++stale;
2067 }
2068 }
2069
2070 if (inactive)
2071 note["stuck inactive"] = inactive;
2072
2073 if (unclean)
2074 note["stuck unclean"] = unclean;
2075
2076 if (undersized)
2077 note["stuck undersized"] = undersized;
2078
2079 if (degraded)
2080 note["stuck degraded"] = degraded;
2081
2082 if (stale)
2083 note["stuck stale"] = stale;
2084
2085 return inactive || unclean || undersized || degraded || stale;
2086 }
2087
2088 void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
2089 {
2090 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2091 get_stuck_stats(types, cutoff, stuck_pg_stats);
2092 f->open_array_section("stuck_pg_stats");
2093 for (auto i = stuck_pg_stats.begin();
2094 i != stuck_pg_stats.end();
2095 ++i) {
2096 f->open_object_section("pg_stat");
2097 f->dump_stream("pgid") << i->first;
2098 i->second.dump(f);
2099 f->close_section();
2100 }
2101 f->close_section();
2102 }
2103
2104 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2105 {
2106 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2107 get_stuck_stats(types, cutoff, stuck_pg_stats);
2108 if (!stuck_pg_stats.empty())
2109 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2110 }
2111
2112 int PGMap::dump_stuck_pg_stats(
2113 stringstream &ds,
2114 Formatter *f,
2115 int threshold,
2116 vector<string>& args) const
2117 {
2118 int stuck_types = 0;
2119
2120 for (auto i = args.begin(); i != args.end(); ++i) {
2121 if (*i == "inactive")
2122 stuck_types |= PGMap::STUCK_INACTIVE;
2123 else if (*i == "unclean")
2124 stuck_types |= PGMap::STUCK_UNCLEAN;
2125 else if (*i == "undersized")
2126 stuck_types |= PGMap::STUCK_UNDERSIZED;
2127 else if (*i == "degraded")
2128 stuck_types |= PGMap::STUCK_DEGRADED;
2129 else if (*i == "stale")
2130 stuck_types |= PGMap::STUCK_STALE;
2131 else {
2132 ds << "Unknown type: " << *i << std::endl;
2133 return -EINVAL;
2134 }
2135 }
2136
2137 utime_t now(ceph_clock_now());
2138 utime_t cutoff = now - utime_t(threshold, 0);
2139
2140 if (!f) {
2141 dump_stuck_plain(ds, stuck_types, cutoff);
2142 } else {
2143 dump_stuck(f, stuck_types, cutoff);
2144 f->flush(ds);
2145 }
2146
2147 return 0;
2148 }
2149
2150 void PGMap::dump_osd_perf_stats(Formatter *f) const
2151 {
2152 f->open_array_section("osd_perf_infos");
2153 for (auto i = osd_stat.begin();
2154 i != osd_stat.end();
2155 ++i) {
2156 f->open_object_section("osd");
2157 f->dump_int("id", i->first);
2158 {
2159 f->open_object_section("perf_stats");
2160 i->second.os_perf_stat.dump(f);
2161 f->close_section();
2162 }
2163 f->close_section();
2164 }
2165 f->close_section();
2166 }
2167 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2168 {
2169 TextTable tab;
2170 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2171 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2172 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2173 for (auto i = osd_stat.begin();
2174 i != osd_stat.end();
2175 ++i) {
2176 tab << i->first;
2177 tab << i->second.os_perf_stat.os_commit_latency;
2178 tab << i->second.os_perf_stat.os_apply_latency;
2179 tab << TextTable::endrow;
2180 }
2181 (*ss) << tab;
2182 }
2183
2184 void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2185 {
2186 f->open_array_section("osd_blocked_by_infos");
2187 for (auto i = blocked_by_sum.begin();
2188 i != blocked_by_sum.end();
2189 ++i) {
2190 f->open_object_section("osd");
2191 f->dump_int("id", i->first);
2192 f->dump_int("num_blocked", i->second);
2193 f->close_section();
2194 }
2195 f->close_section();
2196 }
2197 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2198 {
2199 TextTable tab;
2200 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2201 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2202 for (auto i = blocked_by_sum.begin();
2203 i != blocked_by_sum.end();
2204 ++i) {
2205 tab << i->first;
2206 tab << i->second;
2207 tab << TextTable::endrow;
2208 }
2209 (*ss) << tab;
2210 }
2211
2212
2213 /**
2214 * update aggregated delta
2215 *
2216 * @param cct ceph context
2217 * @param ts Timestamp for the stats being delta'ed
2218 * @param old_pool_sum Previous stats sum
2219 * @param last_ts Last timestamp for pool
2220 * @param result_pool_sum Resulting stats
2221 * @param result_pool_delta Resulting pool delta
2222 * @param result_ts_delta Resulting timestamp delta
2223 * @param delta_avg_list List of last N computed deltas, used to average
2224 */
2225 void PGMap::update_delta(
2226 CephContext *cct,
2227 const utime_t ts,
2228 const pool_stat_t& old_pool_sum,
2229 utime_t *last_ts,
2230 const pool_stat_t& current_pool_sum,
2231 pool_stat_t *result_pool_delta,
2232 utime_t *result_ts_delta,
2233 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2234 {
2235 /* @p ts is the timestamp we want to associate with the data
2236 * in @p old_pool_sum, and on which we will base ourselves to
2237 * calculate the delta, stored in 'delta_t'.
2238 */
2239 utime_t delta_t;
2240 delta_t = ts; // start with the provided timestamp
2241 delta_t -= *last_ts; // take the last timestamp we saw
2242 *last_ts = ts; // @p ts becomes the last timestamp we saw
2243
2244 // adjust delta_t, quick start if there is no update in a long period
2245 delta_t = std::min(delta_t,
2246 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2247
2248 // calculate a delta, and average over the last 6 deltas by default.
2249 /* start by taking a copy of our current @p result_pool_sum, and by
2250 * taking out the stats from @p old_pool_sum. This generates a stats
2251 * delta. Stash this stats delta in @p delta_avg_list, along with the
2252 * timestamp delta for these results.
2253 */
2254 pool_stat_t d = current_pool_sum;
2255 d.stats.sub(old_pool_sum.stats);
2256
2257 /* Aggregate current delta, and take out the last seen delta (if any) to
2258 * average it out.
2259 * Skip calculating delta while sum was not synchronized.
2260 */
2261 if(!old_pool_sum.stats.sum.is_zero()) {
2262 delta_avg_list->push_back(make_pair(d,delta_t));
2263 *result_ts_delta += delta_t;
2264 result_pool_delta->stats.add(d.stats);
2265 }
2266 size_t s = cct ? cct->_conf->get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2267 if (delta_avg_list->size() > s) {
2268 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2269 *result_ts_delta -= delta_avg_list->front().second;
2270 delta_avg_list->pop_front();
2271 }
2272 }
2273
2274 /**
2275 * update aggregated delta
2276 *
2277 * @param cct ceph context
2278 * @param ts Timestamp
2279 * @param pg_sum_old Old pg_sum
2280 */
2281 void PGMap::update_global_delta(CephContext *cct,
2282 const utime_t ts, const pool_stat_t& pg_sum_old)
2283 {
2284 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
2285 &stamp_delta, &pg_sum_deltas);
2286 }
2287
2288 /**
2289 * Update a given pool's deltas
2290 *
2291 * @param cct Ceph Context
2292 * @param ts Timestamp for the stats being delta'ed
2293 * @param pool Pool's id
2294 * @param old_pool_sum Previous stats sum
2295 */
2296 void PGMap::update_one_pool_delta(
2297 CephContext *cct,
2298 const utime_t ts,
2299 const uint64_t pool,
2300 const pool_stat_t& old_pool_sum)
2301 {
2302 if (per_pool_sum_deltas.count(pool) == 0) {
2303 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2304 assert(per_pool_sum_delta.count(pool) == 0);
2305 }
2306
2307 auto& sum_delta = per_pool_sum_delta[pool];
2308
2309 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2310 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2311 &per_pool_sum_deltas[pool]);
2312 }
2313
2314 /**
2315 * Update pools' deltas
2316 *
2317 * @param cct CephContext
2318 * @param ts Timestamp for the stats being delta'ed
2319 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2320 */
2321 void PGMap::update_pool_deltas(
2322 CephContext *cct, const utime_t ts,
2323 const mempool::pgmap::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
2324 {
2325 for (auto it = pg_pool_sum_old.begin();
2326 it != pg_pool_sum_old.end(); ++it) {
2327 update_one_pool_delta(cct, ts, it->first, it->second);
2328 }
2329 }
2330
2331 void PGMap::clear_delta()
2332 {
2333 pg_sum_delta = pool_stat_t();
2334 pg_sum_deltas.clear();
2335 stamp_delta = utime_t();
2336 }
2337
2338 void PGMap::generate_test_instances(list<PGMap*>& o)
2339 {
2340 o.push_back(new PGMap);
2341 list<Incremental*> inc;
2342 Incremental::generate_test_instances(inc);
2343 delete inc.front();
2344 inc.pop_front();
2345 while (!inc.empty()) {
2346 PGMap *pmp = new PGMap();
2347 *pmp = *o.back();
2348 o.push_back(pmp);
2349 o.back()->apply_incremental(NULL, *inc.front());
2350 delete inc.front();
2351 inc.pop_front();
2352 }
2353 }
2354
2355 void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
2356 bool primary, set<pg_t>& pgs) const
2357 {
2358 for (auto i = pg_stat.begin();
2359 i != pg_stat.end();
2360 ++i) {
2361 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
2362 continue;
2363 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2364 continue;
2365 if (!(i->second.state & state))
2366 continue;
2367 pgs.insert(i->first);
2368 }
2369 }
2370
2371 void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2372 {
2373 f->open_array_section("pg_stats");
2374 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2375 const pg_stat_t& st = pg_stat.at(*i);
2376 f->open_object_section("pg_stat");
2377 f->dump_stream("pgid") << *i;
2378 st.dump(f);
2379 f->close_section();
2380 }
2381 f->close_section();
2382 }
2383
2384 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2385 {
2386 TextTable tab;
2387
2388 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
2389 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2390 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2391 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2392 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2393 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2394 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2395 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2396 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
2397 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2398 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
2399 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2400 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2401 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2402 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2403 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2404 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2405 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2406 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2407 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2408 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2409
2410 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2411 const pg_stat_t& st = pg_stat.at(*i);
2412
2413 ostringstream reported;
2414 reported << st.reported_epoch << ":" << st.reported_seq;
2415
2416 tab << *i
2417 << st.stats.sum.num_objects
2418 << st.stats.sum.num_objects_missing_on_primary
2419 << st.stats.sum.num_objects_degraded
2420 << st.stats.sum.num_objects_misplaced
2421 << st.stats.sum.num_objects_unfound
2422 << st.stats.sum.num_bytes
2423 << st.log_size
2424 << st.ondisk_log_size
2425 << pg_state_string(st.state)
2426 << st.last_change
2427 << st.version
2428 << reported.str()
2429 << st.up
2430 << st.up_primary
2431 << st.acting
2432 << st.acting_primary
2433 << st.last_scrub
2434 << st.last_scrub_stamp
2435 << st.last_deep_scrub
2436 << st.last_deep_scrub_stamp
2437 << TextTable::endrow;
2438 }
2439
2440 ss << tab;
2441 }
2442
2443
2444
2445 // Only called with a single bit set in "what"
2446 static void note_stuck_detail(
2447 int what,
2448 mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
2449 int max_detail,
2450 list<pair<health_status_t,string> > *detail)
2451 {
2452 int n = 0;
2453 for (auto p = stuck_pgs.begin();
2454 p != stuck_pgs.end();
2455 ++p) {
2456 ostringstream ss;
2457 utime_t since;
2458 const char *whatname = 0;
2459 switch (what) {
2460 case PGMap::STUCK_INACTIVE:
2461 since = p->second.last_active;
2462 whatname = "inactive";
2463 break;
2464 case PGMap::STUCK_UNCLEAN:
2465 since = p->second.last_clean;
2466 whatname = "unclean";
2467 break;
2468 case PGMap::STUCK_DEGRADED:
2469 since = p->second.last_undegraded;
2470 whatname = "degraded";
2471 break;
2472 case PGMap::STUCK_UNDERSIZED:
2473 since = p->second.last_fullsized;
2474 whatname = "undersized";
2475 break;
2476 case PGMap::STUCK_STALE:
2477 since = p->second.last_unstale;
2478 whatname = "stale";
2479 break;
2480 default:
2481 ceph_abort();
2482 }
2483 if (--max_detail == 0) {
2484 ostringstream ss;
2485 ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
2486 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2487 break;
2488 }
2489 ++n;
2490 ss << "pg " << p->first << " is stuck " << whatname;
2491 if (since == utime_t()) {
2492 ss << " since forever";
2493 } else {
2494 utime_t dur = ceph_clock_now() - since;
2495 ss << " for " << dur;
2496 }
2497 ss << ", current state " << pg_state_string(p->second.state)
2498 << ", last acting " << p->second.acting;
2499 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2500 }
2501 }
2502
2503 static pair<int,int> _warn_slow_request_histogram(
2504 CephContext *cct,
2505 const pow2_hist_t& h,
2506 string suffix,
2507 list<pair<health_status_t,string> >& summary,
2508 list<pair<health_status_t,string> > *detail)
2509 {
2510 if (h.h.empty())
2511 return make_pair(0, 0);
2512
2513 unsigned warn = 0, error = 0;
2514 float err_age =
2515 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
2516 for (unsigned i = h.h.size() - 1; i > 0; --i) {
2517 float ub = (float)(1 << i) / 1000.0;
2518 if (ub < cct->_conf->mon_osd_warn_op_age)
2519 break;
2520 if (h.h[i]) {
2521 auto sev = HEALTH_WARN;
2522 if (ub > err_age) {
2523 sev = HEALTH_ERR;
2524 error += h.h[i];
2525 } else {
2526 warn += h.h[i];
2527 }
2528 if (detail) {
2529 ostringstream ss;
2530 ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
2531 detail->push_back(make_pair(sev, ss.str()));
2532 }
2533 }
2534 }
2535 return make_pair(warn, error);
2536 }
2537
2538 namespace {
2539 enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
2540
2541 void print_unscrubbed_detailed(
2542 const std::pair<const pg_t,pg_stat_t> &pg_entry,
2543 list<pair<health_status_t,string> > *detail,
2544 scrubbed_or_deepscrubbed_t how_scrubbed)
2545 {
2546 std::stringstream ss;
2547 const auto& pg_stat(pg_entry.second);
2548
2549 ss << "pg " << pg_entry.first << " is not ";
2550 if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
2551 ss << "scrubbed, last_scrub_stamp "
2552 << pg_stat.last_scrub_stamp;
2553 } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
2554 ss << "deep-scrubbed, last_deep_scrub_stamp "
2555 << pg_stat.last_deep_scrub_stamp;
2556 }
2557
2558 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2559 }
2560
2561 using pg_stat_map_t = const mempool::pgmap::unordered_map<pg_t,pg_stat_t>;
2562
2563 void print_unscrubbed_pgs(
2564 pg_stat_map_t& pg_stats,
2565 list<pair<health_status_t,string> > &summary,
2566 list<pair<health_status_t,string> > *detail,
2567 const CephContext* cct)
2568 {
2569 if (cct->_conf->mon_warn_not_scrubbed == 0 &&
2570 cct->_conf->mon_warn_not_deep_scrubbed == 0)
2571 return;
2572
2573 int pgs_count = 0;
2574 const utime_t now = ceph_clock_now();
2575 for (const auto& pg_entry : pg_stats) {
2576 const auto& pg_stat(pg_entry.second);
2577 const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
2578 const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
2579
2580 const int mon_warn_not_scrubbed =
2581 cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
2582
2583 const int mon_warn_not_deep_scrubbed =
2584 cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
2585
2586 bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
2587 cct->_conf->mon_warn_not_scrubbed != 0);
2588
2589 bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
2590 cct->_conf->mon_warn_not_deep_scrubbed != 0);
2591
2592 if (detail != nullptr) {
2593 if (not_scrubbed) {
2594 print_unscrubbed_detailed(pg_entry,
2595 detail,
2596 scrubbed_or_deepscrubbed_t::SCRUBBED);
2597 }
2598 if (not_deep_scrubbed) {
2599 print_unscrubbed_detailed(pg_entry,
2600 detail,
2601 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
2602 }
2603 }
2604 if (not_scrubbed || not_deep_scrubbed) {
2605 ++pgs_count;
2606 }
2607 }
2608
2609 if (pgs_count > 0) {
2610 std::stringstream ss;
2611 ss << pgs_count << " unscrubbed pgs";
2612 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2613 }
2614 }
2615 }
2616
2617 void PGMap::get_health_checks(
2618 CephContext *cct,
2619 const OSDMap& osdmap,
2620 health_check_map_t *checks) const
2621 {
2622 utime_t now = ceph_clock_now();
2623 const auto max = cct->_conf->get_val<uint64_t>("mon_health_max_detail");
2624 const auto& pools = osdmap.get_pools();
2625
2626 typedef enum pg_consequence_t {
2627 UNAVAILABLE = 1, // Client IO to the pool may block
2628 DEGRADED = 2, // Fewer than the requested number of replicas are present
2629 DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
2630 // and insufficiet resources are present to fix this
2631 DAMAGED = 4 // The data may be missing or inconsistent on disk and
2632 // requires repair
2633 } pg_consequence_t;
2634
2635 // For a given PG state, how should it be reported at the pool level?
2636 class PgStateResponse {
2637 public:
2638 pg_consequence_t consequence;
2639 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2640 stuck_cb stuck_since;
2641 bool invert;
2642
2643 PgStateResponse(const pg_consequence_t &c, stuck_cb s)
2644 : consequence(c), stuck_since(s), invert(false)
2645 {
2646 }
2647
2648 PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
2649 : consequence(c), stuck_since(s), invert(i)
2650 {
2651 }
2652 };
2653
2654 // Record the PG state counts that contributed to a reported pool state
2655 class PgCauses {
2656 public:
2657 // Map of PG_STATE_* to number of pgs in that state.
2658 std::map<unsigned, unsigned> states;
2659
2660 // List of all PG IDs that had a state contributing
2661 // to this health condition.
2662 std::set<pg_t> pgs;
2663
2664 std::map<pg_t, std::string> pg_messages;
2665 };
2666
2667 // Map of PG state to how to respond to it
2668 std::map<unsigned, PgStateResponse> state_to_response = {
2669 // Immediate reports
2670 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2671 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
2672 { PG_STATE_REPAIR, {DAMAGED, {}} },
2673 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2674 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2675 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
2676 { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
2677 { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
2678 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2679 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2680 // Delayed (wait until stuck) reports
2681 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2682 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2683 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2684 // Delayed and inverted reports
2685 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
2686 };
2687
2688 // Specialized state printer that takes account of inversion of
2689 // ACTIVE, CLEAN checks.
2690 auto state_name = [](const uint32_t &state) {
2691 // Special cases for the states that are inverted checks
2692 if (state == PG_STATE_CLEAN) {
2693 return std::string("unclean");
2694 } else if (state == PG_STATE_ACTIVE) {
2695 return std::string("inactive");
2696 } else {
2697 return pg_state_string(state);
2698 }
2699 };
2700
2701 // Map of what is wrong to information about why, implicitly also stores
2702 // the list of what is wrong.
2703 std::map<pg_consequence_t, PgCauses> detected;
2704
2705 // Optimisation: trim down the number of checks to apply based on
2706 // the summary counters
2707 std::map<unsigned, PgStateResponse> possible_responses;
2708 for (const auto &i : num_pg_by_state) {
2709 for (const auto &j : state_to_response) {
2710 if (!j.second.invert) {
2711 // Check for normal tests by seeing if any pgs have the flag
2712 if (i.first & j.first) {
2713 possible_responses.insert(j);
2714 }
2715 }
2716 }
2717 }
2718
2719 for (const auto &j : state_to_response) {
2720 if (j.second.invert) {
2721 // Check for inverted tests by seeing if not-all pgs have the flag
2722 const auto &found = num_pg_by_state.find(j.first);
2723 if (found == num_pg_by_state.end() || found->second != num_pg) {
2724 possible_responses.insert(j);
2725 }
2726 }
2727 }
2728
2729 utime_t cutoff = now - utime_t(cct->_conf->get_val<int64_t>("mon_pg_stuck_threshold"), 0);
2730 // Loop over all PGs, if there are any possibly-unhealthy states in there
2731 if (!possible_responses.empty()) {
2732 for (const auto& i : pg_stat) {
2733 const auto &pg_id = i.first;
2734 const auto &pg_info = i.second;
2735
2736 for (const auto &j : state_to_response) {
2737 const auto &pg_response_state = j.first;
2738 const auto &pg_response = j.second;
2739
2740 // Apply the state test
2741 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2742 continue;
2743 }
2744
2745 // Apply stuckness test if needed
2746 if (pg_response.stuck_since) {
2747 // Delayed response, check for stuckness
2748 utime_t last_whatever = pg_response.stuck_since(pg_info);
2749 if (last_whatever >= cutoff) {
2750 // Not stuck enough, ignore.
2751 continue;
2752 } else {
2753
2754 }
2755 }
2756
2757 auto &causes = detected[pg_response.consequence];
2758 causes.states[pg_response_state]++;
2759 causes.pgs.insert(pg_id);
2760
2761 // Don't bother composing detail string if we have already recorded
2762 // too many
2763 if (causes.pg_messages.size() > max) {
2764 continue;
2765 }
2766
2767 std::ostringstream ss;
2768 if (pg_response.stuck_since) {
2769 utime_t since = pg_response.stuck_since(pg_info);
2770 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2771 if (since == utime_t()) {
2772 ss << " since forever";
2773 } else {
2774 utime_t dur = now - since;
2775 ss << " for " << dur;
2776 }
2777 ss << ", current state " << pg_state_string(pg_info.state)
2778 << ", last acting " << pg_info.acting;
2779 } else {
2780 ss << "pg " << pg_id << " is "
2781 << pg_state_string(pg_info.state);
2782 ss << ", acting " << pg_info.acting;
2783 if (pg_info.stats.sum.num_objects_unfound) {
2784 ss << ", " << pg_info.stats.sum.num_objects_unfound
2785 << " unfound";
2786 }
2787 }
2788
2789 if (pg_info.state & PG_STATE_INCOMPLETE) {
2790 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2791 if (pi && pi->min_size > 1) {
2792 ss << " (reducing pool "
2793 << osdmap.get_pool_name(pg_id.pool())
2794 << " min_size from " << (int)pi->min_size
2795 << " may help; search ceph.com/docs for 'incomplete')";
2796 }
2797 }
2798
2799 causes.pg_messages[pg_id] = ss.str();
2800 }
2801 }
2802 } else {
2803 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2804 }
2805
2806 for (const auto &i : detected) {
2807 std::string health_code;
2808 health_status_t sev;
2809 std::string summary;
2810 switch(i.first) {
2811 case UNAVAILABLE:
2812 health_code = "PG_AVAILABILITY";
2813 sev = HEALTH_WARN;
2814 summary = "Reduced data availability: ";
2815 break;
2816 case DEGRADED:
2817 health_code = "PG_DEGRADED";
2818 summary = "Degraded data redundancy: ";
2819 sev = HEALTH_WARN;
2820 break;
2821 case DEGRADED_FULL:
2822 health_code = "PG_DEGRADED_FULL";
2823 summary = "Degraded data redundancy (low space): ";
2824 sev = HEALTH_ERR;
2825 break;
2826 case DAMAGED:
2827 health_code = "PG_DAMAGED";
2828 summary = "Possible data damage: ";
2829 sev = HEALTH_ERR;
2830 break;
2831 default:
2832 assert(false);
2833 }
2834
2835 if (i.first == DEGRADED) {
2836 if (pg_sum.stats.sum.num_objects_degraded &&
2837 pg_sum.stats.sum.num_object_copies > 0) {
2838 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2839 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2840 char b[20];
2841 snprintf(b, sizeof(b), "%.3lf", pc);
2842 ostringstream ss;
2843 ss << pg_sum.stats.sum.num_objects_degraded
2844 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2845 << b << "%)";
2846
2847 // Throw in a comma for the benefit of the following PG counts
2848 summary += ss.str() + ", ";
2849 }
2850 }
2851
2852 // Compose summary message saying how many PGs in what states led
2853 // to this health check failing
2854 std::vector<std::string> pg_msgs;
2855 for (const auto &j : i.second.states) {
2856 std::ostringstream msg;
2857 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2858 pg_msgs.push_back(msg.str());
2859 }
2860 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2861
2862
2863
2864 health_check_t *check = &checks->add(
2865 health_code,
2866 sev,
2867 summary);
2868
2869 // Compose list of PGs contributing to this health check failing
2870 for (const auto &j : i.second.pg_messages) {
2871 check->detail.push_back(j.second);
2872 }
2873 }
2874
2875 // OSD_SCRUB_ERRORS
2876 if (pg_sum.stats.sum.num_scrub_errors) {
2877 ostringstream ss;
2878 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2879 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
2880 }
2881
2882 // CACHE_POOL_NEAR_FULL
2883 {
2884 list<string> detail;
2885 unsigned num_pools = 0;
2886 for (auto& p : pools) {
2887 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2888 !pg_pool_sum.count(p.first)) {
2889 continue;
2890 }
2891 bool nearfull = false;
2892 const string& name = osdmap.get_pool_name(p.first);
2893 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2894 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2895 ((1000000 - p.second.cache_target_full_ratio_micro) *
2896 cct->_conf->mon_cache_target_full_warn_ratio);
2897 if (p.second.target_max_objects &&
2898 (uint64_t)(st.stats.sum.num_objects -
2899 st.stats.sum.num_objects_hit_set_archive) >
2900 p.second.target_max_objects * (ratio / 1000000.0)) {
2901 ostringstream ss;
2902 ss << "cache pool '" << name << "' with "
2903 << si_t(st.stats.sum.num_objects)
2904 << " objects at/near target max "
2905 << si_t(p.second.target_max_objects) << " objects";
2906 detail.push_back(ss.str());
2907 nearfull = true;
2908 }
2909 if (p.second.target_max_bytes &&
2910 (uint64_t)(st.stats.sum.num_bytes -
2911 st.stats.sum.num_bytes_hit_set_archive) >
2912 p.second.target_max_bytes * (ratio / 1000000.0)) {
2913 ostringstream ss;
2914 ss << "cache pool '" << name
2915 << "' with " << si_t(st.stats.sum.num_bytes)
2916 << "B at/near target max "
2917 << si_t(p.second.target_max_bytes) << "B";
2918 detail.push_back(ss.str());
2919 nearfull = true;
2920 }
2921 if (nearfull) {
2922 ++num_pools;
2923 }
2924 }
2925 if (!detail.empty()) {
2926 ostringstream ss;
2927 ss << num_pools << " cache pools at or near target size";
2928 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2929 d.detail.swap(detail);
2930 }
2931 }
2932
2933 // TOO_FEW_PGS
2934 unsigned num_in = osdmap.get_num_in_osds();
2935 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2936 const auto min_pg_per_osd =
2937 cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
2938 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2939 auto per = sum_pg_up / num_in;
2940 if (per < min_pg_per_osd && per) {
2941 ostringstream ss;
2942 ss << "too few PGs per OSD (" << per
2943 << " < min " << min_pg_per_osd << ")";
2944 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
2945 }
2946 }
2947
2948 // TOO_MANY_PGS
2949 auto max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
2950 if (num_in && max_pg_per_osd > 0) {
2951 auto per = sum_pg_up / num_in;
2952 if (per > max_pg_per_osd) {
2953 ostringstream ss;
2954 ss << "too many PGs per OSD (" << per
2955 << " > max " << max_pg_per_osd << ")";
2956 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
2957 }
2958 }
2959
2960 // SMALLER_PGP_NUM
2961 // MANY_OBJECTS_PER_PG
2962 if (!pg_stat.empty()) {
2963 list<string> pgp_detail, many_detail;
2964 const auto mon_pg_warn_min_objects =
2965 cct->_conf->get_val<int64_t>("mon_pg_warn_min_objects");
2966 const auto mon_pg_warn_min_pool_objects =
2967 cct->_conf->get_val<int64_t>("mon_pg_warn_min_pool_objects");
2968 const auto mon_pg_warn_max_object_skew =
2969 cct->_conf->get_val<double>("mon_pg_warn_max_object_skew");
2970 for (auto p = pg_pool_sum.begin();
2971 p != pg_pool_sum.end();
2972 ++p) {
2973 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2974 if (!pi)
2975 continue; // in case osdmap changes haven't propagated to PGMap yet
2976 const string& name = osdmap.get_pool_name(p->first);
2977 if (pi->get_pg_num() > pi->get_pgp_num() &&
2978 !(name.find(".DELETED") != string::npos &&
2979 cct->_conf->mon_fake_pool_delete)) {
2980 ostringstream ss;
2981 ss << "pool " << name << " pg_num "
2982 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
2983 pgp_detail.push_back(ss.str());
2984 }
2985 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2986 if (average_objects_per_pg > 0 &&
2987 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2988 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
2989 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
2990 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2991 if (mon_pg_warn_max_object_skew > 0 &&
2992 ratio > mon_pg_warn_max_object_skew) {
2993 ostringstream ss;
2994 ss << "pool " << name << " objects per pg ("
2995 << objects_per_pg << ") is more than " << ratio
2996 << " times cluster average ("
2997 << average_objects_per_pg << ")";
2998 many_detail.push_back(ss.str());
2999 }
3000 }
3001 }
3002 if (!pgp_detail.empty()) {
3003 ostringstream ss;
3004 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
3005 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
3006 d.detail.swap(pgp_detail);
3007 }
3008 if (!many_detail.empty()) {
3009 ostringstream ss;
3010 ss << many_detail.size() << " pools have many more objects per pg than"
3011 << " average";
3012 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
3013 d.detail.swap(many_detail);
3014 }
3015 }
3016
3017 // POOL_FULL
3018 // POOL_NEAR_FULL
3019 {
3020 float warn_threshold = (float)g_conf->get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
3021 float crit_threshold = (float)g_conf->get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
3022 list<string> full_detail, nearfull_detail;
3023 unsigned full_pools = 0, nearfull_pools = 0;
3024 for (auto it : pools) {
3025 auto it2 = pg_pool_sum.find(it.first);
3026 if (it2 == pg_pool_sum.end()) {
3027 continue;
3028 }
3029 const pool_stat_t *pstat = &it2->second;
3030 const object_stat_sum_t& sum = pstat->stats.sum;
3031 const string& pool_name = osdmap.get_pool_name(it.first);
3032 const pg_pool_t &pool = it.second;
3033 bool full = false, nearfull = false;
3034 if (pool.quota_max_objects > 0) {
3035 stringstream ss;
3036 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3037 } else if (crit_threshold > 0 &&
3038 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3039 ss << "pool '" << pool_name
3040 << "' has " << sum.num_objects << " objects"
3041 << " (max " << pool.quota_max_objects << ")";
3042 full_detail.push_back(ss.str());
3043 full = true;
3044 } else if (warn_threshold > 0 &&
3045 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3046 ss << "pool '" << pool_name
3047 << "' has " << sum.num_objects << " objects"
3048 << " (max " << pool.quota_max_objects << ")";
3049 nearfull_detail.push_back(ss.str());
3050 nearfull = true;
3051 }
3052 }
3053 if (pool.quota_max_bytes > 0) {
3054 stringstream ss;
3055 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3056 } else if (crit_threshold > 0 &&
3057 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3058 ss << "pool '" << pool_name
3059 << "' has " << si_t(sum.num_bytes) << " bytes"
3060 << " (max " << si_t(pool.quota_max_bytes) << ")";
3061 full_detail.push_back(ss.str());
3062 full = true;
3063 } else if (warn_threshold > 0 &&
3064 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3065 ss << "pool '" << pool_name
3066 << "' has " << si_t(sum.num_bytes) << " bytes"
3067 << " (max " << si_t(pool.quota_max_bytes) << ")";
3068 nearfull_detail.push_back(ss.str());
3069 nearfull = true;
3070 }
3071 }
3072 if (full) {
3073 ++full_pools;
3074 }
3075 if (nearfull) {
3076 ++nearfull_pools;
3077 }
3078 }
3079 if (full_pools) {
3080 ostringstream ss;
3081 ss << full_pools << " pools full";
3082 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
3083 d.detail.swap(full_detail);
3084 }
3085 if (nearfull_pools) {
3086 ostringstream ss;
3087 ss << nearfull_pools << " pools full";
3088 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
3089 d.detail.swap(nearfull_detail);
3090 }
3091 }
3092
3093 // OBJECT_MISPLACED
3094 if (pg_sum.stats.sum.num_objects_misplaced &&
3095 pg_sum.stats.sum.num_object_copies > 0) {
3096 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3097 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3098 char b[20];
3099 snprintf(b, sizeof(b), "%.3lf", pc);
3100 ostringstream ss;
3101 ss << pg_sum.stats.sum.num_objects_misplaced
3102 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3103 << b << "%)";
3104 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
3105 }
3106
3107 // OBJECT_UNFOUND
3108 if (pg_sum.stats.sum.num_objects_unfound &&
3109 pg_sum.stats.sum.num_objects) {
3110 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3111 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3112 char b[20];
3113 snprintf(b, sizeof(b), "%.3lf", pc);
3114 ostringstream ss;
3115 ss << pg_sum.stats.sum.num_objects_unfound
3116 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
3117 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
3118
3119 for (auto& p : pg_stat) {
3120 if (p.second.stats.sum.num_objects_unfound) {
3121 ostringstream ss;
3122 ss << "pg " << p.first
3123 << " has " << p.second.stats.sum.num_objects_unfound
3124 << " unfound objects";
3125 d.detail.push_back(ss.str());
3126 if (d.detail.size() > max) {
3127 d.detail.push_back("(additional pgs left out for brevity)");
3128 break;
3129 }
3130 }
3131 }
3132 }
3133
3134 // REQUEST_SLOW
3135 // REQUEST_STUCK
3136 if (cct->_conf->mon_osd_warn_op_age > 0 &&
3137 !osd_sum.op_queue_age_hist.h.empty() &&
3138 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3139 cct->_conf->mon_osd_warn_op_age) {
3140 list<string> warn_detail, error_detail;
3141 unsigned warn = 0, error = 0;
3142 float err_age =
3143 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3144 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3145 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3146 float ub = (float)(1 << i) / 1000.0;
3147 if (ub < cct->_conf->mon_osd_warn_op_age)
3148 break;
3149 if (h.h[i]) {
3150 ostringstream ss;
3151 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3152 if (ub > err_age) {
3153 error += h.h[i];
3154 error_detail.push_back(ss.str());
3155 } else {
3156 warn += h.h[i];
3157 warn_detail.push_back(ss.str());
3158 }
3159 }
3160 }
3161
3162 map<float,set<int>> warn_osd_by_max; // max -> osds
3163 map<float,set<int>> error_osd_by_max; // max -> osds
3164 if (!warn_detail.empty() || !error_detail.empty()) {
3165 for (auto& p : osd_stat) {
3166 const pow2_hist_t& h = p.second.op_queue_age_hist;
3167 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3168 float ub = (float)(1 << i) / 1000.0;
3169 if (ub < cct->_conf->mon_osd_warn_op_age)
3170 break;
3171 if (h.h[i]) {
3172 if (ub > err_age) {
3173 error_osd_by_max[ub].insert(p.first);
3174 } else {
3175 warn_osd_by_max[ub].insert(p.first);
3176 }
3177 break;
3178 }
3179 }
3180 }
3181 }
3182
3183 if (!warn_detail.empty()) {
3184 ostringstream ss;
3185 ss << warn << " slow requests are blocked > "
3186 << cct->_conf->mon_osd_warn_op_age << " sec";
3187 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
3188 d.detail.swap(warn_detail);
3189 int left = max;
3190 for (auto& p : warn_osd_by_max) {
3191 ostringstream ss;
3192 if (p.second.size() > 1) {
3193 ss << "osds " << p.second
3194 << " have blocked requests > " << p.first << " sec";
3195 } else {
3196 ss << "osd." << *p.second.begin()
3197 << " has blocked requests > " << p.first << " sec";
3198 }
3199 d.detail.push_back(ss.str());
3200 if (--left == 0) {
3201 break;
3202 }
3203 }
3204 }
3205 if (!error_detail.empty()) {
3206 ostringstream ss;
3207 ss << error << " stuck requests are blocked > "
3208 << err_age << " sec";
3209 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
3210 d.detail.swap(error_detail);
3211 int left = max;
3212 for (auto& p : error_osd_by_max) {
3213 ostringstream ss;
3214 if (p.second.size() > 1) {
3215 ss << "osds " << p.second
3216 << " have stuck requests > " << p.first << " sec";
3217 } else {
3218 ss << "osd." << *p.second.begin()
3219 << " has stuck requests > " << p.first << " sec";
3220 }
3221 d.detail.push_back(ss.str());
3222 if (--left == 0) {
3223 break;
3224 }
3225 }
3226 }
3227 }
3228
3229 // PG_NOT_SCRUBBED
3230 // PG_NOT_DEEP_SCRUBBED
3231 {
3232 if (cct->_conf->mon_warn_not_scrubbed ||
3233 cct->_conf->mon_warn_not_deep_scrubbed) {
3234 list<string> detail, deep_detail;
3235 const double age = cct->_conf->mon_warn_not_scrubbed +
3236 cct->_conf->mon_scrub_interval;
3237 utime_t cutoff = now;
3238 cutoff -= age;
3239 const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
3240 cct->_conf->osd_deep_scrub_interval;
3241 utime_t deep_cutoff = now;
3242 deep_cutoff -= deep_age;
3243 for (auto& p : pg_stat) {
3244 if (cct->_conf->mon_warn_not_scrubbed &&
3245 p.second.last_scrub_stamp < cutoff) {
3246 ostringstream ss;
3247 ss << "pg " << p.first << " not scrubbed since "
3248 << p.second.last_scrub_stamp;
3249 detail.push_back(ss.str());
3250 }
3251 if (cct->_conf->mon_warn_not_deep_scrubbed &&
3252 p.second.last_deep_scrub_stamp < deep_cutoff) {
3253 ostringstream ss;
3254 ss << "pg " << p.first << " not deep-scrubbed since "
3255 << p.second.last_deep_scrub_stamp;
3256 deep_detail.push_back(ss.str());
3257 }
3258 }
3259 if (!detail.empty()) {
3260 ostringstream ss;
3261 ss << detail.size() << " pgs not scrubbed for " << age;
3262 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
3263 d.detail.swap(detail);
3264 }
3265 if (!deep_detail.empty()) {
3266 ostringstream ss;
3267 ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
3268 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
3269 d.detail.swap(deep_detail);
3270 }
3271 }
3272 }
3273
3274 // POOL_APP
3275 if (g_conf->get_val<bool>("mon_warn_on_pool_no_app")) {
3276 list<string> detail;
3277 for (auto &it : pools) {
3278 const pg_pool_t &pool = it.second;
3279 const string& pool_name = osdmap.get_pool_name(it.first);
3280 auto it2 = pg_pool_sum.find(it.first);
3281 if (it2 == pg_pool_sum.end()) {
3282 continue;
3283 }
3284 const pool_stat_t *pstat = &it2->second;
3285 if (pstat == nullptr) {
3286 continue;
3287 }
3288 const object_stat_sum_t& sum = pstat->stats.sum;
3289 // application metadata is not encoded until luminous is minimum
3290 // required release
3291 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3292 sum.num_objects > 0 && pool.application_metadata.empty() &&
3293 !pool.is_tier() && !g_conf->mon_debug_no_require_luminous) {
3294 stringstream ss;
3295 ss << "application not enabled on pool '" << pool_name << "'";
3296 detail.push_back(ss.str());
3297 }
3298 }
3299 if (!detail.empty()) {
3300 ostringstream ss;
3301 ss << "application not enabled on " << detail.size() << " pool(s)";
3302 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
3303 stringstream tip;
3304 tip << "use 'ceph osd pool application enable <pool-name> "
3305 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3306 << "or freeform for custom applications.";
3307 detail.push_back(tip.str());
3308 d.detail.swap(detail);
3309 }
3310 }
3311
3312 // PG_SLOW_SNAP_TRIMMING
3313 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3314 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3315 uint64_t snaptrimq_exceeded = 0;
3316 uint32_t longest_queue = 0;
3317 const pg_t* longest_q_pg = nullptr;
3318 list<string> detail;
3319
3320 for (auto& i: pg_stat) {
3321 uint32_t current_len = i.second.snaptrimq_len;
3322 if (current_len >= snapthreshold) {
3323 snaptrimq_exceeded++;
3324 if (longest_queue <= current_len) {
3325 longest_q_pg = &i.first;
3326 longest_queue = current_len;
3327 }
3328 if (detail.size() < max - 1) {
3329 stringstream ss;
3330 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3331 detail.push_back(ss.str());
3332 continue;
3333 }
3334 if (detail.size() < max) {
3335 detail.push_back("...more pgs affected");
3336 continue;
3337 }
3338 }
3339 }
3340
3341 if (snaptrimq_exceeded) {
3342 {
3343 ostringstream ss;
3344 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3345 detail.push_back(ss.str());
3346 }
3347
3348 stringstream ss;
3349 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3350 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str());
3351 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3352 d.detail.swap(detail);
3353 }
3354 }
3355 }
3356
3357 void PGMap::get_health(
3358 CephContext *cct,
3359 const OSDMap& osdmap,
3360 list<pair<health_status_t,string> >& summary,
3361 list<pair<health_status_t,string> > *detail) const
3362 {
3363 map<string,int> note;
3364 auto p = num_pg_by_state.begin();
3365 auto p_end = num_pg_by_state.end();
3366 for (; p != p_end; ++p) {
3367 if (p->first & PG_STATE_STALE)
3368 note["stale"] += p->second;
3369 if (p->first & PG_STATE_DOWN)
3370 note["down"] += p->second;
3371 if (p->first & PG_STATE_UNDERSIZED)
3372 note["undersized"] += p->second;
3373 if (p->first & PG_STATE_DEGRADED)
3374 note["degraded"] += p->second;
3375 if (p->first & PG_STATE_INCONSISTENT)
3376 note["inconsistent"] += p->second;
3377 if (p->first & PG_STATE_PEERING)
3378 note["peering"] += p->second;
3379 if (p->first & PG_STATE_REPAIR)
3380 note["repair"] += p->second;
3381 if (p->first & PG_STATE_RECOVERING)
3382 note["recovering"] += p->second;
3383 if (p->first & PG_STATE_RECOVERY_WAIT)
3384 note["recovery_wait"] += p->second;
3385 if (p->first & PG_STATE_INCOMPLETE)
3386 note["incomplete"] += p->second;
3387 if (p->first & PG_STATE_BACKFILL_WAIT)
3388 note["backfill_wait"] += p->second;
3389 if (p->first & PG_STATE_BACKFILLING)
3390 note["backfilling"] += p->second;
3391 if (p->first & PG_STATE_BACKFILL_TOOFULL)
3392 note["backfill_toofull"] += p->second;
3393 if (p->first & PG_STATE_RECOVERY_TOOFULL)
3394 note["recovery_toofull"] += p->second;
3395 if (p->first & PG_STATE_SNAPTRIM_ERROR)
3396 note["snaptrim_error"] += p->second;
3397 }
3398
3399 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
3400 utime_t now(ceph_clock_now());
3401 utime_t cutoff = now - utime_t(g_conf->get_val<int64_t>("mon_pg_stuck_threshold"), 0);
3402 uint64_t num_inactive_pgs = 0;
3403
3404 if (detail) {
3405 // we need to collect details of stuck pgs, first do a quick check
3406 // whether this will yield any results
3407 if (get_stuck_counts(cutoff, note)) {
3408
3409 // there are stuck pgs. gather details for specified statuses
3410 // only if we know that there are pgs stuck in that status
3411
3412 if (note.find("stuck inactive") != note.end()) {
3413 get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
3414 note["stuck inactive"] = stuck_pgs.size();
3415 num_inactive_pgs += stuck_pgs.size();
3416 note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
3417 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
3418 stuck_pgs.clear();
3419 }
3420
3421 if (note.find("stuck unclean") != note.end()) {
3422 get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
3423 note["stuck unclean"] = stuck_pgs.size();
3424 note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
3425 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
3426 stuck_pgs.clear();
3427 }
3428
3429 if (note.find("stuck undersized") != note.end()) {
3430 get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
3431 note["stuck undersized"] = stuck_pgs.size();
3432 note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
3433 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
3434 stuck_pgs.clear();
3435 }
3436
3437 if (note.find("stuck degraded") != note.end()) {
3438 get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
3439 note["stuck degraded"] = stuck_pgs.size();
3440 note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
3441 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
3442 stuck_pgs.clear();
3443 }
3444
3445 if (note.find("stuck stale") != note.end()) {
3446 get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
3447 note["stuck stale"] = stuck_pgs.size();
3448 num_inactive_pgs += stuck_pgs.size();
3449 note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
3450 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
3451 }
3452 }
3453 } else {
3454 get_stuck_counts(cutoff, note);
3455 auto p = note.find("stuck inactive");
3456 if (p != note.end())
3457 num_inactive_pgs += p->second;
3458 p = note.find("stuck stale");
3459 if (p != note.end())
3460 num_inactive_pgs += p->second;
3461 }
3462
3463 if (cct->_conf->mon_pg_min_inactive > 0 &&
3464 num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
3465 ostringstream ss;
3466 ss << num_inactive_pgs << " pgs are stuck inactive for more than " << g_conf->get_val<int64_t>("mon_pg_stuck_threshold") << " seconds";
3467 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3468 }
3469
3470 if (!note.empty()) {
3471 for (auto p = note.begin(); p != note.end(); ++p) {
3472 ostringstream ss;
3473 ss << p->second << " pgs " << p->first;
3474 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3475 }
3476 if (detail) {
3477 int n = 0, more = 0;
3478 int max = cct->_conf->get_val<uint64_t>("mon_health_max_detail");
3479 for (auto p = pg_stat.begin();
3480 p != pg_stat.end();
3481 ++p) {
3482 if ((p->second.state & (PG_STATE_STALE |
3483 PG_STATE_DOWN |
3484 PG_STATE_UNDERSIZED |
3485 PG_STATE_DEGRADED |
3486 PG_STATE_INCONSISTENT |
3487 PG_STATE_PEERING |
3488 PG_STATE_REPAIR |
3489 PG_STATE_RECOVERING |
3490 PG_STATE_RECOVERY_WAIT |
3491 PG_STATE_RECOVERY_TOOFULL |
3492 PG_STATE_INCOMPLETE |
3493 PG_STATE_BACKFILL_WAIT |
3494 PG_STATE_BACKFILLING |
3495 PG_STATE_BACKFILL_TOOFULL)) &&
3496 stuck_pgs.count(p->first) == 0) {
3497 if (max > 0) {
3498 --max;
3499 } else {
3500 ++more;
3501 continue;
3502 }
3503 ++n;
3504 ostringstream ss;
3505 ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
3506 ss << ", acting " << p->second.acting;
3507 if (p->second.stats.sum.num_objects_unfound)
3508 ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
3509 if (p->second.state & PG_STATE_INCOMPLETE) {
3510 const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
3511 if (pi && pi->min_size > 1) {
3512 ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
3513 << " min_size from " << (int)pi->min_size
3514 << " may help; search ceph.com/docs for 'incomplete')";
3515 }
3516 }
3517 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3518 }
3519 }
3520 if (more) {
3521 ostringstream ss;
3522 ss << more << " more pgs are also unhealthy";
3523 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3524 }
3525 }
3526 }
3527
3528 // slow requests
3529 if (cct->_conf->mon_osd_warn_op_age > 0 &&
3530 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3531 cct->_conf->mon_osd_warn_op_age) {
3532 auto sum = _warn_slow_request_histogram(
3533 cct, osd_sum.op_queue_age_hist, "", summary, NULL);
3534 if (sum.first > 0 || sum.second > 0) {
3535 if (sum.first > 0) {
3536 ostringstream ss;
3537 ss << sum.first << " requests are blocked > "
3538 << cct->_conf->mon_osd_warn_op_age
3539 << " sec";
3540 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3541 }
3542 if (sum.second > 0) {
3543 ostringstream ss;
3544 ss << sum.second << " requests are blocked > "
3545 << (cct->_conf->mon_osd_warn_op_age *
3546 cct->_conf->mon_osd_err_op_age_ratio)
3547 << " sec";
3548 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3549 }
3550
3551 if (detail) {
3552 unsigned num_warn = 0, num_err = 0;
3553 // do per-osd warnings
3554 for (auto p = osd_stat.begin();
3555 p != osd_stat.end();
3556 ++p) {
3557 auto sum = _warn_slow_request_histogram(
3558 cct,
3559 p->second.op_queue_age_hist,
3560 string(" on osd.") + stringify(p->first),
3561 summary, detail);
3562 if (sum.second)
3563 ++num_err;
3564 else if (sum.first)
3565 ++num_warn;
3566 }
3567 if (num_err) {
3568 ostringstream ss2;
3569 ss2 << num_err << " osds have very slow requests";
3570 summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
3571 detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
3572 }
3573 if (num_warn) {
3574 ostringstream ss2;
3575 ss2 << num_warn << " osds have slow requests";
3576 summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
3577 detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
3578 }
3579 }
3580 }
3581 }
3582
3583 // recovery
3584 list<string> sl;
3585 overall_recovery_summary(NULL, &sl);
3586 for (auto p = sl.begin(); p != sl.end(); ++p) {
3587 summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3588 if (detail)
3589 detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3590 }
3591
3592 // near-target max pools
3593 auto& pools = osdmap.get_pools();
3594 for (auto p = pools.begin();
3595 p != pools.end(); ++p) {
3596 if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
3597 !pg_pool_sum.count(p->first))
3598 continue;
3599 bool nearfull = false;
3600 const string& name = osdmap.get_pool_name(p->first);
3601 const pool_stat_t& st = get_pg_pool_sum_stat(p->first);
3602 uint64_t ratio = p->second.cache_target_full_ratio_micro +
3603 ((1000000 - p->second.cache_target_full_ratio_micro) *
3604 cct->_conf->mon_cache_target_full_warn_ratio);
3605 if (p->second.target_max_objects &&
3606 (uint64_t)(st.stats.sum.num_objects -
3607 st.stats.sum.num_objects_hit_set_archive) >
3608 p->second.target_max_objects * (ratio / 1000000.0)) {
3609 nearfull = true;
3610 if (detail) {
3611 ostringstream ss;
3612 ss << "cache pool '" << name << "' with "
3613 << si_t(st.stats.sum.num_objects)
3614 << " objects at/near target max "
3615 << si_t(p->second.target_max_objects) << " objects";
3616 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3617 }
3618 }
3619 if (p->second.target_max_bytes &&
3620 (uint64_t)(st.stats.sum.num_bytes -
3621 st.stats.sum.num_bytes_hit_set_archive) >
3622 p->second.target_max_bytes * (ratio / 1000000.0)) {
3623 nearfull = true;
3624 if (detail) {
3625 ostringstream ss;
3626 ss << "cache pool '" << name
3627 << "' with " << si_t(st.stats.sum.num_bytes)
3628 << "B at/near target max "
3629 << si_t(p->second.target_max_bytes) << "B";
3630 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3631 }
3632 }
3633 if (nearfull) {
3634 ostringstream ss;
3635 ss << "'" << name << "' at/near target max";
3636 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3637 }
3638 }
3639
3640 // scrub
3641 if (pg_sum.stats.sum.num_scrub_errors) {
3642 ostringstream ss;
3643 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
3644 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3645 if (detail) {
3646 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3647 }
3648 }
3649
3650 // pg skew
3651 auto num_in = osdmap.get_num_in_osds();
3652 auto sum_pg_up = MAX(static_cast<unsigned>(pg_sum.up), pg_stat.size());
3653 int sum_objects = pg_sum.stats.sum.num_objects;
3654 if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
3655 return;
3656 }
3657 const auto min_pg_per_osd =
3658 cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
3659 if (num_in && min_pg_per_osd > 0) {
3660 auto per = sum_pg_up / num_in;
3661 if (per < min_pg_per_osd && per) {
3662 ostringstream ss;
3663 ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")";
3664 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3665 if (detail)
3666 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3667 }
3668 }
3669 int64_t max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
3670 if (num_in && max_pg_per_osd > 0) {
3671 int per = sum_pg_up / num_in;
3672 if (per > max_pg_per_osd) {
3673 ostringstream ss;
3674 ss << "too many PGs per OSD (" << per << " > max "
3675 << max_pg_per_osd << ")";
3676 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3677 if (detail)
3678 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3679 }
3680 }
3681 if (!pg_stat.empty()) {
3682 for (auto p = pg_pool_sum.begin();
3683 p != pg_pool_sum.end();
3684 ++p) {
3685 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
3686 if (!pi)
3687 continue; // in case osdmap changes haven't propagated to PGMap yet
3688 const string& name = osdmap.get_pool_name(p->first);
3689 if (pi->get_pg_num() > pi->get_pgp_num() &&
3690 !(name.find(".DELETED") != string::npos &&
3691 cct->_conf->mon_fake_pool_delete)) {
3692 ostringstream ss;
3693 ss << "pool " << name << " pg_num "
3694 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
3695 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3696 if (detail)
3697 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3698 }
3699 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
3700 if (average_objects_per_pg > 0 &&
3701 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
3702 p->second.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_pool_objects) {
3703 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
3704 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
3705 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
3706 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
3707 ostringstream ss;
3708 ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
3709 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3710 if (detail) {
3711 ostringstream ss;
3712 ss << "pool " << name << " objects per pg ("
3713 << objects_per_pg << ") is more than " << ratio << " times cluster average ("
3714 << average_objects_per_pg << ")";
3715 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3716 }
3717 }
3718 }
3719 }
3720 }
3721
3722 for (auto it : pools) {
3723 auto it2 = pg_pool_sum.find(it.first);
3724 if (it2 == pg_pool_sum.end()) {
3725 continue;
3726 }
3727 const pool_stat_t *pstat = &it2->second;
3728 const object_stat_sum_t& sum = pstat->stats.sum;
3729 const string& pool_name = osdmap.get_pool_name(it.first);
3730 const pg_pool_t &pool = it.second;
3731
3732 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3733 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3734
3735 if (pool.quota_max_objects > 0) {
3736 stringstream ss;
3737 health_status_t status = HEALTH_OK;
3738 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3739 } else if (crit_threshold > 0 &&
3740 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3741 ss << "pool '" << pool_name
3742 << "' has " << sum.num_objects << " objects"
3743 << " (max " << pool.quota_max_objects << ")";
3744 status = HEALTH_ERR;
3745 } else if (warn_threshold > 0 &&
3746 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3747 ss << "pool '" << pool_name
3748 << "' has " << sum.num_objects << " objects"
3749 << " (max " << pool.quota_max_objects << ")";
3750 status = HEALTH_WARN;
3751 }
3752 if (status != HEALTH_OK) {
3753 pair<health_status_t,string> s(status, ss.str());
3754 summary.push_back(s);
3755 if (detail)
3756 detail->push_back(s);
3757 }
3758 }
3759
3760 if (pool.quota_max_bytes > 0) {
3761 health_status_t status = HEALTH_OK;
3762 stringstream ss;
3763 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3764 } else if (crit_threshold > 0 &&
3765 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3766 ss << "pool '" << pool_name
3767 << "' has " << si_t(sum.num_bytes) << " bytes"
3768 << " (max " << si_t(pool.quota_max_bytes) << ")";
3769 status = HEALTH_ERR;
3770 } else if (warn_threshold > 0 &&
3771 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3772 ss << "pool '" << pool_name
3773 << "' has " << si_t(sum.num_bytes) << " bytes"
3774 << " (max " << si_t(pool.quota_max_bytes) << ")";
3775 status = HEALTH_WARN;
3776 }
3777 if (status != HEALTH_OK) {
3778 pair<health_status_t,string> s(status, ss.str());
3779 summary.push_back(s);
3780 if (detail)
3781 detail->push_back(s);
3782 }
3783 }
3784 }
3785
3786 print_unscrubbed_pgs(pg_stat, summary, detail, cct);
3787 }
3788
3789 int process_pg_map_command(
3790 const string& orig_prefix,
3791 const map<string,cmd_vartype>& orig_cmdmap,
3792 const PGMap& pg_map,
3793 const OSDMap& osdmap,
3794 Formatter *f,
3795 stringstream *ss,
3796 bufferlist *odata)
3797 {
3798 string prefix = orig_prefix;
3799 map<string,cmd_vartype> cmdmap = orig_cmdmap;
3800
3801 // perhaps these would be better in the parsing, but it's weird
3802 bool primary = false;
3803 if (prefix == "pg dump_json") {
3804 vector<string> v;
3805 v.push_back(string("all"));
3806 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3807 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3808 prefix = "pg dump";
3809 } else if (prefix == "pg dump_pools_json") {
3810 vector<string> v;
3811 v.push_back(string("pools"));
3812 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3813 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3814 prefix = "pg dump";
3815 } else if (prefix == "pg ls-by-primary") {
3816 primary = true;
3817 prefix = "pg ls";
3818 } else if (prefix == "pg ls-by-osd") {
3819 prefix = "pg ls";
3820 } else if (prefix == "pg ls-by-pool") {
3821 prefix = "pg ls";
3822 string poolstr;
3823 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3824 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3825 if (pool < 0) {
3826 *ss << "pool " << poolstr << " does not exist";
3827 return -ENOENT;
3828 }
3829 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3830 }
3831
3832 int r = 0;
3833 stringstream ds;
3834 if (prefix == "pg stat") {
3835 if (f) {
3836 f->open_object_section("pg_summary");
3837 pg_map.print_oneline_summary(f, NULL);
3838 f->close_section();
3839 f->flush(ds);
3840 } else {
3841 ds << pg_map;
3842 }
3843 odata->append(ds);
3844 return 0;
3845 }
3846
3847 if (prefix == "pg getmap") {
3848 pg_map.encode(*odata);
3849 *ss << "got pgmap version " << pg_map.version;
3850 return 0;
3851 }
3852
3853 if (prefix == "pg dump") {
3854 string val;
3855 vector<string> dumpcontents;
3856 set<string> what;
3857 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3858 copy(dumpcontents.begin(), dumpcontents.end(),
3859 inserter(what, what.end()));
3860 }
3861 if (what.empty())
3862 what.insert("all");
3863 if (f) {
3864 if (what.count("all")) {
3865 f->open_object_section("pg_map");
3866 pg_map.dump(f);
3867 f->close_section();
3868 } else if (what.count("summary") || what.count("sum")) {
3869 f->open_object_section("pg_map");
3870 pg_map.dump_basic(f);
3871 f->close_section();
3872 } else {
3873 if (what.count("pools")) {
3874 pg_map.dump_pool_stats(f);
3875 }
3876 if (what.count("osds")) {
3877 pg_map.dump_osd_stats(f);
3878 }
3879 if (what.count("pgs")) {
3880 pg_map.dump_pg_stats(f, false);
3881 }
3882 if (what.count("pgs_brief")) {
3883 pg_map.dump_pg_stats(f, true);
3884 }
3885 if (what.count("delta")) {
3886 f->open_object_section("delta");
3887 pg_map.dump_delta(f);
3888 f->close_section();
3889 }
3890 }
3891 f->flush(*odata);
3892 } else {
3893 if (what.count("all")) {
3894 pg_map.dump(ds);
3895 } else if (what.count("summary") || what.count("sum")) {
3896 pg_map.dump_basic(ds);
3897 pg_map.dump_pg_sum_stats(ds, true);
3898 pg_map.dump_osd_sum_stats(ds);
3899 } else {
3900 if (what.count("pgs_brief")) {
3901 pg_map.dump_pg_stats(ds, true);
3902 }
3903 bool header = true;
3904 if (what.count("pgs")) {
3905 pg_map.dump_pg_stats(ds, false);
3906 header = false;
3907 }
3908 if (what.count("pools")) {
3909 pg_map.dump_pool_stats(ds, header);
3910 }
3911 if (what.count("osds")) {
3912 pg_map.dump_osd_stats(ds);
3913 }
3914 }
3915 odata->append(ds);
3916 }
3917 *ss << "dumped " << what;
3918 return 0;
3919 }
3920
3921 if (prefix == "pg ls") {
3922 int64_t osd = -1;
3923 int64_t pool = -1;
3924 vector<string>states;
3925 set<pg_t> pgs;
3926 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3927 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3928 cmd_getval(g_ceph_context, cmdmap, "states", states);
3929 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3930 *ss << "pool " << pool << " does not exist";
3931 return -ENOENT;
3932 }
3933 if (osd >= 0 && !osdmap.is_up(osd)) {
3934 *ss << "osd " << osd << " is not up";
3935 return -EAGAIN;
3936 }
3937 if (states.empty())
3938 states.push_back("all");
3939
3940 uint32_t state = 0;
3941
3942 while (!states.empty()) {
3943 string state_str = states.back();
3944
3945 if (state_str == "all") {
3946 state = -1;
3947 break;
3948 } else {
3949 auto filter = pg_string_state(state_str);
3950 if (!filter) {
3951 *ss << "'" << state_str << "' is not a valid pg state,"
3952 << " available choices: " << pg_state_string(0xFFFFFFFF);
3953 return -EINVAL;
3954 }
3955 state |= *filter;
3956 }
3957
3958 states.pop_back();
3959 }
3960
3961 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3962
3963 if (f && !pgs.empty()) {
3964 pg_map.dump_filtered_pg_stats(f, pgs);
3965 f->flush(*odata);
3966 } else if (!pgs.empty()) {
3967 pg_map.dump_filtered_pg_stats(ds, pgs);
3968 odata->append(ds);
3969 }
3970 return 0;
3971 }
3972
3973 if (prefix == "pg dump_stuck") {
3974 vector<string> stuckop_vec;
3975 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3976 if (stuckop_vec.empty())
3977 stuckop_vec.push_back("unclean");
3978 int64_t threshold;
3979 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3980 g_conf->get_val<int64_t>("mon_pg_stuck_threshold"));
3981
3982 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
3983 odata->append(ds);
3984 if (r < 0)
3985 *ss << "failed";
3986 else
3987 *ss << "ok";
3988 return 0;
3989 }
3990
3991 if (prefix == "pg debug") {
3992 string debugop;
3993 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3994 string("unfound_objects_exist"));
3995 if (debugop == "unfound_objects_exist") {
3996 bool unfound_objects_exist = false;
3997 for (const auto& p : pg_map.pg_stat) {
3998 if (p.second.stats.sum.num_objects_unfound > 0) {
3999 unfound_objects_exist = true;
4000 break;
4001 }
4002 }
4003 if (unfound_objects_exist)
4004 ds << "TRUE";
4005 else
4006 ds << "FALSE";
4007 odata->append(ds);
4008 return 0;
4009 }
4010 if (debugop == "degraded_pgs_exist") {
4011 bool degraded_pgs_exist = false;
4012 for (const auto& p : pg_map.pg_stat) {
4013 if (p.second.stats.sum.num_objects_degraded > 0) {
4014 degraded_pgs_exist = true;
4015 break;
4016 }
4017 }
4018 if (degraded_pgs_exist)
4019 ds << "TRUE";
4020 else
4021 ds << "FALSE";
4022 odata->append(ds);
4023 return 0;
4024 }
4025 }
4026
4027 if (prefix == "osd perf") {
4028 if (f) {
4029 f->open_object_section("osdstats");
4030 pg_map.dump_osd_perf_stats(f);
4031 f->close_section();
4032 f->flush(ds);
4033 } else {
4034 pg_map.print_osd_perf_stats(&ds);
4035 }
4036 odata->append(ds);
4037 return 0;
4038 }
4039
4040 if (prefix == "osd blocked-by") {
4041 if (f) {
4042 f->open_object_section("osd_blocked_by");
4043 pg_map.dump_osd_blocked_by_stats(f);
4044 f->close_section();
4045 f->flush(ds);
4046 } else {
4047 pg_map.print_osd_blocked_by_stats(&ds);
4048 }
4049 odata->append(ds);
4050 return 0;
4051 }
4052
4053 if (prefix == "osd pool stats") {
4054 string pool_name;
4055 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
4056
4057 int64_t poolid = -ENOENT;
4058 bool one_pool = false;
4059 if (!pool_name.empty()) {
4060 poolid = osdmap.lookup_pg_pool_name(pool_name);
4061 if (poolid < 0) {
4062 assert(poolid == -ENOENT);
4063 *ss << "unrecognized pool '" << pool_name << "'";
4064 return -ENOENT;
4065 }
4066 one_pool = true;
4067 }
4068
4069 stringstream rs;
4070
4071 if (f)
4072 f->open_array_section("pool_stats");
4073 else {
4074 if (osdmap.get_pools().empty()) {
4075 *ss << "there are no pools!";
4076 goto stats_out;
4077 }
4078 }
4079
4080 for (auto& p : osdmap.get_pools()) {
4081 if (!one_pool)
4082 poolid = p.first;
4083
4084 pool_name = osdmap.get_pool_name(poolid);
4085
4086 if (f) {
4087 f->open_object_section("pool");
4088 f->dump_string("pool_name", pool_name.c_str());
4089 f->dump_int("pool_id", poolid);
4090 f->open_object_section("recovery");
4091 }
4092
4093 list<string> sl;
4094 stringstream tss;
4095 pg_map.pool_recovery_summary(f, &sl, poolid);
4096 if (!f && !sl.empty()) {
4097 for (auto& p : sl)
4098 tss << " " << p << "\n";
4099 }
4100
4101 if (f) {
4102 f->close_section();
4103 f->open_object_section("recovery_rate");
4104 }
4105
4106 ostringstream rss;
4107 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
4108 if (!f && !rss.str().empty())
4109 tss << " recovery io " << rss.str() << "\n";
4110
4111 if (f) {
4112 f->close_section();
4113 f->open_object_section("client_io_rate");
4114 }
4115 rss.clear();
4116 rss.str("");
4117
4118 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
4119 if (!f && !rss.str().empty())
4120 tss << " client io " << rss.str() << "\n";
4121
4122 // dump cache tier IO rate for cache pool
4123 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
4124 if (pool->is_tier()) {
4125 if (f) {
4126 f->close_section();
4127 f->open_object_section("cache_io_rate");
4128 }
4129 rss.clear();
4130 rss.str("");
4131
4132 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
4133 if (!f && !rss.str().empty())
4134 tss << " cache tier io " << rss.str() << "\n";
4135 }
4136 if (f) {
4137 f->close_section();
4138 f->close_section();
4139 } else {
4140 rs << "pool " << pool_name << " id " << poolid << "\n";
4141 if (!tss.str().empty())
4142 rs << tss.str() << "\n";
4143 else
4144 rs << " nothing is going on\n\n";
4145 }
4146 if (one_pool)
4147 break;
4148 }
4149
4150 stats_out:
4151 if (f) {
4152 f->close_section();
4153 f->flush(ds);
4154 odata->append(ds);
4155 } else {
4156 odata->append(rs.str());
4157 }
4158 return 0;
4159 }
4160
4161 return -EOPNOTSUPP;
4162 }
4163
4164 void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
4165 std::set<int> *need_check_down_pg_osds,
4166 std::map<int,utime_t> *last_osd_report,
4167 PGMap *pg_map,
4168 PGMap::Incremental *pending_inc)
4169 {
4170 for (const auto &p : osd_inc.new_weight) {
4171 if (p.second == CEPH_OSD_OUT) {
4172 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
4173 auto j = pg_map->osd_epochs.find(p.first);
4174 if (j != pg_map->osd_epochs.end())
4175 pending_inc->stat_osd_out(p.first, j->second);
4176 }
4177 }
4178
4179 // this is conservative: we want to know if any osds (maybe) got marked down.
4180 for (const auto &p : osd_inc.new_state) {
4181 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
4182 // but we're too lazy to check
4183 // which
4184 need_check_down_pg_osds->insert(p.first);
4185
4186 // clear out the last_osd_report for this OSD
4187 auto report = last_osd_report->find(p.first);
4188 if (report != last_osd_report->end()) {
4189 last_osd_report->erase(report);
4190 }
4191
4192 // clear out osd_stat slow request histogram
4193 dout(20) << __func__ << " clearing osd." << p.first
4194 << " request histogram" << dendl;
4195 pending_inc->stat_osd_down_up(p.first, osd_inc.epoch, *pg_map);
4196 }
4197
4198 if (p.second & CEPH_OSD_EXISTS) {
4199 // whether it was created *or* destroyed, we can safely drop
4200 // it's osd_stat_t record.
4201 dout(10) << __func__ << " osd." << p.first
4202 << " created or destroyed" << dendl;
4203 pending_inc->rm_stat(p.first);
4204
4205 // and adjust full, nearfull set
4206 pg_map->nearfull_osds.erase(p.first);
4207 pg_map->full_osds.erase(p.first);
4208 }
4209 }
4210 }
4211
4212 void PGMapUpdater::check_osd_map(
4213 CephContext *cct,
4214 const OSDMap& osdmap,
4215 const PGMap& pgmap,
4216 PGMap::Incremental *pending_inc)
4217 {
4218 for (auto& p : pgmap.osd_stat) {
4219 if (!osdmap.exists(p.first)) {
4220 // remove osd_stat
4221 pending_inc->rm_stat(p.first);
4222 } else if (osdmap.is_out(p.first)) {
4223 // zero osd_stat
4224 if (p.second.kb != 0) {
4225 auto j = pgmap.osd_epochs.find(p.first);
4226 if (j != pgmap.osd_epochs.end()) {
4227 pending_inc->stat_osd_out(p.first, j->second);
4228 }
4229 }
4230 } else if (!osdmap.is_up(p.first)) {
4231 // zero the op_queue_age_hist
4232 if (!p.second.op_queue_age_hist.empty()) {
4233 pending_inc->stat_osd_down_up(p.first, osdmap.get_epoch(), pgmap);
4234 }
4235 }
4236 }
4237
4238 // deleted pgs (pools)?
4239 for (auto& p : pgmap.pg_pool_sum) {
4240 if (!osdmap.have_pg_pool(p.first)) {
4241 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
4242 << dendl;
4243 for (auto& q : pgmap.pg_stat) {
4244 if (q.first.pool() == (uint64_t)p.first) {
4245 pending_inc->pg_remove.insert(q.first);
4246 }
4247 }
4248 auto q = pending_inc->pg_stat_updates.begin();
4249 while (q != pending_inc->pg_stat_updates.end()) {
4250 if (q->first.pool() == (uint64_t)p.first) {
4251 q = pending_inc->pg_stat_updates.erase(q);
4252 } else {
4253 ++q;
4254 }
4255 }
4256 }
4257 }
4258
4259 // new pgs (split or new pool)?
4260 for (auto& p : osdmap.get_pools()) {
4261 int64_t poolid = p.first;
4262 const pg_pool_t& pi = p.second;
4263 auto q = pgmap.num_pg_by_pool.find(poolid);
4264 unsigned my_pg_num = 0;
4265 if (q != pgmap.num_pg_by_pool.end())
4266 my_pg_num = q->second;
4267 unsigned pg_num = pi.get_pg_num();
4268 if (my_pg_num != pg_num) {
4269 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
4270 << " != my pg_num " << my_pg_num << dendl;
4271 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
4272 pg_t pgid(ps, poolid);
4273 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
4274 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
4275 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4276 stats.last_fresh = osdmap.get_modified();
4277 stats.last_active = osdmap.get_modified();
4278 stats.last_change = osdmap.get_modified();
4279 stats.last_peered = osdmap.get_modified();
4280 stats.last_clean = osdmap.get_modified();
4281 stats.last_unstale = osdmap.get_modified();
4282 stats.last_undegraded = osdmap.get_modified();
4283 stats.last_fullsized = osdmap.get_modified();
4284 stats.last_scrub_stamp = osdmap.get_modified();
4285 stats.last_deep_scrub_stamp = osdmap.get_modified();
4286 stats.last_clean_scrub_stamp = osdmap.get_modified();
4287 }
4288 }
4289 }
4290 }
4291 }
4292
4293 void PGMapUpdater::register_pg(
4294 const OSDMap &osd_map,
4295 pg_t pgid, epoch_t epoch,
4296 bool new_pool,
4297 const PGMap &pg_map,
4298 PGMap::Incremental *pending_inc)
4299 {
4300 pg_t parent;
4301 int split_bits = 0;
4302 auto parent_stat = pg_map.pg_stat.end();
4303 if (!new_pool) {
4304 parent = pgid;
4305 while (1) {
4306 // remove most significant bit
4307 int msb = cbits(parent.ps());
4308 if (!msb)
4309 break;
4310 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
4311 split_bits++;
4312 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
4313 parent_stat = pg_map.pg_stat.find(parent);
4314 if (parent_stat != pg_map.pg_stat.end() &&
4315 parent_stat->second.state != PG_STATE_CREATING) {
4316 dout(10) << " parent is " << parent << dendl;
4317 break;
4318 }
4319 }
4320 }
4321
4322 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4323 stats.state = PG_STATE_CREATING;
4324 stats.created = epoch;
4325 stats.parent = parent;
4326 stats.parent_split_bits = split_bits;
4327 stats.mapping_epoch = epoch;
4328
4329 if (parent_stat != pg_map.pg_stat.end()) {
4330 const pg_stat_t &ps = parent_stat->second;
4331 stats.last_fresh = ps.last_fresh;
4332 stats.last_active = ps.last_active;
4333 stats.last_change = ps.last_change;
4334 stats.last_peered = ps.last_peered;
4335 stats.last_clean = ps.last_clean;
4336 stats.last_unstale = ps.last_unstale;
4337 stats.last_undegraded = ps.last_undegraded;
4338 stats.last_fullsized = ps.last_fullsized;
4339 stats.last_scrub_stamp = ps.last_scrub_stamp;
4340 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
4341 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
4342 } else {
4343 utime_t now = osd_map.get_modified();
4344 stats.last_fresh = now;
4345 stats.last_active = now;
4346 stats.last_change = now;
4347 stats.last_peered = now;
4348 stats.last_clean = now;
4349 stats.last_unstale = now;
4350 stats.last_undegraded = now;
4351 stats.last_fullsized = now;
4352 stats.last_scrub_stamp = now;
4353 stats.last_deep_scrub_stamp = now;
4354 stats.last_clean_scrub_stamp = now;
4355 }
4356
4357 osd_map.pg_to_up_acting_osds(
4358 pgid,
4359 &stats.up,
4360 &stats.up_primary,
4361 &stats.acting,
4362 &stats.acting_primary);
4363
4364 if (split_bits == 0) {
4365 dout(10) << __func__ << " will create " << pgid
4366 << " primary " << stats.acting_primary
4367 << " acting " << stats.acting
4368 << dendl;
4369 } else {
4370 dout(10) << __func__ << " will create " << pgid
4371 << " primary " << stats.acting_primary
4372 << " acting " << stats.acting
4373 << " parent " << parent
4374 << " by " << split_bits << " bits"
4375 << dendl;
4376 }
4377 }
4378
4379 void PGMapUpdater::register_new_pgs(
4380 const OSDMap &osd_map,
4381 const PGMap &pg_map,
4382 PGMap::Incremental *pending_inc)
4383 {
4384 epoch_t epoch = osd_map.get_epoch();
4385 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
4386 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
4387
4388 int created = 0;
4389 const auto &pools = osd_map.get_pools();
4390
4391 for (const auto &p : pools) {
4392 int64_t poolid = p.first;
4393 const pg_pool_t &pool = p.second;
4394 int ruleno = osd_map.crush->find_rule(pool.get_crush_rule(),
4395 pool.get_type(), pool.get_size());
4396 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
4397 continue;
4398
4399 if (pool.get_last_change() <= pg_map.last_pg_scan ||
4400 pool.get_last_change() <= pending_inc->pg_scan) {
4401 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
4402 continue;
4403 }
4404
4405 dout(10) << __func__ << " scanning pool " << poolid
4406 << " " << pool << dendl;
4407
4408 // first pgs in this pool
4409 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
4410
4411 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
4412 pg_t pgid(ps, poolid, -1);
4413 if (pg_map.pg_stat.count(pgid)) {
4414 dout(20) << "register_new_pgs have " << pgid << dendl;
4415 continue;
4416 }
4417 created++;
4418 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
4419 pg_map, pending_inc);
4420 }
4421 }
4422
4423 int removed = 0;
4424 for (const auto &p : pg_map.creating_pgs) {
4425 if (p.preferred() >= 0) {
4426 dout(20) << " removing creating_pg " << p
4427 << " because it is localized and obsolete" << dendl;
4428 pending_inc->pg_remove.insert(p);
4429 ++removed;
4430 } else if (!osd_map.have_pg_pool(p.pool())) {
4431 dout(20) << " removing creating_pg " << p
4432 << " because containing pool deleted" << dendl;
4433 pending_inc->pg_remove.insert(p);
4434 ++removed;
4435 }
4436 }
4437
4438 // deleted pools?
4439 for (const auto &p : pg_map.pg_stat) {
4440 if (!osd_map.have_pg_pool(p.first.pool())) {
4441 dout(20) << " removing pg_stat " << p.first << " because "
4442 << "containing pool deleted" << dendl;
4443 pending_inc->pg_remove.insert(p.first);
4444 ++removed;
4445 } else if (p.first.preferred() >= 0) {
4446 dout(20) << " removing localized pg " << p.first << dendl;
4447 pending_inc->pg_remove.insert(p.first);
4448 ++removed;
4449 }
4450 }
4451
4452 // we don't want to redo this work if we can avoid it.
4453 pending_inc->pg_scan = epoch;
4454
4455 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
4456 << removed << " uncreated pgs" << dendl;
4457 }
4458
4459
4460 void PGMapUpdater::update_creating_pgs(
4461 const OSDMap &osd_map,
4462 const PGMap &pg_map,
4463 PGMap::Incremental *pending_inc)
4464 {
4465 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
4466 << " pgs, osdmap epoch " << osd_map.get_epoch()
4467 << dendl;
4468
4469 unsigned changed = 0;
4470 for (auto p = pg_map.creating_pgs.begin();
4471 p != pg_map.creating_pgs.end();
4472 ++p) {
4473 pg_t pgid = *p;
4474 pg_t on = pgid;
4475 auto q = pg_map.pg_stat.find(pgid);
4476 assert(q != pg_map.pg_stat.end());
4477 const pg_stat_t *s = &q->second;
4478
4479 if (s->parent_split_bits)
4480 on = s->parent;
4481
4482 vector<int> up, acting;
4483 int up_primary, acting_primary;
4484 osd_map.pg_to_up_acting_osds(
4485 on,
4486 &up,
4487 &up_primary,
4488 &acting,
4489 &acting_primary);
4490
4491 if (up != s->up ||
4492 up_primary != s->up_primary ||
4493 acting != s->acting ||
4494 acting_primary != s->acting_primary) {
4495 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
4496 if (osd_map.get_epoch() > ns->reported_epoch) {
4497 dout(20) << __func__ << " " << pgid << " "
4498 << " acting_primary: " << s->acting_primary
4499 << " -> " << acting_primary
4500 << " acting: " << s->acting << " -> " << acting
4501 << " up_primary: " << s->up_primary << " -> " << up_primary
4502 << " up: " << s->up << " -> " << up
4503 << dendl;
4504
4505 // only initialize if it wasn't already a pending update
4506 if (ns->reported_epoch == 0)
4507 *ns = *s;
4508
4509 // note epoch if the target of the create message changed
4510 if (acting_primary != ns->acting_primary)
4511 ns->mapping_epoch = osd_map.get_epoch();
4512
4513 ns->up = up;
4514 ns->up_primary = up_primary;
4515 ns->acting = acting;
4516 ns->acting_primary = acting_primary;
4517
4518 ++changed;
4519 } else {
4520 dout(20) << __func__ << " " << pgid << " has pending update from newer"
4521 << " epoch " << ns->reported_epoch
4522 << dendl;
4523 }
4524 }
4525 }
4526 if (changed) {
4527 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
4528 }
4529 }
4530
4531 static void _try_mark_pg_stale(
4532 const OSDMap& osdmap,
4533 pg_t pgid,
4534 const pg_stat_t& cur,
4535 PGMap::Incremental *pending_inc)
4536 {
4537 if ((cur.state & PG_STATE_STALE) == 0 &&
4538 cur.acting_primary != -1 &&
4539 osdmap.is_down(cur.acting_primary)) {
4540 pg_stat_t *newstat;
4541 auto q = pending_inc->pg_stat_updates.find(pgid);
4542 if (q != pending_inc->pg_stat_updates.end()) {
4543 if ((q->second.acting_primary == cur.acting_primary) ||
4544 ((q->second.state & PG_STATE_STALE) == 0 &&
4545 q->second.acting_primary != -1 &&
4546 osdmap.is_down(q->second.acting_primary))) {
4547 newstat = &q->second;
4548 } else {
4549 // pending update is no longer down or already stale
4550 return;
4551 }
4552 } else {
4553 newstat = &pending_inc->pg_stat_updates[pgid];
4554 *newstat = cur;
4555 }
4556 dout(10) << __func__ << " marking pg " << pgid
4557 << " stale (acting_primary " << newstat->acting_primary
4558 << ")" << dendl;
4559 newstat->state |= PG_STATE_STALE;
4560 newstat->last_unstale = ceph_clock_now();
4561 }
4562 }
4563
4564 void PGMapUpdater::check_down_pgs(
4565 const OSDMap &osdmap,
4566 const PGMap &pg_map,
4567 bool check_all,
4568 const set<int>& need_check_down_pg_osds,
4569 PGMap::Incremental *pending_inc)
4570 {
4571 // if a large number of osds changed state, just iterate over the whole
4572 // pg map.
4573 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
4574 g_conf->get_val<double>("mon_pg_check_down_all_threshold")) {
4575 check_all = true;
4576 }
4577
4578 if (check_all) {
4579 for (const auto& p : pg_map.pg_stat) {
4580 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
4581 }
4582 } else {
4583 for (auto osd : need_check_down_pg_osds) {
4584 if (osdmap.is_down(osd)) {
4585 auto p = pg_map.pg_by_osd.find(osd);
4586 if (p == pg_map.pg_by_osd.end()) {
4587 continue;
4588 }
4589 for (auto pgid : p->second) {
4590 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
4591 assert(stat.acting_primary == osd);
4592 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
4593 }
4594 }
4595 }
4596 }
4597 }
4598
4599 int reweight::by_utilization(
4600 const OSDMap &osdmap,
4601 const PGMap &pgm,
4602 int oload,
4603 double max_changef,
4604 int max_osds,
4605 bool by_pg, const set<int64_t> *pools,
4606 bool no_increasing,
4607 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
4608 std::stringstream *ss,
4609 std::string *out_str,
4610 Formatter *f)
4611 {
4612 if (oload <= 100) {
4613 *ss << "You must give a percentage higher than 100. "
4614 "The reweighting threshold will be calculated as <average-utilization> "
4615 "times <input-percentage>. For example, an argument of 200 would "
4616 "reweight OSDs which are twice as utilized as the average OSD.\n";
4617 return -EINVAL;
4618 }
4619
4620 vector<int> pgs_by_osd(osdmap.get_max_osd());
4621
4622 // Avoid putting a small number (or 0) in the denominator when calculating
4623 // average_util
4624 double average_util;
4625 if (by_pg) {
4626 // by pg mapping
4627 double weight_sum = 0.0; // sum up the crush weights
4628 unsigned num_pg_copies = 0;
4629 int num_osds = 0;
4630 for (const auto& pg : pgm.pg_stat) {
4631 if (pools && pools->count(pg.first.pool()) == 0)
4632 continue;
4633 for (const auto acting : pg.second.acting) {
4634 if (!osdmap.exists(acting)) {
4635 continue;
4636 }
4637 if (acting >= (int)pgs_by_osd.size())
4638 pgs_by_osd.resize(acting);
4639 if (pgs_by_osd[acting] == 0) {
4640 if (osdmap.crush->get_item_weightf(acting) <= 0) {
4641 //skip if we currently can not identify item
4642 continue;
4643 }
4644 weight_sum += osdmap.crush->get_item_weightf(acting);
4645 ++num_osds;
4646 }
4647 ++pgs_by_osd[acting];
4648 ++num_pg_copies;
4649 }
4650 }
4651
4652 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
4653 *ss << "Refusing to reweight: we only have " << num_pg_copies
4654 << " PGs across " << num_osds << " osds!\n";
4655 return -EDOM;
4656 }
4657
4658 average_util = (double)num_pg_copies / weight_sum;
4659 } else {
4660 // by osd utilization
4661 int num_osd = MAX(1, pgm.osd_stat.size());
4662 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
4663 < g_conf->mon_reweight_min_bytes_per_osd) {
4664 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
4665 << " kb across all osds!\n";
4666 return -EDOM;
4667 }
4668 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
4669 < g_conf->mon_reweight_min_bytes_per_osd) {
4670 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
4671 << " kb used across all osds!\n";
4672 return -EDOM;
4673 }
4674
4675 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
4676 }
4677
4678 // adjust down only if we are above the threshold
4679 const double overload_util = average_util * (double)oload / 100.0;
4680
4681 // but aggressively adjust weights up whenever possible.
4682 const double underload_util = average_util;
4683
4684 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4685
4686 ostringstream oss;
4687 if (f) {
4688 f->open_object_section("reweight_by_utilization");
4689 f->dump_int("overload_min", oload);
4690 f->dump_float("max_change", max_changef);
4691 f->dump_int("max_change_osds", max_osds);
4692 f->dump_float("average_utilization", average_util);
4693 f->dump_float("overload_utilization", overload_util);
4694 } else {
4695 oss << "oload " << oload << "\n";
4696 oss << "max_change " << max_changef << "\n";
4697 oss << "max_change_osds " << max_osds << "\n";
4698 oss.precision(4);
4699 oss << "average_utilization " << std::fixed << average_util << "\n";
4700 oss << "overload_utilization " << overload_util << "\n";
4701 }
4702 int num_changed = 0;
4703
4704 // precompute util for each OSD
4705 std::vector<std::pair<int, float> > util_by_osd;
4706 for (const auto& p : pgm.osd_stat) {
4707 std::pair<int, float> osd_util;
4708 osd_util.first = p.first;
4709 if (by_pg) {
4710 if (p.first >= (int)pgs_by_osd.size() ||
4711 pgs_by_osd[p.first] == 0) {
4712 // skip if this OSD does not contain any pg
4713 // belonging to the specified pool(s).
4714 continue;
4715 }
4716
4717 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4718 // skip if we are unable to locate item.
4719 continue;
4720 }
4721
4722 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4723 } else {
4724 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
4725 }
4726 util_by_osd.push_back(osd_util);
4727 }
4728
4729 // sort by absolute deviation from the mean utilization,
4730 // in descending order.
4731 std::sort(util_by_osd.begin(), util_by_osd.end(),
4732 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4733 return abs(l.second - average_util) > abs(r.second - average_util);
4734 }
4735 );
4736
4737 if (f)
4738 f->open_array_section("reweights");
4739
4740 for (const auto& p : util_by_osd) {
4741 unsigned weight = osdmap.get_weight(p.first);
4742 if (weight == 0) {
4743 // skip if OSD is currently out
4744 continue;
4745 }
4746 float util = p.second;
4747
4748 if (util >= overload_util) {
4749 // Assign a lower weight to overloaded OSDs. The current weight
4750 // is a factor to take into account the original weights,
4751 // to represent e.g. differing storage capacities
4752 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4753 if (weight > max_change)
4754 new_weight = MAX(new_weight, weight - max_change);
4755 new_weights->insert({p.first, new_weight});
4756 if (f) {
4757 f->open_object_section("osd");
4758 f->dump_int("osd", p.first);
4759 f->dump_float("weight", (float)weight / (float)0x10000);
4760 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4761 f->close_section();
4762 } else {
4763 oss << "osd." << p.first << " weight "
4764 << (float)weight / (float)0x10000 << " -> "
4765 << (float)new_weight / (float)0x10000 << "\n";
4766 }
4767 if (++num_changed >= max_osds)
4768 break;
4769 }
4770 if (!no_increasing && util <= underload_util) {
4771 // assign a higher weight.. if we can.
4772 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4773 new_weight = MIN(new_weight, weight + max_change);
4774 if (new_weight > 0x10000)
4775 new_weight = 0x10000;
4776 if (new_weight > weight) {
4777 new_weights->insert({p.first, new_weight});
4778 oss << "osd." << p.first << " weight "
4779 << (float)weight / (float)0x10000 << " -> "
4780 << (float)new_weight / (float)0x10000 << "\n";
4781 if (++num_changed >= max_osds)
4782 break;
4783 }
4784 }
4785 }
4786 if (f) {
4787 f->close_section();
4788 }
4789
4790 OSDMap newmap;
4791 newmap.deepish_copy_from(osdmap);
4792 OSDMap::Incremental newinc;
4793 newinc.fsid = newmap.get_fsid();
4794 newinc.epoch = newmap.get_epoch() + 1;
4795 newinc.new_weight = *new_weights;
4796 newmap.apply_incremental(newinc);
4797
4798 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4799
4800 if (f) {
4801 f->close_section();
4802 } else {
4803 *out_str += "\n";
4804 *out_str += oss.str();
4805 }
4806 return num_changed;
4807 }