]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/PGMap.cc
c0277b7a518a235ce18162afcb856e59b934a0cd
[ceph.git] / ceph / src / mon / PGMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "PGMap.h"
5
6 #define dout_subsys ceph_subsys_mon
7 #include "common/debug.h"
8 #include "common/Formatter.h"
9 #include "include/ceph_features.h"
10 #include "include/stringify.h"
11
12 #include "osd/osd_types.h"
13 #include "osd/OSDMap.h"
14
15 #define dout_context g_ceph_context
16
17 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
18 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
19 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
20
21
22 // ---------------------
23 // PGMapDigest
24
25 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
26 {
27 // NOTE: see PGMap::encode_digest
28 ENCODE_START(1, 1, bl);
29 ::encode(num_pg, bl);
30 ::encode(num_pg_active, bl);
31 ::encode(num_pg_unknown, bl);
32 ::encode(num_osd, bl);
33 ::encode(pg_pool_sum, bl, features);
34 ::encode(pg_sum, bl, features);
35 ::encode(osd_sum, bl);
36 ::encode(num_pg_by_state, bl);
37 ::encode(num_pg_by_osd, bl);
38 ::encode(num_pg_by_pool, bl);
39 ::encode(osd_last_seq, bl);
40 ::encode(per_pool_sum_delta, bl, features);
41 ::encode(per_pool_sum_deltas_stamps, bl);
42 ::encode(pg_sum_delta, bl, features);
43 ::encode(stamp_delta, bl);
44 ::encode(avail_space_by_rule, bl);
45 ENCODE_FINISH(bl);
46 }
47
48 void PGMapDigest::decode(bufferlist::iterator& p)
49 {
50 DECODE_START(1, p);
51 ::decode(num_pg, p);
52 ::decode(num_pg_active, p);
53 ::decode(num_pg_unknown, p);
54 ::decode(num_osd, p);
55 ::decode(pg_pool_sum, p);
56 ::decode(pg_sum, p);
57 ::decode(osd_sum, p);
58 ::decode(num_pg_by_state, p);
59 ::decode(num_pg_by_osd, p);
60 ::decode(num_pg_by_pool, p);
61 ::decode(osd_last_seq, p);
62 ::decode(per_pool_sum_delta, p);
63 ::decode(per_pool_sum_deltas_stamps, p);
64 ::decode(pg_sum_delta, p);
65 ::decode(stamp_delta, p);
66 ::decode(avail_space_by_rule, p);
67 DECODE_FINISH(p);
68 }
69
70 void PGMapDigest::dump(Formatter *f) const
71 {
72 f->dump_unsigned("num_pg", num_pg);
73 f->dump_unsigned("num_pg_active", num_pg_active);
74 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
75 f->dump_unsigned("num_osd", num_osd);
76 f->dump_object("pool_sum", pg_sum);
77 f->dump_object("osd_sum", osd_sum);
78 f->open_array_section("pool_stats");
79 for (auto& p : pg_pool_sum) {
80 f->open_object_section("pool_stat");
81 f->dump_int("poolid", p.first);
82 auto q = num_pg_by_pool.find(p.first);
83 if (q != num_pg_by_pool.end())
84 f->dump_unsigned("num_pg", q->second);
85 p.second.dump(f);
86 f->close_section();
87 }
88 f->close_section();
89 f->open_array_section("osd_stats");
90 int i = 0;
91 // TODO: this isn't really correct since we can dump non-existent OSDs
92 // I dunno what osd_last_seq is set to in that case...
93 for (auto& p : osd_last_seq) {
94 f->open_object_section("osd_stat");
95 f->dump_int("osd", i);
96 f->dump_unsigned("seq", p);
97 f->close_section();
98 ++i;
99 }
100 f->close_section();
101 f->open_array_section("num_pg_by_state");
102 for (auto& p : num_pg_by_state) {
103 f->open_object_section("count");
104 f->dump_string("state", pg_state_string(p.first));
105 f->dump_unsigned("num", p.second);
106 f->close_section();
107 }
108 f->close_section();
109 f->open_array_section("num_pg_by_osd");
110 for (auto& p : num_pg_by_osd) {
111 f->open_object_section("count");
112 f->dump_unsigned("osd", p.first);
113 f->dump_unsigned("num_primary_pg", p.second.primary);
114 f->dump_unsigned("num_acting_pg", p.second.acting);
115 f->dump_unsigned("num_up_pg", p.second.up);
116 f->close_section();
117 }
118 f->close_section();
119 }
120
121 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
122 {
123 ls.push_back(new PGMapDigest);
124 }
125
126 inline std::string percentify(const float& a) {
127 std::stringstream ss;
128 if (a < 0.01)
129 ss << "0";
130 else
131 ss << std::fixed << std::setprecision(2) << a;
132 return ss.str();
133 }
134
135 void PGMapDigest::print_summary(Formatter *f, ostream *out) const
136 {
137 if (f)
138 f->open_array_section("pgs_by_state");
139
140 // list is descending numeric order (by count)
141 multimap<int,int> state_by_count; // count -> state
142 for (auto p = num_pg_by_state.begin();
143 p != num_pg_by_state.end();
144 ++p) {
145 state_by_count.insert(make_pair(p->second, p->first));
146 }
147 if (f) {
148 for (auto p = state_by_count.rbegin();
149 p != state_by_count.rend();
150 ++p)
151 {
152 f->open_object_section("pgs_by_state_element");
153 f->dump_string("state_name", pg_state_string(p->second));
154 f->dump_unsigned("count", p->first);
155 f->close_section();
156 }
157 }
158 if (f)
159 f->close_section();
160
161 if (f) {
162 f->dump_unsigned("num_pgs", num_pg);
163 f->dump_unsigned("num_pools", pg_pool_sum.size());
164 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
165 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
166 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
167 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
168 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
169 } else {
170 *out << " pools: " << pg_pool_sum.size() << " pools, "
171 << num_pg << " pgs\n";
172 *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
173 << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
174 *out << " usage: "
175 << kb_t(osd_sum.kb_used) << " used, "
176 << kb_t(osd_sum.kb_avail) << " / "
177 << kb_t(osd_sum.kb) << " avail\n";
178 *out << " pgs: ";
179 }
180
181 bool pad = false;
182
183 if (num_pg_unknown > 0) {
184 float p = (float)num_pg_unknown / (float)num_pg;
185 if (f) {
186 f->dump_float("unknown_pgs_ratio", p);
187 } else {
188 char b[20];
189 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
190 *out << b << "% pgs unknown\n";
191 pad = true;
192 }
193 }
194
195 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
196 if (num_pg_inactive > 0) {
197 float p = (float)num_pg_inactive / (float)num_pg;
198 if (f) {
199 f->dump_float("inactive_pgs_ratio", p);
200 } else {
201 if (pad) {
202 *out << " ";
203 }
204 char b[20];
205 snprintf(b, sizeof(b), "%.3f", p * 100.0);
206 *out << b << "% pgs not active\n";
207 pad = true;
208 }
209 }
210
211 list<string> sl;
212 overall_recovery_summary(f, &sl);
213 if (!f && !sl.empty()) {
214 for (auto p = sl.begin(); p != sl.end(); ++p) {
215 if (pad) {
216 *out << " ";
217 }
218 *out << *p << "\n";
219 pad = true;
220 }
221 }
222 sl.clear();
223
224 if (!f) {
225 unsigned max_width = 1;
226 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
227 p != state_by_count.rend();
228 ++p)
229 {
230 std::stringstream ss;
231 ss << p->first;
232 max_width = MAX(ss.str().size(), max_width);
233 }
234
235 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
236 p != state_by_count.rend();
237 ++p)
238 {
239 if (pad) {
240 *out << " ";
241 }
242 pad = true;
243 out->setf(std::ios::left);
244 *out << std::setw(max_width) << p->first
245 << " " << pg_state_string(p->second) << "\n";
246 out->unsetf(std::ios::left);
247 }
248 }
249
250 ostringstream ss_rec_io;
251 overall_recovery_rate_summary(f, &ss_rec_io);
252 ostringstream ss_client_io;
253 overall_client_io_rate_summary(f, &ss_client_io);
254 ostringstream ss_cache_io;
255 overall_cache_io_rate_summary(f, &ss_cache_io);
256
257 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
258 || ss_cache_io.str().length())) {
259 *out << "\n \n";
260 *out << " io:\n";
261 }
262
263 if (!f && ss_client_io.str().length())
264 *out << " client: " << ss_client_io.str() << "\n";
265 if (!f && ss_rec_io.str().length())
266 *out << " recovery: " << ss_rec_io.str() << "\n";
267 if (!f && ss_cache_io.str().length())
268 *out << " cache: " << ss_cache_io.str() << "\n";
269 }
270
271 void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
272 {
273 std::stringstream ss;
274
275 if (f)
276 f->open_array_section("num_pg_by_state");
277 for (auto p = num_pg_by_state.begin();
278 p != num_pg_by_state.end();
279 ++p) {
280 if (f) {
281 f->open_object_section("state");
282 f->dump_string("name", pg_state_string(p->first));
283 f->dump_unsigned("num", p->second);
284 f->close_section();
285 }
286 if (p != num_pg_by_state.begin())
287 ss << ", ";
288 ss << p->second << " " << pg_state_string(p->first);
289 }
290 if (f)
291 f->close_section();
292
293 string states = ss.str();
294 if (out)
295 *out << num_pg << " pgs: "
296 << states << "; "
297 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
298 << kb_t(osd_sum.kb_used) << " used, "
299 << kb_t(osd_sum.kb_avail) << " / "
300 << kb_t(osd_sum.kb) << " avail";
301 if (f) {
302 f->dump_unsigned("num_pgs", num_pg);
303 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
304 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
305 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
306 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
307 }
308
309 // make non-negative; we can get negative values if osds send
310 // uncommitted stats and then "go backward" or if they are just
311 // buggy/wrong.
312 pool_stat_t pos_delta = pg_sum_delta;
313 pos_delta.floor(0);
314 if (pos_delta.stats.sum.num_rd ||
315 pos_delta.stats.sum.num_wr) {
316 if (out)
317 *out << "; ";
318 if (pos_delta.stats.sum.num_rd) {
319 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
320 if (out)
321 *out << pretty_si_t(rd) << "B/s rd, ";
322 if (f)
323 f->dump_unsigned("read_bytes_sec", rd);
324 }
325 if (pos_delta.stats.sum.num_wr) {
326 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
327 if (out)
328 *out << pretty_si_t(wr) << "B/s wr, ";
329 if (f)
330 f->dump_unsigned("write_bytes_sec", wr);
331 }
332 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
333 if (out)
334 *out << pretty_si_t(iops) << "op/s";
335 if (f)
336 f->dump_unsigned("io_sec", iops);
337 }
338
339 list<string> sl;
340 overall_recovery_summary(f, &sl);
341 if (out)
342 for (auto p = sl.begin(); p != sl.end(); ++p)
343 *out << "; " << *p;
344 std::stringstream ssr;
345 overall_recovery_rate_summary(f, &ssr);
346 if (out && ssr.str().length())
347 *out << "; " << ssr.str() << " recovering";
348 }
349
350 void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
351 const pool_stat_t& delta_sum) const
352 {
353 if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
354 double pc = (double)delta_sum.stats.sum.num_objects_degraded /
355 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
356 char b[20];
357 snprintf(b, sizeof(b), "%.3lf", pc);
358 if (f) {
359 f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
360 f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
361 f->dump_float("degraded_ratio", pc / 100.0);
362 } else {
363 ostringstream ss;
364 ss << delta_sum.stats.sum.num_objects_degraded
365 << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
366 psl->push_back(ss.str());
367 }
368 }
369 if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
370 double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
371 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
372 char b[20];
373 snprintf(b, sizeof(b), "%.3lf", pc);
374 if (f) {
375 f->dump_unsigned("misplaced_objects", delta_sum.stats.sum.num_objects_misplaced);
376 f->dump_unsigned("misplaced_total", delta_sum.stats.sum.num_object_copies);
377 f->dump_float("misplaced_ratio", pc / 100.0);
378 } else {
379 ostringstream ss;
380 ss << delta_sum.stats.sum.num_objects_misplaced
381 << "/" << delta_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
382 psl->push_back(ss.str());
383 }
384 }
385 if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
386 double pc = (double)delta_sum.stats.sum.num_objects_unfound /
387 (double)delta_sum.stats.sum.num_objects * (double)100.0;
388 char b[20];
389 snprintf(b, sizeof(b), "%.3lf", pc);
390 if (f) {
391 f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
392 f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
393 f->dump_float("unfound_ratio", pc / 100.0);
394 } else {
395 ostringstream ss;
396 ss << delta_sum.stats.sum.num_objects_unfound
397 << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
398 psl->push_back(ss.str());
399 }
400 }
401 }
402
403 void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
404 const pool_stat_t& delta_sum,
405 utime_t delta_stamp) const
406 {
407 // make non-negative; we can get negative values if osds send
408 // uncommitted stats and then "go backward" or if they are just
409 // buggy/wrong.
410 pool_stat_t pos_delta = delta_sum;
411 pos_delta.floor(0);
412 if (pos_delta.stats.sum.num_objects_recovered ||
413 pos_delta.stats.sum.num_bytes_recovered ||
414 pos_delta.stats.sum.num_keys_recovered) {
415 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
416 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
417 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
418 if (f) {
419 f->dump_int("recovering_objects_per_sec", objps);
420 f->dump_int("recovering_bytes_per_sec", bps);
421 f->dump_int("recovering_keys_per_sec", kps);
422 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
423 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
424 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
425 } else {
426 *out << pretty_si_t(bps) << "B/s";
427 if (pos_delta.stats.sum.num_keys_recovered)
428 *out << ", " << pretty_si_t(kps) << "keys/s";
429 *out << ", " << pretty_si_t(objps) << "objects/s";
430 }
431 }
432 }
433
434 void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
435 {
436 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
437 }
438
439 void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
440 {
441 recovery_summary(f, psl, pg_sum);
442 }
443
444 void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
445 uint64_t poolid) const
446 {
447 auto p = per_pool_sum_delta.find(poolid);
448 if (p == per_pool_sum_delta.end())
449 return;
450
451 auto ts = per_pool_sum_deltas_stamps.find(p->first);
452 assert(ts != per_pool_sum_deltas_stamps.end());
453 recovery_rate_summary(f, out, p->second.first, ts->second);
454 }
455
456 void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
457 uint64_t poolid) const
458 {
459 auto p = per_pool_sum_delta.find(poolid);
460 if (p == per_pool_sum_delta.end())
461 return;
462
463 recovery_summary(f, psl, p->second.first);
464 }
465
466 void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
467 const pool_stat_t& delta_sum,
468 utime_t delta_stamp) const
469 {
470 pool_stat_t pos_delta = delta_sum;
471 pos_delta.floor(0);
472 if (pos_delta.stats.sum.num_rd ||
473 pos_delta.stats.sum.num_wr) {
474 if (pos_delta.stats.sum.num_rd) {
475 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
476 if (f) {
477 f->dump_int("read_bytes_sec", rd);
478 } else {
479 *out << pretty_si_t(rd) << "B/s rd, ";
480 }
481 }
482 if (pos_delta.stats.sum.num_wr) {
483 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
484 if (f) {
485 f->dump_int("write_bytes_sec", wr);
486 } else {
487 *out << pretty_si_t(wr) << "B/s wr, ";
488 }
489 }
490 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
491 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
492 if (f) {
493 f->dump_int("read_op_per_sec", iops_rd);
494 f->dump_int("write_op_per_sec", iops_wr);
495 } else {
496 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
497 }
498 }
499 }
500
501 void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
502 {
503 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
504 }
505
506 void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
507 uint64_t poolid) const
508 {
509 auto p = per_pool_sum_delta.find(poolid);
510 if (p == per_pool_sum_delta.end())
511 return;
512
513 auto ts = per_pool_sum_deltas_stamps.find(p->first);
514 assert(ts != per_pool_sum_deltas_stamps.end());
515 client_io_rate_summary(f, out, p->second.first, ts->second);
516 }
517
518 void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
519 const pool_stat_t& delta_sum,
520 utime_t delta_stamp) const
521 {
522 pool_stat_t pos_delta = delta_sum;
523 pos_delta.floor(0);
524 bool have_output = false;
525
526 if (pos_delta.stats.sum.num_flush) {
527 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
528 if (f) {
529 f->dump_int("flush_bytes_sec", flush);
530 } else {
531 *out << pretty_si_t(flush) << "B/s flush";
532 have_output = true;
533 }
534 }
535 if (pos_delta.stats.sum.num_evict) {
536 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
537 if (f) {
538 f->dump_int("evict_bytes_sec", evict);
539 } else {
540 if (have_output)
541 *out << ", ";
542 *out << pretty_si_t(evict) << "B/s evict";
543 have_output = true;
544 }
545 }
546 if (pos_delta.stats.sum.num_promote) {
547 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
548 if (f) {
549 f->dump_int("promote_op_per_sec", promote);
550 } else {
551 if (have_output)
552 *out << ", ";
553 *out << pretty_si_t(promote) << "op/s promote";
554 have_output = true;
555 }
556 }
557 if (pos_delta.stats.sum.num_flush_mode_low) {
558 if (f) {
559 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
560 } else {
561 if (have_output)
562 *out << ", ";
563 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
564 have_output = true;
565 }
566 }
567 if (pos_delta.stats.sum.num_flush_mode_high) {
568 if (f) {
569 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
570 } else {
571 if (have_output)
572 *out << ", ";
573 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
574 have_output = true;
575 }
576 }
577 if (pos_delta.stats.sum.num_evict_mode_some) {
578 if (f) {
579 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
580 } else {
581 if (have_output)
582 *out << ", ";
583 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
584 have_output = true;
585 }
586 }
587 if (pos_delta.stats.sum.num_evict_mode_full) {
588 if (f) {
589 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
590 } else {
591 if (have_output)
592 *out << ", ";
593 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
594 }
595 }
596 }
597
598 void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
599 {
600 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
601 }
602
603 void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
604 uint64_t poolid) const
605 {
606 auto p = per_pool_sum_delta.find(poolid);
607 if (p == per_pool_sum_delta.end())
608 return;
609
610 auto ts = per_pool_sum_deltas_stamps.find(p->first);
611 assert(ts != per_pool_sum_deltas_stamps.end());
612 cache_io_rate_summary(f, out, p->second.first, ts->second);
613 }
614
615 void PGMapDigest::dump_pool_stats_full(
616 const OSDMap &osd_map,
617 stringstream *ss,
618 Formatter *f,
619 bool verbose) const
620 {
621 TextTable tbl;
622
623 if (f) {
624 f->open_array_section("pools");
625 } else {
626 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
627 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
628 if (verbose) {
629 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
630 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
631 }
632
633 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
634 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
635 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
636 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
637 if (verbose) {
638 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
639 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
640 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
641 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
642 }
643 }
644
645 map<int,uint64_t> avail_by_rule;
646 for (auto p = osd_map.get_pools().begin();
647 p != osd_map.get_pools().end(); ++p) {
648 int64_t pool_id = p->first;
649 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
650 continue;
651 const string& pool_name = osd_map.get_pool_name(pool_id);
652 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
653
654 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
655 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
656 pool->get_type(),
657 pool->get_size());
658 int64_t avail;
659 float raw_used_rate;
660 if (avail_by_rule.count(ruleno) == 0) {
661 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
662 avail = get_rule_avail(ruleno);
663 if (avail < 0)
664 avail = 0;
665 avail_by_rule[ruleno] = avail;
666 } else {
667 avail = avail_by_rule[ruleno];
668 }
669 switch (pool->get_type()) {
670 case pg_pool_t::TYPE_REPLICATED:
671 avail /= pool->get_size();
672 raw_used_rate = pool->get_size();
673 break;
674 case pg_pool_t::TYPE_ERASURE:
675 {
676 auto& ecp =
677 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
678 auto pm = ecp.find("m");
679 auto pk = ecp.find("k");
680 if (pm != ecp.end() && pk != ecp.end()) {
681 int k = atoi(pk->second.c_str());
682 int m = atoi(pm->second.c_str());
683 int mk = m + k;
684 assert(mk != 0);
685 avail = avail * k / mk;
686 raw_used_rate = (float)mk / k;
687 } else {
688 raw_used_rate = 0.0;
689 }
690 }
691 break;
692 default:
693 assert(0 == "unrecognized pool type");
694 }
695
696 if (f) {
697 f->open_object_section("pool");
698 f->dump_string("name", pool_name);
699 f->dump_int("id", pool_id);
700 f->open_object_section("stats");
701 } else {
702 tbl << pool_name
703 << pool_id;
704 if (verbose) {
705 if (pool->quota_max_objects == 0)
706 tbl << "N/A";
707 else
708 tbl << si_t(pool->quota_max_objects);
709
710 if (pool->quota_max_bytes == 0)
711 tbl << "N/A";
712 else
713 tbl << si_t(pool->quota_max_bytes);
714 }
715
716 }
717 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
718 if (f)
719 f->close_section(); // stats
720 else
721 tbl << TextTable::endrow;
722
723 if (f)
724 f->close_section(); // pool
725 }
726 if (f)
727 f->close_section();
728 else {
729 assert(ss != nullptr);
730 *ss << "POOLS:\n";
731 tbl.set_indent(4);
732 *ss << tbl;
733 }
734 }
735
736 void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
737 {
738 if (f) {
739 f->open_object_section("stats");
740 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
741 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
742 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
743 if (verbose) {
744 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
745 }
746 f->close_section();
747 } else {
748 assert(ss != nullptr);
749 TextTable tbl;
750 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
751 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
752 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
753 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
754 if (verbose) {
755 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
756 }
757 tbl << stringify(si_t(osd_sum.kb*1024))
758 << stringify(si_t(osd_sum.kb_avail*1024))
759 << stringify(si_t(osd_sum.kb_used*1024));
760 float used = 0.0;
761 if (osd_sum.kb > 0) {
762 used = ((float)osd_sum.kb_used / osd_sum.kb);
763 }
764 tbl << percentify(used*100);
765 if (verbose) {
766 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
767 }
768 tbl << TextTable::endrow;
769
770 *ss << "GLOBAL:\n";
771 tbl.set_indent(4);
772 *ss << tbl;
773 }
774 }
775
776 void PGMapDigest::dump_object_stat_sum(
777 TextTable &tbl, Formatter *f,
778 const object_stat_sum_t &sum, uint64_t avail,
779 float raw_used_rate, bool verbose,
780 const pg_pool_t *pool)
781 {
782 float curr_object_copies_rate = 0.0;
783 if (sum.num_object_copies > 0)
784 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
785
786 float used = 0.0;
787 if (avail) {
788 used = sum.num_bytes * curr_object_copies_rate;
789 used /= used + avail;
790 } else if (sum.num_bytes) {
791 used = 1.0;
792 }
793
794 if (f) {
795 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
796 f->dump_int("bytes_used", sum.num_bytes);
797 f->dump_format_unquoted("percent_used", "%.2f", (used*100));
798 f->dump_unsigned("max_avail", avail);
799 f->dump_int("objects", sum.num_objects);
800 if (verbose) {
801 f->dump_int("quota_objects", pool->quota_max_objects);
802 f->dump_int("quota_bytes", pool->quota_max_bytes);
803 f->dump_int("dirty", sum.num_objects_dirty);
804 f->dump_int("rd", sum.num_rd);
805 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
806 f->dump_int("wr", sum.num_wr);
807 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
808 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
809 }
810 } else {
811 tbl << stringify(si_t(sum.num_bytes));
812 tbl << percentify(used*100);
813 tbl << si_t(avail);
814 tbl << sum.num_objects;
815 if (verbose) {
816 tbl << stringify(si_t(sum.num_objects_dirty))
817 << stringify(si_t(sum.num_rd))
818 << stringify(si_t(sum.num_wr))
819 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
820 }
821 }
822 }
823
824 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
825 {
826 map<int,float> wm;
827 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
828 if (r < 0) {
829 return r;
830 }
831 if (wm.empty()) {
832 return 0;
833 }
834
835 float fratio;
836 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
837 osdmap.get_full_ratio() > 0) {
838 fratio = osdmap.get_full_ratio();
839 } else {
840 fratio = get_fallback_full_ratio();
841 }
842
843 int64_t min = -1;
844 for (auto p = wm.begin(); p != wm.end(); ++p) {
845 auto osd_info = osd_stat.find(p->first);
846 if (osd_info != osd_stat.end()) {
847 if (osd_info->second.kb == 0 || p->second == 0) {
848 // osd must be out, hence its stats have been zeroed
849 // (unless we somehow managed to have a disk with size 0...)
850 //
851 // (p->second == 0), if osd weight is 0, no need to
852 // calculate proj below.
853 continue;
854 }
855 double unusable = (double)osd_info->second.kb *
856 (1.0 - fratio);
857 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
858 avail *= 1024.0;
859 int64_t proj = (int64_t)(avail / (double)p->second);
860 if (min < 0 || proj < min) {
861 min = proj;
862 }
863 } else {
864 dout(0) << "Cannot get stat of OSD " << p->first << dendl;
865 }
866 }
867 return min;
868 }
869
870 void PGMap::get_rules_avail(const OSDMap& osdmap,
871 std::map<int,int64_t> *avail_map) const
872 {
873 avail_map->clear();
874 for (auto p : osdmap.get_pools()) {
875 int64_t pool_id = p.first;
876 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
877 continue;
878 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
879 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
880 pool->get_type(),
881 pool->get_size());
882 if (avail_map->count(ruleno) == 0)
883 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
884 }
885 }
886
887 // ---------------------
888 // PGMap
889
890 void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
891 {
892 if ((features & CEPH_FEATURE_MONENC) == 0) {
893 __u8 v = 4;
894 ::encode(v, bl);
895 ::encode(version, bl);
896 ::encode(pg_stat_updates, bl);
897 ::encode(osd_stat_updates, bl);
898 ::encode(osd_stat_rm, bl);
899 ::encode(osdmap_epoch, bl);
900 ::encode(pg_scan, bl);
901 ::encode(full_ratio, bl);
902 ::encode(nearfull_ratio, bl);
903 ::encode(pg_remove, bl);
904 return;
905 }
906
907 ENCODE_START(7, 5, bl);
908 ::encode(version, bl);
909 ::encode(pg_stat_updates, bl);
910 ::encode(osd_stat_updates, bl);
911 ::encode(osd_stat_rm, bl);
912 ::encode(osdmap_epoch, bl);
913 ::encode(pg_scan, bl);
914 ::encode(full_ratio, bl);
915 ::encode(nearfull_ratio, bl);
916 ::encode(pg_remove, bl);
917 ::encode(stamp, bl);
918 ::encode(osd_epochs, bl);
919 ENCODE_FINISH(bl);
920 }
921
922 void PGMap::Incremental::decode(bufferlist::iterator &bl)
923 {
924 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
925 ::decode(version, bl);
926 if (struct_v < 3) {
927 pg_stat_updates.clear();
928 __u32 n;
929 ::decode(n, bl);
930 while (n--) {
931 old_pg_t opgid;
932 ::decode(opgid, bl);
933 pg_t pgid = opgid;
934 ::decode(pg_stat_updates[pgid], bl);
935 }
936 } else {
937 ::decode(pg_stat_updates, bl);
938 }
939 ::decode(osd_stat_updates, bl);
940 ::decode(osd_stat_rm, bl);
941 ::decode(osdmap_epoch, bl);
942 ::decode(pg_scan, bl);
943 if (struct_v >= 2) {
944 ::decode(full_ratio, bl);
945 ::decode(nearfull_ratio, bl);
946 }
947 if (struct_v < 3) {
948 pg_remove.clear();
949 __u32 n;
950 ::decode(n, bl);
951 while (n--) {
952 old_pg_t opgid;
953 ::decode(opgid, bl);
954 pg_remove.insert(pg_t(opgid));
955 }
956 } else {
957 ::decode(pg_remove, bl);
958 }
959 if (struct_v < 4 && full_ratio == 0) {
960 full_ratio = -1;
961 }
962 if (struct_v < 4 && nearfull_ratio == 0) {
963 nearfull_ratio = -1;
964 }
965 if (struct_v >= 6)
966 ::decode(stamp, bl);
967 if (struct_v >= 7) {
968 ::decode(osd_epochs, bl);
969 } else {
970 for (auto i = osd_stat_updates.begin();
971 i != osd_stat_updates.end();
972 ++i) {
973 // This isn't accurate, but will cause trimming to behave like
974 // previously.
975 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
976 }
977 }
978 DECODE_FINISH(bl);
979 }
980
981 void PGMap::Incremental::dump(Formatter *f) const
982 {
983 f->dump_unsigned("version", version);
984 f->dump_stream("stamp") << stamp;
985 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
986 f->dump_unsigned("pg_scan_epoch", pg_scan);
987 f->dump_float("full_ratio", full_ratio);
988 f->dump_float("nearfull_ratio", nearfull_ratio);
989
990 f->open_array_section("pg_stat_updates");
991 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
992 f->open_object_section("pg_stat");
993 f->dump_stream("pgid") << p->first;
994 p->second.dump(f);
995 f->close_section();
996 }
997 f->close_section();
998
999 f->open_array_section("osd_stat_updates");
1000 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1001 f->open_object_section("osd_stat");
1002 f->dump_int("osd", p->first);
1003 p->second.dump(f);
1004 f->close_section();
1005 }
1006 f->close_section();
1007
1008 f->open_array_section("osd_stat_removals");
1009 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1010 f->dump_int("osd", *p);
1011 f->close_section();
1012
1013 f->open_array_section("pg_removals");
1014 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1015 f->dump_stream("pgid") << *p;
1016 f->close_section();
1017 }
1018
1019 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1020 {
1021 o.push_back(new Incremental);
1022 o.push_back(new Incremental);
1023 o.back()->version = 1;
1024 o.back()->stamp = utime_t(123,345);
1025 o.push_back(new Incremental);
1026 o.back()->version = 2;
1027 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
1028 o.back()->osd_stat_updates[5] = osd_stat_t();
1029 o.back()->osd_epochs[5] = 12;
1030 o.push_back(new Incremental);
1031 o.back()->version = 3;
1032 o.back()->osdmap_epoch = 1;
1033 o.back()->pg_scan = 2;
1034 o.back()->full_ratio = .2;
1035 o.back()->nearfull_ratio = .3;
1036 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
1037 o.back()->osd_stat_updates[6] = osd_stat_t();
1038 o.back()->osd_epochs[6] = 12;
1039 o.back()->pg_remove.insert(pg_t(1,2,3));
1040 o.back()->osd_stat_rm.insert(5);
1041 }
1042
1043
1044 // --
1045
1046 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1047 {
1048 assert(inc.version == version+1);
1049 version++;
1050
1051 utime_t delta_t;
1052 delta_t = inc.stamp;
1053 delta_t -= stamp;
1054 stamp = inc.stamp;
1055
1056 pool_stat_t pg_sum_old = pg_sum;
1057 mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
1058
1059 bool ratios_changed = false;
1060 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
1061 full_ratio = inc.full_ratio;
1062 ratios_changed = true;
1063 }
1064 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
1065 nearfull_ratio = inc.nearfull_ratio;
1066 ratios_changed = true;
1067 }
1068 if (ratios_changed)
1069 redo_full_sets();
1070
1071 for (auto p = inc.pg_stat_updates.begin();
1072 p != inc.pg_stat_updates.end();
1073 ++p) {
1074 const pg_t &update_pg(p->first);
1075 const pg_stat_t &update_stat(p->second);
1076
1077 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
1078 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
1079
1080 auto t = pg_stat.find(update_pg);
1081 if (t == pg_stat.end()) {
1082 pg_stat.insert(make_pair(update_pg, update_stat));
1083 } else {
1084 stat_pg_sub(update_pg, t->second);
1085 t->second = update_stat;
1086 }
1087 stat_pg_add(update_pg, update_stat);
1088 }
1089 assert(osd_stat.size() == osd_epochs.size());
1090 for (auto p = inc.get_osd_stat_updates().begin();
1091 p != inc.get_osd_stat_updates().end();
1092 ++p) {
1093 int osd = p->first;
1094 const osd_stat_t &new_stats(p->second);
1095
1096 auto t = osd_stat.find(osd);
1097 if (t == osd_stat.end()) {
1098 osd_stat.insert(make_pair(osd, new_stats));
1099 } else {
1100 stat_osd_sub(t->first, t->second);
1101 t->second = new_stats;
1102 }
1103 auto i = osd_epochs.find(osd);
1104 auto j = inc.get_osd_epochs().find(osd);
1105 assert(j != inc.get_osd_epochs().end());
1106
1107 if (i == osd_epochs.end())
1108 osd_epochs.insert(*j);
1109 else
1110 i->second = j->second;
1111
1112 stat_osd_add(osd, new_stats);
1113
1114 // adjust [near]full status
1115 register_nearfull_status(osd, new_stats);
1116 }
1117 set<int64_t> deleted_pools;
1118 for (auto p = inc.pg_remove.begin();
1119 p != inc.pg_remove.end();
1120 ++p) {
1121 const pg_t &removed_pg(*p);
1122 auto s = pg_stat.find(removed_pg);
1123 if (s != pg_stat.end()) {
1124 stat_pg_sub(removed_pg, s->second);
1125 pg_stat.erase(s);
1126 }
1127 deleted_pools.insert(removed_pg.pool());
1128 }
1129
1130 for (auto p = inc.get_osd_stat_rm().begin();
1131 p != inc.get_osd_stat_rm().end();
1132 ++p) {
1133 auto t = osd_stat.find(*p);
1134 if (t != osd_stat.end()) {
1135 stat_osd_sub(t->first, t->second);
1136 osd_stat.erase(t);
1137 osd_epochs.erase(*p);
1138 }
1139
1140 // remove these old osds from full/nearfull set(s), too
1141 nearfull_osds.erase(*p);
1142 full_osds.erase(*p);
1143 }
1144
1145 // calculate a delta, and average over the last 2 deltas.
1146 pool_stat_t d = pg_sum;
1147 d.stats.sub(pg_sum_old.stats);
1148 pg_sum_deltas.push_back(make_pair(d, delta_t));
1149 stamp_delta += delta_t;
1150
1151 pg_sum_delta.stats.add(d.stats);
1152 if (pg_sum_deltas.size() > (unsigned)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
1153 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1154 stamp_delta -= pg_sum_deltas.front().second;
1155 pg_sum_deltas.pop_front();
1156 }
1157
1158 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1159
1160 for (auto p : deleted_pools) {
1161 if (cct)
1162 dout(20) << " deleted pool " << p << dendl;
1163 deleted_pool(p);
1164 }
1165
1166 if (inc.osdmap_epoch)
1167 last_osdmap_epoch = inc.osdmap_epoch;
1168 if (inc.pg_scan)
1169 last_pg_scan = inc.pg_scan;
1170
1171 min_last_epoch_clean = 0; // invalidate
1172 }
1173
1174 void PGMap::redo_full_sets()
1175 {
1176 full_osds.clear();
1177 nearfull_osds.clear();
1178 for (auto i = osd_stat.begin();
1179 i != osd_stat.end();
1180 ++i) {
1181 register_nearfull_status(i->first, i->second);
1182 }
1183 }
1184
1185 void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
1186 {
1187 float ratio = ((float)s.kb_used) / ((float)s.kb);
1188
1189 if (full_ratio > 0 && ratio > full_ratio) {
1190 // full
1191 full_osds.insert(osd);
1192 nearfull_osds.erase(osd);
1193 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
1194 // nearfull
1195 full_osds.erase(osd);
1196 nearfull_osds.insert(osd);
1197 } else {
1198 // ok
1199 full_osds.erase(osd);
1200 nearfull_osds.erase(osd);
1201 }
1202 }
1203
1204 void PGMap::calc_stats()
1205 {
1206 num_pg = 0;
1207 num_pg_active = 0;
1208 num_pg_unknown = 0;
1209 num_osd = 0;
1210 pg_pool_sum.clear();
1211 num_pg_by_pool.clear();
1212 pg_by_osd.clear();
1213 pg_sum = pool_stat_t();
1214 osd_sum = osd_stat_t();
1215 num_pg_by_state.clear();
1216 num_pg_by_osd.clear();
1217
1218 for (auto p = pg_stat.begin();
1219 p != pg_stat.end();
1220 ++p) {
1221 stat_pg_add(p->first, p->second);
1222 }
1223 for (auto p = osd_stat.begin();
1224 p != osd_stat.end();
1225 ++p)
1226 stat_osd_add(p->first, p->second);
1227
1228 redo_full_sets();
1229
1230 min_last_epoch_clean = calc_min_last_epoch_clean();
1231 }
1232
1233 void PGMap::update_pg(pg_t pgid, bufferlist& bl)
1234 {
1235 bufferlist::iterator p = bl.begin();
1236 auto s = pg_stat.find(pgid);
1237 epoch_t old_lec = 0, lec;
1238 if (s != pg_stat.end()) {
1239 old_lec = s->second.get_effective_last_epoch_clean();
1240 stat_pg_update(pgid, s->second, p);
1241 lec = s->second.get_effective_last_epoch_clean();
1242 } else {
1243 pg_stat_t& r = pg_stat[pgid];
1244 ::decode(r, p);
1245 stat_pg_add(pgid, r);
1246 lec = r.get_effective_last_epoch_clean();
1247 }
1248
1249 if (min_last_epoch_clean &&
1250 (lec < min_last_epoch_clean || // we did
1251 (lec > min_last_epoch_clean && // we might
1252 old_lec == min_last_epoch_clean)
1253 ))
1254 min_last_epoch_clean = 0;
1255 }
1256
1257 void PGMap::remove_pg(pg_t pgid)
1258 {
1259 auto s = pg_stat.find(pgid);
1260 if (s != pg_stat.end()) {
1261 if (min_last_epoch_clean &&
1262 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
1263 min_last_epoch_clean = 0;
1264 stat_pg_sub(pgid, s->second);
1265 pg_stat.erase(s);
1266 }
1267 }
1268
1269 void PGMap::update_osd(int osd, bufferlist& bl)
1270 {
1271 bufferlist::iterator p = bl.begin();
1272 auto o = osd_stat.find(osd);
1273 epoch_t old_lec = 0;
1274 if (o != osd_stat.end()) {
1275 auto i = osd_epochs.find(osd);
1276 if (i != osd_epochs.end())
1277 old_lec = i->second;
1278 stat_osd_sub(osd, o->second);
1279 }
1280 osd_stat_t& r = osd_stat[osd];
1281 ::decode(r, p);
1282 stat_osd_add(osd, r);
1283
1284 // adjust [near]full status
1285 register_nearfull_status(osd, r);
1286
1287 // epoch?
1288 if (!p.end()) {
1289 epoch_t e;
1290 ::decode(e, p);
1291
1292 if (e < min_last_epoch_clean ||
1293 (e > min_last_epoch_clean &&
1294 old_lec == min_last_epoch_clean))
1295 min_last_epoch_clean = 0;
1296 } else {
1297 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1298 // or old mon running.
1299 }
1300 }
1301
1302 void PGMap::remove_osd(int osd)
1303 {
1304 auto o = osd_stat.find(osd);
1305 if (o != osd_stat.end()) {
1306 stat_osd_sub(osd, o->second);
1307 osd_stat.erase(o);
1308
1309 // remove these old osds from full/nearfull set(s), too
1310 nearfull_osds.erase(osd);
1311 full_osds.erase(osd);
1312 }
1313 }
1314
1315 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1316 bool sameosds)
1317 {
1318 pg_pool_sum[pgid.pool()].add(s);
1319 pg_sum.add(s);
1320
1321 num_pg++;
1322 num_pg_by_state[s.state]++;
1323 num_pg_by_pool[pgid.pool()]++;
1324
1325 if ((s.state & PG_STATE_CREATING) &&
1326 s.parent_split_bits == 0) {
1327 creating_pgs.insert(pgid);
1328 if (s.acting_primary >= 0) {
1329 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1330 }
1331 }
1332
1333 if (s.state & PG_STATE_ACTIVE) {
1334 ++num_pg_active;
1335 }
1336 if (s.state == 0) {
1337 ++num_pg_unknown;
1338 }
1339
1340 if (sameosds)
1341 return;
1342
1343 for (auto p = s.blocked_by.begin();
1344 p != s.blocked_by.end();
1345 ++p) {
1346 ++blocked_by_sum[*p];
1347 }
1348
1349 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1350 pg_by_osd[*p].insert(pgid);
1351 num_pg_by_osd[*p].acting++;
1352 }
1353 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1354 pg_by_osd[*p].insert(pgid);
1355 num_pg_by_osd[*p].up++;
1356 }
1357
1358 if (s.up_primary >= 0) {
1359 num_pg_by_osd[s.up_primary].primary++;
1360 }
1361 }
1362
1363 void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1364 bool sameosds)
1365 {
1366 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
1367 ps.sub(s);
1368 pg_sum.sub(s);
1369
1370 num_pg--;
1371 int end = --num_pg_by_state[s.state];
1372 assert(end >= 0);
1373 if (end == 0)
1374 num_pg_by_state.erase(s.state);
1375 end = --num_pg_by_pool[pgid.pool()];
1376 if (end == 0) {
1377 num_pg_by_pool.erase(pgid.pool());
1378 pg_pool_sum.erase(pgid.pool());
1379 }
1380
1381 if ((s.state & PG_STATE_CREATING) &&
1382 s.parent_split_bits == 0) {
1383 creating_pgs.erase(pgid);
1384 if (s.acting_primary >= 0) {
1385 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1386 r[s.mapping_epoch].erase(pgid);
1387 if (r[s.mapping_epoch].empty())
1388 r.erase(s.mapping_epoch);
1389 if (r.empty())
1390 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1391 }
1392 }
1393
1394 if (s.state & PG_STATE_ACTIVE) {
1395 --num_pg_active;
1396 }
1397 if (s.state == 0) {
1398 --num_pg_unknown;
1399 }
1400
1401 if (sameosds)
1402 return;
1403
1404 for (auto p = s.blocked_by.begin();
1405 p != s.blocked_by.end();
1406 ++p) {
1407 auto q = blocked_by_sum.find(*p);
1408 assert(q != blocked_by_sum.end());
1409 --q->second;
1410 if (q->second == 0)
1411 blocked_by_sum.erase(q);
1412 }
1413
1414 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1415 auto& oset = pg_by_osd[*p];
1416 oset.erase(pgid);
1417 if (oset.empty())
1418 pg_by_osd.erase(*p);
1419 auto it = num_pg_by_osd.find(*p);
1420 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1421 it->second.acting--;
1422 }
1423 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1424 auto& oset = pg_by_osd[*p];
1425 oset.erase(pgid);
1426 if (oset.empty())
1427 pg_by_osd.erase(*p);
1428 auto it = num_pg_by_osd.find(*p);
1429 if (it != num_pg_by_osd.end() && it->second.up > 0)
1430 it->second.up--;
1431 }
1432
1433 if (s.up_primary >= 0) {
1434 auto it = num_pg_by_osd.find(s.up_primary);
1435 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1436 it->second.primary--;
1437 }
1438 }
1439
1440 void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
1441 bufferlist::iterator& blp)
1442 {
1443 pg_stat_t n;
1444 ::decode(n, blp);
1445
1446 bool sameosds =
1447 s.acting == n.acting &&
1448 s.up == n.up &&
1449 s.blocked_by == n.blocked_by;
1450
1451 stat_pg_sub(pgid, s, sameosds);
1452
1453 // if acting_primary has shift to an just restored osd, and pg yet to finish
1454 // peering, many attributes in current stats remain stale. others seem don't
1455 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1456 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
1457 n.last_active < s.last_active)
1458 n.last_active = s.last_active;
1459 s = n;
1460 stat_pg_add(pgid, n, sameosds);
1461 }
1462
1463 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1464 {
1465 num_osd++;
1466 osd_sum.add(s);
1467 if (osd >= (int)osd_last_seq.size()) {
1468 osd_last_seq.resize(osd + 1);
1469 }
1470 osd_last_seq[osd] = s.seq;
1471 }
1472
1473 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1474 {
1475 num_osd--;
1476 osd_sum.sub(s);
1477 assert(osd < (int)osd_last_seq.size());
1478 osd_last_seq[osd] = 0;
1479 }
1480
1481 epoch_t PGMap::calc_min_last_epoch_clean() const
1482 {
1483 if (pg_stat.empty())
1484 return 0;
1485
1486 auto p = pg_stat.begin();
1487 epoch_t min = p->second.get_effective_last_epoch_clean();
1488 for (++p; p != pg_stat.end(); ++p) {
1489 epoch_t lec = p->second.get_effective_last_epoch_clean();
1490 if (lec < min)
1491 min = lec;
1492 }
1493 // also scan osd epochs
1494 // don't trim past the oldest reported osd epoch
1495 for (auto i = osd_epochs.begin();
1496 i != osd_epochs.end();
1497 ++i) {
1498 if (i->second < min)
1499 min = i->second;
1500 }
1501 return min;
1502 }
1503
1504 void PGMap::encode_digest(const OSDMap& osdmap,
1505 bufferlist& bl, uint64_t features) const
1506 {
1507 get_rules_avail(osdmap, &avail_space_by_rule);
1508 PGMapDigest::encode(bl, features);
1509 }
1510
1511 void PGMap::encode(bufferlist &bl, uint64_t features) const
1512 {
1513 if ((features & CEPH_FEATURE_MONENC) == 0) {
1514 __u8 v = 3;
1515 ::encode(v, bl);
1516 ::encode(version, bl);
1517 ::encode(pg_stat, bl);
1518 ::encode(osd_stat, bl);
1519 ::encode(last_osdmap_epoch, bl);
1520 ::encode(last_pg_scan, bl);
1521 ::encode(full_ratio, bl);
1522 ::encode(nearfull_ratio, bl);
1523 return;
1524 }
1525
1526 ENCODE_START(6, 4, bl);
1527 ::encode(version, bl);
1528 ::encode(pg_stat, bl);
1529 ::encode(osd_stat, bl);
1530 ::encode(last_osdmap_epoch, bl);
1531 ::encode(last_pg_scan, bl);
1532 ::encode(full_ratio, bl);
1533 ::encode(nearfull_ratio, bl);
1534 ::encode(stamp, bl);
1535 ::encode(osd_epochs, bl);
1536 ENCODE_FINISH(bl);
1537 }
1538
1539 void PGMap::decode(bufferlist::iterator &bl)
1540 {
1541 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
1542 ::decode(version, bl);
1543 if (struct_v < 3) {
1544 pg_stat.clear();
1545 __u32 n;
1546 ::decode(n, bl);
1547 while (n--) {
1548 old_pg_t opgid;
1549 ::decode(opgid, bl);
1550 pg_t pgid = opgid;
1551 ::decode(pg_stat[pgid], bl);
1552 }
1553 } else {
1554 ::decode(pg_stat, bl);
1555 }
1556 ::decode(osd_stat, bl);
1557 ::decode(last_osdmap_epoch, bl);
1558 ::decode(last_pg_scan, bl);
1559 if (struct_v >= 2) {
1560 ::decode(full_ratio, bl);
1561 ::decode(nearfull_ratio, bl);
1562 }
1563 if (struct_v >= 5)
1564 ::decode(stamp, bl);
1565 if (struct_v >= 6) {
1566 ::decode(osd_epochs, bl);
1567 } else {
1568 for (auto i = osd_stat.begin();
1569 i != osd_stat.end();
1570 ++i) {
1571 // This isn't accurate, but will cause trimming to behave like
1572 // previously.
1573 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
1574 }
1575 }
1576 DECODE_FINISH(bl);
1577
1578 calc_stats();
1579 }
1580
1581 void PGMap::dirty_all(Incremental& inc)
1582 {
1583 inc.osdmap_epoch = last_osdmap_epoch;
1584 inc.pg_scan = last_pg_scan;
1585 inc.full_ratio = full_ratio;
1586 inc.nearfull_ratio = nearfull_ratio;
1587
1588 for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) {
1589 inc.pg_stat_updates[p->first] = p->second;
1590 }
1591 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
1592 assert(osd_epochs.count(p->first));
1593 inc.update_stat(p->first,
1594 inc.get_osd_epochs().find(p->first)->second,
1595 p->second);
1596 }
1597 }
1598
1599 void PGMap::dump(Formatter *f) const
1600 {
1601 dump_basic(f);
1602 dump_pg_stats(f, false);
1603 dump_pool_stats(f);
1604 dump_osd_stats(f);
1605 }
1606
1607 void PGMap::dump_basic(Formatter *f) const
1608 {
1609 f->dump_unsigned("version", version);
1610 f->dump_stream("stamp") << stamp;
1611 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1612 f->dump_unsigned("last_pg_scan", last_pg_scan);
1613 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
1614 f->dump_float("full_ratio", full_ratio);
1615 f->dump_float("near_full_ratio", nearfull_ratio);
1616
1617 f->open_object_section("pg_stats_sum");
1618 pg_sum.dump(f);
1619 f->close_section();
1620
1621 f->open_object_section("osd_stats_sum");
1622 osd_sum.dump(f);
1623 f->close_section();
1624
1625 f->open_array_section("osd_epochs");
1626 for (auto p = osd_epochs.begin(); p != osd_epochs.end(); ++p) {
1627 f->open_object_section("osd");
1628 f->dump_unsigned("osd", p->first);
1629 f->dump_unsigned("epoch", p->second);
1630 f->close_section();
1631 }
1632 f->close_section();
1633
1634 dump_delta(f);
1635 }
1636
1637 void PGMap::dump_delta(Formatter *f) const
1638 {
1639 f->open_object_section("pg_stats_delta");
1640 pg_sum_delta.dump(f);
1641 f->close_section();
1642 }
1643
1644 void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1645 {
1646 f->open_array_section("pg_stats");
1647 for (auto i = pg_stat.begin();
1648 i != pg_stat.end();
1649 ++i) {
1650 f->open_object_section("pg_stat");
1651 f->dump_stream("pgid") << i->first;
1652 if (brief)
1653 i->second.dump_brief(f);
1654 else
1655 i->second.dump(f);
1656 f->close_section();
1657 }
1658 f->close_section();
1659 }
1660
1661 void PGMap::dump_pool_stats(Formatter *f) const
1662 {
1663 f->open_array_section("pool_stats");
1664 for (auto p = pg_pool_sum.begin();
1665 p != pg_pool_sum.end();
1666 ++p) {
1667 f->open_object_section("pool_stat");
1668 f->dump_int("poolid", p->first);
1669 auto q = num_pg_by_pool.find(p->first);
1670 if (q != num_pg_by_pool.end())
1671 f->dump_unsigned("num_pg", q->second);
1672 p->second.dump(f);
1673 f->close_section();
1674 }
1675 f->close_section();
1676 }
1677
1678 void PGMap::dump_osd_stats(Formatter *f) const
1679 {
1680 f->open_array_section("osd_stats");
1681 for (auto q = osd_stat.begin();
1682 q != osd_stat.end();
1683 ++q) {
1684 f->open_object_section("osd_stat");
1685 f->dump_int("osd", q->first);
1686 q->second.dump(f);
1687 f->close_section();
1688 }
1689 f->close_section();
1690 }
1691
1692 void PGMap::dump_pg_stats_plain(
1693 ostream& ss,
1694 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1695 bool brief) const
1696 {
1697 TextTable tab;
1698
1699 if (brief){
1700 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1701 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1702 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1703 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1704 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1705 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1706 }
1707 else {
1708 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1709 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1710 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1711 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1712 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1713 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1714 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1715 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1716 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1717 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1718 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1719 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1720 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1721 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1722 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1723 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1724 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1725 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1726 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1727 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1728 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1729 }
1730
1731 for (auto i = pg_stats.begin();
1732 i != pg_stats.end(); ++i) {
1733 const pg_stat_t &st(i->second);
1734 if (brief) {
1735 tab << i->first
1736 << pg_state_string(st.state)
1737 << st.up
1738 << st.up_primary
1739 << st.acting
1740 << st.acting_primary
1741 << TextTable::endrow;
1742 } else {
1743 ostringstream reported;
1744 reported << st.reported_epoch << ":" << st.reported_seq;
1745
1746 tab << i->first
1747 << st.stats.sum.num_objects
1748 << st.stats.sum.num_objects_missing_on_primary
1749 << st.stats.sum.num_objects_degraded
1750 << st.stats.sum.num_objects_misplaced
1751 << st.stats.sum.num_objects_unfound
1752 << st.stats.sum.num_bytes
1753 << st.log_size
1754 << st.ondisk_log_size
1755 << pg_state_string(st.state)
1756 << st.last_change
1757 << st.version
1758 << reported.str()
1759 << pg_vector_string(st.up)
1760 << st.up_primary
1761 << pg_vector_string(st.acting)
1762 << st.acting_primary
1763 << st.last_scrub
1764 << st.last_scrub_stamp
1765 << st.last_deep_scrub
1766 << st.last_deep_scrub_stamp
1767 << TextTable::endrow;
1768 }
1769 }
1770
1771 ss << tab;
1772 }
1773
1774 void PGMap::dump(ostream& ss) const
1775 {
1776 dump_basic(ss);
1777 dump_pg_stats(ss, false);
1778 dump_pool_stats(ss, false);
1779 dump_pg_sum_stats(ss, false);
1780 dump_osd_stats(ss);
1781 }
1782
1783 void PGMap::dump_basic(ostream& ss) const
1784 {
1785 ss << "version " << version << std::endl;
1786 ss << "stamp " << stamp << std::endl;
1787 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1788 ss << "last_pg_scan " << last_pg_scan << std::endl;
1789 ss << "full_ratio " << full_ratio << std::endl;
1790 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
1791 }
1792
1793 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1794 {
1795 dump_pg_stats_plain(ss, pg_stat, brief);
1796 }
1797
1798 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1799 {
1800 TextTable tab;
1801
1802 if (header) {
1803 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1804 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1805 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1806 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1807 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1808 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1809 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1810 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1811 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1812 } else {
1813 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1814 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1815 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1816 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1817 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1818 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1819 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1820 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1821 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1822 }
1823
1824 for (auto p = pg_pool_sum.begin();
1825 p != pg_pool_sum.end();
1826 ++p) {
1827 tab << p->first
1828 << p->second.stats.sum.num_objects
1829 << p->second.stats.sum.num_objects_missing_on_primary
1830 << p->second.stats.sum.num_objects_degraded
1831 << p->second.stats.sum.num_objects_misplaced
1832 << p->second.stats.sum.num_objects_unfound
1833 << p->second.stats.sum.num_bytes
1834 << p->second.log_size
1835 << p->second.ondisk_log_size
1836 << TextTable::endrow;
1837 }
1838
1839 ss << tab;
1840 }
1841
1842 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1843 {
1844 TextTable tab;
1845
1846 if (header) {
1847 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1848 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1849 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1850 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1851 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1852 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1853 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1854 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1855 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1856 } else {
1857 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1858 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1859 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1860 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1864 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1865 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1866 };
1867
1868 tab << "sum"
1869 << pg_sum.stats.sum.num_objects
1870 << pg_sum.stats.sum.num_objects_missing_on_primary
1871 << pg_sum.stats.sum.num_objects_degraded
1872 << pg_sum.stats.sum.num_objects_misplaced
1873 << pg_sum.stats.sum.num_objects_unfound
1874 << pg_sum.stats.sum.num_bytes
1875 << pg_sum.log_size
1876 << pg_sum.ondisk_log_size
1877 << TextTable::endrow;
1878
1879 ss << tab;
1880 }
1881
1882 void PGMap::dump_osd_stats(ostream& ss) const
1883 {
1884 TextTable tab;
1885
1886 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1887 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1888 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1889 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1890 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1891 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1892 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1893
1894 for (auto p = osd_stat.begin();
1895 p != osd_stat.end();
1896 ++p) {
1897 tab << p->first
1898 << si_t(p->second.kb_used << 10)
1899 << si_t(p->second.kb_avail << 10)
1900 << si_t(p->second.kb << 10)
1901 << p->second.hb_peers
1902 << get_num_pg_by_osd(p->first)
1903 << get_num_primary_pg_by_osd(p->first)
1904 << TextTable::endrow;
1905 }
1906
1907 tab << "sum"
1908 << si_t(osd_sum.kb_used << 10)
1909 << si_t(osd_sum.kb_avail << 10)
1910 << si_t(osd_sum.kb << 10)
1911 << TextTable::endrow;
1912
1913 ss << tab;
1914 }
1915
1916 void PGMap::dump_osd_sum_stats(ostream& ss) const
1917 {
1918 TextTable tab;
1919
1920 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1921 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1922 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1923 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1924
1925 tab << "sum"
1926 << si_t(osd_sum.kb_used << 10)
1927 << si_t(osd_sum.kb_avail << 10)
1928 << si_t(osd_sum.kb << 10)
1929 << TextTable::endrow;
1930
1931 ss << tab;
1932 }
1933
1934 void PGMap::get_stuck_stats(
1935 int types, const utime_t cutoff,
1936 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1937 {
1938 assert(types != 0);
1939 for (auto i = pg_stat.begin();
1940 i != pg_stat.end();
1941 ++i) {
1942 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1943
1944 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1945 if (i->second.last_active < val)
1946 val = i->second.last_active;
1947 }
1948
1949 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1950 if (i->second.last_clean < val)
1951 val = i->second.last_clean;
1952 }
1953
1954 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1955 if (i->second.last_undegraded < val)
1956 val = i->second.last_undegraded;
1957 }
1958
1959 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1960 if (i->second.last_fullsized < val)
1961 val = i->second.last_fullsized;
1962 }
1963
1964 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1965 if (i->second.last_unstale < val)
1966 val = i->second.last_unstale;
1967 }
1968
1969 // val is now the earliest any of the requested stuck states began
1970 if (val < cutoff) {
1971 stuck_pgs[i->first] = i->second;
1972 }
1973 }
1974 }
1975
1976 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1977 {
1978 int inactive = 0;
1979 int unclean = 0;
1980 int degraded = 0;
1981 int undersized = 0;
1982 int stale = 0;
1983
1984 for (auto i = pg_stat.begin();
1985 i != pg_stat.end();
1986 ++i) {
1987 if (! (i->second.state & PG_STATE_ACTIVE)) {
1988 if (i->second.last_active < cutoff)
1989 ++inactive;
1990 }
1991 if (! (i->second.state & PG_STATE_CLEAN)) {
1992 if (i->second.last_clean < cutoff)
1993 ++unclean;
1994 }
1995 if (i->second.state & PG_STATE_DEGRADED) {
1996 if (i->second.last_undegraded < cutoff)
1997 ++degraded;
1998 }
1999 if (i->second.state & PG_STATE_UNDERSIZED) {
2000 if (i->second.last_fullsized < cutoff)
2001 ++undersized;
2002 }
2003 if (i->second.state & PG_STATE_STALE) {
2004 if (i->second.last_unstale < cutoff)
2005 ++stale;
2006 }
2007 }
2008
2009 if (inactive)
2010 note["stuck inactive"] = inactive;
2011
2012 if (unclean)
2013 note["stuck unclean"] = unclean;
2014
2015 if (undersized)
2016 note["stuck undersized"] = undersized;
2017
2018 if (degraded)
2019 note["stuck degraded"] = degraded;
2020
2021 if (stale)
2022 note["stuck stale"] = stale;
2023
2024 return inactive || unclean || undersized || degraded || stale;
2025 }
2026
2027 void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
2028 {
2029 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2030 get_stuck_stats(types, cutoff, stuck_pg_stats);
2031 f->open_array_section("stuck_pg_stats");
2032 for (auto i = stuck_pg_stats.begin();
2033 i != stuck_pg_stats.end();
2034 ++i) {
2035 f->open_object_section("pg_stat");
2036 f->dump_stream("pgid") << i->first;
2037 i->second.dump(f);
2038 f->close_section();
2039 }
2040 f->close_section();
2041 }
2042
2043 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2044 {
2045 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2046 get_stuck_stats(types, cutoff, stuck_pg_stats);
2047 if (!stuck_pg_stats.empty())
2048 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2049 }
2050
2051 int PGMap::dump_stuck_pg_stats(
2052 stringstream &ds,
2053 Formatter *f,
2054 int threshold,
2055 vector<string>& args) const
2056 {
2057 int stuck_types = 0;
2058
2059 for (auto i = args.begin(); i != args.end(); ++i) {
2060 if (*i == "inactive")
2061 stuck_types |= PGMap::STUCK_INACTIVE;
2062 else if (*i == "unclean")
2063 stuck_types |= PGMap::STUCK_UNCLEAN;
2064 else if (*i == "undersized")
2065 stuck_types |= PGMap::STUCK_UNDERSIZED;
2066 else if (*i == "degraded")
2067 stuck_types |= PGMap::STUCK_DEGRADED;
2068 else if (*i == "stale")
2069 stuck_types |= PGMap::STUCK_STALE;
2070 else {
2071 ds << "Unknown type: " << *i << std::endl;
2072 return -EINVAL;
2073 }
2074 }
2075
2076 utime_t now(ceph_clock_now());
2077 utime_t cutoff = now - utime_t(threshold, 0);
2078
2079 if (!f) {
2080 dump_stuck_plain(ds, stuck_types, cutoff);
2081 } else {
2082 dump_stuck(f, stuck_types, cutoff);
2083 f->flush(ds);
2084 }
2085
2086 return 0;
2087 }
2088
2089 void PGMap::dump_osd_perf_stats(Formatter *f) const
2090 {
2091 f->open_array_section("osd_perf_infos");
2092 for (auto i = osd_stat.begin();
2093 i != osd_stat.end();
2094 ++i) {
2095 f->open_object_section("osd");
2096 f->dump_int("id", i->first);
2097 {
2098 f->open_object_section("perf_stats");
2099 i->second.os_perf_stat.dump(f);
2100 f->close_section();
2101 }
2102 f->close_section();
2103 }
2104 f->close_section();
2105 }
2106 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2107 {
2108 TextTable tab;
2109 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2110 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2111 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2112 for (auto i = osd_stat.begin();
2113 i != osd_stat.end();
2114 ++i) {
2115 tab << i->first;
2116 tab << i->second.os_perf_stat.os_commit_latency;
2117 tab << i->second.os_perf_stat.os_apply_latency;
2118 tab << TextTable::endrow;
2119 }
2120 (*ss) << tab;
2121 }
2122
2123 void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2124 {
2125 f->open_array_section("osd_blocked_by_infos");
2126 for (auto i = blocked_by_sum.begin();
2127 i != blocked_by_sum.end();
2128 ++i) {
2129 f->open_object_section("osd");
2130 f->dump_int("id", i->first);
2131 f->dump_int("num_blocked", i->second);
2132 f->close_section();
2133 }
2134 f->close_section();
2135 }
2136 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2137 {
2138 TextTable tab;
2139 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2140 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2141 for (auto i = blocked_by_sum.begin();
2142 i != blocked_by_sum.end();
2143 ++i) {
2144 tab << i->first;
2145 tab << i->second;
2146 tab << TextTable::endrow;
2147 }
2148 (*ss) << tab;
2149 }
2150
2151
2152 /**
2153 * update aggregated delta
2154 *
2155 * @param cct ceph context
2156 * @param ts Timestamp for the stats being delta'ed
2157 * @param old_pool_sum Previous stats sum
2158 * @param last_ts Last timestamp for pool
2159 * @param result_pool_sum Resulting stats
2160 * @param result_pool_delta Resulting pool delta
2161 * @param result_ts_delta Resulting timestamp delta
2162 * @param delta_avg_list List of last N computed deltas, used to average
2163 */
2164 void PGMap::update_delta(
2165 CephContext *cct,
2166 const utime_t ts,
2167 const pool_stat_t& old_pool_sum,
2168 utime_t *last_ts,
2169 const pool_stat_t& current_pool_sum,
2170 pool_stat_t *result_pool_delta,
2171 utime_t *result_ts_delta,
2172 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2173 {
2174 /* @p ts is the timestamp we want to associate with the data
2175 * in @p old_pool_sum, and on which we will base ourselves to
2176 * calculate the delta, stored in 'delta_t'.
2177 */
2178 utime_t delta_t;
2179 delta_t = ts; // start with the provided timestamp
2180 delta_t -= *last_ts; // take the last timestamp we saw
2181 *last_ts = ts; // @p ts becomes the last timestamp we saw
2182
2183 // adjust delta_t, quick start if there is no update in a long period
2184 delta_t = std::min(delta_t,
2185 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2186
2187 // calculate a delta, and average over the last 6 deltas by default.
2188 /* start by taking a copy of our current @p result_pool_sum, and by
2189 * taking out the stats from @p old_pool_sum. This generates a stats
2190 * delta. Stash this stats delta in @p delta_avg_list, along with the
2191 * timestamp delta for these results.
2192 */
2193 pool_stat_t d = current_pool_sum;
2194 d.stats.sub(old_pool_sum.stats);
2195 delta_avg_list->push_back(make_pair(d,delta_t));
2196 *result_ts_delta += delta_t;
2197
2198 /* Aggregate current delta, and take out the last seen delta (if any) to
2199 * average it out.
2200 */
2201 result_pool_delta->stats.add(d.stats);
2202 size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
2203 if (delta_avg_list->size() > s) {
2204 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2205 *result_ts_delta -= delta_avg_list->front().second;
2206 delta_avg_list->pop_front();
2207 }
2208 }
2209
2210 /**
2211 * update aggregated delta
2212 *
2213 * @param cct ceph context
2214 * @param ts Timestamp
2215 * @param pg_sum_old Old pg_sum
2216 */
2217 void PGMap::update_global_delta(CephContext *cct,
2218 const utime_t ts, const pool_stat_t& pg_sum_old)
2219 {
2220 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
2221 &stamp_delta, &pg_sum_deltas);
2222 }
2223
2224 /**
2225 * Update a given pool's deltas
2226 *
2227 * @param cct Ceph Context
2228 * @param ts Timestamp for the stats being delta'ed
2229 * @param pool Pool's id
2230 * @param old_pool_sum Previous stats sum
2231 */
2232 void PGMap::update_one_pool_delta(
2233 CephContext *cct,
2234 const utime_t ts,
2235 const uint64_t pool,
2236 const pool_stat_t& old_pool_sum)
2237 {
2238 if (per_pool_sum_deltas.count(pool) == 0) {
2239 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2240 assert(per_pool_sum_delta.count(pool) == 0);
2241 }
2242
2243 auto& sum_delta = per_pool_sum_delta[pool];
2244
2245 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2246 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2247 &per_pool_sum_deltas[pool]);
2248 }
2249
2250 /**
2251 * Update pools' deltas
2252 *
2253 * @param cct CephContext
2254 * @param ts Timestamp for the stats being delta'ed
2255 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2256 */
2257 void PGMap::update_pool_deltas(
2258 CephContext *cct, const utime_t ts,
2259 const mempool::pgmap::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
2260 {
2261 for (auto it = pg_pool_sum_old.begin();
2262 it != pg_pool_sum_old.end(); ++it) {
2263 update_one_pool_delta(cct, ts, it->first, it->second);
2264 }
2265 }
2266
2267 void PGMap::clear_delta()
2268 {
2269 pg_sum_delta = pool_stat_t();
2270 pg_sum_deltas.clear();
2271 stamp_delta = utime_t();
2272 }
2273
2274 void PGMap::generate_test_instances(list<PGMap*>& o)
2275 {
2276 o.push_back(new PGMap);
2277 list<Incremental*> inc;
2278 Incremental::generate_test_instances(inc);
2279 delete inc.front();
2280 inc.pop_front();
2281 while (!inc.empty()) {
2282 PGMap *pmp = new PGMap();
2283 *pmp = *o.back();
2284 o.push_back(pmp);
2285 o.back()->apply_incremental(NULL, *inc.front());
2286 delete inc.front();
2287 inc.pop_front();
2288 }
2289 }
2290
2291 void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
2292 bool primary, set<pg_t>& pgs) const
2293 {
2294 for (auto i = pg_stat.begin();
2295 i != pg_stat.end();
2296 ++i) {
2297 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
2298 continue;
2299 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2300 continue;
2301 if (!(i->second.state & state))
2302 continue;
2303 pgs.insert(i->first);
2304 }
2305 }
2306
2307 void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2308 {
2309 f->open_array_section("pg_stats");
2310 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2311 const pg_stat_t& st = pg_stat.at(*i);
2312 f->open_object_section("pg_stat");
2313 f->dump_stream("pgid") << *i;
2314 st.dump(f);
2315 f->close_section();
2316 }
2317 f->close_section();
2318 }
2319
2320 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2321 {
2322 TextTable tab;
2323
2324 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
2325 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2326 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2327 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2328 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2329 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2330 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2331 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2332 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
2333 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2334 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
2335 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2336 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2337 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2338 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2339 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2340 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2341 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2342 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2343 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2344 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2345
2346 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2347 const pg_stat_t& st = pg_stat.at(*i);
2348
2349 ostringstream reported;
2350 reported << st.reported_epoch << ":" << st.reported_seq;
2351
2352 tab << *i
2353 << st.stats.sum.num_objects
2354 << st.stats.sum.num_objects_missing_on_primary
2355 << st.stats.sum.num_objects_degraded
2356 << st.stats.sum.num_objects_misplaced
2357 << st.stats.sum.num_objects_unfound
2358 << st.stats.sum.num_bytes
2359 << st.log_size
2360 << st.ondisk_log_size
2361 << pg_state_string(st.state)
2362 << st.last_change
2363 << st.version
2364 << reported.str()
2365 << st.up
2366 << st.up_primary
2367 << st.acting
2368 << st.acting_primary
2369 << st.last_scrub
2370 << st.last_scrub_stamp
2371 << st.last_deep_scrub
2372 << st.last_deep_scrub_stamp
2373 << TextTable::endrow;
2374 }
2375
2376 ss << tab;
2377 }
2378
2379
2380
2381 // Only called with a single bit set in "what"
2382 static void note_stuck_detail(
2383 int what,
2384 mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
2385 int max_detail,
2386 list<pair<health_status_t,string> > *detail)
2387 {
2388 int n = 0;
2389 for (auto p = stuck_pgs.begin();
2390 p != stuck_pgs.end();
2391 ++p) {
2392 ostringstream ss;
2393 utime_t since;
2394 const char *whatname = 0;
2395 switch (what) {
2396 case PGMap::STUCK_INACTIVE:
2397 since = p->second.last_active;
2398 whatname = "inactive";
2399 break;
2400 case PGMap::STUCK_UNCLEAN:
2401 since = p->second.last_clean;
2402 whatname = "unclean";
2403 break;
2404 case PGMap::STUCK_DEGRADED:
2405 since = p->second.last_undegraded;
2406 whatname = "degraded";
2407 break;
2408 case PGMap::STUCK_UNDERSIZED:
2409 since = p->second.last_fullsized;
2410 whatname = "undersized";
2411 break;
2412 case PGMap::STUCK_STALE:
2413 since = p->second.last_unstale;
2414 whatname = "stale";
2415 break;
2416 default:
2417 ceph_abort();
2418 }
2419 if (--max_detail == 0) {
2420 ostringstream ss;
2421 ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
2422 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2423 break;
2424 }
2425 ++n;
2426 ss << "pg " << p->first << " is stuck " << whatname;
2427 if (since == utime_t()) {
2428 ss << " since forever";
2429 } else {
2430 utime_t dur = ceph_clock_now() - since;
2431 ss << " for " << dur;
2432 }
2433 ss << ", current state " << pg_state_string(p->second.state)
2434 << ", last acting " << p->second.acting;
2435 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2436 }
2437 }
2438
2439 static pair<int,int> _warn_slow_request_histogram(
2440 CephContext *cct,
2441 const pow2_hist_t& h,
2442 string suffix,
2443 list<pair<health_status_t,string> >& summary,
2444 list<pair<health_status_t,string> > *detail)
2445 {
2446 if (h.h.empty())
2447 return make_pair(0, 0);
2448
2449 unsigned warn = 0, error = 0;
2450 float err_age =
2451 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
2452 for (unsigned i = h.h.size() - 1; i > 0; --i) {
2453 float ub = (float)(1 << i) / 1000.0;
2454 if (ub < cct->_conf->mon_osd_warn_op_age)
2455 break;
2456 if (h.h[i]) {
2457 auto sev = HEALTH_WARN;
2458 if (ub > err_age) {
2459 sev = HEALTH_ERR;
2460 error += h.h[i];
2461 } else {
2462 warn += h.h[i];
2463 }
2464 if (detail) {
2465 ostringstream ss;
2466 ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
2467 detail->push_back(make_pair(sev, ss.str()));
2468 }
2469 }
2470 }
2471 return make_pair(warn, error);
2472 }
2473
2474 namespace {
2475 enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
2476
2477 void print_unscrubbed_detailed(
2478 const std::pair<const pg_t,pg_stat_t> &pg_entry,
2479 list<pair<health_status_t,string> > *detail,
2480 scrubbed_or_deepscrubbed_t how_scrubbed)
2481 {
2482 std::stringstream ss;
2483 const auto& pg_stat(pg_entry.second);
2484
2485 ss << "pg " << pg_entry.first << " is not ";
2486 if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
2487 ss << "scrubbed, last_scrub_stamp "
2488 << pg_stat.last_scrub_stamp;
2489 } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
2490 ss << "deep-scrubbed, last_deep_scrub_stamp "
2491 << pg_stat.last_deep_scrub_stamp;
2492 }
2493
2494 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2495 }
2496
2497 using pg_stat_map_t = const mempool::pgmap::unordered_map<pg_t,pg_stat_t>;
2498
2499 void print_unscrubbed_pgs(
2500 pg_stat_map_t& pg_stats,
2501 list<pair<health_status_t,string> > &summary,
2502 list<pair<health_status_t,string> > *detail,
2503 const CephContext* cct)
2504 {
2505 if (cct->_conf->mon_warn_not_scrubbed == 0 &&
2506 cct->_conf->mon_warn_not_deep_scrubbed == 0)
2507 return;
2508
2509 int pgs_count = 0;
2510 const utime_t now = ceph_clock_now();
2511 for (const auto& pg_entry : pg_stats) {
2512 const auto& pg_stat(pg_entry.second);
2513 const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
2514 const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
2515
2516 const int mon_warn_not_scrubbed =
2517 cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
2518
2519 const int mon_warn_not_deep_scrubbed =
2520 cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
2521
2522 bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
2523 cct->_conf->mon_warn_not_scrubbed != 0);
2524
2525 bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
2526 cct->_conf->mon_warn_not_deep_scrubbed != 0);
2527
2528 if (detail != nullptr) {
2529 if (not_scrubbed) {
2530 print_unscrubbed_detailed(pg_entry,
2531 detail,
2532 scrubbed_or_deepscrubbed_t::SCRUBBED);
2533 }
2534 if (not_deep_scrubbed) {
2535 print_unscrubbed_detailed(pg_entry,
2536 detail,
2537 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
2538 }
2539 }
2540 if (not_scrubbed || not_deep_scrubbed) {
2541 ++pgs_count;
2542 }
2543 }
2544
2545 if (pgs_count > 0) {
2546 std::stringstream ss;
2547 ss << pgs_count << " unscrubbed pgs";
2548 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2549 }
2550
2551 }
2552 }
2553
2554 void PGMap::get_health(
2555 CephContext *cct,
2556 const OSDMap& osdmap,
2557 list<pair<health_status_t,string> >& summary,
2558 list<pair<health_status_t,string> > *detail) const
2559 {
2560 map<string,int> note;
2561 auto p = num_pg_by_state.begin();
2562 auto p_end = num_pg_by_state.end();
2563 for (; p != p_end; ++p) {
2564 if (p->first & PG_STATE_STALE)
2565 note["stale"] += p->second;
2566 if (p->first & PG_STATE_DOWN)
2567 note["down"] += p->second;
2568 if (p->first & PG_STATE_UNDERSIZED)
2569 note["undersized"] += p->second;
2570 if (p->first & PG_STATE_DEGRADED)
2571 note["degraded"] += p->second;
2572 if (p->first & PG_STATE_INCONSISTENT)
2573 note["inconsistent"] += p->second;
2574 if (p->first & PG_STATE_PEERING)
2575 note["peering"] += p->second;
2576 if (p->first & PG_STATE_REPAIR)
2577 note["repair"] += p->second;
2578 if (p->first & PG_STATE_RECOVERING)
2579 note["recovering"] += p->second;
2580 if (p->first & PG_STATE_RECOVERY_WAIT)
2581 note["recovery_wait"] += p->second;
2582 if (p->first & PG_STATE_INCOMPLETE)
2583 note["incomplete"] += p->second;
2584 if (p->first & PG_STATE_BACKFILL_WAIT)
2585 note["backfill_wait"] += p->second;
2586 if (p->first & PG_STATE_BACKFILL)
2587 note["backfilling"] += p->second;
2588 if (p->first & PG_STATE_BACKFILL_TOOFULL)
2589 note["backfill_toofull"] += p->second;
2590 if (p->first & PG_STATE_RECOVERY_TOOFULL)
2591 note["recovery_toofull"] += p->second;
2592 }
2593
2594 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
2595 utime_t now(ceph_clock_now());
2596 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
2597 uint64_t num_inactive_pgs = 0;
2598
2599 if (detail) {
2600 // we need to collect details of stuck pgs, first do a quick check
2601 // whether this will yield any results
2602 if (get_stuck_counts(cutoff, note)) {
2603
2604 // there are stuck pgs. gather details for specified statuses
2605 // only if we know that there are pgs stuck in that status
2606
2607 if (note.find("stuck inactive") != note.end()) {
2608 get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
2609 note["stuck inactive"] = stuck_pgs.size();
2610 num_inactive_pgs += stuck_pgs.size();
2611 note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
2612 cct->_conf->mon_health_max_detail, detail);
2613 stuck_pgs.clear();
2614 }
2615
2616 if (note.find("stuck unclean") != note.end()) {
2617 get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
2618 note["stuck unclean"] = stuck_pgs.size();
2619 note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
2620 cct->_conf->mon_health_max_detail, detail);
2621 stuck_pgs.clear();
2622 }
2623
2624 if (note.find("stuck undersized") != note.end()) {
2625 get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
2626 note["stuck undersized"] = stuck_pgs.size();
2627 note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
2628 cct->_conf->mon_health_max_detail, detail);
2629 stuck_pgs.clear();
2630 }
2631
2632 if (note.find("stuck degraded") != note.end()) {
2633 get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
2634 note["stuck degraded"] = stuck_pgs.size();
2635 note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
2636 cct->_conf->mon_health_max_detail, detail);
2637 stuck_pgs.clear();
2638 }
2639
2640 if (note.find("stuck stale") != note.end()) {
2641 get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
2642 note["stuck stale"] = stuck_pgs.size();
2643 num_inactive_pgs += stuck_pgs.size();
2644 note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
2645 cct->_conf->mon_health_max_detail, detail);
2646 }
2647 }
2648 } else {
2649 get_stuck_counts(cutoff, note);
2650 auto p = note.find("stuck inactive");
2651 if (p != note.end())
2652 num_inactive_pgs += p->second;
2653 p = note.find("stuck stale");
2654 if (p != note.end())
2655 num_inactive_pgs += p->second;
2656 }
2657
2658 if (cct->_conf->mon_pg_min_inactive > 0 &&
2659 num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
2660 ostringstream ss;
2661 ss << num_inactive_pgs << " pgs are stuck inactive for more than " << cct->_conf->mon_pg_stuck_threshold << " seconds";
2662 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
2663 }
2664
2665 if (!note.empty()) {
2666 for (auto p = note.begin(); p != note.end(); ++p) {
2667 ostringstream ss;
2668 ss << p->second << " pgs " << p->first;
2669 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2670 }
2671 if (detail) {
2672 int n = 0, more = 0;
2673 int max = cct->_conf->mon_health_max_detail;
2674 for (auto p = pg_stat.begin();
2675 p != pg_stat.end();
2676 ++p) {
2677 if ((p->second.state & (PG_STATE_STALE |
2678 PG_STATE_DOWN |
2679 PG_STATE_UNDERSIZED |
2680 PG_STATE_DEGRADED |
2681 PG_STATE_INCONSISTENT |
2682 PG_STATE_PEERING |
2683 PG_STATE_REPAIR |
2684 PG_STATE_RECOVERING |
2685 PG_STATE_RECOVERY_WAIT |
2686 PG_STATE_RECOVERY_TOOFULL |
2687 PG_STATE_INCOMPLETE |
2688 PG_STATE_BACKFILL_WAIT |
2689 PG_STATE_BACKFILL |
2690 PG_STATE_BACKFILL_TOOFULL)) &&
2691 stuck_pgs.count(p->first) == 0) {
2692 if (max > 0) {
2693 --max;
2694 } else {
2695 ++more;
2696 continue;
2697 }
2698 ++n;
2699 ostringstream ss;
2700 ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
2701 ss << ", acting " << p->second.acting;
2702 if (p->second.stats.sum.num_objects_unfound)
2703 ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
2704 if (p->second.state & PG_STATE_INCOMPLETE) {
2705 const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
2706 if (pi && pi->min_size > 1) {
2707 ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
2708 << " min_size from " << (int)pi->min_size
2709 << " may help; search ceph.com/docs for 'incomplete')";
2710 }
2711 }
2712 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2713 }
2714 }
2715 if (more) {
2716 ostringstream ss;
2717 ss << more << " more pgs are also unhealthy";
2718 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2719 }
2720 }
2721 }
2722
2723 // slow requests
2724 if (cct->_conf->mon_osd_warn_op_age > 0 &&
2725 osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) {
2726 auto sum = _warn_slow_request_histogram(
2727 cct, osd_sum.op_queue_age_hist, "", summary, NULL);
2728 if (sum.first > 0 || sum.second > 0) {
2729 if (sum.first > 0) {
2730 ostringstream ss;
2731 ss << sum.first << " requests are blocked > "
2732 << cct->_conf->mon_osd_warn_op_age
2733 << " sec";
2734 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2735 }
2736 if (sum.second > 0) {
2737 ostringstream ss;
2738 ss << sum.first << " requests are blocked > "
2739 << (cct->_conf->mon_osd_warn_op_age *
2740 cct->_conf->mon_osd_err_op_age_ratio)
2741 << " sec";
2742 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
2743 }
2744
2745 if (detail) {
2746 unsigned num_warn = 0, num_err = 0;
2747 // do per-osd warnings
2748 for (auto p = osd_stat.begin();
2749 p != osd_stat.end();
2750 ++p) {
2751 auto sum = _warn_slow_request_histogram(
2752 cct,
2753 p->second.op_queue_age_hist,
2754 string(" on osd.") + stringify(p->first),
2755 summary, detail);
2756 if (sum.second)
2757 ++num_err;
2758 else if (sum.first)
2759 ++num_warn;
2760 }
2761 if (num_err) {
2762 ostringstream ss2;
2763 ss2 << num_err << " osds have very slow requests";
2764 summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
2765 detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
2766 }
2767 if (num_warn) {
2768 ostringstream ss2;
2769 ss2 << num_warn << " osds have slow requests";
2770 summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
2771 detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
2772 }
2773 }
2774 }
2775 }
2776
2777 if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
2778 float max_osd_usage = 0.0, min_osd_usage = 1.0;
2779 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
2780 // kb should never be 0, but avoid divide by zero in case of corruption
2781 if (p->second.kb <= 0)
2782 continue;
2783 float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
2784 if (usage > max_osd_usage)
2785 max_osd_usage = usage;
2786 if (usage < min_osd_usage)
2787 min_osd_usage = usage;
2788 }
2789 float diff = max_osd_usage - min_osd_usage;
2790 if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
2791 ostringstream ss;
2792 ss << "difference between min (" << roundf(min_osd_usage*1000.0)/100.0
2793 << "%) and max (" << roundf(max_osd_usage*1000.0)/100.0
2794 << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
2795 << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
2796 << " (mon_warn_osd_usage_min_max_delta)";
2797 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2798 if (detail)
2799 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2800 }
2801 }
2802
2803 // recovery
2804 list<string> sl;
2805 overall_recovery_summary(NULL, &sl);
2806 for (auto p = sl.begin(); p != sl.end(); ++p) {
2807 summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
2808 if (detail)
2809 detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
2810 }
2811
2812 // near-target max pools
2813 auto& pools = osdmap.get_pools();
2814 for (auto p = pools.begin();
2815 p != pools.end(); ++p) {
2816 if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
2817 !pg_pool_sum.count(p->first))
2818 continue;
2819 bool nearfull = false;
2820 const string& name = osdmap.get_pool_name(p->first);
2821 const pool_stat_t& st = get_pg_pool_sum_stat(p->first);
2822 uint64_t ratio = p->second.cache_target_full_ratio_micro +
2823 ((1000000 - p->second.cache_target_full_ratio_micro) *
2824 cct->_conf->mon_cache_target_full_warn_ratio);
2825 if (p->second.target_max_objects &&
2826 (uint64_t)(st.stats.sum.num_objects -
2827 st.stats.sum.num_objects_hit_set_archive) >
2828 p->second.target_max_objects * (ratio / 1000000.0)) {
2829 nearfull = true;
2830 if (detail) {
2831 ostringstream ss;
2832 ss << "cache pool '" << name << "' with "
2833 << si_t(st.stats.sum.num_objects)
2834 << " objects at/near target max "
2835 << si_t(p->second.target_max_objects) << " objects";
2836 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2837 }
2838 }
2839 if (p->second.target_max_bytes &&
2840 (uint64_t)(st.stats.sum.num_bytes -
2841 st.stats.sum.num_bytes_hit_set_archive) >
2842 p->second.target_max_bytes * (ratio / 1000000.0)) {
2843 nearfull = true;
2844 if (detail) {
2845 ostringstream ss;
2846 ss << "cache pool '" << name
2847 << "' with " << si_t(st.stats.sum.num_bytes)
2848 << "B at/near target max "
2849 << si_t(p->second.target_max_bytes) << "B";
2850 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2851 }
2852 }
2853 if (nearfull) {
2854 ostringstream ss;
2855 ss << "'" << name << "' at/near target max";
2856 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2857 }
2858 }
2859
2860 // scrub
2861 if (pg_sum.stats.sum.num_scrub_errors) {
2862 ostringstream ss;
2863 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2864 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
2865 if (detail) {
2866 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
2867 }
2868 }
2869
2870 // pg skew
2871 int num_in = osdmap.get_num_in_osds();
2872 int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
2873 if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
2874 int per = sum_pg_up / num_in;
2875 if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
2876 ostringstream ss;
2877 ss << "too few PGs per OSD (" << per << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
2878 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2879 if (detail)
2880 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2881 }
2882 }
2883 if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
2884 int per = sum_pg_up / num_in;
2885 if (per > cct->_conf->mon_pg_warn_max_per_osd) {
2886 ostringstream ss;
2887 ss << "too many PGs per OSD (" << per << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
2888 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2889 if (detail)
2890 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2891 }
2892 }
2893 if (!pg_stat.empty()) {
2894 for (auto p = pg_pool_sum.begin();
2895 p != pg_pool_sum.end();
2896 ++p) {
2897 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2898 if (!pi)
2899 continue; // in case osdmap changes haven't propagated to PGMap yet
2900 const string& name = osdmap.get_pool_name(p->first);
2901 if (pi->get_pg_num() > pi->get_pgp_num() &&
2902 !(name.find(".DELETED") != string::npos &&
2903 cct->_conf->mon_fake_pool_delete)) {
2904 ostringstream ss;
2905 ss << "pool " << name << " pg_num "
2906 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
2907 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2908 if (detail)
2909 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2910 }
2911 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2912 if (average_objects_per_pg > 0 &&
2913 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
2914 p->second.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_pool_objects) {
2915 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
2916 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2917 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
2918 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
2919 ostringstream ss;
2920 ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
2921 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
2922 if (detail) {
2923 ostringstream ss;
2924 ss << "pool " << name << " objects per pg ("
2925 << objects_per_pg << ") is more than " << ratio << " times cluster average ("
2926 << average_objects_per_pg << ")";
2927 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2928 }
2929 }
2930 }
2931 }
2932 }
2933
2934 print_unscrubbed_pgs(pg_stat, summary, detail, cct);
2935 }
2936
2937 int process_pg_map_command(
2938 const string& orig_prefix,
2939 const map<string,cmd_vartype>& orig_cmdmap,
2940 const PGMap& pg_map,
2941 const OSDMap& osdmap,
2942 Formatter *f,
2943 stringstream *ss,
2944 bufferlist *odata)
2945 {
2946 string prefix = orig_prefix;
2947 map<string,cmd_vartype> cmdmap = orig_cmdmap;
2948
2949 // perhaps these would be better in the parsing, but it's weird
2950 bool primary = false;
2951 if (prefix == "pg dump_json") {
2952 vector<string> v;
2953 v.push_back(string("all"));
2954 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
2955 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
2956 prefix = "pg dump";
2957 } else if (prefix == "pg dump_pools_json") {
2958 vector<string> v;
2959 v.push_back(string("pools"));
2960 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
2961 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
2962 prefix = "pg dump";
2963 } else if (prefix == "pg ls-by-primary") {
2964 primary = true;
2965 prefix = "pg ls";
2966 } else if (prefix == "pg ls-by-osd") {
2967 prefix = "pg ls";
2968 } else if (prefix == "pg ls-by-pool") {
2969 prefix = "pg ls";
2970 string poolstr;
2971 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
2972 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
2973 if (pool < 0) {
2974 *ss << "pool " << poolstr << " does not exist";
2975 return -ENOENT;
2976 }
2977 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
2978 }
2979
2980 int r = 0;
2981 stringstream ds;
2982 if (prefix == "pg stat") {
2983 if (f) {
2984 f->open_object_section("pg_summary");
2985 pg_map.print_oneline_summary(f, NULL);
2986 f->close_section();
2987 f->flush(ds);
2988 } else {
2989 ds << pg_map;
2990 }
2991 odata->append(ds);
2992 return 0;
2993 }
2994
2995 if (prefix == "pg getmap") {
2996 pg_map.encode(*odata);
2997 *ss << "got pgmap version " << pg_map.version;
2998 return 0;
2999 }
3000
3001 if (prefix == "pg dump") {
3002 string val;
3003 vector<string> dumpcontents;
3004 set<string> what;
3005 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3006 copy(dumpcontents.begin(), dumpcontents.end(),
3007 inserter(what, what.end()));
3008 }
3009 if (what.empty())
3010 what.insert("all");
3011 if (f) {
3012 if (what.count("all")) {
3013 f->open_object_section("pg_map");
3014 pg_map.dump(f);
3015 f->close_section();
3016 } else if (what.count("summary") || what.count("sum")) {
3017 f->open_object_section("pg_map");
3018 pg_map.dump_basic(f);
3019 f->close_section();
3020 } else {
3021 if (what.count("pools")) {
3022 pg_map.dump_pool_stats(f);
3023 }
3024 if (what.count("osds")) {
3025 pg_map.dump_osd_stats(f);
3026 }
3027 if (what.count("pgs")) {
3028 pg_map.dump_pg_stats(f, false);
3029 }
3030 if (what.count("pgs_brief")) {
3031 pg_map.dump_pg_stats(f, true);
3032 }
3033 if (what.count("delta")) {
3034 f->open_object_section("delta");
3035 pg_map.dump_delta(f);
3036 f->close_section();
3037 }
3038 }
3039 f->flush(*odata);
3040 } else {
3041 if (what.count("all")) {
3042 pg_map.dump(ds);
3043 } else if (what.count("summary") || what.count("sum")) {
3044 pg_map.dump_basic(ds);
3045 pg_map.dump_pg_sum_stats(ds, true);
3046 pg_map.dump_osd_sum_stats(ds);
3047 } else {
3048 if (what.count("pgs_brief")) {
3049 pg_map.dump_pg_stats(ds, true);
3050 }
3051 bool header = true;
3052 if (what.count("pgs")) {
3053 pg_map.dump_pg_stats(ds, false);
3054 header = false;
3055 }
3056 if (what.count("pools")) {
3057 pg_map.dump_pool_stats(ds, header);
3058 }
3059 if (what.count("osds")) {
3060 pg_map.dump_osd_stats(ds);
3061 }
3062 }
3063 odata->append(ds);
3064 }
3065 *ss << "dumped " << what;
3066 return 0;
3067 }
3068
3069 if (prefix == "pg ls") {
3070 int64_t osd = -1;
3071 int64_t pool = -1;
3072 vector<string>states;
3073 set<pg_t> pgs;
3074 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3075 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3076 cmd_getval(g_ceph_context, cmdmap, "states", states);
3077 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3078 *ss << "pool " << pool << " does not exist";
3079 return -ENOENT;
3080 }
3081 if (osd >= 0 && !osdmap.is_up(osd)) {
3082 *ss << "osd " << osd << " is not up";
3083 return -EAGAIN;
3084 }
3085 if (states.empty())
3086 states.push_back("all");
3087
3088 uint32_t state = 0;
3089
3090 while (!states.empty()) {
3091 string state_str = states.back();
3092
3093 if (state_str == "all") {
3094 state = -1;
3095 break;
3096 } else {
3097 int filter = pg_string_state(state_str);
3098 assert(filter != -1);
3099 state |= filter;
3100 }
3101
3102 states.pop_back();
3103 }
3104
3105 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3106
3107 if (f && !pgs.empty()) {
3108 pg_map.dump_filtered_pg_stats(f, pgs);
3109 f->flush(*odata);
3110 } else if (!pgs.empty()) {
3111 pg_map.dump_filtered_pg_stats(ds, pgs);
3112 odata->append(ds);
3113 }
3114 return 0;
3115 }
3116
3117 if (prefix == "pg dump_stuck") {
3118 vector<string> stuckop_vec;
3119 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3120 if (stuckop_vec.empty())
3121 stuckop_vec.push_back("unclean");
3122 int64_t threshold;
3123 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3124 int64_t(g_conf->mon_pg_stuck_threshold));
3125
3126 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
3127 odata->append(ds);
3128 if (r < 0)
3129 *ss << "failed";
3130 else
3131 *ss << "ok";
3132 return 0;
3133 }
3134
3135 if (prefix == "pg debug") {
3136 string debugop;
3137 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3138 string("unfound_objects_exist"));
3139 if (debugop == "unfound_objects_exist") {
3140 bool unfound_objects_exist = false;
3141 for (const auto& p : pg_map.pg_stat) {
3142 if (p.second.stats.sum.num_objects_unfound > 0) {
3143 unfound_objects_exist = true;
3144 break;
3145 }
3146 }
3147 if (unfound_objects_exist)
3148 ds << "TRUE";
3149 else
3150 ds << "FALSE";
3151 odata->append(ds);
3152 return 0;
3153 }
3154 if (debugop == "degraded_pgs_exist") {
3155 bool degraded_pgs_exist = false;
3156 for (const auto& p : pg_map.pg_stat) {
3157 if (p.second.stats.sum.num_objects_degraded > 0) {
3158 degraded_pgs_exist = true;
3159 break;
3160 }
3161 }
3162 if (degraded_pgs_exist)
3163 ds << "TRUE";
3164 else
3165 ds << "FALSE";
3166 odata->append(ds);
3167 return 0;
3168 }
3169 }
3170
3171 if (prefix == "osd perf") {
3172 if (f) {
3173 f->open_object_section("osdstats");
3174 pg_map.dump_osd_perf_stats(f);
3175 f->close_section();
3176 f->flush(ds);
3177 } else {
3178 pg_map.print_osd_perf_stats(&ds);
3179 }
3180 odata->append(ds);
3181 return 0;
3182 }
3183
3184 if (prefix == "osd blocked-by") {
3185 if (f) {
3186 f->open_object_section("osd_blocked_by");
3187 pg_map.dump_osd_blocked_by_stats(f);
3188 f->close_section();
3189 f->flush(ds);
3190 } else {
3191 pg_map.print_osd_blocked_by_stats(&ds);
3192 }
3193 odata->append(ds);
3194 return 0;
3195 }
3196
3197 if (prefix == "osd pool stats") {
3198 string pool_name;
3199 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
3200
3201 int64_t poolid = -ENOENT;
3202 bool one_pool = false;
3203 if (!pool_name.empty()) {
3204 poolid = osdmap.lookup_pg_pool_name(pool_name);
3205 if (poolid < 0) {
3206 assert(poolid == -ENOENT);
3207 *ss << "unrecognized pool '" << pool_name << "'";
3208 return -ENOENT;
3209 }
3210 one_pool = true;
3211 }
3212
3213 stringstream rs;
3214
3215 if (f)
3216 f->open_array_section("pool_stats");
3217 else {
3218 if (osdmap.get_pools().empty()) {
3219 *ss << "there are no pools!";
3220 goto stats_out;
3221 }
3222 }
3223
3224 for (auto& p : osdmap.get_pools()) {
3225 if (!one_pool)
3226 poolid = p.first;
3227
3228 pool_name = osdmap.get_pool_name(poolid);
3229
3230 if (f) {
3231 f->open_object_section("pool");
3232 f->dump_string("pool_name", pool_name.c_str());
3233 f->dump_int("pool_id", poolid);
3234 f->open_object_section("recovery");
3235 }
3236
3237 list<string> sl;
3238 stringstream tss;
3239 pg_map.pool_recovery_summary(f, &sl, poolid);
3240 if (!f && !sl.empty()) {
3241 for (auto& p : sl)
3242 tss << " " << p << "\n";
3243 }
3244
3245 if (f) {
3246 f->close_section();
3247 f->open_object_section("recovery_rate");
3248 }
3249
3250 ostringstream rss;
3251 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
3252 if (!f && !rss.str().empty())
3253 tss << " recovery io " << rss.str() << "\n";
3254
3255 if (f) {
3256 f->close_section();
3257 f->open_object_section("client_io_rate");
3258 }
3259 rss.clear();
3260 rss.str("");
3261
3262 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
3263 if (!f && !rss.str().empty())
3264 tss << " client io " << rss.str() << "\n";
3265
3266 // dump cache tier IO rate for cache pool
3267 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
3268 if (pool->is_tier()) {
3269 if (f) {
3270 f->close_section();
3271 f->open_object_section("cache_io_rate");
3272 }
3273 rss.clear();
3274 rss.str("");
3275
3276 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
3277 if (!f && !rss.str().empty())
3278 tss << " cache tier io " << rss.str() << "\n";
3279 }
3280 if (f) {
3281 f->close_section();
3282 f->close_section();
3283 } else {
3284 rs << "pool " << pool_name << " id " << poolid << "\n";
3285 if (!tss.str().empty())
3286 rs << tss.str() << "\n";
3287 else
3288 rs << " nothing is going on\n\n";
3289 }
3290 if (one_pool)
3291 break;
3292 }
3293
3294 stats_out:
3295 if (f) {
3296 f->close_section();
3297 f->flush(ds);
3298 odata->append(ds);
3299 } else {
3300 odata->append(rs.str());
3301 }
3302 return 0;
3303 }
3304
3305 return -EOPNOTSUPP;
3306 }
3307
3308 void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
3309 std::set<int> *need_check_down_pg_osds,
3310 std::map<int,utime_t> *last_osd_report,
3311 PGMap *pg_map,
3312 PGMap::Incremental *pending_inc)
3313 {
3314 for (const auto &p : osd_inc.new_weight) {
3315 if (p.second == CEPH_OSD_OUT) {
3316 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
3317 auto j = pg_map->osd_epochs.find(p.first);
3318 if (j != pg_map->osd_epochs.end())
3319 pending_inc->stat_osd_out(p.first, j->second);
3320 }
3321 }
3322
3323 // this is conservative: we want to know if any osds (maybe) got marked down.
3324 for (const auto &p : osd_inc.new_state) {
3325 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
3326 // but we're too lazy to check
3327 // which
3328 need_check_down_pg_osds->insert(p.first);
3329
3330 // clear out the last_osd_report for this OSD
3331 auto report = last_osd_report->find(p.first);
3332 if (report != last_osd_report->end()) {
3333 last_osd_report->erase(report);
3334 }
3335
3336 // clear out osd_stat slow request histogram
3337 dout(20) << __func__ << " clearing osd." << p.first
3338 << " request histogram" << dendl;
3339 pending_inc->stat_osd_down_up(p.first, osd_inc.epoch, *pg_map);
3340 }
3341
3342 if (p.second & CEPH_OSD_EXISTS) {
3343 // whether it was created *or* destroyed, we can safely drop
3344 // it's osd_stat_t record.
3345 dout(10) << __func__ << " osd." << p.first
3346 << " created or destroyed" << dendl;
3347 pending_inc->rm_stat(p.first);
3348
3349 // and adjust full, nearfull set
3350 pg_map->nearfull_osds.erase(p.first);
3351 pg_map->full_osds.erase(p.first);
3352 }
3353 }
3354 }
3355
3356 void PGMapUpdater::check_osd_map(
3357 CephContext *cct,
3358 const OSDMap& osdmap,
3359 const PGMap& pgmap,
3360 PGMap::Incremental *pending_inc)
3361 {
3362 for (auto& p : pgmap.osd_stat) {
3363 if (!osdmap.exists(p.first)) {
3364 // remove osd_stat
3365 pending_inc->rm_stat(p.first);
3366 } else if (osdmap.is_out(p.first)) {
3367 // zero osd_stat
3368 if (p.second.kb != 0) {
3369 auto j = pgmap.osd_epochs.find(p.first);
3370 if (j != pgmap.osd_epochs.end()) {
3371 pending_inc->stat_osd_out(p.first, j->second);
3372 }
3373 }
3374 } else if (!osdmap.is_up(p.first)) {
3375 // zero the op_queue_age_hist
3376 if (!p.second.op_queue_age_hist.empty()) {
3377 pending_inc->stat_osd_down_up(p.first, osdmap.get_epoch(), pgmap);
3378 }
3379 }
3380 }
3381
3382 // deleted pgs (pools)?
3383 for (auto& p : pgmap.pg_pool_sum) {
3384 if (!osdmap.have_pg_pool(p.first)) {
3385 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3386 << dendl;
3387 for (auto& q : pgmap.pg_stat) {
3388 if (q.first.pool() == (uint64_t)p.first) {
3389 pending_inc->pg_remove.insert(q.first);
3390 }
3391 }
3392 auto q = pending_inc->pg_stat_updates.begin();
3393 while (q != pending_inc->pg_stat_updates.end()) {
3394 if (q->first.pool() == (uint64_t)p.first) {
3395 q = pending_inc->pg_stat_updates.erase(q);
3396 } else {
3397 ++q;
3398 }
3399 }
3400 }
3401 }
3402
3403 // new pgs (split or new pool)?
3404 for (auto& p : osdmap.get_pools()) {
3405 int64_t poolid = p.first;
3406 const pg_pool_t& pi = p.second;
3407 auto q = pgmap.num_pg_by_pool.find(poolid);
3408 unsigned my_pg_num = 0;
3409 if (q != pgmap.num_pg_by_pool.end())
3410 my_pg_num = q->second;
3411 unsigned pg_num = pi.get_pg_num();
3412 if (my_pg_num != pg_num) {
3413 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3414 pg_t pgid(ps, poolid);
3415 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
3416 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3417 stats.last_fresh = osdmap.get_modified();
3418 stats.last_active = osdmap.get_modified();
3419 stats.last_change = osdmap.get_modified();
3420 stats.last_peered = osdmap.get_modified();
3421 stats.last_clean = osdmap.get_modified();
3422 stats.last_unstale = osdmap.get_modified();
3423 stats.last_undegraded = osdmap.get_modified();
3424 stats.last_fullsized = osdmap.get_modified();
3425 stats.last_scrub_stamp = osdmap.get_modified();
3426 stats.last_deep_scrub_stamp = osdmap.get_modified();
3427 stats.last_clean_scrub_stamp = osdmap.get_modified();
3428 }
3429 }
3430 }
3431 }
3432 }
3433
3434 void PGMapUpdater::register_pg(
3435 const OSDMap &osd_map,
3436 pg_t pgid, epoch_t epoch,
3437 bool new_pool,
3438 const PGMap &pg_map,
3439 PGMap::Incremental *pending_inc)
3440 {
3441 pg_t parent;
3442 int split_bits = 0;
3443 auto parent_stat = pg_map.pg_stat.end();
3444 if (!new_pool) {
3445 parent = pgid;
3446 while (1) {
3447 // remove most significant bit
3448 int msb = cbits(parent.ps());
3449 if (!msb)
3450 break;
3451 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
3452 split_bits++;
3453 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
3454 parent_stat = pg_map.pg_stat.find(parent);
3455 if (parent_stat != pg_map.pg_stat.end() &&
3456 parent_stat->second.state != PG_STATE_CREATING) {
3457 dout(10) << " parent is " << parent << dendl;
3458 break;
3459 }
3460 }
3461 }
3462
3463 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3464 stats.state = PG_STATE_CREATING;
3465 stats.created = epoch;
3466 stats.parent = parent;
3467 stats.parent_split_bits = split_bits;
3468 stats.mapping_epoch = epoch;
3469
3470 if (parent_stat != pg_map.pg_stat.end()) {
3471 const pg_stat_t &ps = parent_stat->second;
3472 stats.last_fresh = ps.last_fresh;
3473 stats.last_active = ps.last_active;
3474 stats.last_change = ps.last_change;
3475 stats.last_peered = ps.last_peered;
3476 stats.last_clean = ps.last_clean;
3477 stats.last_unstale = ps.last_unstale;
3478 stats.last_undegraded = ps.last_undegraded;
3479 stats.last_fullsized = ps.last_fullsized;
3480 stats.last_scrub_stamp = ps.last_scrub_stamp;
3481 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
3482 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
3483 } else {
3484 utime_t now = osd_map.get_modified();
3485 stats.last_fresh = now;
3486 stats.last_active = now;
3487 stats.last_change = now;
3488 stats.last_peered = now;
3489 stats.last_clean = now;
3490 stats.last_unstale = now;
3491 stats.last_undegraded = now;
3492 stats.last_fullsized = now;
3493 stats.last_scrub_stamp = now;
3494 stats.last_deep_scrub_stamp = now;
3495 stats.last_clean_scrub_stamp = now;
3496 }
3497
3498 osd_map.pg_to_up_acting_osds(
3499 pgid,
3500 &stats.up,
3501 &stats.up_primary,
3502 &stats.acting,
3503 &stats.acting_primary);
3504
3505 if (split_bits == 0) {
3506 dout(10) << __func__ << " will create " << pgid
3507 << " primary " << stats.acting_primary
3508 << " acting " << stats.acting
3509 << dendl;
3510 } else {
3511 dout(10) << __func__ << " will create " << pgid
3512 << " primary " << stats.acting_primary
3513 << " acting " << stats.acting
3514 << " parent " << parent
3515 << " by " << split_bits << " bits"
3516 << dendl;
3517 }
3518 }
3519
3520 void PGMapUpdater::register_new_pgs(
3521 const OSDMap &osd_map,
3522 const PGMap &pg_map,
3523 PGMap::Incremental *pending_inc)
3524 {
3525 epoch_t epoch = osd_map.get_epoch();
3526 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
3527 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
3528
3529 int created = 0;
3530 const auto &pools = osd_map.get_pools();
3531
3532 for (const auto &p : pools) {
3533 int64_t poolid = p.first;
3534 const pg_pool_t &pool = p.second;
3535 int ruleno = osd_map.crush->find_rule(pool.get_crush_rule(),
3536 pool.get_type(), pool.get_size());
3537 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
3538 continue;
3539
3540 if (pool.get_last_change() <= pg_map.last_pg_scan ||
3541 pool.get_last_change() <= pending_inc->pg_scan) {
3542 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
3543 continue;
3544 }
3545
3546 dout(10) << __func__ << " scanning pool " << poolid
3547 << " " << pool << dendl;
3548
3549 // first pgs in this pool
3550 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
3551
3552 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
3553 pg_t pgid(ps, poolid, -1);
3554 if (pg_map.pg_stat.count(pgid)) {
3555 dout(20) << "register_new_pgs have " << pgid << dendl;
3556 continue;
3557 }
3558 created++;
3559 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
3560 pg_map, pending_inc);
3561 }
3562 }
3563
3564 int removed = 0;
3565 for (const auto &p : pg_map.creating_pgs) {
3566 if (p.preferred() >= 0) {
3567 dout(20) << " removing creating_pg " << p
3568 << " because it is localized and obsolete" << dendl;
3569 pending_inc->pg_remove.insert(p);
3570 ++removed;
3571 } else if (!osd_map.have_pg_pool(p.pool())) {
3572 dout(20) << " removing creating_pg " << p
3573 << " because containing pool deleted" << dendl;
3574 pending_inc->pg_remove.insert(p);
3575 ++removed;
3576 }
3577 }
3578
3579 // deleted pools?
3580 for (const auto &p : pg_map.pg_stat) {
3581 if (!osd_map.have_pg_pool(p.first.pool())) {
3582 dout(20) << " removing pg_stat " << p.first << " because "
3583 << "containing pool deleted" << dendl;
3584 pending_inc->pg_remove.insert(p.first);
3585 ++removed;
3586 } else if (p.first.preferred() >= 0) {
3587 dout(20) << " removing localized pg " << p.first << dendl;
3588 pending_inc->pg_remove.insert(p.first);
3589 ++removed;
3590 }
3591 }
3592
3593 // we don't want to redo this work if we can avoid it.
3594 pending_inc->pg_scan = epoch;
3595
3596 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
3597 << removed << " uncreated pgs" << dendl;
3598 }
3599
3600
3601 void PGMapUpdater::update_creating_pgs(
3602 const OSDMap &osd_map,
3603 const PGMap &pg_map,
3604 PGMap::Incremental *pending_inc)
3605 {
3606 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
3607 << " pgs, osdmap epoch " << osd_map.get_epoch()
3608 << dendl;
3609
3610 unsigned changed = 0;
3611 for (auto p = pg_map.creating_pgs.begin();
3612 p != pg_map.creating_pgs.end();
3613 ++p) {
3614 pg_t pgid = *p;
3615 pg_t on = pgid;
3616 auto q = pg_map.pg_stat.find(pgid);
3617 assert(q != pg_map.pg_stat.end());
3618 const pg_stat_t *s = &q->second;
3619
3620 if (s->parent_split_bits)
3621 on = s->parent;
3622
3623 vector<int> up, acting;
3624 int up_primary, acting_primary;
3625 osd_map.pg_to_up_acting_osds(
3626 on,
3627 &up,
3628 &up_primary,
3629 &acting,
3630 &acting_primary);
3631
3632 if (up != s->up ||
3633 up_primary != s->up_primary ||
3634 acting != s->acting ||
3635 acting_primary != s->acting_primary) {
3636 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
3637 if (osd_map.get_epoch() > ns->reported_epoch) {
3638 dout(20) << __func__ << " " << pgid << " "
3639 << " acting_primary: " << s->acting_primary
3640 << " -> " << acting_primary
3641 << " acting: " << s->acting << " -> " << acting
3642 << " up_primary: " << s->up_primary << " -> " << up_primary
3643 << " up: " << s->up << " -> " << up
3644 << dendl;
3645
3646 // only initialize if it wasn't already a pending update
3647 if (ns->reported_epoch == 0)
3648 *ns = *s;
3649
3650 // note epoch if the target of the create message changed
3651 if (acting_primary != ns->acting_primary)
3652 ns->mapping_epoch = osd_map.get_epoch();
3653
3654 ns->up = up;
3655 ns->up_primary = up_primary;
3656 ns->acting = acting;
3657 ns->acting_primary = acting_primary;
3658
3659 ++changed;
3660 } else {
3661 dout(20) << __func__ << " " << pgid << " has pending update from newer"
3662 << " epoch " << ns->reported_epoch
3663 << dendl;
3664 }
3665 }
3666 }
3667 if (changed) {
3668 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
3669 }
3670 }
3671
3672 static void _try_mark_pg_stale(
3673 const OSDMap& osdmap,
3674 pg_t pgid,
3675 const pg_stat_t& cur,
3676 PGMap::Incremental *pending_inc)
3677 {
3678 if ((cur.state & PG_STATE_STALE) == 0 &&
3679 cur.acting_primary != -1 &&
3680 osdmap.is_down(cur.acting_primary)) {
3681 pg_stat_t *newstat;
3682 auto q = pending_inc->pg_stat_updates.find(pgid);
3683 if (q != pending_inc->pg_stat_updates.end()) {
3684 if ((q->second.acting_primary == cur.acting_primary) ||
3685 ((q->second.state & PG_STATE_STALE) == 0 &&
3686 q->second.acting_primary != -1 &&
3687 osdmap.is_down(q->second.acting_primary))) {
3688 newstat = &q->second;
3689 } else {
3690 // pending update is no longer down or already stale
3691 return;
3692 }
3693 } else {
3694 newstat = &pending_inc->pg_stat_updates[pgid];
3695 *newstat = cur;
3696 }
3697 dout(10) << __func__ << " marking pg " << pgid
3698 << " stale (acting_primary " << newstat->acting_primary
3699 << ")" << dendl;
3700 newstat->state |= PG_STATE_STALE;
3701 newstat->last_unstale = ceph_clock_now();
3702 }
3703 }
3704
3705 void PGMapUpdater::check_down_pgs(
3706 const OSDMap &osdmap,
3707 const PGMap &pg_map,
3708 bool check_all,
3709 const set<int>& need_check_down_pg_osds,
3710 PGMap::Incremental *pending_inc)
3711 {
3712 // if a large number of osds changed state, just iterate over the whole
3713 // pg map.
3714 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
3715 g_conf->mon_pg_check_down_all_threshold) {
3716 check_all = true;
3717 }
3718
3719 if (check_all) {
3720 for (const auto& p : pg_map.pg_stat) {
3721 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3722 }
3723 } else {
3724 for (auto osd : need_check_down_pg_osds) {
3725 if (osdmap.is_down(osd)) {
3726 auto p = pg_map.pg_by_osd.find(osd);
3727 if (p == pg_map.pg_by_osd.end()) {
3728 continue;
3729 }
3730 for (auto pgid : p->second) {
3731 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
3732 assert(stat.acting_primary == osd);
3733 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3734 }
3735 }
3736 }
3737 }
3738 }
3739
3740 int reweight::by_utilization(
3741 const OSDMap &osdmap,
3742 const PGMap &pgm,
3743 int oload,
3744 double max_changef,
3745 int max_osds,
3746 bool by_pg, const set<int64_t> *pools,
3747 bool no_increasing,
3748 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3749 std::stringstream *ss,
3750 std::string *out_str,
3751 Formatter *f)
3752 {
3753 if (oload <= 100) {
3754 *ss << "You must give a percentage higher than 100. "
3755 "The reweighting threshold will be calculated as <average-utilization> "
3756 "times <input-percentage>. For example, an argument of 200 would "
3757 "reweight OSDs which are twice as utilized as the average OSD.\n";
3758 return -EINVAL;
3759 }
3760
3761 vector<int> pgs_by_osd(osdmap.get_max_osd());
3762
3763 // Avoid putting a small number (or 0) in the denominator when calculating
3764 // average_util
3765 double average_util;
3766 if (by_pg) {
3767 // by pg mapping
3768 double weight_sum = 0.0; // sum up the crush weights
3769 unsigned num_pg_copies = 0;
3770 int num_osds = 0;
3771 for (const auto& pg : pgm.pg_stat) {
3772 if (pools && pools->count(pg.first.pool()) == 0)
3773 continue;
3774 for (const auto acting : pg.second.acting) {
3775 if (acting >= (int)pgs_by_osd.size())
3776 pgs_by_osd.resize(acting);
3777 if (pgs_by_osd[acting] == 0) {
3778 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3779 //skip if we currently can not identify item
3780 continue;
3781 }
3782 weight_sum += osdmap.crush->get_item_weightf(acting);
3783 ++num_osds;
3784 }
3785 ++pgs_by_osd[acting];
3786 ++num_pg_copies;
3787 }
3788 }
3789
3790 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
3791 *ss << "Refusing to reweight: we only have " << num_pg_copies
3792 << " PGs across " << num_osds << " osds!\n";
3793 return -EDOM;
3794 }
3795
3796 average_util = (double)num_pg_copies / weight_sum;
3797 } else {
3798 // by osd utilization
3799 int num_osd = MAX(1, pgm.osd_stat.size());
3800 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
3801 < g_conf->mon_reweight_min_bytes_per_osd) {
3802 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
3803 << " kb across all osds!\n";
3804 return -EDOM;
3805 }
3806 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
3807 < g_conf->mon_reweight_min_bytes_per_osd) {
3808 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
3809 << " kb used across all osds!\n";
3810 return -EDOM;
3811 }
3812
3813 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
3814 }
3815
3816 // adjust down only if we are above the threshold
3817 const double overload_util = average_util * (double)oload / 100.0;
3818
3819 // but aggressively adjust weights up whenever possible.
3820 const double underload_util = average_util;
3821
3822 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
3823
3824 ostringstream oss;
3825 if (f) {
3826 f->open_object_section("reweight_by_utilization");
3827 f->dump_int("overload_min", oload);
3828 f->dump_float("max_change", max_changef);
3829 f->dump_int("max_change_osds", max_osds);
3830 f->dump_float("average_utilization", average_util);
3831 f->dump_float("overload_utilization", overload_util);
3832 } else {
3833 oss << "oload " << oload << "\n";
3834 oss << "max_change " << max_changef << "\n";
3835 oss << "max_change_osds " << max_osds << "\n";
3836 oss.precision(4);
3837 oss << "average_utilization " << std::fixed << average_util << "\n";
3838 oss << "overload_utilization " << overload_util << "\n";
3839 }
3840 int num_changed = 0;
3841
3842 // precompute util for each OSD
3843 std::vector<std::pair<int, float> > util_by_osd;
3844 for (const auto& p : pgm.osd_stat) {
3845 std::pair<int, float> osd_util;
3846 osd_util.first = p.first;
3847 if (by_pg) {
3848 if (p.first >= (int)pgs_by_osd.size() ||
3849 pgs_by_osd[p.first] == 0) {
3850 // skip if this OSD does not contain any pg
3851 // belonging to the specified pool(s).
3852 continue;
3853 }
3854
3855 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
3856 // skip if we are unable to locate item.
3857 continue;
3858 }
3859
3860 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
3861 } else {
3862 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
3863 }
3864 util_by_osd.push_back(osd_util);
3865 }
3866
3867 // sort by absolute deviation from the mean utilization,
3868 // in descending order.
3869 std::sort(util_by_osd.begin(), util_by_osd.end(),
3870 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
3871 return abs(l.second - average_util) > abs(r.second - average_util);
3872 }
3873 );
3874
3875 if (f)
3876 f->open_array_section("reweights");
3877
3878 for (const auto& p : util_by_osd) {
3879 unsigned weight = osdmap.get_weight(p.first);
3880 if (weight == 0) {
3881 // skip if OSD is currently out
3882 continue;
3883 }
3884 float util = p.second;
3885
3886 if (util >= overload_util) {
3887 // Assign a lower weight to overloaded OSDs. The current weight
3888 // is a factor to take into account the original weights,
3889 // to represent e.g. differing storage capacities
3890 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3891 if (weight > max_change)
3892 new_weight = MAX(new_weight, weight - max_change);
3893 new_weights->insert({p.first, new_weight});
3894 if (f) {
3895 f->open_object_section("osd");
3896 f->dump_int("osd", p.first);
3897 f->dump_float("weight", (float)weight / (float)0x10000);
3898 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
3899 f->close_section();
3900 } else {
3901 oss << "osd." << p.first << " weight "
3902 << (float)weight / (float)0x10000 << " -> "
3903 << (float)new_weight / (float)0x10000 << "\n";
3904 }
3905 if (++num_changed >= max_osds)
3906 break;
3907 }
3908 if (!no_increasing && util <= underload_util) {
3909 // assign a higher weight.. if we can.
3910 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3911 new_weight = MIN(new_weight, weight + max_change);
3912 if (new_weight > 0x10000)
3913 new_weight = 0x10000;
3914 if (new_weight > weight) {
3915 new_weights->insert({p.first, new_weight});
3916 oss << "osd." << p.first << " weight "
3917 << (float)weight / (float)0x10000 << " -> "
3918 << (float)new_weight / (float)0x10000 << "\n";
3919 if (++num_changed >= max_osds)
3920 break;
3921 }
3922 }
3923 }
3924 if (f) {
3925 f->close_section();
3926 }
3927
3928 OSDMap newmap;
3929 newmap.deepish_copy_from(osdmap);
3930 OSDMap::Incremental newinc;
3931 newinc.fsid = newmap.get_fsid();
3932 newinc.epoch = newmap.get_epoch() + 1;
3933 newinc.new_weight = *new_weights;
3934 newmap.apply_incremental(newinc);
3935
3936 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
3937
3938 if (f) {
3939 f->close_section();
3940 } else {
3941 *out_str += "\n";
3942 *out_str += oss.str();
3943 }
3944 return num_changed;
3945 }