]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PGMap.cc
update sources to v12.1.3
[ceph.git] / ceph / src / mon / PGMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
224ce89b
WB
4#include <boost/algorithm/string.hpp>
5
7c673cae
FG
6#include "PGMap.h"
7
8#define dout_subsys ceph_subsys_mon
9#include "common/debug.h"
10#include "common/Formatter.h"
11#include "include/ceph_features.h"
12#include "include/stringify.h"
13
14#include "osd/osd_types.h"
15#include "osd/OSDMap.h"
16
17#define dout_context g_ceph_context
18
31f18b77
FG
19MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
20MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
21MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
22
23
24// ---------------------
25// PGMapDigest
26
27void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
28{
29 // NOTE: see PGMap::encode_digest
30 ENCODE_START(1, 1, bl);
31 ::encode(num_pg, bl);
32 ::encode(num_pg_active, bl);
33 ::encode(num_pg_unknown, bl);
34 ::encode(num_osd, bl);
35 ::encode(pg_pool_sum, bl, features);
36 ::encode(pg_sum, bl, features);
37 ::encode(osd_sum, bl);
38 ::encode(num_pg_by_state, bl);
39 ::encode(num_pg_by_osd, bl);
40 ::encode(num_pg_by_pool, bl);
41 ::encode(osd_last_seq, bl);
42 ::encode(per_pool_sum_delta, bl, features);
43 ::encode(per_pool_sum_deltas_stamps, bl);
44 ::encode(pg_sum_delta, bl, features);
45 ::encode(stamp_delta, bl);
46 ::encode(avail_space_by_rule, bl);
7c673cae
FG
47 ENCODE_FINISH(bl);
48}
49
31f18b77
FG
50void PGMapDigest::decode(bufferlist::iterator& p)
51{
52 DECODE_START(1, p);
53 ::decode(num_pg, p);
54 ::decode(num_pg_active, p);
55 ::decode(num_pg_unknown, p);
56 ::decode(num_osd, p);
57 ::decode(pg_pool_sum, p);
58 ::decode(pg_sum, p);
59 ::decode(osd_sum, p);
60 ::decode(num_pg_by_state, p);
61 ::decode(num_pg_by_osd, p);
62 ::decode(num_pg_by_pool, p);
63 ::decode(osd_last_seq, p);
64 ::decode(per_pool_sum_delta, p);
65 ::decode(per_pool_sum_deltas_stamps, p);
66 ::decode(pg_sum_delta, p);
67 ::decode(stamp_delta, p);
68 ::decode(avail_space_by_rule, p);
69 DECODE_FINISH(p);
70}
71
72void PGMapDigest::dump(Formatter *f) const
73{
74 f->dump_unsigned("num_pg", num_pg);
75 f->dump_unsigned("num_pg_active", num_pg_active);
76 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
77 f->dump_unsigned("num_osd", num_osd);
78 f->dump_object("pool_sum", pg_sum);
79 f->dump_object("osd_sum", osd_sum);
80 f->open_array_section("pool_stats");
81 for (auto& p : pg_pool_sum) {
82 f->open_object_section("pool_stat");
83 f->dump_int("poolid", p.first);
84 auto q = num_pg_by_pool.find(p.first);
85 if (q != num_pg_by_pool.end())
86 f->dump_unsigned("num_pg", q->second);
87 p.second.dump(f);
7c673cae
FG
88 f->close_section();
89 }
90 f->close_section();
31f18b77
FG
91 f->open_array_section("osd_stats");
92 int i = 0;
93 // TODO: this isn't really correct since we can dump non-existent OSDs
94 // I dunno what osd_last_seq is set to in that case...
95 for (auto& p : osd_last_seq) {
7c673cae 96 f->open_object_section("osd_stat");
31f18b77
FG
97 f->dump_int("osd", i);
98 f->dump_unsigned("seq", p);
7c673cae 99 f->close_section();
31f18b77 100 ++i;
7c673cae
FG
101 }
102 f->close_section();
31f18b77
FG
103 f->open_array_section("num_pg_by_state");
104 for (auto& p : num_pg_by_state) {
105 f->open_object_section("count");
106 f->dump_string("state", pg_state_string(p.first));
107 f->dump_unsigned("num", p.second);
108 f->close_section();
109 }
7c673cae 110 f->close_section();
31f18b77
FG
111 f->open_array_section("num_pg_by_osd");
112 for (auto& p : num_pg_by_osd) {
113 f->open_object_section("count");
114 f->dump_unsigned("osd", p.first);
115 f->dump_unsigned("num_primary_pg", p.second.primary);
116 f->dump_unsigned("num_acting_pg", p.second.acting);
117 f->dump_unsigned("num_up_pg", p.second.up);
118 f->close_section();
119 }
7c673cae
FG
120 f->close_section();
121}
122
31f18b77 123void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
7c673cae 124{
31f18b77 125 ls.push_back(new PGMapDigest);
7c673cae
FG
126}
127
31f18b77
FG
128inline std::string percentify(const float& a) {
129 std::stringstream ss;
130 if (a < 0.01)
131 ss << "0";
132 else
133 ss << std::fixed << std::setprecision(2) << a;
134 return ss.str();
135}
7c673cae 136
31f18b77 137void PGMapDigest::print_summary(Formatter *f, ostream *out) const
7c673cae 138{
31f18b77
FG
139 if (f)
140 f->open_array_section("pgs_by_state");
7c673cae 141
31f18b77
FG
142 // list is descending numeric order (by count)
143 multimap<int,int> state_by_count; // count -> state
144 for (auto p = num_pg_by_state.begin();
145 p != num_pg_by_state.end();
146 ++p) {
147 state_by_count.insert(make_pair(p->second, p->first));
7c673cae 148 }
31f18b77
FG
149 if (f) {
150 for (auto p = state_by_count.rbegin();
151 p != state_by_count.rend();
152 ++p)
153 {
154 f->open_object_section("pgs_by_state_element");
155 f->dump_string("state_name", pg_state_string(p->second));
156 f->dump_unsigned("count", p->first);
157 f->close_section();
158 }
7c673cae 159 }
31f18b77
FG
160 if (f)
161 f->close_section();
7c673cae 162
31f18b77
FG
163 if (f) {
164 f->dump_unsigned("num_pgs", num_pg);
165 f->dump_unsigned("num_pools", pg_pool_sum.size());
166 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
167 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
168 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
169 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
170 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
171 } else {
172 *out << " pools: " << pg_pool_sum.size() << " pools, "
173 << num_pg << " pgs\n";
174 *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
175 << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
176 *out << " usage: "
177 << kb_t(osd_sum.kb_used) << " used, "
178 << kb_t(osd_sum.kb_avail) << " / "
179 << kb_t(osd_sum.kb) << " avail\n";
180 *out << " pgs: ";
181 }
7c673cae 182
31f18b77 183 bool pad = false;
7c673cae 184
31f18b77
FG
185 if (num_pg_unknown > 0) {
186 float p = (float)num_pg_unknown / (float)num_pg;
187 if (f) {
188 f->dump_float("unknown_pgs_ratio", p);
7c673cae 189 } else {
31f18b77
FG
190 char b[20];
191 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
192 *out << b << "% pgs unknown\n";
193 pad = true;
7c673cae 194 }
7c673cae 195 }
7c673cae 196
31f18b77
FG
197 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
198 if (num_pg_inactive > 0) {
199 float p = (float)num_pg_inactive / (float)num_pg;
200 if (f) {
201 f->dump_float("inactive_pgs_ratio", p);
7c673cae 202 } else {
31f18b77
FG
203 if (pad) {
204 *out << " ";
205 }
206 char b[20];
207 snprintf(b, sizeof(b), "%.3f", p * 100.0);
208 *out << b << "% pgs not active\n";
209 pad = true;
7c673cae 210 }
7c673cae 211 }
31f18b77
FG
212
213 list<string> sl;
214 overall_recovery_summary(f, &sl);
215 if (!f && !sl.empty()) {
216 for (auto p = sl.begin(); p != sl.end(); ++p) {
217 if (pad) {
218 *out << " ";
219 }
220 *out << *p << "\n";
221 pad = true;
7c673cae 222 }
7c673cae 223 }
31f18b77 224 sl.clear();
7c673cae 225
31f18b77
FG
226 if (!f) {
227 unsigned max_width = 1;
228 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
229 p != state_by_count.rend();
230 ++p)
231 {
232 std::stringstream ss;
233 ss << p->first;
234 max_width = MAX(ss.str().size(), max_width);
7c673cae
FG
235 }
236
31f18b77
FG
237 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
238 p != state_by_count.rend();
239 ++p)
240 {
241 if (pad) {
242 *out << " ";
243 }
244 pad = true;
245 out->setf(std::ios::left);
246 *out << std::setw(max_width) << p->first
247 << " " << pg_state_string(p->second) << "\n";
248 out->unsetf(std::ios::left);
249 }
7c673cae
FG
250 }
251
31f18b77
FG
252 ostringstream ss_rec_io;
253 overall_recovery_rate_summary(f, &ss_rec_io);
254 ostringstream ss_client_io;
255 overall_client_io_rate_summary(f, &ss_client_io);
256 ostringstream ss_cache_io;
257 overall_cache_io_rate_summary(f, &ss_cache_io);
7c673cae 258
31f18b77
FG
259 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
260 || ss_cache_io.str().length())) {
261 *out << "\n \n";
262 *out << " io:\n";
7c673cae
FG
263 }
264
31f18b77
FG
265 if (!f && ss_client_io.str().length())
266 *out << " client: " << ss_client_io.str() << "\n";
267 if (!f && ss_rec_io.str().length())
268 *out << " recovery: " << ss_rec_io.str() << "\n";
269 if (!f && ss_cache_io.str().length())
270 *out << " cache: " << ss_cache_io.str() << "\n";
7c673cae
FG
271}
272
31f18b77 273void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
7c673cae 274{
31f18b77
FG
275 std::stringstream ss;
276
277 if (f)
278 f->open_array_section("num_pg_by_state");
279 for (auto p = num_pg_by_state.begin();
280 p != num_pg_by_state.end();
281 ++p) {
282 if (f) {
283 f->open_object_section("state");
284 f->dump_string("name", pg_state_string(p->first));
285 f->dump_unsigned("num", p->second);
286 f->close_section();
287 }
288 if (p != num_pg_by_state.begin())
289 ss << ", ";
290 ss << p->second << " " << pg_state_string(p->first);
7c673cae 291 }
31f18b77
FG
292 if (f)
293 f->close_section();
7c673cae 294
31f18b77
FG
295 string states = ss.str();
296 if (out)
297 *out << num_pg << " pgs: "
298 << states << "; "
299 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
300 << kb_t(osd_sum.kb_used) << " used, "
301 << kb_t(osd_sum.kb_avail) << " / "
302 << kb_t(osd_sum.kb) << " avail";
303 if (f) {
304 f->dump_unsigned("num_pgs", num_pg);
305 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
306 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
307 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
308 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
309 }
7c673cae 310
31f18b77
FG
311 // make non-negative; we can get negative values if osds send
312 // uncommitted stats and then "go backward" or if they are just
313 // buggy/wrong.
314 pool_stat_t pos_delta = pg_sum_delta;
315 pos_delta.floor(0);
316 if (pos_delta.stats.sum.num_rd ||
317 pos_delta.stats.sum.num_wr) {
318 if (out)
319 *out << "; ";
320 if (pos_delta.stats.sum.num_rd) {
321 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
322 if (out)
323 *out << pretty_si_t(rd) << "B/s rd, ";
324 if (f)
325 f->dump_unsigned("read_bytes_sec", rd);
326 }
327 if (pos_delta.stats.sum.num_wr) {
328 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
329 if (out)
330 *out << pretty_si_t(wr) << "B/s wr, ";
331 if (f)
332 f->dump_unsigned("write_bytes_sec", wr);
333 }
334 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
335 if (out)
336 *out << pretty_si_t(iops) << "op/s";
337 if (f)
338 f->dump_unsigned("io_sec", iops);
7c673cae 339 }
31f18b77
FG
340
341 list<string> sl;
342 overall_recovery_summary(f, &sl);
343 if (out)
344 for (auto p = sl.begin(); p != sl.end(); ++p)
345 *out << "; " << *p;
346 std::stringstream ssr;
347 overall_recovery_rate_summary(f, &ssr);
348 if (out && ssr.str().length())
349 *out << "; " << ssr.str() << " recovering";
7c673cae
FG
350}
351
31f18b77
FG
352void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
353 const pool_stat_t& delta_sum) const
7c673cae 354{
31f18b77
FG
355 if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
356 double pc = (double)delta_sum.stats.sum.num_objects_degraded /
357 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
358 char b[20];
359 snprintf(b, sizeof(b), "%.3lf", pc);
360 if (f) {
361 f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
362 f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
363 f->dump_float("degraded_ratio", pc / 100.0);
364 } else {
365 ostringstream ss;
366 ss << delta_sum.stats.sum.num_objects_degraded
367 << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
368 psl->push_back(ss.str());
369 }
370 }
371 if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
372 double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
373 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
374 char b[20];
375 snprintf(b, sizeof(b), "%.3lf", pc);
376 if (f) {
377 f->dump_unsigned("misplaced_objects", delta_sum.stats.sum.num_objects_misplaced);
378 f->dump_unsigned("misplaced_total", delta_sum.stats.sum.num_object_copies);
379 f->dump_float("misplaced_ratio", pc / 100.0);
380 } else {
381 ostringstream ss;
382 ss << delta_sum.stats.sum.num_objects_misplaced
383 << "/" << delta_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
384 psl->push_back(ss.str());
385 }
386 }
387 if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
388 double pc = (double)delta_sum.stats.sum.num_objects_unfound /
389 (double)delta_sum.stats.sum.num_objects * (double)100.0;
390 char b[20];
391 snprintf(b, sizeof(b), "%.3lf", pc);
392 if (f) {
393 f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
394 f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
395 f->dump_float("unfound_ratio", pc / 100.0);
396 } else {
397 ostringstream ss;
398 ss << delta_sum.stats.sum.num_objects_unfound
399 << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
400 psl->push_back(ss.str());
401 }
7c673cae 402 }
7c673cae
FG
403}
404
31f18b77
FG
405void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
406 const pool_stat_t& delta_sum,
407 utime_t delta_stamp) const
7c673cae 408{
31f18b77
FG
409 // make non-negative; we can get negative values if osds send
410 // uncommitted stats and then "go backward" or if they are just
411 // buggy/wrong.
412 pool_stat_t pos_delta = delta_sum;
413 pos_delta.floor(0);
414 if (pos_delta.stats.sum.num_objects_recovered ||
415 pos_delta.stats.sum.num_bytes_recovered ||
416 pos_delta.stats.sum.num_keys_recovered) {
417 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
418 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
419 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
420 if (f) {
421 f->dump_int("recovering_objects_per_sec", objps);
422 f->dump_int("recovering_bytes_per_sec", bps);
423 f->dump_int("recovering_keys_per_sec", kps);
424 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
425 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
426 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
427 } else {
428 *out << pretty_si_t(bps) << "B/s";
429 if (pos_delta.stats.sum.num_keys_recovered)
430 *out << ", " << pretty_si_t(kps) << "keys/s";
431 *out << ", " << pretty_si_t(objps) << "objects/s";
432 }
7c673cae 433 }
31f18b77 434}
7c673cae 435
31f18b77
FG
436void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
437{
438 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
439}
440
31f18b77 441void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
7c673cae 442{
31f18b77 443 recovery_summary(f, psl, pg_sum);
7c673cae
FG
444}
445
31f18b77
FG
446void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
447 uint64_t poolid) const
7c673cae 448{
31f18b77
FG
449 auto p = per_pool_sum_delta.find(poolid);
450 if (p == per_pool_sum_delta.end())
451 return;
7c673cae 452
31f18b77
FG
453 auto ts = per_pool_sum_deltas_stamps.find(p->first);
454 assert(ts != per_pool_sum_deltas_stamps.end());
455 recovery_rate_summary(f, out, p->second.first, ts->second);
456}
7c673cae 457
31f18b77
FG
458void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
459 uint64_t poolid) const
460{
461 auto p = per_pool_sum_delta.find(poolid);
462 if (p == per_pool_sum_delta.end())
463 return;
7c673cae 464
31f18b77 465 recovery_summary(f, psl, p->second.first);
7c673cae
FG
466}
467
31f18b77
FG
468void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
469 const pool_stat_t& delta_sum,
470 utime_t delta_stamp) const
7c673cae 471{
31f18b77
FG
472 pool_stat_t pos_delta = delta_sum;
473 pos_delta.floor(0);
474 if (pos_delta.stats.sum.num_rd ||
475 pos_delta.stats.sum.num_wr) {
476 if (pos_delta.stats.sum.num_rd) {
477 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
478 if (f) {
479 f->dump_int("read_bytes_sec", rd);
480 } else {
481 *out << pretty_si_t(rd) << "B/s rd, ";
482 }
483 }
484 if (pos_delta.stats.sum.num_wr) {
485 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
486 if (f) {
487 f->dump_int("write_bytes_sec", wr);
488 } else {
489 *out << pretty_si_t(wr) << "B/s wr, ";
490 }
491 }
492 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
493 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
494 if (f) {
495 f->dump_int("read_op_per_sec", iops_rd);
496 f->dump_int("write_op_per_sec", iops_wr);
497 } else {
498 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
499 }
7c673cae
FG
500 }
501}
502
31f18b77 503void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
7c673cae 504{
31f18b77
FG
505 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
506}
7c673cae 507
31f18b77
FG
508void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
509 uint64_t poolid) const
510{
511 auto p = per_pool_sum_delta.find(poolid);
512 if (p == per_pool_sum_delta.end())
7c673cae
FG
513 return;
514
31f18b77
FG
515 auto ts = per_pool_sum_deltas_stamps.find(p->first);
516 assert(ts != per_pool_sum_deltas_stamps.end());
517 client_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
518}
519
31f18b77
FG
520void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
521 const pool_stat_t& delta_sum,
522 utime_t delta_stamp) const
7c673cae 523{
31f18b77
FG
524 pool_stat_t pos_delta = delta_sum;
525 pos_delta.floor(0);
526 bool have_output = false;
7c673cae 527
31f18b77
FG
528 if (pos_delta.stats.sum.num_flush) {
529 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
530 if (f) {
531 f->dump_int("flush_bytes_sec", flush);
532 } else {
533 *out << pretty_si_t(flush) << "B/s flush";
534 have_output = true;
7c673cae
FG
535 }
536 }
31f18b77
FG
537 if (pos_delta.stats.sum.num_evict) {
538 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
539 if (f) {
540 f->dump_int("evict_bytes_sec", evict);
541 } else {
542 if (have_output)
543 *out << ", ";
544 *out << pretty_si_t(evict) << "B/s evict";
545 have_output = true;
546 }
7c673cae 547 }
31f18b77
FG
548 if (pos_delta.stats.sum.num_promote) {
549 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
550 if (f) {
551 f->dump_int("promote_op_per_sec", promote);
552 } else {
553 if (have_output)
554 *out << ", ";
555 *out << pretty_si_t(promote) << "op/s promote";
556 have_output = true;
557 }
7c673cae 558 }
31f18b77
FG
559 if (pos_delta.stats.sum.num_flush_mode_low) {
560 if (f) {
561 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
562 } else {
563 if (have_output)
564 *out << ", ";
565 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
566 have_output = true;
567 }
7c673cae 568 }
31f18b77
FG
569 if (pos_delta.stats.sum.num_flush_mode_high) {
570 if (f) {
571 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
572 } else {
573 if (have_output)
574 *out << ", ";
575 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
576 have_output = true;
577 }
7c673cae 578 }
31f18b77
FG
579 if (pos_delta.stats.sum.num_evict_mode_some) {
580 if (f) {
581 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
582 } else {
583 if (have_output)
584 *out << ", ";
585 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
586 have_output = true;
587 }
588 }
589 if (pos_delta.stats.sum.num_evict_mode_full) {
590 if (f) {
591 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
592 } else {
593 if (have_output)
594 *out << ", ";
595 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
596 }
7c673cae
FG
597 }
598}
599
31f18b77 600void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
7c673cae 601{
31f18b77 602 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
603}
604
31f18b77
FG
605void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
606 uint64_t poolid) const
7c673cae 607{
31f18b77
FG
608 auto p = per_pool_sum_delta.find(poolid);
609 if (p == per_pool_sum_delta.end())
610 return;
7c673cae 611
31f18b77
FG
612 auto ts = per_pool_sum_deltas_stamps.find(p->first);
613 assert(ts != per_pool_sum_deltas_stamps.end());
614 cache_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
615}
616
d2e6a577
FG
617static float pool_raw_used_rate(const OSDMap &osd_map, int64_t poolid)
618{
619 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
620
621 switch (pool->get_type()) {
622 case pg_pool_t::TYPE_REPLICATED:
623 return pool->get_size();
624 break;
625 case pg_pool_t::TYPE_ERASURE:
626 {
627 auto& ecp =
628 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
629 auto pm = ecp.find("m");
630 auto pk = ecp.find("k");
631 if (pm != ecp.end() && pk != ecp.end()) {
632 int k = atoi(pk->second.c_str());
633 int m = atoi(pm->second.c_str());
634 int mk = m + k;
635 assert(mk != 0);
636 assert(k != 0);
637 return (float)mk / k;
638 } else {
639 return 0.0;
640 }
641 }
642 break;
643 default:
644 assert(0 == "unrecognized pool type");
645 }
646}
647
648ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
649 boost::optional<int64_t> data_pool) const
650{
651 ceph_statfs statfs;
652 bool filter = false;
653 object_stat_sum_t sum;
654
655 if (data_pool) {
656 auto i = pg_pool_sum.find(*data_pool);
657 if (i != pg_pool_sum.end()) {
658 sum = i->second.stats.sum;
659 filter = true;
660 }
661 }
662
663 if (filter) {
664 statfs.kb_used = (sum.num_bytes >> 10);
665 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
666 statfs.num_objects = sum.num_objects;
667 statfs.kb = statfs.kb_used + statfs.kb_avail;
668 } else {
669 // these are in KB.
670 statfs.kb = osd_sum.kb;
671 statfs.kb_used = osd_sum.kb_used;
672 statfs.kb_avail = osd_sum.kb_avail;
673 statfs.num_objects = pg_sum.stats.sum.num_objects;
674 }
675
676 return statfs;
677}
678
31f18b77
FG
679void PGMapDigest::dump_pool_stats_full(
680 const OSDMap &osd_map,
681 stringstream *ss,
682 Formatter *f,
683 bool verbose) const
7c673cae 684{
31f18b77 685 TextTable tbl;
7c673cae 686
31f18b77
FG
687 if (f) {
688 f->open_array_section("pools");
689 } else {
690 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
691 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
692 if (verbose) {
693 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
694 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
695 }
7c673cae 696
31f18b77
FG
697 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
698 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
699 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
700 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
701 if (verbose) {
702 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
703 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
704 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
705 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
706 }
707 }
708
709 map<int,uint64_t> avail_by_rule;
710 for (auto p = osd_map.get_pools().begin();
711 p != osd_map.get_pools().end(); ++p) {
712 int64_t pool_id = p->first;
713 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
714 continue;
715 const string& pool_name = osd_map.get_pool_name(pool_id);
716 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
717
718 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
719 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
720 pool->get_type(),
721 pool->get_size());
722 int64_t avail;
723 float raw_used_rate;
724 if (avail_by_rule.count(ruleno) == 0) {
725 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
726 avail = get_rule_avail(ruleno);
727 if (avail < 0)
728 avail = 0;
729 avail_by_rule[ruleno] = avail;
730 } else {
731 avail = avail_by_rule[ruleno];
732 }
d2e6a577
FG
733
734 raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);
31f18b77
FG
735
736 if (f) {
737 f->open_object_section("pool");
738 f->dump_string("name", pool_name);
739 f->dump_int("id", pool_id);
740 f->open_object_section("stats");
741 } else {
742 tbl << pool_name
743 << pool_id;
744 if (verbose) {
745 if (pool->quota_max_objects == 0)
746 tbl << "N/A";
747 else
748 tbl << si_t(pool->quota_max_objects);
749
750 if (pool->quota_max_bytes == 0)
751 tbl << "N/A";
752 else
753 tbl << si_t(pool->quota_max_bytes);
754 }
755
756 }
757 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
758 if (f)
759 f->close_section(); // stats
760 else
761 tbl << TextTable::endrow;
762
763 if (f)
764 f->close_section(); // pool
765 }
766 if (f)
767 f->close_section();
768 else {
769 assert(ss != nullptr);
770 *ss << "POOLS:\n";
771 tbl.set_indent(4);
772 *ss << tbl;
773 }
774}
775
776void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
777{
778 if (f) {
779 f->open_object_section("stats");
780 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
781 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
782 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
783 if (verbose) {
784 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
785 }
786 f->close_section();
787 } else {
788 assert(ss != nullptr);
789 TextTable tbl;
790 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
791 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
792 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
793 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
794 if (verbose) {
795 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
796 }
797 tbl << stringify(si_t(osd_sum.kb*1024))
798 << stringify(si_t(osd_sum.kb_avail*1024))
799 << stringify(si_t(osd_sum.kb_used*1024));
800 float used = 0.0;
801 if (osd_sum.kb > 0) {
802 used = ((float)osd_sum.kb_used / osd_sum.kb);
803 }
804 tbl << percentify(used*100);
805 if (verbose) {
806 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
807 }
808 tbl << TextTable::endrow;
809
810 *ss << "GLOBAL:\n";
811 tbl.set_indent(4);
812 *ss << tbl;
813 }
814}
815
816void PGMapDigest::dump_object_stat_sum(
817 TextTable &tbl, Formatter *f,
818 const object_stat_sum_t &sum, uint64_t avail,
819 float raw_used_rate, bool verbose,
820 const pg_pool_t *pool)
821{
822 float curr_object_copies_rate = 0.0;
823 if (sum.num_object_copies > 0)
824 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
825
826 float used = 0.0;
827 if (avail) {
828 used = sum.num_bytes * curr_object_copies_rate;
829 used /= used + avail;
830 } else if (sum.num_bytes) {
831 used = 1.0;
832 }
833
834 if (f) {
835 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
836 f->dump_int("bytes_used", sum.num_bytes);
837 f->dump_format_unquoted("percent_used", "%.2f", (used*100));
838 f->dump_unsigned("max_avail", avail);
839 f->dump_int("objects", sum.num_objects);
840 if (verbose) {
841 f->dump_int("quota_objects", pool->quota_max_objects);
842 f->dump_int("quota_bytes", pool->quota_max_bytes);
843 f->dump_int("dirty", sum.num_objects_dirty);
844 f->dump_int("rd", sum.num_rd);
845 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
846 f->dump_int("wr", sum.num_wr);
847 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
848 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
849 }
850 } else {
851 tbl << stringify(si_t(sum.num_bytes));
852 tbl << percentify(used*100);
853 tbl << si_t(avail);
854 tbl << sum.num_objects;
855 if (verbose) {
856 tbl << stringify(si_t(sum.num_objects_dirty))
857 << stringify(si_t(sum.num_rd))
858 << stringify(si_t(sum.num_wr))
859 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
860 }
861 }
862}
863
d2e6a577
FG
864int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
865 int64_t poolid) const
866{
867 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
868 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
869 pool->get_type(),
870 pool->get_size());
871 int64_t avail;
872 avail = get_rule_avail(ruleno);
873 if (avail < 0)
874 avail = 0;
875
876 return avail / ::pool_raw_used_rate(osd_map, poolid);
877}
878
31f18b77
FG
879int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
880{
881 map<int,float> wm;
882 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
883 if (r < 0) {
884 return r;
885 }
886 if (wm.empty()) {
887 return 0;
888 }
889
890 float fratio;
891 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
892 osdmap.get_full_ratio() > 0) {
893 fratio = osdmap.get_full_ratio();
894 } else {
895 fratio = get_fallback_full_ratio();
896 }
897
898 int64_t min = -1;
899 for (auto p = wm.begin(); p != wm.end(); ++p) {
900 auto osd_info = osd_stat.find(p->first);
901 if (osd_info != osd_stat.end()) {
902 if (osd_info->second.kb == 0 || p->second == 0) {
903 // osd must be out, hence its stats have been zeroed
904 // (unless we somehow managed to have a disk with size 0...)
905 //
906 // (p->second == 0), if osd weight is 0, no need to
907 // calculate proj below.
908 continue;
909 }
910 double unusable = (double)osd_info->second.kb *
911 (1.0 - fratio);
912 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
913 avail *= 1024.0;
914 int64_t proj = (int64_t)(avail / (double)p->second);
915 if (min < 0 || proj < min) {
916 min = proj;
917 }
918 } else {
919 dout(0) << "Cannot get stat of OSD " << p->first << dendl;
920 }
921 }
922 return min;
923}
924
925void PGMap::get_rules_avail(const OSDMap& osdmap,
926 std::map<int,int64_t> *avail_map) const
927{
928 avail_map->clear();
929 for (auto p : osdmap.get_pools()) {
930 int64_t pool_id = p.first;
931 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
932 continue;
933 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
934 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
935 pool->get_type(),
936 pool->get_size());
937 if (avail_map->count(ruleno) == 0)
938 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
939 }
940}
941
942// ---------------------
943// PGMap
944
945void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
946{
947 if ((features & CEPH_FEATURE_MONENC) == 0) {
948 __u8 v = 4;
7c673cae
FG
949 ::encode(v, bl);
950 ::encode(version, bl);
31f18b77
FG
951 ::encode(pg_stat_updates, bl);
952 ::encode(osd_stat_updates, bl);
953 ::encode(osd_stat_rm, bl);
954 ::encode(osdmap_epoch, bl);
955 ::encode(pg_scan, bl);
7c673cae
FG
956 ::encode(full_ratio, bl);
957 ::encode(nearfull_ratio, bl);
31f18b77 958 ::encode(pg_remove, bl);
7c673cae
FG
959 return;
960 }
961
31f18b77 962 ENCODE_START(7, 5, bl);
7c673cae 963 ::encode(version, bl);
31f18b77
FG
964 ::encode(pg_stat_updates, bl);
965 ::encode(osd_stat_updates, bl);
966 ::encode(osd_stat_rm, bl);
967 ::encode(osdmap_epoch, bl);
968 ::encode(pg_scan, bl);
7c673cae
FG
969 ::encode(full_ratio, bl);
970 ::encode(nearfull_ratio, bl);
31f18b77 971 ::encode(pg_remove, bl);
7c673cae
FG
972 ::encode(stamp, bl);
973 ::encode(osd_epochs, bl);
974 ENCODE_FINISH(bl);
975}
976
31f18b77 977void PGMap::Incremental::decode(bufferlist::iterator &bl)
7c673cae 978{
31f18b77 979 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
7c673cae
FG
980 ::decode(version, bl);
981 if (struct_v < 3) {
31f18b77 982 pg_stat_updates.clear();
7c673cae
FG
983 __u32 n;
984 ::decode(n, bl);
985 while (n--) {
986 old_pg_t opgid;
987 ::decode(opgid, bl);
988 pg_t pgid = opgid;
31f18b77 989 ::decode(pg_stat_updates[pgid], bl);
7c673cae
FG
990 }
991 } else {
31f18b77 992 ::decode(pg_stat_updates, bl);
7c673cae 993 }
31f18b77
FG
994 ::decode(osd_stat_updates, bl);
995 ::decode(osd_stat_rm, bl);
996 ::decode(osdmap_epoch, bl);
997 ::decode(pg_scan, bl);
7c673cae
FG
998 if (struct_v >= 2) {
999 ::decode(full_ratio, bl);
1000 ::decode(nearfull_ratio, bl);
1001 }
31f18b77
FG
1002 if (struct_v < 3) {
1003 pg_remove.clear();
1004 __u32 n;
1005 ::decode(n, bl);
1006 while (n--) {
1007 old_pg_t opgid;
1008 ::decode(opgid, bl);
1009 pg_remove.insert(pg_t(opgid));
1010 }
1011 } else {
1012 ::decode(pg_remove, bl);
1013 }
1014 if (struct_v < 4 && full_ratio == 0) {
1015 full_ratio = -1;
1016 }
1017 if (struct_v < 4 && nearfull_ratio == 0) {
1018 nearfull_ratio = -1;
1019 }
1020 if (struct_v >= 6)
7c673cae 1021 ::decode(stamp, bl);
31f18b77 1022 if (struct_v >= 7) {
7c673cae
FG
1023 ::decode(osd_epochs, bl);
1024 } else {
31f18b77
FG
1025 for (auto i = osd_stat_updates.begin();
1026 i != osd_stat_updates.end();
7c673cae
FG
1027 ++i) {
1028 // This isn't accurate, but will cause trimming to behave like
1029 // previously.
31f18b77 1030 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
7c673cae
FG
1031 }
1032 }
1033 DECODE_FINISH(bl);
7c673cae
FG
1034}
1035
31f18b77 1036void PGMap::Incremental::dump(Formatter *f) const
7c673cae
FG
1037{
1038 f->dump_unsigned("version", version);
1039 f->dump_stream("stamp") << stamp;
31f18b77
FG
1040 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1041 f->dump_unsigned("pg_scan_epoch", pg_scan);
7c673cae 1042 f->dump_float("full_ratio", full_ratio);
31f18b77 1043 f->dump_float("nearfull_ratio", nearfull_ratio);
7c673cae 1044
31f18b77
FG
1045 f->open_array_section("pg_stat_updates");
1046 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1047 f->open_object_section("pg_stat");
1048 f->dump_stream("pgid") << p->first;
1049 p->second.dump(f);
1050 f->close_section();
1051 }
7c673cae
FG
1052 f->close_section();
1053
31f18b77
FG
1054 f->open_array_section("osd_stat_updates");
1055 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1056 f->open_object_section("osd_stat");
1057 f->dump_int("osd", p->first);
1058 p->second.dump(f);
7c673cae
FG
1059 f->close_section();
1060 }
1061 f->close_section();
1062
31f18b77
FG
1063 f->open_array_section("osd_stat_removals");
1064 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1065 f->dump_int("osd", *p);
7c673cae 1066 f->close_section();
7c673cae 1067
31f18b77
FG
1068 f->open_array_section("pg_removals");
1069 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1070 f->dump_stream("pgid") << *p;
7c673cae
FG
1071 f->close_section();
1072}
1073
31f18b77 1074void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
7c673cae 1075{
31f18b77
FG
1076 o.push_back(new Incremental);
1077 o.push_back(new Incremental);
1078 o.back()->version = 1;
1079 o.back()->stamp = utime_t(123,345);
1080 o.push_back(new Incremental);
1081 o.back()->version = 2;
1082 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
1083 o.back()->osd_stat_updates[5] = osd_stat_t();
1084 o.back()->osd_epochs[5] = 12;
1085 o.push_back(new Incremental);
1086 o.back()->version = 3;
1087 o.back()->osdmap_epoch = 1;
1088 o.back()->pg_scan = 2;
1089 o.back()->full_ratio = .2;
1090 o.back()->nearfull_ratio = .3;
1091 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
1092 o.back()->osd_stat_updates[6] = osd_stat_t();
1093 o.back()->osd_epochs[6] = 12;
1094 o.back()->pg_remove.insert(pg_t(1,2,3));
1095 o.back()->osd_stat_rm.insert(5);
7c673cae
FG
1096}
1097
7c673cae 1098
31f18b77
FG
1099// --
1100
1101void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
7c673cae 1102{
31f18b77
FG
1103 assert(inc.version == version+1);
1104 version++;
7c673cae 1105
31f18b77
FG
1106 utime_t delta_t;
1107 delta_t = inc.stamp;
1108 delta_t -= stamp;
1109 stamp = inc.stamp;
1110
1111 pool_stat_t pg_sum_old = pg_sum;
1112 mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
1113
1114 bool ratios_changed = false;
1115 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
1116 full_ratio = inc.full_ratio;
1117 ratios_changed = true;
7c673cae 1118 }
31f18b77
FG
1119 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
1120 nearfull_ratio = inc.nearfull_ratio;
1121 ratios_changed = true;
7c673cae 1122 }
31f18b77
FG
1123 if (ratios_changed)
1124 redo_full_sets();
7c673cae 1125
31f18b77
FG
1126 for (auto p = inc.pg_stat_updates.begin();
1127 p != inc.pg_stat_updates.end();
1128 ++p) {
1129 const pg_t &update_pg(p->first);
1130 const pg_stat_t &update_stat(p->second);
7c673cae 1131
31f18b77
FG
1132 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
1133 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
1134
1135 auto t = pg_stat.find(update_pg);
1136 if (t == pg_stat.end()) {
1137 pg_stat.insert(make_pair(update_pg, update_stat));
1138 } else {
1139 stat_pg_sub(update_pg, t->second);
1140 t->second = update_stat;
7c673cae 1141 }
31f18b77 1142 stat_pg_add(update_pg, update_stat);
7c673cae 1143 }
31f18b77
FG
1144 assert(osd_stat.size() == osd_epochs.size());
1145 for (auto p = inc.get_osd_stat_updates().begin();
1146 p != inc.get_osd_stat_updates().end();
1147 ++p) {
1148 int osd = p->first;
1149 const osd_stat_t &new_stats(p->second);
7c673cae 1150
31f18b77
FG
1151 auto t = osd_stat.find(osd);
1152 if (t == osd_stat.end()) {
1153 osd_stat.insert(make_pair(osd, new_stats));
1154 } else {
1155 stat_osd_sub(t->first, t->second);
1156 t->second = new_stats;
1157 }
1158 auto i = osd_epochs.find(osd);
1159 auto j = inc.get_osd_epochs().find(osd);
1160 assert(j != inc.get_osd_epochs().end());
7c673cae 1161
31f18b77
FG
1162 if (i == osd_epochs.end())
1163 osd_epochs.insert(*j);
1164 else
1165 i->second = j->second;
7c673cae 1166
31f18b77 1167 stat_osd_add(osd, new_stats);
7c673cae 1168
31f18b77
FG
1169 // adjust [near]full status
1170 register_nearfull_status(osd, new_stats);
1171 }
1172 set<int64_t> deleted_pools;
1173 for (auto p = inc.pg_remove.begin();
1174 p != inc.pg_remove.end();
1175 ++p) {
1176 const pg_t &removed_pg(*p);
1177 auto s = pg_stat.find(removed_pg);
1178 if (s != pg_stat.end()) {
1179 stat_pg_sub(removed_pg, s->second);
1180 pg_stat.erase(s);
1181 }
1182 deleted_pools.insert(removed_pg.pool());
7c673cae
FG
1183 }
1184
31f18b77
FG
1185 for (auto p = inc.get_osd_stat_rm().begin();
1186 p != inc.get_osd_stat_rm().end();
7c673cae 1187 ++p) {
31f18b77
FG
1188 auto t = osd_stat.find(*p);
1189 if (t != osd_stat.end()) {
1190 stat_osd_sub(t->first, t->second);
1191 osd_stat.erase(t);
1192 osd_epochs.erase(*p);
1193 }
1194
1195 // remove these old osds from full/nearfull set(s), too
1196 nearfull_osds.erase(*p);
1197 full_osds.erase(*p);
7c673cae
FG
1198 }
1199
31f18b77
FG
1200 // calculate a delta, and average over the last 2 deltas.
1201 pool_stat_t d = pg_sum;
1202 d.stats.sub(pg_sum_old.stats);
1203 pg_sum_deltas.push_back(make_pair(d, delta_t));
1204 stamp_delta += delta_t;
7c673cae 1205
31f18b77
FG
1206 pg_sum_delta.stats.add(d.stats);
1207 if (pg_sum_deltas.size() > (unsigned)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
1208 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1209 stamp_delta -= pg_sum_deltas.front().second;
1210 pg_sum_deltas.pop_front();
1211 }
7c673cae 1212
31f18b77 1213 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
7c673cae 1214
31f18b77
FG
1215 for (auto p : deleted_pools) {
1216 if (cct)
1217 dout(20) << " deleted pool " << p << dendl;
1218 deleted_pool(p);
1219 }
7c673cae 1220
31f18b77
FG
1221 if (inc.osdmap_epoch)
1222 last_osdmap_epoch = inc.osdmap_epoch;
1223 if (inc.pg_scan)
1224 last_pg_scan = inc.pg_scan;
1225
1226 min_last_epoch_clean = 0; // invalidate
7c673cae
FG
1227}
1228
31f18b77 1229void PGMap::redo_full_sets()
7c673cae 1230{
31f18b77
FG
1231 full_osds.clear();
1232 nearfull_osds.clear();
1233 for (auto i = osd_stat.begin();
1234 i != osd_stat.end();
1235 ++i) {
1236 register_nearfull_status(i->first, i->second);
7c673cae 1237 }
31f18b77 1238}
7c673cae 1239
31f18b77
FG
1240void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
1241{
1242 float ratio = ((float)s.kb_used) / ((float)s.kb);
7c673cae 1243
31f18b77
FG
1244 if (full_ratio > 0 && ratio > full_ratio) {
1245 // full
1246 full_osds.insert(osd);
1247 nearfull_osds.erase(osd);
1248 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
1249 // nearfull
1250 full_osds.erase(osd);
1251 nearfull_osds.insert(osd);
1252 } else {
1253 // ok
1254 full_osds.erase(osd);
1255 nearfull_osds.erase(osd);
1256 }
7c673cae
FG
1257}
1258
31f18b77 1259void PGMap::calc_stats()
7c673cae 1260{
31f18b77
FG
1261 num_pg = 0;
1262 num_pg_active = 0;
1263 num_pg_unknown = 0;
1264 num_osd = 0;
1265 pg_pool_sum.clear();
1266 num_pg_by_pool.clear();
1267 pg_by_osd.clear();
1268 pg_sum = pool_stat_t();
1269 osd_sum = osd_stat_t();
1270 num_pg_by_state.clear();
1271 num_pg_by_osd.clear();
7c673cae 1272
31f18b77
FG
1273 for (auto p = pg_stat.begin();
1274 p != pg_stat.end();
1275 ++p) {
1276 stat_pg_add(p->first, p->second);
1277 }
1278 for (auto p = osd_stat.begin();
1279 p != osd_stat.end();
1280 ++p)
1281 stat_osd_add(p->first, p->second);
7c673cae 1282
31f18b77 1283 redo_full_sets();
7c673cae 1284
31f18b77 1285 min_last_epoch_clean = calc_min_last_epoch_clean();
7c673cae
FG
1286}
1287
31f18b77 1288void PGMap::update_pg(pg_t pgid, bufferlist& bl)
7c673cae 1289{
31f18b77
FG
1290 bufferlist::iterator p = bl.begin();
1291 auto s = pg_stat.find(pgid);
1292 epoch_t old_lec = 0, lec;
1293 if (s != pg_stat.end()) {
1294 old_lec = s->second.get_effective_last_epoch_clean();
1295 stat_pg_update(pgid, s->second, p);
1296 lec = s->second.get_effective_last_epoch_clean();
1297 } else {
1298 pg_stat_t& r = pg_stat[pgid];
1299 ::decode(r, p);
1300 stat_pg_add(pgid, r);
1301 lec = r.get_effective_last_epoch_clean();
1302 }
7c673cae 1303
31f18b77
FG
1304 if (min_last_epoch_clean &&
1305 (lec < min_last_epoch_clean || // we did
1306 (lec > min_last_epoch_clean && // we might
1307 old_lec == min_last_epoch_clean)
1308 ))
1309 min_last_epoch_clean = 0;
1310}
7c673cae 1311
31f18b77
FG
1312void PGMap::remove_pg(pg_t pgid)
1313{
1314 auto s = pg_stat.find(pgid);
1315 if (s != pg_stat.end()) {
1316 if (min_last_epoch_clean &&
1317 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
1318 min_last_epoch_clean = 0;
1319 stat_pg_sub(pgid, s->second);
1320 pg_stat.erase(s);
7c673cae
FG
1321 }
1322}
1323
31f18b77 1324void PGMap::update_osd(int osd, bufferlist& bl)
7c673cae 1325{
31f18b77
FG
1326 bufferlist::iterator p = bl.begin();
1327 auto o = osd_stat.find(osd);
1328 epoch_t old_lec = 0;
1329 if (o != osd_stat.end()) {
1330 auto i = osd_epochs.find(osd);
1331 if (i != osd_epochs.end())
1332 old_lec = i->second;
1333 stat_osd_sub(osd, o->second);
1334 }
1335 osd_stat_t& r = osd_stat[osd];
1336 ::decode(r, p);
1337 stat_osd_add(osd, r);
7c673cae 1338
31f18b77
FG
1339 // adjust [near]full status
1340 register_nearfull_status(osd, r);
1341
1342 // epoch?
1343 if (!p.end()) {
1344 epoch_t e;
1345 ::decode(e, p);
1346
1347 if (e < min_last_epoch_clean ||
1348 (e > min_last_epoch_clean &&
1349 old_lec == min_last_epoch_clean))
1350 min_last_epoch_clean = 0;
1351 } else {
1352 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1353 // or old mon running.
7c673cae 1354 }
7c673cae
FG
1355}
1356
31f18b77 1357void PGMap::remove_osd(int osd)
7c673cae 1358{
31f18b77
FG
1359 auto o = osd_stat.find(osd);
1360 if (o != osd_stat.end()) {
1361 stat_osd_sub(osd, o->second);
1362 osd_stat.erase(o);
1363
1364 // remove these old osds from full/nearfull set(s), too
1365 nearfull_osds.erase(osd);
1366 full_osds.erase(osd);
7c673cae 1367 }
7c673cae
FG
1368}
1369
31f18b77
FG
1370void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1371 bool sameosds)
7c673cae 1372{
31f18b77
FG
1373 pg_pool_sum[pgid.pool()].add(s);
1374 pg_sum.add(s);
7c673cae 1375
31f18b77
FG
1376 num_pg++;
1377 num_pg_by_state[s.state]++;
1378 num_pg_by_pool[pgid.pool()]++;
7c673cae 1379
31f18b77
FG
1380 if ((s.state & PG_STATE_CREATING) &&
1381 s.parent_split_bits == 0) {
1382 creating_pgs.insert(pgid);
1383 if (s.acting_primary >= 0) {
1384 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
7c673cae
FG
1385 }
1386 }
1387
31f18b77
FG
1388 if (s.state & PG_STATE_ACTIVE) {
1389 ++num_pg_active;
1390 }
1391 if (s.state == 0) {
1392 ++num_pg_unknown;
7c673cae
FG
1393 }
1394
31f18b77
FG
1395 if (sameosds)
1396 return;
7c673cae 1397
31f18b77
FG
1398 for (auto p = s.blocked_by.begin();
1399 p != s.blocked_by.end();
1400 ++p) {
1401 ++blocked_by_sum[*p];
7c673cae 1402 }
31f18b77
FG
1403
1404 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1405 pg_by_osd[*p].insert(pgid);
1406 num_pg_by_osd[*p].acting++;
1407 }
1408 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1409 pg_by_osd[*p].insert(pgid);
1410 num_pg_by_osd[*p].up++;
7c673cae 1411 }
7c673cae 1412
31f18b77
FG
1413 if (s.up_primary >= 0) {
1414 num_pg_by_osd[s.up_primary].primary++;
7c673cae 1415 }
7c673cae 1416}
31f18b77
FG
1417
1418void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1419 bool sameosds)
7c673cae 1420{
31f18b77
FG
1421 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
1422 ps.sub(s);
1423 pg_sum.sub(s);
1424
1425 num_pg--;
1426 int end = --num_pg_by_state[s.state];
1427 assert(end >= 0);
1428 if (end == 0)
1429 num_pg_by_state.erase(s.state);
1430 end = --num_pg_by_pool[pgid.pool()];
1431 if (end == 0) {
1432 num_pg_by_pool.erase(pgid.pool());
1433 pg_pool_sum.erase(pgid.pool());
7c673cae 1434 }
7c673cae 1435
31f18b77
FG
1436 if ((s.state & PG_STATE_CREATING) &&
1437 s.parent_split_bits == 0) {
1438 creating_pgs.erase(pgid);
1439 if (s.acting_primary >= 0) {
1440 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1441 r[s.mapping_epoch].erase(pgid);
1442 if (r[s.mapping_epoch].empty())
1443 r.erase(s.mapping_epoch);
1444 if (r.empty())
1445 creating_pgs_by_osd_epoch.erase(s.acting_primary);
7c673cae
FG
1446 }
1447 }
31f18b77
FG
1448
1449 if (s.state & PG_STATE_ACTIVE) {
1450 --num_pg_active;
1451 }
1452 if (s.state == 0) {
1453 --num_pg_unknown;
1454 }
1455
1456 if (sameosds)
1457 return;
1458
1459 for (auto p = s.blocked_by.begin();
1460 p != s.blocked_by.end();
1461 ++p) {
1462 auto q = blocked_by_sum.find(*p);
1463 assert(q != blocked_by_sum.end());
1464 --q->second;
1465 if (q->second == 0)
1466 blocked_by_sum.erase(q);
1467 }
1468
1469 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1470 auto& oset = pg_by_osd[*p];
1471 oset.erase(pgid);
1472 if (oset.empty())
1473 pg_by_osd.erase(*p);
1474 auto it = num_pg_by_osd.find(*p);
1475 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1476 it->second.acting--;
1477 }
1478 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1479 auto& oset = pg_by_osd[*p];
1480 oset.erase(pgid);
1481 if (oset.empty())
1482 pg_by_osd.erase(*p);
1483 auto it = num_pg_by_osd.find(*p);
1484 if (it != num_pg_by_osd.end() && it->second.up > 0)
1485 it->second.up--;
1486 }
1487
1488 if (s.up_primary >= 0) {
1489 auto it = num_pg_by_osd.find(s.up_primary);
1490 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1491 it->second.primary--;
1492 }
1493}
1494
1495void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
1496 bufferlist::iterator& blp)
1497{
1498 pg_stat_t n;
1499 ::decode(n, blp);
1500
1501 bool sameosds =
1502 s.acting == n.acting &&
1503 s.up == n.up &&
1504 s.blocked_by == n.blocked_by;
1505
1506 stat_pg_sub(pgid, s, sameosds);
1507
1508 // if acting_primary has shift to an just restored osd, and pg yet to finish
1509 // peering, many attributes in current stats remain stale. others seem don't
1510 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1511 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
1512 n.last_active < s.last_active)
1513 n.last_active = s.last_active;
1514 s = n;
1515 stat_pg_add(pgid, n, sameosds);
1516}
1517
1518void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1519{
1520 num_osd++;
1521 osd_sum.add(s);
1522 if (osd >= (int)osd_last_seq.size()) {
1523 osd_last_seq.resize(osd + 1);
1524 }
1525 osd_last_seq[osd] = s.seq;
1526}
1527
1528void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1529{
1530 num_osd--;
1531 osd_sum.sub(s);
1532 assert(osd < (int)osd_last_seq.size());
1533 osd_last_seq[osd] = 0;
1534}
1535
1536epoch_t PGMap::calc_min_last_epoch_clean() const
1537{
1538 if (pg_stat.empty())
1539 return 0;
1540
1541 auto p = pg_stat.begin();
1542 epoch_t min = p->second.get_effective_last_epoch_clean();
1543 for (++p; p != pg_stat.end(); ++p) {
1544 epoch_t lec = p->second.get_effective_last_epoch_clean();
1545 if (lec < min)
1546 min = lec;
1547 }
1548 // also scan osd epochs
1549 // don't trim past the oldest reported osd epoch
1550 for (auto i = osd_epochs.begin();
1551 i != osd_epochs.end();
1552 ++i) {
1553 if (i->second < min)
1554 min = i->second;
1555 }
1556 return min;
1557}
1558
1559void PGMap::encode_digest(const OSDMap& osdmap,
1560 bufferlist& bl, uint64_t features) const
1561{
1562 get_rules_avail(osdmap, &avail_space_by_rule);
1563 PGMapDigest::encode(bl, features);
1564}
1565
1566void PGMap::encode(bufferlist &bl, uint64_t features) const
1567{
1568 if ((features & CEPH_FEATURE_MONENC) == 0) {
1569 __u8 v = 3;
1570 ::encode(v, bl);
1571 ::encode(version, bl);
1572 ::encode(pg_stat, bl);
1573 ::encode(osd_stat, bl);
1574 ::encode(last_osdmap_epoch, bl);
1575 ::encode(last_pg_scan, bl);
1576 ::encode(full_ratio, bl);
1577 ::encode(nearfull_ratio, bl);
1578 return;
1579 }
1580
1581 ENCODE_START(6, 4, bl);
1582 ::encode(version, bl);
1583 ::encode(pg_stat, bl);
1584 ::encode(osd_stat, bl);
1585 ::encode(last_osdmap_epoch, bl);
1586 ::encode(last_pg_scan, bl);
1587 ::encode(full_ratio, bl);
1588 ::encode(nearfull_ratio, bl);
1589 ::encode(stamp, bl);
1590 ::encode(osd_epochs, bl);
1591 ENCODE_FINISH(bl);
1592}
1593
1594void PGMap::decode(bufferlist::iterator &bl)
1595{
1596 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
1597 ::decode(version, bl);
1598 if (struct_v < 3) {
1599 pg_stat.clear();
1600 __u32 n;
1601 ::decode(n, bl);
1602 while (n--) {
1603 old_pg_t opgid;
1604 ::decode(opgid, bl);
1605 pg_t pgid = opgid;
1606 ::decode(pg_stat[pgid], bl);
7c673cae 1607 }
31f18b77
FG
1608 } else {
1609 ::decode(pg_stat, bl);
7c673cae 1610 }
31f18b77
FG
1611 ::decode(osd_stat, bl);
1612 ::decode(last_osdmap_epoch, bl);
1613 ::decode(last_pg_scan, bl);
1614 if (struct_v >= 2) {
1615 ::decode(full_ratio, bl);
1616 ::decode(nearfull_ratio, bl);
1617 }
1618 if (struct_v >= 5)
1619 ::decode(stamp, bl);
1620 if (struct_v >= 6) {
1621 ::decode(osd_epochs, bl);
1622 } else {
1623 for (auto i = osd_stat.begin();
1624 i != osd_stat.end();
1625 ++i) {
1626 // This isn't accurate, but will cause trimming to behave like
1627 // previously.
1628 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
7c673cae
FG
1629 }
1630 }
31f18b77
FG
1631 DECODE_FINISH(bl);
1632
1633 calc_stats();
7c673cae
FG
1634}
1635
31f18b77 1636void PGMap::dirty_all(Incremental& inc)
7c673cae 1637{
31f18b77
FG
1638 inc.osdmap_epoch = last_osdmap_epoch;
1639 inc.pg_scan = last_pg_scan;
1640 inc.full_ratio = full_ratio;
1641 inc.nearfull_ratio = nearfull_ratio;
1642
1643 for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) {
1644 inc.pg_stat_updates[p->first] = p->second;
1645 }
1646 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
1647 assert(osd_epochs.count(p->first));
1648 inc.update_stat(p->first,
1649 inc.get_osd_epochs().find(p->first)->second,
1650 p->second);
1651 }
1652}
1653
1654void PGMap::dump(Formatter *f) const
1655{
1656 dump_basic(f);
1657 dump_pg_stats(f, false);
1658 dump_pool_stats(f);
1659 dump_osd_stats(f);
1660}
1661
1662void PGMap::dump_basic(Formatter *f) const
1663{
1664 f->dump_unsigned("version", version);
1665 f->dump_stream("stamp") << stamp;
1666 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1667 f->dump_unsigned("last_pg_scan", last_pg_scan);
1668 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
1669 f->dump_float("full_ratio", full_ratio);
1670 f->dump_float("near_full_ratio", nearfull_ratio);
1671
1672 f->open_object_section("pg_stats_sum");
1673 pg_sum.dump(f);
1674 f->close_section();
1675
1676 f->open_object_section("osd_stats_sum");
1677 osd_sum.dump(f);
1678 f->close_section();
1679
1680 f->open_array_section("osd_epochs");
1681 for (auto p = osd_epochs.begin(); p != osd_epochs.end(); ++p) {
1682 f->open_object_section("osd");
1683 f->dump_unsigned("osd", p->first);
1684 f->dump_unsigned("epoch", p->second);
1685 f->close_section();
1686 }
1687 f->close_section();
1688
1689 dump_delta(f);
1690}
1691
1692void PGMap::dump_delta(Formatter *f) const
1693{
1694 f->open_object_section("pg_stats_delta");
1695 pg_sum_delta.dump(f);
1696 f->close_section();
1697}
1698
1699void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1700{
1701 f->open_array_section("pg_stats");
1702 for (auto i = pg_stat.begin();
1703 i != pg_stat.end();
1704 ++i) {
1705 f->open_object_section("pg_stat");
1706 f->dump_stream("pgid") << i->first;
1707 if (brief)
1708 i->second.dump_brief(f);
1709 else
1710 i->second.dump(f);
1711 f->close_section();
1712 }
1713 f->close_section();
1714}
1715
1716void PGMap::dump_pool_stats(Formatter *f) const
1717{
1718 f->open_array_section("pool_stats");
1719 for (auto p = pg_pool_sum.begin();
1720 p != pg_pool_sum.end();
1721 ++p) {
1722 f->open_object_section("pool_stat");
1723 f->dump_int("poolid", p->first);
1724 auto q = num_pg_by_pool.find(p->first);
1725 if (q != num_pg_by_pool.end())
1726 f->dump_unsigned("num_pg", q->second);
1727 p->second.dump(f);
1728 f->close_section();
1729 }
1730 f->close_section();
1731}
1732
1733void PGMap::dump_osd_stats(Formatter *f) const
1734{
1735 f->open_array_section("osd_stats");
1736 for (auto q = osd_stat.begin();
1737 q != osd_stat.end();
1738 ++q) {
1739 f->open_object_section("osd_stat");
1740 f->dump_int("osd", q->first);
1741 q->second.dump(f);
1742 f->close_section();
1743 }
1744 f->close_section();
1745}
1746
1747void PGMap::dump_pg_stats_plain(
1748 ostream& ss,
1749 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1750 bool brief) const
1751{
1752 TextTable tab;
1753
1754 if (brief){
1755 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1756 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1757 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1758 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1759 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1760 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1761 }
1762 else {
1763 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1764 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1765 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1766 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1768 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1769 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1770 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1771 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1772 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1774 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1775 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1776 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1777 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1778 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1780 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1782 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1784 }
1785
1786 for (auto i = pg_stats.begin();
1787 i != pg_stats.end(); ++i) {
1788 const pg_stat_t &st(i->second);
1789 if (brief) {
1790 tab << i->first
1791 << pg_state_string(st.state)
1792 << st.up
1793 << st.up_primary
1794 << st.acting
1795 << st.acting_primary
1796 << TextTable::endrow;
7c673cae 1797 } else {
31f18b77
FG
1798 ostringstream reported;
1799 reported << st.reported_epoch << ":" << st.reported_seq;
1800
1801 tab << i->first
1802 << st.stats.sum.num_objects
1803 << st.stats.sum.num_objects_missing_on_primary
1804 << st.stats.sum.num_objects_degraded
1805 << st.stats.sum.num_objects_misplaced
1806 << st.stats.sum.num_objects_unfound
1807 << st.stats.sum.num_bytes
1808 << st.log_size
1809 << st.ondisk_log_size
1810 << pg_state_string(st.state)
1811 << st.last_change
1812 << st.version
1813 << reported.str()
1814 << pg_vector_string(st.up)
1815 << st.up_primary
1816 << pg_vector_string(st.acting)
1817 << st.acting_primary
1818 << st.last_scrub
1819 << st.last_scrub_stamp
1820 << st.last_deep_scrub
1821 << st.last_deep_scrub_stamp
1822 << TextTable::endrow;
7c673cae
FG
1823 }
1824 }
7c673cae 1825
31f18b77
FG
1826 ss << tab;
1827}
1828
1829void PGMap::dump(ostream& ss) const
1830{
1831 dump_basic(ss);
1832 dump_pg_stats(ss, false);
1833 dump_pool_stats(ss, false);
1834 dump_pg_sum_stats(ss, false);
1835 dump_osd_stats(ss);
1836}
1837
1838void PGMap::dump_basic(ostream& ss) const
1839{
1840 ss << "version " << version << std::endl;
1841 ss << "stamp " << stamp << std::endl;
1842 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1843 ss << "last_pg_scan " << last_pg_scan << std::endl;
1844 ss << "full_ratio " << full_ratio << std::endl;
1845 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
1846}
1847
1848void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1849{
1850 dump_pg_stats_plain(ss, pg_stat, brief);
1851}
1852
1853void PGMap::dump_pool_stats(ostream& ss, bool header) const
1854{
1855 TextTable tab;
1856
1857 if (header) {
1858 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1859 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1860 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1864 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1865 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1866 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1867 } else {
1868 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1869 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1870 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1871 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1872 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1873 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1874 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1875 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1876 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1877 }
1878
1879 for (auto p = pg_pool_sum.begin();
1880 p != pg_pool_sum.end();
1881 ++p) {
1882 tab << p->first
1883 << p->second.stats.sum.num_objects
1884 << p->second.stats.sum.num_objects_missing_on_primary
1885 << p->second.stats.sum.num_objects_degraded
1886 << p->second.stats.sum.num_objects_misplaced
1887 << p->second.stats.sum.num_objects_unfound
1888 << p->second.stats.sum.num_bytes
1889 << p->second.log_size
1890 << p->second.ondisk_log_size
1891 << TextTable::endrow;
1892 }
1893
1894 ss << tab;
1895}
1896
1897void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1898{
1899 TextTable tab;
1900
1901 if (header) {
1902 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1903 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1904 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1905 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1906 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1907 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1908 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1909 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1910 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1911 } else {
1912 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1913 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1914 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1915 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1916 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1917 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1918 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1919 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1920 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1921 };
1922
1923 tab << "sum"
1924 << pg_sum.stats.sum.num_objects
1925 << pg_sum.stats.sum.num_objects_missing_on_primary
1926 << pg_sum.stats.sum.num_objects_degraded
1927 << pg_sum.stats.sum.num_objects_misplaced
1928 << pg_sum.stats.sum.num_objects_unfound
1929 << pg_sum.stats.sum.num_bytes
1930 << pg_sum.log_size
1931 << pg_sum.ondisk_log_size
1932 << TextTable::endrow;
1933
1934 ss << tab;
1935}
1936
1937void PGMap::dump_osd_stats(ostream& ss) const
1938{
1939 TextTable tab;
1940
1941 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1942 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1943 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1944 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1945 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1946 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1947 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1948
1949 for (auto p = osd_stat.begin();
1950 p != osd_stat.end();
1951 ++p) {
1952 tab << p->first
1953 << si_t(p->second.kb_used << 10)
1954 << si_t(p->second.kb_avail << 10)
1955 << si_t(p->second.kb << 10)
1956 << p->second.hb_peers
1957 << get_num_pg_by_osd(p->first)
1958 << get_num_primary_pg_by_osd(p->first)
1959 << TextTable::endrow;
1960 }
1961
1962 tab << "sum"
1963 << si_t(osd_sum.kb_used << 10)
1964 << si_t(osd_sum.kb_avail << 10)
1965 << si_t(osd_sum.kb << 10)
1966 << TextTable::endrow;
7c673cae 1967
31f18b77 1968 ss << tab;
7c673cae
FG
1969}
1970
31f18b77 1971void PGMap::dump_osd_sum_stats(ostream& ss) const
7c673cae 1972{
31f18b77 1973 TextTable tab;
7c673cae 1974
31f18b77
FG
1975 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1976 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1977 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1978 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
7c673cae 1979
31f18b77
FG
1980 tab << "sum"
1981 << si_t(osd_sum.kb_used << 10)
1982 << si_t(osd_sum.kb_avail << 10)
1983 << si_t(osd_sum.kb << 10)
1984 << TextTable::endrow;
7c673cae 1985
31f18b77 1986 ss << tab;
7c673cae
FG
1987}
1988
31f18b77
FG
1989void PGMap::get_stuck_stats(
1990 int types, const utime_t cutoff,
1991 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
7c673cae 1992{
31f18b77
FG
1993 assert(types != 0);
1994 for (auto i = pg_stat.begin();
1995 i != pg_stat.end();
1996 ++i) {
1997 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1998
1999 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
2000 if (i->second.last_active < val)
2001 val = i->second.last_active;
7c673cae 2002 }
31f18b77
FG
2003
2004 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
2005 if (i->second.last_clean < val)
2006 val = i->second.last_clean;
7c673cae 2007 }
31f18b77
FG
2008
2009 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
2010 if (i->second.last_undegraded < val)
2011 val = i->second.last_undegraded;
7c673cae 2012 }
7c673cae 2013
31f18b77
FG
2014 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
2015 if (i->second.last_fullsized < val)
2016 val = i->second.last_fullsized;
2017 }
7c673cae 2018
31f18b77
FG
2019 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
2020 if (i->second.last_unstale < val)
2021 val = i->second.last_unstale;
2022 }
7c673cae 2023
31f18b77
FG
2024 // val is now the earliest any of the requested stuck states began
2025 if (val < cutoff) {
2026 stuck_pgs[i->first] = i->second;
2027 }
2028 }
7c673cae
FG
2029}
2030
31f18b77 2031bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
7c673cae 2032{
31f18b77
FG
2033 int inactive = 0;
2034 int unclean = 0;
2035 int degraded = 0;
2036 int undersized = 0;
2037 int stale = 0;
7c673cae 2038
31f18b77
FG
2039 for (auto i = pg_stat.begin();
2040 i != pg_stat.end();
2041 ++i) {
2042 if (! (i->second.state & PG_STATE_ACTIVE)) {
2043 if (i->second.last_active < cutoff)
2044 ++inactive;
7c673cae 2045 }
31f18b77
FG
2046 if (! (i->second.state & PG_STATE_CLEAN)) {
2047 if (i->second.last_clean < cutoff)
2048 ++unclean;
7c673cae 2049 }
31f18b77
FG
2050 if (i->second.state & PG_STATE_DEGRADED) {
2051 if (i->second.last_undegraded < cutoff)
2052 ++degraded;
7c673cae 2053 }
31f18b77
FG
2054 if (i->second.state & PG_STATE_UNDERSIZED) {
2055 if (i->second.last_fullsized < cutoff)
2056 ++undersized;
7c673cae 2057 }
31f18b77
FG
2058 if (i->second.state & PG_STATE_STALE) {
2059 if (i->second.last_unstale < cutoff)
2060 ++stale;
7c673cae
FG
2061 }
2062 }
31f18b77
FG
2063
2064 if (inactive)
2065 note["stuck inactive"] = inactive;
2066
2067 if (unclean)
2068 note["stuck unclean"] = unclean;
2069
2070 if (undersized)
2071 note["stuck undersized"] = undersized;
2072
2073 if (degraded)
2074 note["stuck degraded"] = degraded;
2075
2076 if (stale)
2077 note["stuck stale"] = stale;
2078
2079 return inactive || unclean || undersized || degraded || stale;
2080}
2081
2082void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
2083{
2084 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2085 get_stuck_stats(types, cutoff, stuck_pg_stats);
2086 f->open_array_section("stuck_pg_stats");
2087 for (auto i = stuck_pg_stats.begin();
2088 i != stuck_pg_stats.end();
2089 ++i) {
2090 f->open_object_section("pg_stat");
2091 f->dump_stream("pgid") << i->first;
2092 i->second.dump(f);
2093 f->close_section();
2094 }
2095 f->close_section();
2096}
2097
2098void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2099{
2100 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2101 get_stuck_stats(types, cutoff, stuck_pg_stats);
2102 if (!stuck_pg_stats.empty())
2103 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2104}
2105
2106int PGMap::dump_stuck_pg_stats(
2107 stringstream &ds,
2108 Formatter *f,
2109 int threshold,
2110 vector<string>& args) const
2111{
2112 int stuck_types = 0;
2113
2114 for (auto i = args.begin(); i != args.end(); ++i) {
2115 if (*i == "inactive")
2116 stuck_types |= PGMap::STUCK_INACTIVE;
2117 else if (*i == "unclean")
2118 stuck_types |= PGMap::STUCK_UNCLEAN;
2119 else if (*i == "undersized")
2120 stuck_types |= PGMap::STUCK_UNDERSIZED;
2121 else if (*i == "degraded")
2122 stuck_types |= PGMap::STUCK_DEGRADED;
2123 else if (*i == "stale")
2124 stuck_types |= PGMap::STUCK_STALE;
2125 else {
2126 ds << "Unknown type: " << *i << std::endl;
2127 return -EINVAL;
7c673cae
FG
2128 }
2129 }
31f18b77
FG
2130
2131 utime_t now(ceph_clock_now());
2132 utime_t cutoff = now - utime_t(threshold, 0);
2133
2134 if (!f) {
2135 dump_stuck_plain(ds, stuck_types, cutoff);
2136 } else {
2137 dump_stuck(f, stuck_types, cutoff);
2138 f->flush(ds);
7c673cae 2139 }
31f18b77
FG
2140
2141 return 0;
7c673cae
FG
2142}
2143
31f18b77 2144void PGMap::dump_osd_perf_stats(Formatter *f) const
7c673cae 2145{
31f18b77
FG
2146 f->open_array_section("osd_perf_infos");
2147 for (auto i = osd_stat.begin();
2148 i != osd_stat.end();
2149 ++i) {
2150 f->open_object_section("osd");
2151 f->dump_int("id", i->first);
2152 {
2153 f->open_object_section("perf_stats");
2154 i->second.os_perf_stat.dump(f);
2155 f->close_section();
2156 }
2157 f->close_section();
2158 }
2159 f->close_section();
7c673cae 2160}
31f18b77 2161void PGMap::print_osd_perf_stats(std::ostream *ss) const
7c673cae 2162{
31f18b77
FG
2163 TextTable tab;
2164 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2165 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2166 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2167 for (auto i = osd_stat.begin();
2168 i != osd_stat.end();
2169 ++i) {
2170 tab << i->first;
2171 tab << i->second.os_perf_stat.os_commit_latency;
2172 tab << i->second.os_perf_stat.os_apply_latency;
2173 tab << TextTable::endrow;
2174 }
2175 (*ss) << tab;
2176}
7c673cae 2177
31f18b77
FG
2178void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2179{
2180 f->open_array_section("osd_blocked_by_infos");
2181 for (auto i = blocked_by_sum.begin();
2182 i != blocked_by_sum.end();
2183 ++i) {
2184 f->open_object_section("osd");
2185 f->dump_int("id", i->first);
2186 f->dump_int("num_blocked", i->second);
2187 f->close_section();
2188 }
2189 f->close_section();
2190}
2191void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2192{
2193 TextTable tab;
2194 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2195 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2196 for (auto i = blocked_by_sum.begin();
2197 i != blocked_by_sum.end();
2198 ++i) {
2199 tab << i->first;
2200 tab << i->second;
2201 tab << TextTable::endrow;
2202 }
2203 (*ss) << tab;
7c673cae
FG
2204}
2205
31f18b77 2206
7c673cae
FG
2207/**
2208 * update aggregated delta
2209 *
2210 * @param cct ceph context
2211 * @param ts Timestamp for the stats being delta'ed
2212 * @param old_pool_sum Previous stats sum
2213 * @param last_ts Last timestamp for pool
2214 * @param result_pool_sum Resulting stats
2215 * @param result_pool_delta Resulting pool delta
2216 * @param result_ts_delta Resulting timestamp delta
2217 * @param delta_avg_list List of last N computed deltas, used to average
2218 */
31f18b77
FG
2219void PGMap::update_delta(
2220 CephContext *cct,
2221 const utime_t ts,
2222 const pool_stat_t& old_pool_sum,
2223 utime_t *last_ts,
2224 const pool_stat_t& current_pool_sum,
2225 pool_stat_t *result_pool_delta,
2226 utime_t *result_ts_delta,
2227 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
7c673cae
FG
2228{
2229 /* @p ts is the timestamp we want to associate with the data
2230 * in @p old_pool_sum, and on which we will base ourselves to
2231 * calculate the delta, stored in 'delta_t'.
2232 */
2233 utime_t delta_t;
2234 delta_t = ts; // start with the provided timestamp
2235 delta_t -= *last_ts; // take the last timestamp we saw
2236 *last_ts = ts; // @p ts becomes the last timestamp we saw
2237
31f18b77
FG
2238 // adjust delta_t, quick start if there is no update in a long period
2239 delta_t = std::min(delta_t,
2240 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2241
2242 // calculate a delta, and average over the last 6 deltas by default.
7c673cae
FG
2243 /* start by taking a copy of our current @p result_pool_sum, and by
2244 * taking out the stats from @p old_pool_sum. This generates a stats
2245 * delta. Stash this stats delta in @p delta_avg_list, along with the
2246 * timestamp delta for these results.
2247 */
2248 pool_stat_t d = current_pool_sum;
2249 d.stats.sub(old_pool_sum.stats);
2250 delta_avg_list->push_back(make_pair(d,delta_t));
2251 *result_ts_delta += delta_t;
2252
2253 /* Aggregate current delta, and take out the last seen delta (if any) to
2254 * average it out.
2255 */
2256 result_pool_delta->stats.add(d.stats);
2257 size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
2258 if (delta_avg_list->size() > s) {
2259 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2260 *result_ts_delta -= delta_avg_list->front().second;
2261 delta_avg_list->pop_front();
2262 }
2263}
2264
2265/**
2266 * update aggregated delta
2267 *
2268 * @param cct ceph context
2269 * @param ts Timestamp
2270 * @param pg_sum_old Old pg_sum
2271 */
2272void PGMap::update_global_delta(CephContext *cct,
2273 const utime_t ts, const pool_stat_t& pg_sum_old)
2274{
2275 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
2276 &stamp_delta, &pg_sum_deltas);
2277}
2278
2279/**
2280 * Update a given pool's deltas
2281 *
2282 * @param cct Ceph Context
2283 * @param ts Timestamp for the stats being delta'ed
2284 * @param pool Pool's id
2285 * @param old_pool_sum Previous stats sum
2286 */
31f18b77
FG
2287void PGMap::update_one_pool_delta(
2288 CephContext *cct,
2289 const utime_t ts,
2290 const uint64_t pool,
2291 const pool_stat_t& old_pool_sum)
7c673cae
FG
2292{
2293 if (per_pool_sum_deltas.count(pool) == 0) {
2294 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2295 assert(per_pool_sum_delta.count(pool) == 0);
2296 }
2297
31f18b77 2298 auto& sum_delta = per_pool_sum_delta[pool];
7c673cae
FG
2299
2300 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2301 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2302 &per_pool_sum_deltas[pool]);
2303}
2304
2305/**
2306 * Update pools' deltas
2307 *
2308 * @param cct CephContext
2309 * @param ts Timestamp for the stats being delta'ed
2310 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2311 */
31f18b77
FG
2312void PGMap::update_pool_deltas(
2313 CephContext *cct, const utime_t ts,
2314 const mempool::pgmap::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
7c673cae 2315{
31f18b77 2316 for (auto it = pg_pool_sum_old.begin();
7c673cae
FG
2317 it != pg_pool_sum_old.end(); ++it) {
2318 update_one_pool_delta(cct, ts, it->first, it->second);
2319 }
2320}
2321
2322void PGMap::clear_delta()
2323{
2324 pg_sum_delta = pool_stat_t();
2325 pg_sum_deltas.clear();
2326 stamp_delta = utime_t();
2327}
2328
7c673cae
FG
2329void PGMap::generate_test_instances(list<PGMap*>& o)
2330{
2331 o.push_back(new PGMap);
2332 list<Incremental*> inc;
2333 Incremental::generate_test_instances(inc);
2334 delete inc.front();
2335 inc.pop_front();
2336 while (!inc.empty()) {
2337 PGMap *pmp = new PGMap();
2338 *pmp = *o.back();
2339 o.push_back(pmp);
2340 o.back()->apply_incremental(NULL, *inc.front());
2341 delete inc.front();
2342 inc.pop_front();
2343 }
2344}
2345
2346void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
2347 bool primary, set<pg_t>& pgs) const
2348{
31f18b77 2349 for (auto i = pg_stat.begin();
7c673cae
FG
2350 i != pg_stat.end();
2351 ++i) {
2352 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
2353 continue;
2354 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2355 continue;
2356 if (!(i->second.state & state))
2357 continue;
2358 pgs.insert(i->first);
2359 }
2360}
2361
2362void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2363{
2364 f->open_array_section("pg_stats");
31f18b77 2365 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2366 const pg_stat_t& st = pg_stat.at(*i);
2367 f->open_object_section("pg_stat");
2368 f->dump_stream("pgid") << *i;
2369 st.dump(f);
2370 f->close_section();
2371 }
2372 f->close_section();
2373}
2374
2375void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2376{
2377 TextTable tab;
2378
2379 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
2380 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2381 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2382 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2383 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2384 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2385 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2386 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2387 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
2388 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2389 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
2390 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2391 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2392 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2393 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2394 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2395 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2396 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2397 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2398 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2399 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2400
31f18b77 2401 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2402 const pg_stat_t& st = pg_stat.at(*i);
2403
2404 ostringstream reported;
2405 reported << st.reported_epoch << ":" << st.reported_seq;
2406
2407 tab << *i
2408 << st.stats.sum.num_objects
2409 << st.stats.sum.num_objects_missing_on_primary
2410 << st.stats.sum.num_objects_degraded
2411 << st.stats.sum.num_objects_misplaced
2412 << st.stats.sum.num_objects_unfound
2413 << st.stats.sum.num_bytes
2414 << st.log_size
2415 << st.ondisk_log_size
2416 << pg_state_string(st.state)
2417 << st.last_change
2418 << st.version
2419 << reported.str()
2420 << st.up
2421 << st.up_primary
2422 << st.acting
2423 << st.acting_primary
2424 << st.last_scrub
2425 << st.last_scrub_stamp
2426 << st.last_deep_scrub
2427 << st.last_deep_scrub_stamp
2428 << TextTable::endrow;
2429 }
2430
2431 ss << tab;
2432}
2433
7c673cae 2434
7c673cae 2435
31f18b77
FG
2436// Only called with a single bit set in "what"
2437static void note_stuck_detail(
2438 int what,
2439 mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
2440 int max_detail,
2441 list<pair<health_status_t,string> > *detail)
2442{
2443 int n = 0;
2444 for (auto p = stuck_pgs.begin();
2445 p != stuck_pgs.end();
2446 ++p) {
2447 ostringstream ss;
2448 utime_t since;
2449 const char *whatname = 0;
2450 switch (what) {
2451 case PGMap::STUCK_INACTIVE:
2452 since = p->second.last_active;
2453 whatname = "inactive";
2454 break;
2455 case PGMap::STUCK_UNCLEAN:
2456 since = p->second.last_clean;
2457 whatname = "unclean";
2458 break;
2459 case PGMap::STUCK_DEGRADED:
2460 since = p->second.last_undegraded;
2461 whatname = "degraded";
2462 break;
2463 case PGMap::STUCK_UNDERSIZED:
2464 since = p->second.last_fullsized;
2465 whatname = "undersized";
2466 break;
2467 case PGMap::STUCK_STALE:
2468 since = p->second.last_unstale;
2469 whatname = "stale";
2470 break;
2471 default:
2472 ceph_abort();
2473 }
2474 if (--max_detail == 0) {
2475 ostringstream ss;
2476 ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
2477 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2478 break;
2479 }
2480 ++n;
2481 ss << "pg " << p->first << " is stuck " << whatname;
2482 if (since == utime_t()) {
2483 ss << " since forever";
7c673cae 2484 } else {
31f18b77
FG
2485 utime_t dur = ceph_clock_now() - since;
2486 ss << " for " << dur;
7c673cae 2487 }
31f18b77
FG
2488 ss << ", current state " << pg_state_string(p->second.state)
2489 << ", last acting " << p->second.acting;
2490 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae 2491 }
7c673cae
FG
2492}
2493
31f18b77
FG
2494static pair<int,int> _warn_slow_request_histogram(
2495 CephContext *cct,
2496 const pow2_hist_t& h,
2497 string suffix,
2498 list<pair<health_status_t,string> >& summary,
2499 list<pair<health_status_t,string> > *detail)
7c673cae 2500{
31f18b77
FG
2501 if (h.h.empty())
2502 return make_pair(0, 0);
7c673cae 2503
31f18b77
FG
2504 unsigned warn = 0, error = 0;
2505 float err_age =
2506 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
2507 for (unsigned i = h.h.size() - 1; i > 0; --i) {
2508 float ub = (float)(1 << i) / 1000.0;
2509 if (ub < cct->_conf->mon_osd_warn_op_age)
2510 break;
2511 if (h.h[i]) {
2512 auto sev = HEALTH_WARN;
2513 if (ub > err_age) {
2514 sev = HEALTH_ERR;
2515 error += h.h[i];
2516 } else {
2517 warn += h.h[i];
2518 }
2519 if (detail) {
2520 ostringstream ss;
2521 ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
2522 detail->push_back(make_pair(sev, ss.str()));
2523 }
7c673cae 2524 }
31f18b77
FG
2525 }
2526 return make_pair(warn, error);
2527}
7c673cae 2528
31f18b77
FG
2529namespace {
2530 enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
2531
2532 void print_unscrubbed_detailed(
2533 const std::pair<const pg_t,pg_stat_t> &pg_entry,
2534 list<pair<health_status_t,string> > *detail,
2535 scrubbed_or_deepscrubbed_t how_scrubbed)
2536 {
2537 std::stringstream ss;
2538 const auto& pg_stat(pg_entry.second);
2539
2540 ss << "pg " << pg_entry.first << " is not ";
2541 if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
2542 ss << "scrubbed, last_scrub_stamp "
2543 << pg_stat.last_scrub_stamp;
2544 } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
2545 ss << "deep-scrubbed, last_deep_scrub_stamp "
2546 << pg_stat.last_deep_scrub_stamp;
7c673cae 2547 }
31f18b77
FG
2548
2549 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae
FG
2550 }
2551
31f18b77 2552 using pg_stat_map_t = const mempool::pgmap::unordered_map<pg_t,pg_stat_t>;
7c673cae 2553
31f18b77
FG
2554 void print_unscrubbed_pgs(
2555 pg_stat_map_t& pg_stats,
2556 list<pair<health_status_t,string> > &summary,
2557 list<pair<health_status_t,string> > *detail,
2558 const CephContext* cct)
2559 {
2560 if (cct->_conf->mon_warn_not_scrubbed == 0 &&
2561 cct->_conf->mon_warn_not_deep_scrubbed == 0)
2562 return;
2563
2564 int pgs_count = 0;
2565 const utime_t now = ceph_clock_now();
2566 for (const auto& pg_entry : pg_stats) {
2567 const auto& pg_stat(pg_entry.second);
2568 const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
2569 const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
2570
2571 const int mon_warn_not_scrubbed =
2572 cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
2573
2574 const int mon_warn_not_deep_scrubbed =
2575 cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
2576
2577 bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
2578 cct->_conf->mon_warn_not_scrubbed != 0);
2579
2580 bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
2581 cct->_conf->mon_warn_not_deep_scrubbed != 0);
2582
2583 if (detail != nullptr) {
2584 if (not_scrubbed) {
2585 print_unscrubbed_detailed(pg_entry,
2586 detail,
2587 scrubbed_or_deepscrubbed_t::SCRUBBED);
2588 }
2589 if (not_deep_scrubbed) {
2590 print_unscrubbed_detailed(pg_entry,
2591 detail,
2592 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
2593 }
2594 }
2595 if (not_scrubbed || not_deep_scrubbed) {
2596 ++pgs_count;
7c673cae
FG
2597 }
2598 }
31f18b77
FG
2599
2600 if (pgs_count > 0) {
2601 std::stringstream ss;
2602 ss << pgs_count << " unscrubbed pgs";
2603 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae 2604 }
224ce89b
WB
2605 }
2606}
2607
2608void PGMap::get_health_checks(
2609 CephContext *cct,
2610 const OSDMap& osdmap,
2611 health_check_map_t *checks) const
2612{
2613 utime_t now = ceph_clock_now();
2614 const unsigned max = cct->_conf->mon_health_max_detail;
2615 const auto& pools = osdmap.get_pools();
2616
224ce89b
WB
2617 typedef enum pg_consequence_t {
2618 UNAVAILABLE = 1, // Client IO to the pool may block
2619 DEGRADED = 2, // Fewer than the requested number of replicas are present
2620 DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
2621 // and insufficiet resources are present to fix this
2622 DAMAGED = 4 // The data may be missing or inconsistent on disk and
2623 // requires repair
2624 } pg_consequence_t;
2625
2626 // For a given PG state, how should it be reported at the pool level?
2627 class PgStateResponse {
2628 public:
2629 pg_consequence_t consequence;
2630 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2631 stuck_cb stuck_since;
2632 bool invert;
2633
2634 PgStateResponse(const pg_consequence_t &c, stuck_cb s)
2635 : consequence(c), stuck_since(s), invert(false)
2636 {
2637 }
2638
2639 PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
2640 : consequence(c), stuck_since(s), invert(i)
2641 {
2642 }
2643 };
2644
2645 // Record the PG state counts that contributed to a reported pool state
2646 class PgCauses {
2647 public:
2648 // Map of PG_STATE_* to number of pgs in that state.
2649 std::map<unsigned, unsigned> states;
2650
2651 // List of all PG IDs that had a state contributing
2652 // to this health condition.
2653 std::set<pg_t> pgs;
2654
2655 std::map<pg_t, std::string> pg_messages;
2656 };
2657
2658 // Map of PG state to how to respond to it
2659 std::map<unsigned, PgStateResponse> state_to_response = {
2660 // Immediate reports
2661 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
c07f9fc5 2662 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
224ce89b
WB
2663 { PG_STATE_REPAIR, {DAMAGED, {}} },
2664 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
c07f9fc5
FG
2665 { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
2666 { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
224ce89b
WB
2667 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2668 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2669 // Delayed (wait until stuck) reports
2670 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2671 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2672 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2673 // Delayed and inverted reports
2674 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} },
2675 { PG_STATE_CLEAN, {DEGRADED, [](const pg_stat_t &p){return p.last_clean;}, true} }
2676 };
2677
2678 // Specialized state printer that takes account of inversion of
2679 // ACTIVE, CLEAN checks.
2680 auto state_name = [](const uint32_t &state) {
2681 // Special cases for the states that are inverted checks
2682 if (state == PG_STATE_CLEAN) {
2683 return std::string("unclean");
2684 } else if (state == PG_STATE_ACTIVE) {
2685 return std::string("inactive");
2686 } else {
2687 return pg_state_string(state);
2688 }
2689 };
2690
2691 // Map of what is wrong to information about why, implicitly also stores
2692 // the list of what is wrong.
2693 std::map<pg_consequence_t, PgCauses> detected;
2694
2695 // Optimisation: trim down the number of checks to apply based on
2696 // the summary counters
2697 std::map<unsigned, PgStateResponse> possible_responses;
2698 for (const auto &i : num_pg_by_state) {
2699 for (const auto &j : state_to_response) {
2700 if (!j.second.invert) {
2701 // Check for normal tests by seeing if any pgs have the flag
2702 if (i.first & j.first) {
2703 possible_responses.insert(j);
2704 }
2705 }
2706 }
2707 }
2708
2709 for (const auto &j : state_to_response) {
2710 if (j.second.invert) {
2711 // Check for inverted tests by seeing if not-all pgs have the flag
2712 const auto &found = num_pg_by_state.find(j.first);
2713 if (found == num_pg_by_state.end() || found->second != num_pg) {
2714 possible_responses.insert(j);
2715 }
2716 }
2717 }
2718
2719 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
2720 // Loop over all PGs, if there are any possibly-unhealthy states in there
2721 if (!possible_responses.empty()) {
2722 for (const auto& i : pg_stat) {
2723 const auto &pg_id = i.first;
2724 const auto &pg_info = i.second;
2725
2726 for (const auto &j : state_to_response) {
2727 const auto &pg_response_state = j.first;
2728 const auto &pg_response = j.second;
2729
2730 // Apply the state test
2731 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2732 continue;
2733 }
2734
2735 // Apply stuckness test if needed
2736 if (pg_response.stuck_since) {
2737 // Delayed response, check for stuckness
2738 utime_t last_whatever = pg_response.stuck_since(pg_info);
2739 if (last_whatever >= cutoff) {
2740 // Not stuck enough, ignore.
2741 continue;
2742 } else {
2743
2744 }
2745 }
2746
2747 auto &causes = detected[pg_response.consequence];
2748 causes.states[pg_response_state]++;
2749 causes.pgs.insert(pg_id);
2750
2751 // Don't bother composing detail string if we have already recorded
2752 // too many
2753 if (causes.pg_messages.size() > max) {
2754 continue;
2755 }
2756
2757 std::ostringstream ss;
2758 if (pg_response.stuck_since) {
2759 utime_t since = pg_response.stuck_since(pg_info);
2760 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2761 if (since == utime_t()) {
2762 ss << " since forever";
2763 } else {
2764 utime_t dur = now - since;
2765 ss << " for " << dur;
2766 }
2767 ss << ", current state " << pg_state_string(pg_info.state)
2768 << ", last acting " << pg_info.acting;
2769 } else {
2770 ss << "pg " << pg_id << " is "
2771 << pg_state_string(pg_info.state);
2772 ss << ", acting " << pg_info.acting;
2773 if (pg_info.stats.sum.num_objects_unfound) {
2774 ss << ", " << pg_info.stats.sum.num_objects_unfound
2775 << " unfound";
2776 }
2777 }
2778
2779 if (pg_info.state & PG_STATE_INCOMPLETE) {
2780 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2781 if (pi && pi->min_size > 1) {
2782 ss << " (reducing pool "
2783 << osdmap.get_pool_name(pg_id.pool())
2784 << " min_size from " << (int)pi->min_size
2785 << " may help; search ceph.com/docs for 'incomplete')";
2786 }
2787 }
2788
2789 causes.pg_messages[pg_id] = ss.str();
2790 }
2791 }
2792 } else {
2793 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2794 }
2795
2796 for (const auto &i : detected) {
2797 std::string health_code;
2798 health_status_t sev;
2799 std::string summary;
2800 switch(i.first) {
2801 case UNAVAILABLE:
2802 health_code = "PG_AVAILABILITY";
2803 sev = HEALTH_WARN;
2804 summary = "Reduced data availability: ";
2805 break;
2806 case DEGRADED:
2807 health_code = "PG_DEGRADED";
2808 summary = "Degraded data redundancy: ";
2809 sev = HEALTH_WARN;
2810 break;
2811 case DEGRADED_FULL:
2812 health_code = "PG_DEGRADED_FULL";
2813 summary = "Degraded data redundancy (low space): ";
2814 sev = HEALTH_ERR;
2815 break;
2816 case DAMAGED:
2817 health_code = "PG_DAMAGED";
2818 summary = "Possible data damage: ";
2819 sev = HEALTH_ERR;
2820 break;
2821 default:
2822 assert(false);
2823 }
2824
2825 if (i.first == DEGRADED) {
2826 if (pg_sum.stats.sum.num_objects_degraded &&
2827 pg_sum.stats.sum.num_object_copies > 0) {
2828 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2829 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2830 char b[20];
2831 snprintf(b, sizeof(b), "%.3lf", pc);
2832 ostringstream ss;
2833 ss << pg_sum.stats.sum.num_objects_degraded
2834 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2835 << b << "%)";
2836
2837 // Throw in a comma for the benefit of the following PG counts
2838 summary += ss.str() + ", ";
2839 }
2840 }
2841
2842 // Compose summary message saying how many PGs in what states led
2843 // to this health check failing
2844 std::vector<std::string> pg_msgs;
2845 for (const auto &j : i.second.states) {
2846 std::ostringstream msg;
2847 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2848 pg_msgs.push_back(msg.str());
2849 }
2850 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2851
2852
2853
2854 health_check_t *check = &checks->add(
2855 health_code,
2856 sev,
2857 summary);
2858
2859 // Compose list of PGs contributing to this health check failing
2860 for (const auto &j : i.second.pg_messages) {
2861 check->detail.push_back(j.second);
2862 }
2863 }
2864
224ce89b
WB
2865 // OSD_SCRUB_ERRORS
2866 if (pg_sum.stats.sum.num_scrub_errors) {
2867 ostringstream ss;
2868 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2869 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
2870 }
2871
2872 // CACHE_POOL_NEAR_FULL
2873 {
2874 list<string> detail;
2875 unsigned num_pools = 0;
2876 for (auto& p : pools) {
2877 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2878 !pg_pool_sum.count(p.first)) {
2879 continue;
2880 }
2881 bool nearfull = false;
2882 const string& name = osdmap.get_pool_name(p.first);
2883 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2884 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2885 ((1000000 - p.second.cache_target_full_ratio_micro) *
2886 cct->_conf->mon_cache_target_full_warn_ratio);
2887 if (p.second.target_max_objects &&
2888 (uint64_t)(st.stats.sum.num_objects -
2889 st.stats.sum.num_objects_hit_set_archive) >
2890 p.second.target_max_objects * (ratio / 1000000.0)) {
2891 ostringstream ss;
2892 ss << "cache pool '" << name << "' with "
2893 << si_t(st.stats.sum.num_objects)
2894 << " objects at/near target max "
2895 << si_t(p.second.target_max_objects) << " objects";
2896 detail.push_back(ss.str());
2897 nearfull = true;
2898 }
2899 if (p.second.target_max_bytes &&
2900 (uint64_t)(st.stats.sum.num_bytes -
2901 st.stats.sum.num_bytes_hit_set_archive) >
2902 p.second.target_max_bytes * (ratio / 1000000.0)) {
2903 ostringstream ss;
2904 ss << "cache pool '" << name
2905 << "' with " << si_t(st.stats.sum.num_bytes)
2906 << "B at/near target max "
2907 << si_t(p.second.target_max_bytes) << "B";
2908 detail.push_back(ss.str());
2909 nearfull = true;
2910 }
2911 if (nearfull) {
2912 ++num_pools;
2913 }
2914 }
2915 if (!detail.empty()) {
2916 ostringstream ss;
2917 ss << num_pools << " cache pools at or near target size";
2918 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2919 d.detail.swap(detail);
2920 }
2921 }
2922
2923 // TOO_FEW_PGS
2924 int num_in = osdmap.get_num_in_osds();
2925 int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
2926 if (num_in &&
2927 cct->_conf->mon_pg_warn_min_per_osd > 0 &&
2928 osdmap.get_pools().size() > 0) {
2929 int per = sum_pg_up / num_in;
2930 if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
2931 ostringstream ss;
2932 ss << "too few PGs per OSD (" << per
2933 << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
2934 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
2935 }
2936 }
2937
2938 // TOO_MANY_PGS
2939 if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
2940 int per = sum_pg_up / num_in;
2941 if (per > cct->_conf->mon_pg_warn_max_per_osd) {
2942 ostringstream ss;
2943 ss << "too many PGs per OSD (" << per
2944 << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
2945 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
2946 }
2947 }
2948
2949 // SMALLER_PGP_NUM
2950 // MANY_OBJECTS_PER_PG
2951 if (!pg_stat.empty()) {
2952 list<string> pgp_detail, many_detail;
2953 for (auto p = pg_pool_sum.begin();
2954 p != pg_pool_sum.end();
2955 ++p) {
2956 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2957 if (!pi)
2958 continue; // in case osdmap changes haven't propagated to PGMap yet
2959 const string& name = osdmap.get_pool_name(p->first);
2960 if (pi->get_pg_num() > pi->get_pgp_num() &&
2961 !(name.find(".DELETED") != string::npos &&
2962 cct->_conf->mon_fake_pool_delete)) {
2963 ostringstream ss;
2964 ss << "pool " << name << " pg_num "
2965 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
2966 pgp_detail.push_back(ss.str());
2967 }
2968 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2969 if (average_objects_per_pg > 0 &&
2970 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
2971 p->second.stats.sum.num_objects >=
2972 cct->_conf->mon_pg_warn_min_pool_objects) {
2973 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
2974 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2975 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
2976 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
2977 ostringstream ss;
2978 ss << "pool " << name << " objects per pg ("
2979 << objects_per_pg << ") is more than " << ratio
2980 << " times cluster average ("
2981 << average_objects_per_pg << ")";
2982 many_detail.push_back(ss.str());
2983 }
2984 }
2985 }
2986 if (!pgp_detail.empty()) {
2987 ostringstream ss;
2988 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2989 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
2990 d.detail.swap(pgp_detail);
2991 }
2992 if (!many_detail.empty()) {
2993 ostringstream ss;
2994 ss << many_detail.size() << " pools have many more objects per pg than"
2995 << " average";
2996 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
2997 d.detail.swap(many_detail);
2998 }
2999 }
3000
3001 // POOL_FULL
3002 // POOL_NEAR_FULL
3003 {
3004 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3005 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3006 list<string> full_detail, nearfull_detail;
3007 unsigned full_pools = 0, nearfull_pools = 0;
3008 for (auto it : pools) {
3009 auto it2 = pg_pool_sum.find(it.first);
3010 if (it2 == pg_pool_sum.end()) {
3011 continue;
3012 }
3013 const pool_stat_t *pstat = &it2->second;
3014 const object_stat_sum_t& sum = pstat->stats.sum;
3015 const string& pool_name = osdmap.get_pool_name(it.first);
3016 const pg_pool_t &pool = it.second;
3017 bool full = false, nearfull = false;
3018 if (pool.quota_max_objects > 0) {
3019 stringstream ss;
3020 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3021 } else if (crit_threshold > 0 &&
3022 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3023 ss << "pool '" << pool_name
3024 << "' has " << sum.num_objects << " objects"
3025 << " (max " << pool.quota_max_objects << ")";
3026 full_detail.push_back(ss.str());
3027 full = true;
3028 } else if (warn_threshold > 0 &&
3029 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3030 ss << "pool '" << pool_name
3031 << "' has " << sum.num_objects << " objects"
3032 << " (max " << pool.quota_max_objects << ")";
3033 nearfull_detail.push_back(ss.str());
3034 nearfull = true;
3035 }
3036 }
3037 if (pool.quota_max_bytes > 0) {
3038 stringstream ss;
3039 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3040 } else if (crit_threshold > 0 &&
3041 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3042 ss << "pool '" << pool_name
3043 << "' has " << si_t(sum.num_bytes) << " bytes"
3044 << " (max " << si_t(pool.quota_max_bytes) << ")";
3045 full_detail.push_back(ss.str());
3046 full = true;
3047 } else if (warn_threshold > 0 &&
3048 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3049 ss << "pool '" << pool_name
3050 << "' has " << si_t(sum.num_bytes) << " bytes"
3051 << " (max " << si_t(pool.quota_max_bytes) << ")";
3052 nearfull_detail.push_back(ss.str());
3053 nearfull = true;
3054 }
3055 }
3056 if (full) {
3057 ++full_pools;
3058 }
3059 if (nearfull) {
3060 ++nearfull_pools;
3061 }
3062 }
3063 if (full_pools) {
3064 ostringstream ss;
3065 ss << full_pools << " pools full";
3066 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
3067 d.detail.swap(full_detail);
3068 }
3069 if (nearfull_pools) {
3070 ostringstream ss;
3071 ss << nearfull_pools << " pools full";
3072 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
3073 d.detail.swap(nearfull_detail);
3074 }
3075 }
3076
3077 // OBJECT_MISPLACED
3078 if (pg_sum.stats.sum.num_objects_misplaced &&
3079 pg_sum.stats.sum.num_object_copies > 0) {
3080 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3081 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3082 char b[20];
3083 snprintf(b, sizeof(b), "%.3lf", pc);
3084 ostringstream ss;
3085 ss << pg_sum.stats.sum.num_objects_misplaced
3086 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3087 << b << "%)";
3088 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
3089 }
3090
3091 // OBJECT_UNFOUND
3092 if (pg_sum.stats.sum.num_objects_unfound &&
3093 pg_sum.stats.sum.num_objects) {
3094 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3095 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3096 char b[20];
3097 snprintf(b, sizeof(b), "%.3lf", pc);
3098 ostringstream ss;
3099 ss << pg_sum.stats.sum.num_objects_unfound
3100 << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
c07f9fc5
FG
3101 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
3102
3103 for (auto& p : pg_stat) {
3104 if (p.second.stats.sum.num_objects_unfound) {
3105 ostringstream ss;
3106 ss << "pg " << p.first
3107 << " has " << p.second.stats.sum.num_objects_unfound
3108 << " unfound objects";
3109 d.detail.push_back(ss.str());
3110 if (d.detail.size() > max) {
3111 d.detail.push_back("(additional pgs left out for brevity)");
3112 break;
3113 }
3114 }
3115 }
224ce89b
WB
3116 }
3117
3118 // REQUEST_SLOW
3119 // REQUEST_STUCK
3120 if (cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3121 !osd_sum.op_queue_age_hist.h.empty() &&
3122 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
224ce89b
WB
3123 cct->_conf->mon_osd_warn_op_age) {
3124 list<string> warn_detail, error_detail;
3125 unsigned warn = 0, error = 0;
3126 float err_age =
3127 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3128 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3129 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3130 float ub = (float)(1 << i) / 1000.0;
3131 if (ub < cct->_conf->mon_osd_warn_op_age)
3132 break;
3133 if (h.h[i]) {
3134 ostringstream ss;
3135 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3136 if (ub > err_age) {
3137 error += h.h[i];
3138 error_detail.push_back(ss.str());
3139 } else {
3140 warn += h.h[i];
3141 warn_detail.push_back(ss.str());
3142 }
3143 }
3144 }
3145
3146 map<float,set<int>> warn_osd_by_max; // max -> osds
3147 map<float,set<int>> error_osd_by_max; // max -> osds
3148 if (!warn_detail.empty() || !error_detail.empty()) {
3149 for (auto& p : osd_stat) {
3150 const pow2_hist_t& h = p.second.op_queue_age_hist;
3151 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3152 float ub = (float)(1 << i) / 1000.0;
3153 if (ub < cct->_conf->mon_osd_warn_op_age)
3154 break;
3155 if (h.h[i]) {
3156 if (ub > err_age) {
3157 error_osd_by_max[ub].insert(p.first);
3158 } else {
3159 warn_osd_by_max[ub].insert(p.first);
3160 }
3161 break;
3162 }
3163 }
3164 }
3165 }
3166
3167 if (!warn_detail.empty()) {
3168 ostringstream ss;
3169 ss << warn << " slow requests are blocked > "
3170 << cct->_conf->mon_osd_warn_op_age << " sec";
3171 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
3172 d.detail.swap(warn_detail);
3173 int left = max;
3174 for (auto& p : warn_osd_by_max) {
3175 ostringstream ss;
3176 if (p.second.size() > 1) {
c07f9fc5
FG
3177 ss << "osds " << p.second
3178 << " have blocked requests > " << p.first << " sec";
224ce89b 3179 } else {
c07f9fc5
FG
3180 ss << "osd." << *p.second.begin()
3181 << " has blocked requests > " << p.first << " sec";
224ce89b 3182 }
224ce89b
WB
3183 d.detail.push_back(ss.str());
3184 if (--left == 0) {
3185 break;
3186 }
3187 }
3188 }
3189 if (!error_detail.empty()) {
3190 ostringstream ss;
3191 ss << warn << " stuck requests are blocked > "
3192 << err_age << " sec";
3193 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
3194 d.detail.swap(error_detail);
3195 int left = max;
3196 for (auto& p : error_osd_by_max) {
3197 ostringstream ss;
3198 if (p.second.size() > 1) {
c07f9fc5
FG
3199 ss << "osds " << p.second
3200 << " have stuck requests > " << p.first << " sec";
224ce89b 3201 } else {
c07f9fc5
FG
3202 ss << "osd." << *p.second.begin()
3203 << " has stuck requests > " << p.first << " sec";
224ce89b 3204 }
224ce89b
WB
3205 d.detail.push_back(ss.str());
3206 if (--left == 0) {
3207 break;
3208 }
3209 }
3210 }
3211 }
7c673cae 3212
224ce89b
WB
3213 // PG_NOT_SCRUBBED
3214 // PG_NOT_DEEP_SCRUBBED
3215 {
c07f9fc5
FG
3216 if (cct->_conf->mon_warn_not_scrubbed ||
3217 cct->_conf->mon_warn_not_deep_scrubbed) {
3218 list<string> detail, deep_detail;
3219 const double age = cct->_conf->mon_warn_not_scrubbed +
3220 cct->_conf->mon_scrub_interval;
3221 utime_t cutoff = now;
3222 cutoff -= age;
3223 const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
3224 cct->_conf->osd_deep_scrub_interval;
3225 utime_t deep_cutoff = now;
3226 deep_cutoff -= deep_age;
3227 for (auto& p : pg_stat) {
3228 if (cct->_conf->mon_warn_not_scrubbed &&
3229 p.second.last_scrub_stamp < cutoff) {
3230 ostringstream ss;
3231 ss << "pg " << p.first << " not scrubbed since "
3232 << p.second.last_scrub_stamp;
3233 detail.push_back(ss.str());
3234 }
3235 if (cct->_conf->mon_warn_not_deep_scrubbed &&
3236 p.second.last_deep_scrub_stamp < deep_cutoff) {
3237 ostringstream ss;
3238 ss << "pg " << p.first << " not deep-scrubbed since "
3239 << p.second.last_deep_scrub_stamp;
3240 deep_detail.push_back(ss.str());
3241 }
224ce89b 3242 }
c07f9fc5
FG
3243 if (!detail.empty()) {
3244 ostringstream ss;
3245 ss << detail.size() << " pgs not scrubbed for " << age;
3246 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
3247 d.detail.swap(detail);
3248 }
3249 if (!deep_detail.empty()) {
3250 ostringstream ss;
3251 ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
3252 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
3253 d.detail.swap(deep_detail);
3254 }
3255 }
3256 }
3257
3258 // POOL_APP
d2e6a577 3259 if (g_conf->get_val<bool>("mon_warn_on_pool_no_app")) {
c07f9fc5
FG
3260 list<string> detail;
3261 for (auto &it : pools) {
3262 const pg_pool_t &pool = it.second;
3263 const string& pool_name = osdmap.get_pool_name(it.first);
3264 auto it2 = pg_pool_sum.find(it.first);
3265 if (it2 == pg_pool_sum.end()) {
3266 continue;
3267 }
3268 const pool_stat_t *pstat = &it2->second;
3269 if (pstat == nullptr) {
3270 continue;
3271 }
3272 const object_stat_sum_t& sum = pstat->stats.sum;
3273 // application metadata is not encoded until luminous is minimum
3274 // required release
3275 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3276 sum.num_objects > 0 && pool.application_metadata.empty() &&
3277 !pool.is_tier() && !g_conf->mon_debug_no_require_luminous) {
3278 stringstream ss;
3279 ss << "application not enabled on pool '" << pool_name << "'";
3280 detail.push_back(ss.str());
224ce89b
WB
3281 }
3282 }
3283 if (!detail.empty()) {
3284 ostringstream ss;
c07f9fc5
FG
3285 ss << "application not enabled on " << detail.size() << " pool(s)";
3286 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
3287 stringstream tip;
3288 tip << "use 'ceph osd pool application enable <pool-name> "
3289 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3290 << "or freeform for custom applications.";
3291 detail.push_back(tip.str());
224ce89b
WB
3292 d.detail.swap(detail);
3293 }
31f18b77
FG
3294 }
3295}
7c673cae 3296
31f18b77
FG
3297void PGMap::get_health(
3298 CephContext *cct,
3299 const OSDMap& osdmap,
3300 list<pair<health_status_t,string> >& summary,
3301 list<pair<health_status_t,string> > *detail) const
3302{
3303 map<string,int> note;
3304 auto p = num_pg_by_state.begin();
3305 auto p_end = num_pg_by_state.end();
3306 for (; p != p_end; ++p) {
3307 if (p->first & PG_STATE_STALE)
3308 note["stale"] += p->second;
3309 if (p->first & PG_STATE_DOWN)
3310 note["down"] += p->second;
3311 if (p->first & PG_STATE_UNDERSIZED)
3312 note["undersized"] += p->second;
3313 if (p->first & PG_STATE_DEGRADED)
3314 note["degraded"] += p->second;
3315 if (p->first & PG_STATE_INCONSISTENT)
3316 note["inconsistent"] += p->second;
3317 if (p->first & PG_STATE_PEERING)
3318 note["peering"] += p->second;
3319 if (p->first & PG_STATE_REPAIR)
3320 note["repair"] += p->second;
3321 if (p->first & PG_STATE_RECOVERING)
3322 note["recovering"] += p->second;
3323 if (p->first & PG_STATE_RECOVERY_WAIT)
3324 note["recovery_wait"] += p->second;
3325 if (p->first & PG_STATE_INCOMPLETE)
3326 note["incomplete"] += p->second;
3327 if (p->first & PG_STATE_BACKFILL_WAIT)
3328 note["backfill_wait"] += p->second;
3329 if (p->first & PG_STATE_BACKFILL)
3330 note["backfilling"] += p->second;
3331 if (p->first & PG_STATE_BACKFILL_TOOFULL)
3332 note["backfill_toofull"] += p->second;
3333 if (p->first & PG_STATE_RECOVERY_TOOFULL)
3334 note["recovery_toofull"] += p->second;
224ce89b
WB
3335 if (p->first & PG_STATE_SNAPTRIM_ERROR)
3336 note["snaptrim_error"] += p->second;
31f18b77
FG
3337 }
3338
3339 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
3340 utime_t now(ceph_clock_now());
3341 utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
3342 uint64_t num_inactive_pgs = 0;
3343
3344 if (detail) {
3345 // we need to collect details of stuck pgs, first do a quick check
3346 // whether this will yield any results
3347 if (get_stuck_counts(cutoff, note)) {
3348
3349 // there are stuck pgs. gather details for specified statuses
3350 // only if we know that there are pgs stuck in that status
3351
3352 if (note.find("stuck inactive") != note.end()) {
3353 get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
3354 note["stuck inactive"] = stuck_pgs.size();
3355 num_inactive_pgs += stuck_pgs.size();
3356 note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
3357 cct->_conf->mon_health_max_detail, detail);
3358 stuck_pgs.clear();
7c673cae
FG
3359 }
3360
31f18b77
FG
3361 if (note.find("stuck unclean") != note.end()) {
3362 get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
3363 note["stuck unclean"] = stuck_pgs.size();
3364 note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
3365 cct->_conf->mon_health_max_detail, detail);
3366 stuck_pgs.clear();
3367 }
7c673cae 3368
31f18b77
FG
3369 if (note.find("stuck undersized") != note.end()) {
3370 get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
3371 note["stuck undersized"] = stuck_pgs.size();
3372 note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
3373 cct->_conf->mon_health_max_detail, detail);
3374 stuck_pgs.clear();
3375 }
3376
3377 if (note.find("stuck degraded") != note.end()) {
3378 get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
3379 note["stuck degraded"] = stuck_pgs.size();
3380 note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
3381 cct->_conf->mon_health_max_detail, detail);
3382 stuck_pgs.clear();
3383 }
3384
3385 if (note.find("stuck stale") != note.end()) {
3386 get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
3387 note["stuck stale"] = stuck_pgs.size();
3388 num_inactive_pgs += stuck_pgs.size();
3389 note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
3390 cct->_conf->mon_health_max_detail, detail);
3391 }
3392 }
3393 } else {
3394 get_stuck_counts(cutoff, note);
3395 auto p = note.find("stuck inactive");
3396 if (p != note.end())
3397 num_inactive_pgs += p->second;
3398 p = note.find("stuck stale");
3399 if (p != note.end())
3400 num_inactive_pgs += p->second;
7c673cae 3401 }
31f18b77
FG
3402
3403 if (cct->_conf->mon_pg_min_inactive > 0 &&
3404 num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
3405 ostringstream ss;
3406 ss << num_inactive_pgs << " pgs are stuck inactive for more than " << cct->_conf->mon_pg_stuck_threshold << " seconds";
3407 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
7c673cae 3408 }
7c673cae 3409
31f18b77
FG
3410 if (!note.empty()) {
3411 for (auto p = note.begin(); p != note.end(); ++p) {
3412 ostringstream ss;
3413 ss << p->second << " pgs " << p->first;
3414 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3415 }
3416 if (detail) {
3417 int n = 0, more = 0;
3418 int max = cct->_conf->mon_health_max_detail;
3419 for (auto p = pg_stat.begin();
3420 p != pg_stat.end();
3421 ++p) {
3422 if ((p->second.state & (PG_STATE_STALE |
3423 PG_STATE_DOWN |
3424 PG_STATE_UNDERSIZED |
3425 PG_STATE_DEGRADED |
3426 PG_STATE_INCONSISTENT |
3427 PG_STATE_PEERING |
3428 PG_STATE_REPAIR |
3429 PG_STATE_RECOVERING |
3430 PG_STATE_RECOVERY_WAIT |
3431 PG_STATE_RECOVERY_TOOFULL |
3432 PG_STATE_INCOMPLETE |
3433 PG_STATE_BACKFILL_WAIT |
3434 PG_STATE_BACKFILL |
3435 PG_STATE_BACKFILL_TOOFULL)) &&
3436 stuck_pgs.count(p->first) == 0) {
3437 if (max > 0) {
3438 --max;
3439 } else {
3440 ++more;
3441 continue;
3442 }
3443 ++n;
3444 ostringstream ss;
3445 ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
3446 ss << ", acting " << p->second.acting;
3447 if (p->second.stats.sum.num_objects_unfound)
3448 ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
3449 if (p->second.state & PG_STATE_INCOMPLETE) {
3450 const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
3451 if (pi && pi->min_size > 1) {
3452 ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
3453 << " min_size from " << (int)pi->min_size
3454 << " may help; search ceph.com/docs for 'incomplete')";
3455 }
3456 }
3457 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3458 }
3459 }
3460 if (more) {
3461 ostringstream ss;
3462 ss << more << " more pgs are also unhealthy";
3463 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3464 }
7c673cae 3465 }
31f18b77
FG
3466 }
3467
3468 // slow requests
3469 if (cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3470 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3471 cct->_conf->mon_osd_warn_op_age) {
31f18b77
FG
3472 auto sum = _warn_slow_request_histogram(
3473 cct, osd_sum.op_queue_age_hist, "", summary, NULL);
3474 if (sum.first > 0 || sum.second > 0) {
3475 if (sum.first > 0) {
3476 ostringstream ss;
3477 ss << sum.first << " requests are blocked > "
3478 << cct->_conf->mon_osd_warn_op_age
3479 << " sec";
3480 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3481 }
3482 if (sum.second > 0) {
3483 ostringstream ss;
c07f9fc5 3484 ss << sum.second << " requests are blocked > "
31f18b77
FG
3485 << (cct->_conf->mon_osd_warn_op_age *
3486 cct->_conf->mon_osd_err_op_age_ratio)
3487 << " sec";
3488 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3489 }
3490
3491 if (detail) {
3492 unsigned num_warn = 0, num_err = 0;
3493 // do per-osd warnings
3494 for (auto p = osd_stat.begin();
3495 p != osd_stat.end();
3496 ++p) {
3497 auto sum = _warn_slow_request_histogram(
3498 cct,
3499 p->second.op_queue_age_hist,
3500 string(" on osd.") + stringify(p->first),
3501 summary, detail);
3502 if (sum.second)
3503 ++num_err;
3504 else if (sum.first)
3505 ++num_warn;
3506 }
3507 if (num_err) {
3508 ostringstream ss2;
3509 ss2 << num_err << " osds have very slow requests";
3510 summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
3511 detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
3512 }
3513 if (num_warn) {
3514 ostringstream ss2;
3515 ss2 << num_warn << " osds have slow requests";
3516 summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
3517 detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
3518 }
3519 }
7c673cae 3520 }
31f18b77
FG
3521 }
3522
31f18b77
FG
3523 // recovery
3524 list<string> sl;
3525 overall_recovery_summary(NULL, &sl);
3526 for (auto p = sl.begin(); p != sl.end(); ++p) {
3527 summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3528 if (detail)
3529 detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3530 }
3531
3532 // near-target max pools
3533 auto& pools = osdmap.get_pools();
3534 for (auto p = pools.begin();
3535 p != pools.end(); ++p) {
3536 if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
3537 !pg_pool_sum.count(p->first))
3538 continue;
3539 bool nearfull = false;
3540 const string& name = osdmap.get_pool_name(p->first);
3541 const pool_stat_t& st = get_pg_pool_sum_stat(p->first);
3542 uint64_t ratio = p->second.cache_target_full_ratio_micro +
3543 ((1000000 - p->second.cache_target_full_ratio_micro) *
3544 cct->_conf->mon_cache_target_full_warn_ratio);
3545 if (p->second.target_max_objects &&
3546 (uint64_t)(st.stats.sum.num_objects -
3547 st.stats.sum.num_objects_hit_set_archive) >
3548 p->second.target_max_objects * (ratio / 1000000.0)) {
3549 nearfull = true;
3550 if (detail) {
3551 ostringstream ss;
3552 ss << "cache pool '" << name << "' with "
3553 << si_t(st.stats.sum.num_objects)
3554 << " objects at/near target max "
3555 << si_t(p->second.target_max_objects) << " objects";
3556 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3557 }
3558 }
3559 if (p->second.target_max_bytes &&
3560 (uint64_t)(st.stats.sum.num_bytes -
3561 st.stats.sum.num_bytes_hit_set_archive) >
3562 p->second.target_max_bytes * (ratio / 1000000.0)) {
3563 nearfull = true;
3564 if (detail) {
3565 ostringstream ss;
3566 ss << "cache pool '" << name
3567 << "' with " << si_t(st.stats.sum.num_bytes)
3568 << "B at/near target max "
3569 << si_t(p->second.target_max_bytes) << "B";
3570 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3571 }
3572 }
3573 if (nearfull) {
3574 ostringstream ss;
3575 ss << "'" << name << "' at/near target max";
3576 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3577 }
3578 }
7c673cae 3579
31f18b77
FG
3580 // scrub
3581 if (pg_sum.stats.sum.num_scrub_errors) {
3582 ostringstream ss;
3583 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
3584 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3585 if (detail) {
3586 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
7c673cae 3587 }
31f18b77
FG
3588 }
3589
3590 // pg skew
3591 int num_in = osdmap.get_num_in_osds();
3592 int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
224ce89b
WB
3593 int sum_objects = pg_sum.stats.sum.num_objects;
3594 if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
3595 return;
3596 }
31f18b77
FG
3597 if (num_in && cct->_conf->mon_pg_warn_min_per_osd > 0) {
3598 int per = sum_pg_up / num_in;
3599 if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
3600 ostringstream ss;
3601 ss << "too few PGs per OSD (" << per << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
3602 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3603 if (detail)
3604 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae 3605 }
31f18b77
FG
3606 }
3607 if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
3608 int per = sum_pg_up / num_in;
3609 if (per > cct->_conf->mon_pg_warn_max_per_osd) {
3610 ostringstream ss;
3611 ss << "too many PGs per OSD (" << per << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
3612 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3613 if (detail)
3614 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3615 }
3616 }
3617 if (!pg_stat.empty()) {
3618 for (auto p = pg_pool_sum.begin();
3619 p != pg_pool_sum.end();
3620 ++p) {
3621 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
3622 if (!pi)
3623 continue; // in case osdmap changes haven't propagated to PGMap yet
3624 const string& name = osdmap.get_pool_name(p->first);
3625 if (pi->get_pg_num() > pi->get_pgp_num() &&
3626 !(name.find(".DELETED") != string::npos &&
3627 cct->_conf->mon_fake_pool_delete)) {
3628 ostringstream ss;
3629 ss << "pool " << name << " pg_num "
3630 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
3631 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3632 if (detail)
3633 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3634 }
3635 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
3636 if (average_objects_per_pg > 0 &&
3637 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
3638 p->second.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_pool_objects) {
3639 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
3640 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
3641 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
3642 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
3643 ostringstream ss;
3644 ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
3645 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3646 if (detail) {
3647 ostringstream ss;
3648 ss << "pool " << name << " objects per pg ("
3649 << objects_per_pg << ") is more than " << ratio << " times cluster average ("
3650 << average_objects_per_pg << ")";
3651 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3652 }
3653 }
3654 }
7c673cae
FG
3655 }
3656 }
7c673cae 3657
224ce89b
WB
3658 for (auto it : pools) {
3659 auto it2 = pg_pool_sum.find(it.first);
3660 if (it2 == pg_pool_sum.end()) {
3661 continue;
3662 }
3663 const pool_stat_t *pstat = &it2->second;
3664 const object_stat_sum_t& sum = pstat->stats.sum;
3665 const string& pool_name = osdmap.get_pool_name(it.first);
3666 const pg_pool_t &pool = it.second;
3667
3668 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3669 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3670
3671 if (pool.quota_max_objects > 0) {
3672 stringstream ss;
3673 health_status_t status = HEALTH_OK;
3674 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3675 } else if (crit_threshold > 0 &&
3676 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3677 ss << "pool '" << pool_name
3678 << "' has " << sum.num_objects << " objects"
3679 << " (max " << pool.quota_max_objects << ")";
3680 status = HEALTH_ERR;
3681 } else if (warn_threshold > 0 &&
3682 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3683 ss << "pool '" << pool_name
3684 << "' has " << sum.num_objects << " objects"
3685 << " (max " << pool.quota_max_objects << ")";
3686 status = HEALTH_WARN;
3687 }
3688 if (status != HEALTH_OK) {
3689 pair<health_status_t,string> s(status, ss.str());
3690 summary.push_back(s);
3691 if (detail)
3692 detail->push_back(s);
3693 }
3694 }
3695
3696 if (pool.quota_max_bytes > 0) {
3697 health_status_t status = HEALTH_OK;
3698 stringstream ss;
3699 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3700 } else if (crit_threshold > 0 &&
3701 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3702 ss << "pool '" << pool_name
3703 << "' has " << si_t(sum.num_bytes) << " bytes"
3704 << " (max " << si_t(pool.quota_max_bytes) << ")";
3705 status = HEALTH_ERR;
3706 } else if (warn_threshold > 0 &&
3707 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3708 ss << "pool '" << pool_name
3709 << "' has " << si_t(sum.num_bytes) << " bytes"
3710 << " (max " << si_t(pool.quota_max_bytes) << ")";
3711 status = HEALTH_WARN;
3712 }
3713 if (status != HEALTH_OK) {
3714 pair<health_status_t,string> s(status, ss.str());
3715 summary.push_back(s);
3716 if (detail)
3717 detail->push_back(s);
3718 }
3719 }
3720 }
3721
31f18b77
FG
3722 print_unscrubbed_pgs(pg_stat, summary, detail, cct);
3723}
7c673cae
FG
3724
3725int process_pg_map_command(
3726 const string& orig_prefix,
3727 const map<string,cmd_vartype>& orig_cmdmap,
3728 const PGMap& pg_map,
3729 const OSDMap& osdmap,
3730 Formatter *f,
3731 stringstream *ss,
3732 bufferlist *odata)
3733{
3734 string prefix = orig_prefix;
3735 map<string,cmd_vartype> cmdmap = orig_cmdmap;
3736
3737 // perhaps these would be better in the parsing, but it's weird
3738 bool primary = false;
3739 if (prefix == "pg dump_json") {
3740 vector<string> v;
3741 v.push_back(string("all"));
3742 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3743 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3744 prefix = "pg dump";
3745 } else if (prefix == "pg dump_pools_json") {
3746 vector<string> v;
3747 v.push_back(string("pools"));
3748 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3749 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3750 prefix = "pg dump";
3751 } else if (prefix == "pg ls-by-primary") {
3752 primary = true;
3753 prefix = "pg ls";
3754 } else if (prefix == "pg ls-by-osd") {
3755 prefix = "pg ls";
3756 } else if (prefix == "pg ls-by-pool") {
3757 prefix = "pg ls";
3758 string poolstr;
3759 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3760 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3761 if (pool < 0) {
3762 *ss << "pool " << poolstr << " does not exist";
3763 return -ENOENT;
3764 }
3765 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3766 }
3767
3768 int r = 0;
3769 stringstream ds;
3770 if (prefix == "pg stat") {
3771 if (f) {
3772 f->open_object_section("pg_summary");
3773 pg_map.print_oneline_summary(f, NULL);
3774 f->close_section();
3775 f->flush(ds);
3776 } else {
3777 ds << pg_map;
3778 }
3779 odata->append(ds);
3780 return 0;
3781 }
3782
3783 if (prefix == "pg getmap") {
3784 pg_map.encode(*odata);
3785 *ss << "got pgmap version " << pg_map.version;
3786 return 0;
3787 }
3788
3789 if (prefix == "pg dump") {
3790 string val;
3791 vector<string> dumpcontents;
3792 set<string> what;
3793 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3794 copy(dumpcontents.begin(), dumpcontents.end(),
3795 inserter(what, what.end()));
3796 }
3797 if (what.empty())
3798 what.insert("all");
3799 if (f) {
3800 if (what.count("all")) {
3801 f->open_object_section("pg_map");
3802 pg_map.dump(f);
3803 f->close_section();
3804 } else if (what.count("summary") || what.count("sum")) {
3805 f->open_object_section("pg_map");
3806 pg_map.dump_basic(f);
3807 f->close_section();
3808 } else {
3809 if (what.count("pools")) {
3810 pg_map.dump_pool_stats(f);
3811 }
3812 if (what.count("osds")) {
3813 pg_map.dump_osd_stats(f);
3814 }
3815 if (what.count("pgs")) {
3816 pg_map.dump_pg_stats(f, false);
3817 }
3818 if (what.count("pgs_brief")) {
3819 pg_map.dump_pg_stats(f, true);
3820 }
3821 if (what.count("delta")) {
3822 f->open_object_section("delta");
3823 pg_map.dump_delta(f);
3824 f->close_section();
3825 }
3826 }
3827 f->flush(*odata);
3828 } else {
3829 if (what.count("all")) {
3830 pg_map.dump(ds);
3831 } else if (what.count("summary") || what.count("sum")) {
3832 pg_map.dump_basic(ds);
3833 pg_map.dump_pg_sum_stats(ds, true);
3834 pg_map.dump_osd_sum_stats(ds);
3835 } else {
3836 if (what.count("pgs_brief")) {
3837 pg_map.dump_pg_stats(ds, true);
3838 }
3839 bool header = true;
3840 if (what.count("pgs")) {
3841 pg_map.dump_pg_stats(ds, false);
3842 header = false;
3843 }
3844 if (what.count("pools")) {
3845 pg_map.dump_pool_stats(ds, header);
3846 }
3847 if (what.count("osds")) {
3848 pg_map.dump_osd_stats(ds);
3849 }
3850 }
3851 odata->append(ds);
3852 }
3853 *ss << "dumped " << what;
3854 return 0;
3855 }
3856
3857 if (prefix == "pg ls") {
3858 int64_t osd = -1;
3859 int64_t pool = -1;
3860 vector<string>states;
3861 set<pg_t> pgs;
3862 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3863 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3864 cmd_getval(g_ceph_context, cmdmap, "states", states);
3865 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3866 *ss << "pool " << pool << " does not exist";
3867 return -ENOENT;
3868 }
3869 if (osd >= 0 && !osdmap.is_up(osd)) {
3870 *ss << "osd " << osd << " is not up";
3871 return -EAGAIN;
3872 }
3873 if (states.empty())
3874 states.push_back("all");
3875
3876 uint32_t state = 0;
3877
3878 while (!states.empty()) {
3879 string state_str = states.back();
3880
3881 if (state_str == "all") {
3882 state = -1;
3883 break;
3884 } else {
3885 int filter = pg_string_state(state_str);
c07f9fc5
FG
3886 if (filter < 0) {
3887 *ss << "'" << state_str << "' is not a valid pg state,"
3888 << " available choices: " << pg_state_string(0xFFFFFFFF);
3889 return -EINVAL;
3890 }
7c673cae
FG
3891 state |= filter;
3892 }
3893
3894 states.pop_back();
3895 }
3896
3897 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3898
3899 if (f && !pgs.empty()) {
3900 pg_map.dump_filtered_pg_stats(f, pgs);
3901 f->flush(*odata);
3902 } else if (!pgs.empty()) {
3903 pg_map.dump_filtered_pg_stats(ds, pgs);
3904 odata->append(ds);
3905 }
3906 return 0;
3907 }
3908
3909 if (prefix == "pg dump_stuck") {
3910 vector<string> stuckop_vec;
3911 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3912 if (stuckop_vec.empty())
3913 stuckop_vec.push_back("unclean");
3914 int64_t threshold;
3915 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3916 int64_t(g_conf->mon_pg_stuck_threshold));
3917
3918 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
3919 odata->append(ds);
3920 if (r < 0)
3921 *ss << "failed";
3922 else
3923 *ss << "ok";
3924 return 0;
3925 }
3926
3927 if (prefix == "pg debug") {
3928 string debugop;
3929 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3930 string("unfound_objects_exist"));
3931 if (debugop == "unfound_objects_exist") {
3932 bool unfound_objects_exist = false;
3933 for (const auto& p : pg_map.pg_stat) {
3934 if (p.second.stats.sum.num_objects_unfound > 0) {
3935 unfound_objects_exist = true;
3936 break;
3937 }
3938 }
3939 if (unfound_objects_exist)
3940 ds << "TRUE";
3941 else
3942 ds << "FALSE";
3943 odata->append(ds);
3944 return 0;
3945 }
3946 if (debugop == "degraded_pgs_exist") {
3947 bool degraded_pgs_exist = false;
3948 for (const auto& p : pg_map.pg_stat) {
3949 if (p.second.stats.sum.num_objects_degraded > 0) {
3950 degraded_pgs_exist = true;
3951 break;
3952 }
3953 }
3954 if (degraded_pgs_exist)
3955 ds << "TRUE";
3956 else
3957 ds << "FALSE";
3958 odata->append(ds);
3959 return 0;
3960 }
3961 }
3962
3963 if (prefix == "osd perf") {
3964 if (f) {
3965 f->open_object_section("osdstats");
3966 pg_map.dump_osd_perf_stats(f);
3967 f->close_section();
3968 f->flush(ds);
3969 } else {
3970 pg_map.print_osd_perf_stats(&ds);
3971 }
3972 odata->append(ds);
3973 return 0;
3974 }
3975
3976 if (prefix == "osd blocked-by") {
3977 if (f) {
3978 f->open_object_section("osd_blocked_by");
3979 pg_map.dump_osd_blocked_by_stats(f);
3980 f->close_section();
3981 f->flush(ds);
3982 } else {
3983 pg_map.print_osd_blocked_by_stats(&ds);
3984 }
3985 odata->append(ds);
3986 return 0;
3987 }
3988
3989 if (prefix == "osd pool stats") {
3990 string pool_name;
3991 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
3992
3993 int64_t poolid = -ENOENT;
3994 bool one_pool = false;
3995 if (!pool_name.empty()) {
3996 poolid = osdmap.lookup_pg_pool_name(pool_name);
3997 if (poolid < 0) {
3998 assert(poolid == -ENOENT);
3999 *ss << "unrecognized pool '" << pool_name << "'";
4000 return -ENOENT;
4001 }
4002 one_pool = true;
4003 }
4004
4005 stringstream rs;
4006
4007 if (f)
4008 f->open_array_section("pool_stats");
4009 else {
4010 if (osdmap.get_pools().empty()) {
4011 *ss << "there are no pools!";
4012 goto stats_out;
4013 }
4014 }
4015
4016 for (auto& p : osdmap.get_pools()) {
4017 if (!one_pool)
4018 poolid = p.first;
4019
4020 pool_name = osdmap.get_pool_name(poolid);
4021
4022 if (f) {
4023 f->open_object_section("pool");
4024 f->dump_string("pool_name", pool_name.c_str());
4025 f->dump_int("pool_id", poolid);
4026 f->open_object_section("recovery");
4027 }
4028
4029 list<string> sl;
4030 stringstream tss;
4031 pg_map.pool_recovery_summary(f, &sl, poolid);
4032 if (!f && !sl.empty()) {
4033 for (auto& p : sl)
4034 tss << " " << p << "\n";
4035 }
4036
4037 if (f) {
4038 f->close_section();
4039 f->open_object_section("recovery_rate");
4040 }
4041
4042 ostringstream rss;
4043 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
4044 if (!f && !rss.str().empty())
4045 tss << " recovery io " << rss.str() << "\n";
4046
4047 if (f) {
4048 f->close_section();
4049 f->open_object_section("client_io_rate");
4050 }
4051 rss.clear();
4052 rss.str("");
4053
4054 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
4055 if (!f && !rss.str().empty())
4056 tss << " client io " << rss.str() << "\n";
4057
4058 // dump cache tier IO rate for cache pool
4059 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
4060 if (pool->is_tier()) {
4061 if (f) {
4062 f->close_section();
4063 f->open_object_section("cache_io_rate");
4064 }
4065 rss.clear();
4066 rss.str("");
4067
4068 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
4069 if (!f && !rss.str().empty())
4070 tss << " cache tier io " << rss.str() << "\n";
4071 }
4072 if (f) {
4073 f->close_section();
4074 f->close_section();
4075 } else {
4076 rs << "pool " << pool_name << " id " << poolid << "\n";
4077 if (!tss.str().empty())
4078 rs << tss.str() << "\n";
4079 else
4080 rs << " nothing is going on\n\n";
4081 }
4082 if (one_pool)
4083 break;
4084 }
4085
4086stats_out:
4087 if (f) {
4088 f->close_section();
4089 f->flush(ds);
4090 odata->append(ds);
4091 } else {
4092 odata->append(rs.str());
4093 }
4094 return 0;
4095 }
4096
4097 return -EOPNOTSUPP;
4098}
4099
4100void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
4101 std::set<int> *need_check_down_pg_osds,
4102 std::map<int,utime_t> *last_osd_report,
4103 PGMap *pg_map,
4104 PGMap::Incremental *pending_inc)
4105{
4106 for (const auto &p : osd_inc.new_weight) {
4107 if (p.second == CEPH_OSD_OUT) {
4108 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
31f18b77
FG
4109 auto j = pg_map->osd_epochs.find(p.first);
4110 if (j != pg_map->osd_epochs.end())
4111 pending_inc->stat_osd_out(p.first, j->second);
7c673cae
FG
4112 }
4113 }
4114
4115 // this is conservative: we want to know if any osds (maybe) got marked down.
4116 for (const auto &p : osd_inc.new_state) {
4117 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
4118 // but we're too lazy to check
4119 // which
4120 need_check_down_pg_osds->insert(p.first);
4121
4122 // clear out the last_osd_report for this OSD
31f18b77 4123 auto report = last_osd_report->find(p.first);
7c673cae
FG
4124 if (report != last_osd_report->end()) {
4125 last_osd_report->erase(report);
4126 }
4127
4128 // clear out osd_stat slow request histogram
4129 dout(20) << __func__ << " clearing osd." << p.first
4130 << " request histogram" << dendl;
31f18b77 4131 pending_inc->stat_osd_down_up(p.first, osd_inc.epoch, *pg_map);
7c673cae
FG
4132 }
4133
4134 if (p.second & CEPH_OSD_EXISTS) {
4135 // whether it was created *or* destroyed, we can safely drop
4136 // it's osd_stat_t record.
4137 dout(10) << __func__ << " osd." << p.first
4138 << " created or destroyed" << dendl;
4139 pending_inc->rm_stat(p.first);
4140
4141 // and adjust full, nearfull set
4142 pg_map->nearfull_osds.erase(p.first);
4143 pg_map->full_osds.erase(p.first);
4144 }
4145 }
4146}
4147
31f18b77
FG
4148void PGMapUpdater::check_osd_map(
4149 CephContext *cct,
4150 const OSDMap& osdmap,
4151 const PGMap& pgmap,
4152 PGMap::Incremental *pending_inc)
4153{
4154 for (auto& p : pgmap.osd_stat) {
4155 if (!osdmap.exists(p.first)) {
4156 // remove osd_stat
4157 pending_inc->rm_stat(p.first);
4158 } else if (osdmap.is_out(p.first)) {
4159 // zero osd_stat
4160 if (p.second.kb != 0) {
4161 auto j = pgmap.osd_epochs.find(p.first);
4162 if (j != pgmap.osd_epochs.end()) {
4163 pending_inc->stat_osd_out(p.first, j->second);
4164 }
4165 }
4166 } else if (!osdmap.is_up(p.first)) {
4167 // zero the op_queue_age_hist
4168 if (!p.second.op_queue_age_hist.empty()) {
4169 pending_inc->stat_osd_down_up(p.first, osdmap.get_epoch(), pgmap);
4170 }
4171 }
4172 }
4173
4174 // deleted pgs (pools)?
4175 for (auto& p : pgmap.pg_pool_sum) {
4176 if (!osdmap.have_pg_pool(p.first)) {
4177 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
4178 << dendl;
4179 for (auto& q : pgmap.pg_stat) {
4180 if (q.first.pool() == (uint64_t)p.first) {
4181 pending_inc->pg_remove.insert(q.first);
4182 }
4183 }
4184 auto q = pending_inc->pg_stat_updates.begin();
4185 while (q != pending_inc->pg_stat_updates.end()) {
4186 if (q->first.pool() == (uint64_t)p.first) {
4187 q = pending_inc->pg_stat_updates.erase(q);
4188 } else {
4189 ++q;
4190 }
4191 }
4192 }
4193 }
4194
4195 // new pgs (split or new pool)?
4196 for (auto& p : osdmap.get_pools()) {
4197 int64_t poolid = p.first;
4198 const pg_pool_t& pi = p.second;
4199 auto q = pgmap.num_pg_by_pool.find(poolid);
4200 unsigned my_pg_num = 0;
4201 if (q != pgmap.num_pg_by_pool.end())
4202 my_pg_num = q->second;
4203 unsigned pg_num = pi.get_pg_num();
4204 if (my_pg_num != pg_num) {
224ce89b
WB
4205 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
4206 << " != my pg_num " << my_pg_num << dendl;
31f18b77
FG
4207 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
4208 pg_t pgid(ps, poolid);
4209 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
224ce89b 4210 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
31f18b77
FG
4211 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4212 stats.last_fresh = osdmap.get_modified();
4213 stats.last_active = osdmap.get_modified();
4214 stats.last_change = osdmap.get_modified();
4215 stats.last_peered = osdmap.get_modified();
4216 stats.last_clean = osdmap.get_modified();
4217 stats.last_unstale = osdmap.get_modified();
4218 stats.last_undegraded = osdmap.get_modified();
4219 stats.last_fullsized = osdmap.get_modified();
4220 stats.last_scrub_stamp = osdmap.get_modified();
4221 stats.last_deep_scrub_stamp = osdmap.get_modified();
4222 stats.last_clean_scrub_stamp = osdmap.get_modified();
4223 }
4224 }
4225 }
4226 }
4227}
4228
7c673cae
FG
4229void PGMapUpdater::register_pg(
4230 const OSDMap &osd_map,
4231 pg_t pgid, epoch_t epoch,
4232 bool new_pool,
4233 const PGMap &pg_map,
4234 PGMap::Incremental *pending_inc)
4235{
4236 pg_t parent;
4237 int split_bits = 0;
4238 auto parent_stat = pg_map.pg_stat.end();
4239 if (!new_pool) {
4240 parent = pgid;
4241 while (1) {
4242 // remove most significant bit
4243 int msb = cbits(parent.ps());
4244 if (!msb)
4245 break;
4246 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
4247 split_bits++;
4248 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
4249 parent_stat = pg_map.pg_stat.find(parent);
4250 if (parent_stat != pg_map.pg_stat.end() &&
4251 parent_stat->second.state != PG_STATE_CREATING) {
4252 dout(10) << " parent is " << parent << dendl;
4253 break;
4254 }
4255 }
4256 }
4257
4258 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4259 stats.state = PG_STATE_CREATING;
4260 stats.created = epoch;
4261 stats.parent = parent;
4262 stats.parent_split_bits = split_bits;
4263 stats.mapping_epoch = epoch;
4264
4265 if (parent_stat != pg_map.pg_stat.end()) {
4266 const pg_stat_t &ps = parent_stat->second;
4267 stats.last_fresh = ps.last_fresh;
4268 stats.last_active = ps.last_active;
4269 stats.last_change = ps.last_change;
4270 stats.last_peered = ps.last_peered;
4271 stats.last_clean = ps.last_clean;
4272 stats.last_unstale = ps.last_unstale;
4273 stats.last_undegraded = ps.last_undegraded;
4274 stats.last_fullsized = ps.last_fullsized;
4275 stats.last_scrub_stamp = ps.last_scrub_stamp;
4276 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
4277 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
4278 } else {
4279 utime_t now = osd_map.get_modified();
4280 stats.last_fresh = now;
4281 stats.last_active = now;
4282 stats.last_change = now;
4283 stats.last_peered = now;
4284 stats.last_clean = now;
4285 stats.last_unstale = now;
4286 stats.last_undegraded = now;
4287 stats.last_fullsized = now;
4288 stats.last_scrub_stamp = now;
4289 stats.last_deep_scrub_stamp = now;
4290 stats.last_clean_scrub_stamp = now;
4291 }
4292
4293 osd_map.pg_to_up_acting_osds(
4294 pgid,
4295 &stats.up,
4296 &stats.up_primary,
4297 &stats.acting,
4298 &stats.acting_primary);
4299
4300 if (split_bits == 0) {
4301 dout(10) << __func__ << " will create " << pgid
4302 << " primary " << stats.acting_primary
4303 << " acting " << stats.acting
4304 << dendl;
4305 } else {
4306 dout(10) << __func__ << " will create " << pgid
4307 << " primary " << stats.acting_primary
4308 << " acting " << stats.acting
4309 << " parent " << parent
4310 << " by " << split_bits << " bits"
4311 << dendl;
4312 }
4313}
4314
4315void PGMapUpdater::register_new_pgs(
4316 const OSDMap &osd_map,
4317 const PGMap &pg_map,
4318 PGMap::Incremental *pending_inc)
4319{
4320 epoch_t epoch = osd_map.get_epoch();
4321 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
4322 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
4323
4324 int created = 0;
4325 const auto &pools = osd_map.get_pools();
4326
4327 for (const auto &p : pools) {
4328 int64_t poolid = p.first;
4329 const pg_pool_t &pool = p.second;
31f18b77 4330 int ruleno = osd_map.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
4331 pool.get_type(), pool.get_size());
4332 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
4333 continue;
4334
4335 if (pool.get_last_change() <= pg_map.last_pg_scan ||
4336 pool.get_last_change() <= pending_inc->pg_scan) {
4337 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
4338 continue;
4339 }
4340
4341 dout(10) << __func__ << " scanning pool " << poolid
4342 << " " << pool << dendl;
4343
4344 // first pgs in this pool
4345 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
4346
4347 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
4348 pg_t pgid(ps, poolid, -1);
4349 if (pg_map.pg_stat.count(pgid)) {
4350 dout(20) << "register_new_pgs have " << pgid << dendl;
4351 continue;
4352 }
4353 created++;
4354 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
4355 pg_map, pending_inc);
4356 }
4357 }
4358
4359 int removed = 0;
4360 for (const auto &p : pg_map.creating_pgs) {
4361 if (p.preferred() >= 0) {
4362 dout(20) << " removing creating_pg " << p
4363 << " because it is localized and obsolete" << dendl;
4364 pending_inc->pg_remove.insert(p);
4365 ++removed;
4366 } else if (!osd_map.have_pg_pool(p.pool())) {
4367 dout(20) << " removing creating_pg " << p
4368 << " because containing pool deleted" << dendl;
4369 pending_inc->pg_remove.insert(p);
4370 ++removed;
4371 }
4372 }
4373
4374 // deleted pools?
4375 for (const auto &p : pg_map.pg_stat) {
4376 if (!osd_map.have_pg_pool(p.first.pool())) {
4377 dout(20) << " removing pg_stat " << p.first << " because "
4378 << "containing pool deleted" << dendl;
4379 pending_inc->pg_remove.insert(p.first);
4380 ++removed;
4381 } else if (p.first.preferred() >= 0) {
4382 dout(20) << " removing localized pg " << p.first << dendl;
4383 pending_inc->pg_remove.insert(p.first);
4384 ++removed;
4385 }
4386 }
4387
4388 // we don't want to redo this work if we can avoid it.
4389 pending_inc->pg_scan = epoch;
4390
4391 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
4392 << removed << " uncreated pgs" << dendl;
4393}
4394
4395
4396void PGMapUpdater::update_creating_pgs(
4397 const OSDMap &osd_map,
4398 const PGMap &pg_map,
4399 PGMap::Incremental *pending_inc)
4400{
4401 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
4402 << " pgs, osdmap epoch " << osd_map.get_epoch()
4403 << dendl;
4404
4405 unsigned changed = 0;
31f18b77 4406 for (auto p = pg_map.creating_pgs.begin();
7c673cae
FG
4407 p != pg_map.creating_pgs.end();
4408 ++p) {
4409 pg_t pgid = *p;
4410 pg_t on = pgid;
31f18b77 4411 auto q = pg_map.pg_stat.find(pgid);
7c673cae
FG
4412 assert(q != pg_map.pg_stat.end());
4413 const pg_stat_t *s = &q->second;
4414
4415 if (s->parent_split_bits)
4416 on = s->parent;
4417
4418 vector<int> up, acting;
4419 int up_primary, acting_primary;
4420 osd_map.pg_to_up_acting_osds(
4421 on,
4422 &up,
4423 &up_primary,
4424 &acting,
4425 &acting_primary);
4426
4427 if (up != s->up ||
4428 up_primary != s->up_primary ||
4429 acting != s->acting ||
4430 acting_primary != s->acting_primary) {
4431 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
4432 if (osd_map.get_epoch() > ns->reported_epoch) {
4433 dout(20) << __func__ << " " << pgid << " "
4434 << " acting_primary: " << s->acting_primary
4435 << " -> " << acting_primary
4436 << " acting: " << s->acting << " -> " << acting
4437 << " up_primary: " << s->up_primary << " -> " << up_primary
4438 << " up: " << s->up << " -> " << up
4439 << dendl;
4440
4441 // only initialize if it wasn't already a pending update
4442 if (ns->reported_epoch == 0)
4443 *ns = *s;
4444
4445 // note epoch if the target of the create message changed
4446 if (acting_primary != ns->acting_primary)
4447 ns->mapping_epoch = osd_map.get_epoch();
4448
4449 ns->up = up;
4450 ns->up_primary = up_primary;
4451 ns->acting = acting;
4452 ns->acting_primary = acting_primary;
4453
4454 ++changed;
4455 } else {
4456 dout(20) << __func__ << " " << pgid << " has pending update from newer"
4457 << " epoch " << ns->reported_epoch
4458 << dendl;
4459 }
4460 }
4461 }
4462 if (changed) {
4463 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
4464 }
4465}
4466
4467static void _try_mark_pg_stale(
4468 const OSDMap& osdmap,
4469 pg_t pgid,
4470 const pg_stat_t& cur,
4471 PGMap::Incremental *pending_inc)
4472{
4473 if ((cur.state & PG_STATE_STALE) == 0 &&
4474 cur.acting_primary != -1 &&
4475 osdmap.is_down(cur.acting_primary)) {
4476 pg_stat_t *newstat;
4477 auto q = pending_inc->pg_stat_updates.find(pgid);
4478 if (q != pending_inc->pg_stat_updates.end()) {
4479 if ((q->second.acting_primary == cur.acting_primary) ||
4480 ((q->second.state & PG_STATE_STALE) == 0 &&
4481 q->second.acting_primary != -1 &&
4482 osdmap.is_down(q->second.acting_primary))) {
4483 newstat = &q->second;
4484 } else {
4485 // pending update is no longer down or already stale
4486 return;
4487 }
4488 } else {
4489 newstat = &pending_inc->pg_stat_updates[pgid];
4490 *newstat = cur;
4491 }
4492 dout(10) << __func__ << " marking pg " << pgid
4493 << " stale (acting_primary " << newstat->acting_primary
4494 << ")" << dendl;
4495 newstat->state |= PG_STATE_STALE;
4496 newstat->last_unstale = ceph_clock_now();
4497 }
4498}
4499
4500void PGMapUpdater::check_down_pgs(
4501 const OSDMap &osdmap,
4502 const PGMap &pg_map,
4503 bool check_all,
4504 const set<int>& need_check_down_pg_osds,
4505 PGMap::Incremental *pending_inc)
4506{
4507 // if a large number of osds changed state, just iterate over the whole
4508 // pg map.
4509 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
4510 g_conf->mon_pg_check_down_all_threshold) {
4511 check_all = true;
4512 }
4513
4514 if (check_all) {
4515 for (const auto& p : pg_map.pg_stat) {
4516 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
4517 }
4518 } else {
4519 for (auto osd : need_check_down_pg_osds) {
4520 if (osdmap.is_down(osd)) {
4521 auto p = pg_map.pg_by_osd.find(osd);
4522 if (p == pg_map.pg_by_osd.end()) {
4523 continue;
4524 }
4525 for (auto pgid : p->second) {
4526 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
4527 assert(stat.acting_primary == osd);
4528 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
4529 }
4530 }
4531 }
4532 }
4533}
4534
4535int reweight::by_utilization(
4536 const OSDMap &osdmap,
4537 const PGMap &pgm,
4538 int oload,
4539 double max_changef,
4540 int max_osds,
4541 bool by_pg, const set<int64_t> *pools,
4542 bool no_increasing,
4543 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
4544 std::stringstream *ss,
4545 std::string *out_str,
4546 Formatter *f)
4547{
4548 if (oload <= 100) {
4549 *ss << "You must give a percentage higher than 100. "
4550 "The reweighting threshold will be calculated as <average-utilization> "
4551 "times <input-percentage>. For example, an argument of 200 would "
4552 "reweight OSDs which are twice as utilized as the average OSD.\n";
4553 return -EINVAL;
4554 }
4555
4556 vector<int> pgs_by_osd(osdmap.get_max_osd());
4557
4558 // Avoid putting a small number (or 0) in the denominator when calculating
4559 // average_util
4560 double average_util;
4561 if (by_pg) {
4562 // by pg mapping
4563 double weight_sum = 0.0; // sum up the crush weights
4564 unsigned num_pg_copies = 0;
4565 int num_osds = 0;
4566 for (const auto& pg : pgm.pg_stat) {
4567 if (pools && pools->count(pg.first.pool()) == 0)
4568 continue;
4569 for (const auto acting : pg.second.acting) {
4570 if (acting >= (int)pgs_by_osd.size())
4571 pgs_by_osd.resize(acting);
4572 if (pgs_by_osd[acting] == 0) {
4573 if (osdmap.crush->get_item_weightf(acting) <= 0) {
4574 //skip if we currently can not identify item
4575 continue;
4576 }
4577 weight_sum += osdmap.crush->get_item_weightf(acting);
4578 ++num_osds;
4579 }
4580 ++pgs_by_osd[acting];
4581 ++num_pg_copies;
4582 }
4583 }
4584
4585 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
4586 *ss << "Refusing to reweight: we only have " << num_pg_copies
4587 << " PGs across " << num_osds << " osds!\n";
4588 return -EDOM;
4589 }
4590
4591 average_util = (double)num_pg_copies / weight_sum;
4592 } else {
4593 // by osd utilization
4594 int num_osd = MAX(1, pgm.osd_stat.size());
4595 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
4596 < g_conf->mon_reweight_min_bytes_per_osd) {
4597 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
4598 << " kb across all osds!\n";
4599 return -EDOM;
4600 }
4601 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
4602 < g_conf->mon_reweight_min_bytes_per_osd) {
4603 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
4604 << " kb used across all osds!\n";
4605 return -EDOM;
4606 }
4607
4608 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
4609 }
4610
4611 // adjust down only if we are above the threshold
4612 const double overload_util = average_util * (double)oload / 100.0;
4613
4614 // but aggressively adjust weights up whenever possible.
4615 const double underload_util = average_util;
4616
4617 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4618
4619 ostringstream oss;
4620 if (f) {
4621 f->open_object_section("reweight_by_utilization");
4622 f->dump_int("overload_min", oload);
4623 f->dump_float("max_change", max_changef);
4624 f->dump_int("max_change_osds", max_osds);
4625 f->dump_float("average_utilization", average_util);
4626 f->dump_float("overload_utilization", overload_util);
4627 } else {
4628 oss << "oload " << oload << "\n";
4629 oss << "max_change " << max_changef << "\n";
4630 oss << "max_change_osds " << max_osds << "\n";
4631 oss.precision(4);
4632 oss << "average_utilization " << std::fixed << average_util << "\n";
4633 oss << "overload_utilization " << overload_util << "\n";
4634 }
4635 int num_changed = 0;
4636
4637 // precompute util for each OSD
4638 std::vector<std::pair<int, float> > util_by_osd;
4639 for (const auto& p : pgm.osd_stat) {
4640 std::pair<int, float> osd_util;
4641 osd_util.first = p.first;
4642 if (by_pg) {
4643 if (p.first >= (int)pgs_by_osd.size() ||
4644 pgs_by_osd[p.first] == 0) {
4645 // skip if this OSD does not contain any pg
4646 // belonging to the specified pool(s).
4647 continue;
4648 }
4649
4650 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4651 // skip if we are unable to locate item.
4652 continue;
4653 }
4654
4655 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4656 } else {
4657 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
4658 }
4659 util_by_osd.push_back(osd_util);
4660 }
4661
4662 // sort by absolute deviation from the mean utilization,
4663 // in descending order.
4664 std::sort(util_by_osd.begin(), util_by_osd.end(),
4665 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4666 return abs(l.second - average_util) > abs(r.second - average_util);
4667 }
4668 );
4669
4670 if (f)
4671 f->open_array_section("reweights");
4672
4673 for (const auto& p : util_by_osd) {
4674 unsigned weight = osdmap.get_weight(p.first);
4675 if (weight == 0) {
4676 // skip if OSD is currently out
4677 continue;
4678 }
4679 float util = p.second;
4680
4681 if (util >= overload_util) {
4682 // Assign a lower weight to overloaded OSDs. The current weight
4683 // is a factor to take into account the original weights,
4684 // to represent e.g. differing storage capacities
4685 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4686 if (weight > max_change)
4687 new_weight = MAX(new_weight, weight - max_change);
4688 new_weights->insert({p.first, new_weight});
4689 if (f) {
4690 f->open_object_section("osd");
4691 f->dump_int("osd", p.first);
4692 f->dump_float("weight", (float)weight / (float)0x10000);
4693 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4694 f->close_section();
4695 } else {
4696 oss << "osd." << p.first << " weight "
4697 << (float)weight / (float)0x10000 << " -> "
4698 << (float)new_weight / (float)0x10000 << "\n";
4699 }
4700 if (++num_changed >= max_osds)
4701 break;
4702 }
4703 if (!no_increasing && util <= underload_util) {
4704 // assign a higher weight.. if we can.
4705 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4706 new_weight = MIN(new_weight, weight + max_change);
4707 if (new_weight > 0x10000)
4708 new_weight = 0x10000;
4709 if (new_weight > weight) {
4710 new_weights->insert({p.first, new_weight});
4711 oss << "osd." << p.first << " weight "
4712 << (float)weight / (float)0x10000 << " -> "
4713 << (float)new_weight / (float)0x10000 << "\n";
4714 if (++num_changed >= max_osds)
4715 break;
4716 }
4717 }
4718 }
4719 if (f) {
4720 f->close_section();
4721 }
4722
4723 OSDMap newmap;
4724 newmap.deepish_copy_from(osdmap);
4725 OSDMap::Incremental newinc;
4726 newinc.fsid = newmap.get_fsid();
4727 newinc.epoch = newmap.get_epoch() + 1;
4728 newinc.new_weight = *new_weights;
4729 newmap.apply_incremental(newinc);
4730
4731 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4732
4733 if (f) {
4734 f->close_section();
4735 } else {
4736 *out_str += "\n";
4737 *out_str += oss.str();
4738 }
4739 return num_changed;
4740}