]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PGMap.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mon / PGMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
224ce89b
WB
4#include <boost/algorithm/string.hpp>
5
7c673cae
FG
6#include "PGMap.h"
7
8#define dout_subsys ceph_subsys_mon
9#include "common/debug.h"
10#include "common/Formatter.h"
11#include "include/ceph_features.h"
12#include "include/stringify.h"
13
14#include "osd/osd_types.h"
15#include "osd/OSDMap.h"
16
17#define dout_context g_ceph_context
18
31f18b77
FG
19MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
20MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
21MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
22
23
24// ---------------------
25// PGMapDigest
26
27void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
28{
29 // NOTE: see PGMap::encode_digest
30 ENCODE_START(1, 1, bl);
31 ::encode(num_pg, bl);
32 ::encode(num_pg_active, bl);
33 ::encode(num_pg_unknown, bl);
34 ::encode(num_osd, bl);
35 ::encode(pg_pool_sum, bl, features);
36 ::encode(pg_sum, bl, features);
37 ::encode(osd_sum, bl);
38 ::encode(num_pg_by_state, bl);
39 ::encode(num_pg_by_osd, bl);
40 ::encode(num_pg_by_pool, bl);
41 ::encode(osd_last_seq, bl);
42 ::encode(per_pool_sum_delta, bl, features);
43 ::encode(per_pool_sum_deltas_stamps, bl);
44 ::encode(pg_sum_delta, bl, features);
45 ::encode(stamp_delta, bl);
46 ::encode(avail_space_by_rule, bl);
7c673cae
FG
47 ENCODE_FINISH(bl);
48}
49
31f18b77
FG
50void PGMapDigest::decode(bufferlist::iterator& p)
51{
52 DECODE_START(1, p);
53 ::decode(num_pg, p);
54 ::decode(num_pg_active, p);
55 ::decode(num_pg_unknown, p);
56 ::decode(num_osd, p);
57 ::decode(pg_pool_sum, p);
58 ::decode(pg_sum, p);
59 ::decode(osd_sum, p);
60 ::decode(num_pg_by_state, p);
61 ::decode(num_pg_by_osd, p);
62 ::decode(num_pg_by_pool, p);
63 ::decode(osd_last_seq, p);
64 ::decode(per_pool_sum_delta, p);
65 ::decode(per_pool_sum_deltas_stamps, p);
66 ::decode(pg_sum_delta, p);
67 ::decode(stamp_delta, p);
68 ::decode(avail_space_by_rule, p);
69 DECODE_FINISH(p);
70}
71
72void PGMapDigest::dump(Formatter *f) const
73{
74 f->dump_unsigned("num_pg", num_pg);
75 f->dump_unsigned("num_pg_active", num_pg_active);
76 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
77 f->dump_unsigned("num_osd", num_osd);
78 f->dump_object("pool_sum", pg_sum);
79 f->dump_object("osd_sum", osd_sum);
80 f->open_array_section("pool_stats");
81 for (auto& p : pg_pool_sum) {
82 f->open_object_section("pool_stat");
83 f->dump_int("poolid", p.first);
84 auto q = num_pg_by_pool.find(p.first);
85 if (q != num_pg_by_pool.end())
86 f->dump_unsigned("num_pg", q->second);
87 p.second.dump(f);
7c673cae
FG
88 f->close_section();
89 }
90 f->close_section();
31f18b77
FG
91 f->open_array_section("osd_stats");
92 int i = 0;
93 // TODO: this isn't really correct since we can dump non-existent OSDs
94 // I dunno what osd_last_seq is set to in that case...
95 for (auto& p : osd_last_seq) {
7c673cae 96 f->open_object_section("osd_stat");
31f18b77
FG
97 f->dump_int("osd", i);
98 f->dump_unsigned("seq", p);
7c673cae 99 f->close_section();
31f18b77 100 ++i;
7c673cae
FG
101 }
102 f->close_section();
31f18b77
FG
103 f->open_array_section("num_pg_by_state");
104 for (auto& p : num_pg_by_state) {
105 f->open_object_section("count");
106 f->dump_string("state", pg_state_string(p.first));
107 f->dump_unsigned("num", p.second);
108 f->close_section();
109 }
7c673cae 110 f->close_section();
31f18b77
FG
111 f->open_array_section("num_pg_by_osd");
112 for (auto& p : num_pg_by_osd) {
113 f->open_object_section("count");
114 f->dump_unsigned("osd", p.first);
115 f->dump_unsigned("num_primary_pg", p.second.primary);
116 f->dump_unsigned("num_acting_pg", p.second.acting);
117 f->dump_unsigned("num_up_pg", p.second.up);
118 f->close_section();
119 }
7c673cae
FG
120 f->close_section();
121}
122
31f18b77 123void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
7c673cae 124{
31f18b77 125 ls.push_back(new PGMapDigest);
7c673cae
FG
126}
127
31f18b77
FG
128inline std::string percentify(const float& a) {
129 std::stringstream ss;
130 if (a < 0.01)
131 ss << "0";
132 else
133 ss << std::fixed << std::setprecision(2) << a;
134 return ss.str();
135}
7c673cae 136
31f18b77 137void PGMapDigest::print_summary(Formatter *f, ostream *out) const
7c673cae 138{
31f18b77
FG
139 if (f)
140 f->open_array_section("pgs_by_state");
7c673cae 141
31f18b77
FG
142 // list is descending numeric order (by count)
143 multimap<int,int> state_by_count; // count -> state
144 for (auto p = num_pg_by_state.begin();
145 p != num_pg_by_state.end();
146 ++p) {
147 state_by_count.insert(make_pair(p->second, p->first));
7c673cae 148 }
31f18b77
FG
149 if (f) {
150 for (auto p = state_by_count.rbegin();
151 p != state_by_count.rend();
152 ++p)
153 {
154 f->open_object_section("pgs_by_state_element");
155 f->dump_string("state_name", pg_state_string(p->second));
156 f->dump_unsigned("count", p->first);
157 f->close_section();
158 }
7c673cae 159 }
31f18b77
FG
160 if (f)
161 f->close_section();
7c673cae 162
31f18b77
FG
163 if (f) {
164 f->dump_unsigned("num_pgs", num_pg);
165 f->dump_unsigned("num_pools", pg_pool_sum.size());
166 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
167 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
168 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
169 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
170 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
171 } else {
172 *out << " pools: " << pg_pool_sum.size() << " pools, "
173 << num_pg << " pgs\n";
174 *out << " objects: " << si_t(pg_sum.stats.sum.num_objects) << " objects, "
175 << prettybyte_t(pg_sum.stats.sum.num_bytes) << "\n";
176 *out << " usage: "
177 << kb_t(osd_sum.kb_used) << " used, "
178 << kb_t(osd_sum.kb_avail) << " / "
179 << kb_t(osd_sum.kb) << " avail\n";
180 *out << " pgs: ";
181 }
7c673cae 182
31f18b77 183 bool pad = false;
7c673cae 184
31f18b77
FG
185 if (num_pg_unknown > 0) {
186 float p = (float)num_pg_unknown / (float)num_pg;
187 if (f) {
188 f->dump_float("unknown_pgs_ratio", p);
7c673cae 189 } else {
31f18b77
FG
190 char b[20];
191 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
192 *out << b << "% pgs unknown\n";
193 pad = true;
7c673cae 194 }
7c673cae 195 }
7c673cae 196
31f18b77
FG
197 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
198 if (num_pg_inactive > 0) {
199 float p = (float)num_pg_inactive / (float)num_pg;
200 if (f) {
201 f->dump_float("inactive_pgs_ratio", p);
7c673cae 202 } else {
31f18b77
FG
203 if (pad) {
204 *out << " ";
205 }
206 char b[20];
207 snprintf(b, sizeof(b), "%.3f", p * 100.0);
208 *out << b << "% pgs not active\n";
209 pad = true;
7c673cae 210 }
7c673cae 211 }
31f18b77
FG
212
213 list<string> sl;
214 overall_recovery_summary(f, &sl);
215 if (!f && !sl.empty()) {
216 for (auto p = sl.begin(); p != sl.end(); ++p) {
217 if (pad) {
218 *out << " ";
219 }
220 *out << *p << "\n";
221 pad = true;
7c673cae 222 }
7c673cae 223 }
31f18b77 224 sl.clear();
7c673cae 225
31f18b77
FG
226 if (!f) {
227 unsigned max_width = 1;
228 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
229 p != state_by_count.rend();
230 ++p)
231 {
232 std::stringstream ss;
233 ss << p->first;
234 max_width = MAX(ss.str().size(), max_width);
7c673cae
FG
235 }
236
31f18b77
FG
237 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
238 p != state_by_count.rend();
239 ++p)
240 {
241 if (pad) {
242 *out << " ";
243 }
244 pad = true;
245 out->setf(std::ios::left);
246 *out << std::setw(max_width) << p->first
247 << " " << pg_state_string(p->second) << "\n";
248 out->unsetf(std::ios::left);
249 }
7c673cae
FG
250 }
251
31f18b77
FG
252 ostringstream ss_rec_io;
253 overall_recovery_rate_summary(f, &ss_rec_io);
254 ostringstream ss_client_io;
255 overall_client_io_rate_summary(f, &ss_client_io);
256 ostringstream ss_cache_io;
257 overall_cache_io_rate_summary(f, &ss_cache_io);
7c673cae 258
31f18b77
FG
259 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
260 || ss_cache_io.str().length())) {
261 *out << "\n \n";
262 *out << " io:\n";
7c673cae
FG
263 }
264
31f18b77
FG
265 if (!f && ss_client_io.str().length())
266 *out << " client: " << ss_client_io.str() << "\n";
267 if (!f && ss_rec_io.str().length())
268 *out << " recovery: " << ss_rec_io.str() << "\n";
269 if (!f && ss_cache_io.str().length())
270 *out << " cache: " << ss_cache_io.str() << "\n";
7c673cae
FG
271}
272
31f18b77 273void PGMapDigest::print_oneline_summary(Formatter *f, ostream *out) const
7c673cae 274{
31f18b77
FG
275 std::stringstream ss;
276
277 if (f)
278 f->open_array_section("num_pg_by_state");
279 for (auto p = num_pg_by_state.begin();
280 p != num_pg_by_state.end();
281 ++p) {
282 if (f) {
283 f->open_object_section("state");
284 f->dump_string("name", pg_state_string(p->first));
285 f->dump_unsigned("num", p->second);
286 f->close_section();
287 }
288 if (p != num_pg_by_state.begin())
289 ss << ", ";
290 ss << p->second << " " << pg_state_string(p->first);
7c673cae 291 }
31f18b77
FG
292 if (f)
293 f->close_section();
7c673cae 294
31f18b77
FG
295 string states = ss.str();
296 if (out)
297 *out << num_pg << " pgs: "
298 << states << "; "
299 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
300 << kb_t(osd_sum.kb_used) << " used, "
301 << kb_t(osd_sum.kb_avail) << " / "
302 << kb_t(osd_sum.kb) << " avail";
303 if (f) {
304 f->dump_unsigned("num_pgs", num_pg);
305 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
306 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
307 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
308 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
309 }
7c673cae 310
31f18b77
FG
311 // make non-negative; we can get negative values if osds send
312 // uncommitted stats and then "go backward" or if they are just
313 // buggy/wrong.
314 pool_stat_t pos_delta = pg_sum_delta;
315 pos_delta.floor(0);
316 if (pos_delta.stats.sum.num_rd ||
317 pos_delta.stats.sum.num_wr) {
318 if (out)
319 *out << "; ";
320 if (pos_delta.stats.sum.num_rd) {
321 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
322 if (out)
323 *out << pretty_si_t(rd) << "B/s rd, ";
324 if (f)
325 f->dump_unsigned("read_bytes_sec", rd);
326 }
327 if (pos_delta.stats.sum.num_wr) {
328 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
329 if (out)
330 *out << pretty_si_t(wr) << "B/s wr, ";
331 if (f)
332 f->dump_unsigned("write_bytes_sec", wr);
333 }
334 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
335 if (out)
336 *out << pretty_si_t(iops) << "op/s";
337 if (f)
338 f->dump_unsigned("io_sec", iops);
7c673cae 339 }
31f18b77
FG
340
341 list<string> sl;
342 overall_recovery_summary(f, &sl);
343 if (out)
344 for (auto p = sl.begin(); p != sl.end(); ++p)
345 *out << "; " << *p;
346 std::stringstream ssr;
347 overall_recovery_rate_summary(f, &ssr);
348 if (out && ssr.str().length())
349 *out << "; " << ssr.str() << " recovering";
7c673cae
FG
350}
351
31f18b77 352void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
b32b8144 353 const pool_stat_t& pool_sum) const
7c673cae 354{
b32b8144
FG
355 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
356 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
357 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
358 char b[20];
359 snprintf(b, sizeof(b), "%.3lf", pc);
360 if (f) {
b32b8144
FG
361 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
362 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
363 f->dump_float("degraded_ratio", pc / 100.0);
364 } else {
365 ostringstream ss;
b32b8144
FG
366 ss << pool_sum.stats.sum.num_objects_degraded
367 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
31f18b77
FG
368 psl->push_back(ss.str());
369 }
370 }
b32b8144
FG
371 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
372 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
373 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
31f18b77
FG
374 char b[20];
375 snprintf(b, sizeof(b), "%.3lf", pc);
376 if (f) {
b32b8144
FG
377 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
378 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
31f18b77
FG
379 f->dump_float("misplaced_ratio", pc / 100.0);
380 } else {
381 ostringstream ss;
b32b8144
FG
382 ss << pool_sum.stats.sum.num_objects_misplaced
383 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
31f18b77
FG
384 psl->push_back(ss.str());
385 }
386 }
b32b8144
FG
387 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
388 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
389 (double)pool_sum.stats.sum.num_objects * (double)100.0;
31f18b77
FG
390 char b[20];
391 snprintf(b, sizeof(b), "%.3lf", pc);
392 if (f) {
b32b8144
FG
393 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
394 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
31f18b77
FG
395 f->dump_float("unfound_ratio", pc / 100.0);
396 } else {
397 ostringstream ss;
b32b8144
FG
398 ss << pool_sum.stats.sum.num_objects_unfound
399 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
31f18b77
FG
400 psl->push_back(ss.str());
401 }
7c673cae 402 }
7c673cae
FG
403}
404
31f18b77
FG
405void PGMapDigest::recovery_rate_summary(Formatter *f, ostream *out,
406 const pool_stat_t& delta_sum,
407 utime_t delta_stamp) const
7c673cae 408{
31f18b77
FG
409 // make non-negative; we can get negative values if osds send
410 // uncommitted stats and then "go backward" or if they are just
411 // buggy/wrong.
412 pool_stat_t pos_delta = delta_sum;
413 pos_delta.floor(0);
414 if (pos_delta.stats.sum.num_objects_recovered ||
415 pos_delta.stats.sum.num_bytes_recovered ||
416 pos_delta.stats.sum.num_keys_recovered) {
417 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
418 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
419 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
420 if (f) {
421 f->dump_int("recovering_objects_per_sec", objps);
422 f->dump_int("recovering_bytes_per_sec", bps);
423 f->dump_int("recovering_keys_per_sec", kps);
424 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
425 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
426 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
427 } else {
428 *out << pretty_si_t(bps) << "B/s";
429 if (pos_delta.stats.sum.num_keys_recovered)
430 *out << ", " << pretty_si_t(kps) << "keys/s";
431 *out << ", " << pretty_si_t(objps) << "objects/s";
432 }
7c673cae 433 }
31f18b77 434}
7c673cae 435
31f18b77
FG
436void PGMapDigest::overall_recovery_rate_summary(Formatter *f, ostream *out) const
437{
438 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
439}
440
31f18b77 441void PGMapDigest::overall_recovery_summary(Formatter *f, list<string> *psl) const
7c673cae 442{
31f18b77 443 recovery_summary(f, psl, pg_sum);
7c673cae
FG
444}
445
31f18b77
FG
446void PGMapDigest::pool_recovery_rate_summary(Formatter *f, ostream *out,
447 uint64_t poolid) const
7c673cae 448{
31f18b77
FG
449 auto p = per_pool_sum_delta.find(poolid);
450 if (p == per_pool_sum_delta.end())
451 return;
7c673cae 452
31f18b77
FG
453 auto ts = per_pool_sum_deltas_stamps.find(p->first);
454 assert(ts != per_pool_sum_deltas_stamps.end());
455 recovery_rate_summary(f, out, p->second.first, ts->second);
456}
7c673cae 457
31f18b77
FG
458void PGMapDigest::pool_recovery_summary(Formatter *f, list<string> *psl,
459 uint64_t poolid) const
460{
b32b8144
FG
461 auto p = pg_pool_sum.find(poolid);
462 if (p == pg_pool_sum.end())
31f18b77 463 return;
7c673cae 464
b32b8144 465 recovery_summary(f, psl, p->second);
7c673cae
FG
466}
467
31f18b77
FG
468void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
469 const pool_stat_t& delta_sum,
470 utime_t delta_stamp) const
7c673cae 471{
31f18b77
FG
472 pool_stat_t pos_delta = delta_sum;
473 pos_delta.floor(0);
474 if (pos_delta.stats.sum.num_rd ||
475 pos_delta.stats.sum.num_wr) {
476 if (pos_delta.stats.sum.num_rd) {
477 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
478 if (f) {
479 f->dump_int("read_bytes_sec", rd);
480 } else {
481 *out << pretty_si_t(rd) << "B/s rd, ";
482 }
483 }
484 if (pos_delta.stats.sum.num_wr) {
485 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
486 if (f) {
487 f->dump_int("write_bytes_sec", wr);
488 } else {
489 *out << pretty_si_t(wr) << "B/s wr, ";
490 }
491 }
492 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
493 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
494 if (f) {
495 f->dump_int("read_op_per_sec", iops_rd);
496 f->dump_int("write_op_per_sec", iops_wr);
497 } else {
498 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
499 }
7c673cae
FG
500 }
501}
502
31f18b77 503void PGMapDigest::overall_client_io_rate_summary(Formatter *f, ostream *out) const
7c673cae 504{
31f18b77
FG
505 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
506}
7c673cae 507
31f18b77
FG
508void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
509 uint64_t poolid) const
510{
511 auto p = per_pool_sum_delta.find(poolid);
512 if (p == per_pool_sum_delta.end())
7c673cae
FG
513 return;
514
31f18b77
FG
515 auto ts = per_pool_sum_deltas_stamps.find(p->first);
516 assert(ts != per_pool_sum_deltas_stamps.end());
517 client_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
518}
519
31f18b77
FG
520void PGMapDigest::cache_io_rate_summary(Formatter *f, ostream *out,
521 const pool_stat_t& delta_sum,
522 utime_t delta_stamp) const
7c673cae 523{
31f18b77
FG
524 pool_stat_t pos_delta = delta_sum;
525 pos_delta.floor(0);
526 bool have_output = false;
7c673cae 527
31f18b77
FG
528 if (pos_delta.stats.sum.num_flush) {
529 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
530 if (f) {
531 f->dump_int("flush_bytes_sec", flush);
532 } else {
533 *out << pretty_si_t(flush) << "B/s flush";
534 have_output = true;
7c673cae
FG
535 }
536 }
31f18b77
FG
537 if (pos_delta.stats.sum.num_evict) {
538 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
539 if (f) {
540 f->dump_int("evict_bytes_sec", evict);
541 } else {
542 if (have_output)
543 *out << ", ";
544 *out << pretty_si_t(evict) << "B/s evict";
545 have_output = true;
546 }
7c673cae 547 }
31f18b77
FG
548 if (pos_delta.stats.sum.num_promote) {
549 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
550 if (f) {
551 f->dump_int("promote_op_per_sec", promote);
552 } else {
553 if (have_output)
554 *out << ", ";
555 *out << pretty_si_t(promote) << "op/s promote";
556 have_output = true;
557 }
7c673cae 558 }
31f18b77
FG
559 if (pos_delta.stats.sum.num_flush_mode_low) {
560 if (f) {
561 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
562 } else {
563 if (have_output)
564 *out << ", ";
565 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
566 have_output = true;
567 }
7c673cae 568 }
31f18b77
FG
569 if (pos_delta.stats.sum.num_flush_mode_high) {
570 if (f) {
571 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
572 } else {
573 if (have_output)
574 *out << ", ";
575 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
576 have_output = true;
577 }
7c673cae 578 }
31f18b77
FG
579 if (pos_delta.stats.sum.num_evict_mode_some) {
580 if (f) {
581 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
582 } else {
583 if (have_output)
584 *out << ", ";
585 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
586 have_output = true;
587 }
588 }
589 if (pos_delta.stats.sum.num_evict_mode_full) {
590 if (f) {
591 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
592 } else {
593 if (have_output)
594 *out << ", ";
595 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
596 }
7c673cae
FG
597 }
598}
599
31f18b77 600void PGMapDigest::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
7c673cae 601{
31f18b77 602 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
7c673cae
FG
603}
604
31f18b77
FG
605void PGMapDigest::pool_cache_io_rate_summary(Formatter *f, ostream *out,
606 uint64_t poolid) const
7c673cae 607{
31f18b77
FG
608 auto p = per_pool_sum_delta.find(poolid);
609 if (p == per_pool_sum_delta.end())
610 return;
7c673cae 611
31f18b77
FG
612 auto ts = per_pool_sum_deltas_stamps.find(p->first);
613 assert(ts != per_pool_sum_deltas_stamps.end());
614 cache_io_rate_summary(f, out, p->second.first, ts->second);
7c673cae
FG
615}
616
d2e6a577
FG
617static float pool_raw_used_rate(const OSDMap &osd_map, int64_t poolid)
618{
619 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
620
621 switch (pool->get_type()) {
622 case pg_pool_t::TYPE_REPLICATED:
623 return pool->get_size();
624 break;
625 case pg_pool_t::TYPE_ERASURE:
626 {
627 auto& ecp =
628 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
629 auto pm = ecp.find("m");
630 auto pk = ecp.find("k");
631 if (pm != ecp.end() && pk != ecp.end()) {
632 int k = atoi(pk->second.c_str());
633 int m = atoi(pm->second.c_str());
634 int mk = m + k;
635 assert(mk != 0);
636 assert(k != 0);
637 return (float)mk / k;
638 } else {
639 return 0.0;
640 }
641 }
642 break;
643 default:
644 assert(0 == "unrecognized pool type");
645 }
646}
647
648ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
649 boost::optional<int64_t> data_pool) const
650{
651 ceph_statfs statfs;
652 bool filter = false;
653 object_stat_sum_t sum;
654
655 if (data_pool) {
656 auto i = pg_pool_sum.find(*data_pool);
657 if (i != pg_pool_sum.end()) {
658 sum = i->second.stats.sum;
659 filter = true;
660 }
661 }
662
663 if (filter) {
664 statfs.kb_used = (sum.num_bytes >> 10);
665 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
666 statfs.num_objects = sum.num_objects;
667 statfs.kb = statfs.kb_used + statfs.kb_avail;
668 } else {
669 // these are in KB.
670 statfs.kb = osd_sum.kb;
671 statfs.kb_used = osd_sum.kb_used;
672 statfs.kb_avail = osd_sum.kb_avail;
673 statfs.num_objects = pg_sum.stats.sum.num_objects;
674 }
675
676 return statfs;
677}
678
31f18b77
FG
679void PGMapDigest::dump_pool_stats_full(
680 const OSDMap &osd_map,
681 stringstream *ss,
682 Formatter *f,
683 bool verbose) const
7c673cae 684{
31f18b77 685 TextTable tbl;
7c673cae 686
31f18b77
FG
687 if (f) {
688 f->open_array_section("pools");
689 } else {
690 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
691 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
692 if (verbose) {
693 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
694 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
695 }
7c673cae 696
31f18b77
FG
697 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
698 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
699 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
700 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
701 if (verbose) {
702 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
703 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
704 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
705 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
706 }
707 }
708
709 map<int,uint64_t> avail_by_rule;
710 for (auto p = osd_map.get_pools().begin();
711 p != osd_map.get_pools().end(); ++p) {
712 int64_t pool_id = p->first;
713 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
714 continue;
715 const string& pool_name = osd_map.get_pool_name(pool_id);
716 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
717
718 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
719 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
720 pool->get_type(),
721 pool->get_size());
722 int64_t avail;
723 float raw_used_rate;
724 if (avail_by_rule.count(ruleno) == 0) {
725 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
726 avail = get_rule_avail(ruleno);
727 if (avail < 0)
728 avail = 0;
729 avail_by_rule[ruleno] = avail;
730 } else {
731 avail = avail_by_rule[ruleno];
732 }
d2e6a577
FG
733
734 raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);
31f18b77
FG
735
736 if (f) {
737 f->open_object_section("pool");
738 f->dump_string("name", pool_name);
739 f->dump_int("id", pool_id);
740 f->open_object_section("stats");
741 } else {
742 tbl << pool_name
743 << pool_id;
744 if (verbose) {
745 if (pool->quota_max_objects == 0)
746 tbl << "N/A";
747 else
748 tbl << si_t(pool->quota_max_objects);
749
750 if (pool->quota_max_bytes == 0)
751 tbl << "N/A";
752 else
753 tbl << si_t(pool->quota_max_bytes);
754 }
755
756 }
757 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
758 if (f)
759 f->close_section(); // stats
760 else
761 tbl << TextTable::endrow;
762
763 if (f)
764 f->close_section(); // pool
765 }
766 if (f)
767 f->close_section();
768 else {
769 assert(ss != nullptr);
770 *ss << "POOLS:\n";
771 tbl.set_indent(4);
772 *ss << tbl;
773 }
774}
775
776void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
777{
778 if (f) {
779 f->open_object_section("stats");
780 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
781 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
782 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
783 if (verbose) {
784 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
785 }
786 f->close_section();
787 } else {
788 assert(ss != nullptr);
789 TextTable tbl;
790 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
791 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
792 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
793 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
794 if (verbose) {
795 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
796 }
797 tbl << stringify(si_t(osd_sum.kb*1024))
798 << stringify(si_t(osd_sum.kb_avail*1024))
799 << stringify(si_t(osd_sum.kb_used*1024));
800 float used = 0.0;
801 if (osd_sum.kb > 0) {
802 used = ((float)osd_sum.kb_used / osd_sum.kb);
803 }
804 tbl << percentify(used*100);
805 if (verbose) {
806 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
807 }
808 tbl << TextTable::endrow;
809
810 *ss << "GLOBAL:\n";
811 tbl.set_indent(4);
812 *ss << tbl;
813 }
814}
815
816void PGMapDigest::dump_object_stat_sum(
817 TextTable &tbl, Formatter *f,
818 const object_stat_sum_t &sum, uint64_t avail,
819 float raw_used_rate, bool verbose,
820 const pg_pool_t *pool)
821{
822 float curr_object_copies_rate = 0.0;
823 if (sum.num_object_copies > 0)
824 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
825
826 float used = 0.0;
3efd9988 827 // note avail passed in is raw_avail, calc raw_used here.
31f18b77 828 if (avail) {
3efd9988 829 used = sum.num_bytes * raw_used_rate * curr_object_copies_rate;
31f18b77
FG
830 used /= used + avail;
831 } else if (sum.num_bytes) {
832 used = 1.0;
833 }
834
835 if (f) {
836 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
837 f->dump_int("bytes_used", sum.num_bytes);
838 f->dump_format_unquoted("percent_used", "%.2f", (used*100));
181888fb 839 f->dump_unsigned("max_avail", avail / raw_used_rate);
31f18b77
FG
840 f->dump_int("objects", sum.num_objects);
841 if (verbose) {
842 f->dump_int("quota_objects", pool->quota_max_objects);
843 f->dump_int("quota_bytes", pool->quota_max_bytes);
844 f->dump_int("dirty", sum.num_objects_dirty);
845 f->dump_int("rd", sum.num_rd);
846 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
847 f->dump_int("wr", sum.num_wr);
848 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
849 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
850 }
851 } else {
852 tbl << stringify(si_t(sum.num_bytes));
853 tbl << percentify(used*100);
181888fb 854 tbl << si_t(avail / raw_used_rate);
31f18b77
FG
855 tbl << sum.num_objects;
856 if (verbose) {
857 tbl << stringify(si_t(sum.num_objects_dirty))
858 << stringify(si_t(sum.num_rd))
859 << stringify(si_t(sum.num_wr))
860 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
861 }
862 }
863}
864
d2e6a577
FG
865int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
866 int64_t poolid) const
867{
868 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
869 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
870 pool->get_type(),
871 pool->get_size());
872 int64_t avail;
873 avail = get_rule_avail(ruleno);
874 if (avail < 0)
875 avail = 0;
876
877 return avail / ::pool_raw_used_rate(osd_map, poolid);
878}
879
31f18b77
FG
880int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
881{
882 map<int,float> wm;
883 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
884 if (r < 0) {
885 return r;
886 }
887 if (wm.empty()) {
888 return 0;
889 }
890
891 float fratio;
892 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
893 osdmap.get_full_ratio() > 0) {
894 fratio = osdmap.get_full_ratio();
895 } else {
896 fratio = get_fallback_full_ratio();
897 }
898
899 int64_t min = -1;
900 for (auto p = wm.begin(); p != wm.end(); ++p) {
901 auto osd_info = osd_stat.find(p->first);
902 if (osd_info != osd_stat.end()) {
903 if (osd_info->second.kb == 0 || p->second == 0) {
904 // osd must be out, hence its stats have been zeroed
905 // (unless we somehow managed to have a disk with size 0...)
906 //
907 // (p->second == 0), if osd weight is 0, no need to
908 // calculate proj below.
909 continue;
910 }
911 double unusable = (double)osd_info->second.kb *
912 (1.0 - fratio);
913 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
914 avail *= 1024.0;
915 int64_t proj = (int64_t)(avail / (double)p->second);
916 if (min < 0 || proj < min) {
917 min = proj;
918 }
919 } else {
94b18763
FG
920 if (osdmap.is_up(p->first)) {
921 // This is a level 4 rather than an error, because we might have
922 // only just started, and not received the first stats message yet.
923 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
924 }
31f18b77
FG
925 }
926 }
927 return min;
928}
929
930void PGMap::get_rules_avail(const OSDMap& osdmap,
931 std::map<int,int64_t> *avail_map) const
932{
933 avail_map->clear();
934 for (auto p : osdmap.get_pools()) {
935 int64_t pool_id = p.first;
936 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
937 continue;
938 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
939 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
940 pool->get_type(),
941 pool->get_size());
942 if (avail_map->count(ruleno) == 0)
943 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
944 }
945}
946
947// ---------------------
948// PGMap
949
950void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
951{
952 if ((features & CEPH_FEATURE_MONENC) == 0) {
953 __u8 v = 4;
7c673cae
FG
954 ::encode(v, bl);
955 ::encode(version, bl);
31f18b77
FG
956 ::encode(pg_stat_updates, bl);
957 ::encode(osd_stat_updates, bl);
958 ::encode(osd_stat_rm, bl);
959 ::encode(osdmap_epoch, bl);
960 ::encode(pg_scan, bl);
7c673cae
FG
961 ::encode(full_ratio, bl);
962 ::encode(nearfull_ratio, bl);
31f18b77 963 ::encode(pg_remove, bl);
7c673cae
FG
964 return;
965 }
966
31f18b77 967 ENCODE_START(7, 5, bl);
7c673cae 968 ::encode(version, bl);
31f18b77
FG
969 ::encode(pg_stat_updates, bl);
970 ::encode(osd_stat_updates, bl);
971 ::encode(osd_stat_rm, bl);
972 ::encode(osdmap_epoch, bl);
973 ::encode(pg_scan, bl);
7c673cae
FG
974 ::encode(full_ratio, bl);
975 ::encode(nearfull_ratio, bl);
31f18b77 976 ::encode(pg_remove, bl);
7c673cae
FG
977 ::encode(stamp, bl);
978 ::encode(osd_epochs, bl);
979 ENCODE_FINISH(bl);
980}
981
31f18b77 982void PGMap::Incremental::decode(bufferlist::iterator &bl)
7c673cae 983{
31f18b77 984 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
7c673cae
FG
985 ::decode(version, bl);
986 if (struct_v < 3) {
31f18b77 987 pg_stat_updates.clear();
7c673cae
FG
988 __u32 n;
989 ::decode(n, bl);
990 while (n--) {
991 old_pg_t opgid;
992 ::decode(opgid, bl);
993 pg_t pgid = opgid;
31f18b77 994 ::decode(pg_stat_updates[pgid], bl);
7c673cae
FG
995 }
996 } else {
31f18b77 997 ::decode(pg_stat_updates, bl);
7c673cae 998 }
31f18b77
FG
999 ::decode(osd_stat_updates, bl);
1000 ::decode(osd_stat_rm, bl);
1001 ::decode(osdmap_epoch, bl);
1002 ::decode(pg_scan, bl);
7c673cae
FG
1003 if (struct_v >= 2) {
1004 ::decode(full_ratio, bl);
1005 ::decode(nearfull_ratio, bl);
1006 }
31f18b77
FG
1007 if (struct_v < 3) {
1008 pg_remove.clear();
1009 __u32 n;
1010 ::decode(n, bl);
1011 while (n--) {
1012 old_pg_t opgid;
1013 ::decode(opgid, bl);
1014 pg_remove.insert(pg_t(opgid));
1015 }
1016 } else {
1017 ::decode(pg_remove, bl);
1018 }
1019 if (struct_v < 4 && full_ratio == 0) {
1020 full_ratio = -1;
1021 }
1022 if (struct_v < 4 && nearfull_ratio == 0) {
1023 nearfull_ratio = -1;
1024 }
1025 if (struct_v >= 6)
7c673cae 1026 ::decode(stamp, bl);
31f18b77 1027 if (struct_v >= 7) {
7c673cae
FG
1028 ::decode(osd_epochs, bl);
1029 } else {
31f18b77
FG
1030 for (auto i = osd_stat_updates.begin();
1031 i != osd_stat_updates.end();
7c673cae
FG
1032 ++i) {
1033 // This isn't accurate, but will cause trimming to behave like
1034 // previously.
31f18b77 1035 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
7c673cae
FG
1036 }
1037 }
1038 DECODE_FINISH(bl);
7c673cae
FG
1039}
1040
31f18b77 1041void PGMap::Incremental::dump(Formatter *f) const
7c673cae
FG
1042{
1043 f->dump_unsigned("version", version);
1044 f->dump_stream("stamp") << stamp;
31f18b77
FG
1045 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1046 f->dump_unsigned("pg_scan_epoch", pg_scan);
7c673cae 1047 f->dump_float("full_ratio", full_ratio);
31f18b77 1048 f->dump_float("nearfull_ratio", nearfull_ratio);
7c673cae 1049
31f18b77
FG
1050 f->open_array_section("pg_stat_updates");
1051 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1052 f->open_object_section("pg_stat");
1053 f->dump_stream("pgid") << p->first;
1054 p->second.dump(f);
1055 f->close_section();
1056 }
7c673cae
FG
1057 f->close_section();
1058
31f18b77
FG
1059 f->open_array_section("osd_stat_updates");
1060 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1061 f->open_object_section("osd_stat");
1062 f->dump_int("osd", p->first);
1063 p->second.dump(f);
7c673cae
FG
1064 f->close_section();
1065 }
1066 f->close_section();
1067
31f18b77
FG
1068 f->open_array_section("osd_stat_removals");
1069 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1070 f->dump_int("osd", *p);
7c673cae 1071 f->close_section();
7c673cae 1072
31f18b77
FG
1073 f->open_array_section("pg_removals");
1074 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1075 f->dump_stream("pgid") << *p;
7c673cae
FG
1076 f->close_section();
1077}
1078
31f18b77 1079void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
7c673cae 1080{
31f18b77
FG
1081 o.push_back(new Incremental);
1082 o.push_back(new Incremental);
1083 o.back()->version = 1;
1084 o.back()->stamp = utime_t(123,345);
1085 o.push_back(new Incremental);
1086 o.back()->version = 2;
1087 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
1088 o.back()->osd_stat_updates[5] = osd_stat_t();
1089 o.back()->osd_epochs[5] = 12;
1090 o.push_back(new Incremental);
1091 o.back()->version = 3;
1092 o.back()->osdmap_epoch = 1;
1093 o.back()->pg_scan = 2;
1094 o.back()->full_ratio = .2;
1095 o.back()->nearfull_ratio = .3;
1096 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
1097 o.back()->osd_stat_updates[6] = osd_stat_t();
1098 o.back()->osd_epochs[6] = 12;
1099 o.back()->pg_remove.insert(pg_t(1,2,3));
1100 o.back()->osd_stat_rm.insert(5);
7c673cae
FG
1101}
1102
7c673cae 1103
31f18b77
FG
1104// --
1105
1106void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
7c673cae 1107{
31f18b77
FG
1108 assert(inc.version == version+1);
1109 version++;
7c673cae 1110
31f18b77
FG
1111 pool_stat_t pg_sum_old = pg_sum;
1112 mempool::pgmap::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
1113
1114 bool ratios_changed = false;
1115 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
1116 full_ratio = inc.full_ratio;
1117 ratios_changed = true;
7c673cae 1118 }
31f18b77
FG
1119 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
1120 nearfull_ratio = inc.nearfull_ratio;
1121 ratios_changed = true;
7c673cae 1122 }
31f18b77
FG
1123 if (ratios_changed)
1124 redo_full_sets();
7c673cae 1125
31f18b77
FG
1126 for (auto p = inc.pg_stat_updates.begin();
1127 p != inc.pg_stat_updates.end();
1128 ++p) {
1129 const pg_t &update_pg(p->first);
1130 const pg_stat_t &update_stat(p->second);
7c673cae 1131
31f18b77
FG
1132 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
1133 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
1134
1135 auto t = pg_stat.find(update_pg);
1136 if (t == pg_stat.end()) {
1137 pg_stat.insert(make_pair(update_pg, update_stat));
1138 } else {
1139 stat_pg_sub(update_pg, t->second);
1140 t->second = update_stat;
7c673cae 1141 }
31f18b77 1142 stat_pg_add(update_pg, update_stat);
7c673cae 1143 }
31f18b77
FG
1144 assert(osd_stat.size() == osd_epochs.size());
1145 for (auto p = inc.get_osd_stat_updates().begin();
1146 p != inc.get_osd_stat_updates().end();
1147 ++p) {
1148 int osd = p->first;
1149 const osd_stat_t &new_stats(p->second);
7c673cae 1150
31f18b77
FG
1151 auto t = osd_stat.find(osd);
1152 if (t == osd_stat.end()) {
1153 osd_stat.insert(make_pair(osd, new_stats));
1154 } else {
1155 stat_osd_sub(t->first, t->second);
1156 t->second = new_stats;
1157 }
1158 auto i = osd_epochs.find(osd);
1159 auto j = inc.get_osd_epochs().find(osd);
1160 assert(j != inc.get_osd_epochs().end());
7c673cae 1161
31f18b77
FG
1162 if (i == osd_epochs.end())
1163 osd_epochs.insert(*j);
1164 else
1165 i->second = j->second;
7c673cae 1166
31f18b77 1167 stat_osd_add(osd, new_stats);
7c673cae 1168
31f18b77
FG
1169 // adjust [near]full status
1170 register_nearfull_status(osd, new_stats);
1171 }
1172 set<int64_t> deleted_pools;
1173 for (auto p = inc.pg_remove.begin();
1174 p != inc.pg_remove.end();
1175 ++p) {
1176 const pg_t &removed_pg(*p);
1177 auto s = pg_stat.find(removed_pg);
1178 if (s != pg_stat.end()) {
1179 stat_pg_sub(removed_pg, s->second);
1180 pg_stat.erase(s);
1181 }
1182 deleted_pools.insert(removed_pg.pool());
7c673cae
FG
1183 }
1184
31f18b77
FG
1185 for (auto p = inc.get_osd_stat_rm().begin();
1186 p != inc.get_osd_stat_rm().end();
7c673cae 1187 ++p) {
31f18b77
FG
1188 auto t = osd_stat.find(*p);
1189 if (t != osd_stat.end()) {
1190 stat_osd_sub(t->first, t->second);
1191 osd_stat.erase(t);
1192 osd_epochs.erase(*p);
1193 }
1194
1195 // remove these old osds from full/nearfull set(s), too
1196 nearfull_osds.erase(*p);
1197 full_osds.erase(*p);
7c673cae
FG
1198 }
1199
b32b8144
FG
1200 // skip calculating delta while sum was not synchronized
1201 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1202 utime_t delta_t;
1203 delta_t = inc.stamp;
1204 delta_t -= stamp;
1205 // calculate a delta, and average over the last 2 deltas.
1206 pool_stat_t d = pg_sum;
1207 d.stats.sub(pg_sum_old.stats);
1208 pg_sum_deltas.push_back(make_pair(d, delta_t));
1209 stamp_delta += delta_t;
1210 pg_sum_delta.stats.add(d.stats);
1211 auto smooth_intervals =
1212 cct ? cct->_conf->get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1213 if (pg_sum_deltas.size() > smooth_intervals) {
1214 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1215 stamp_delta -= pg_sum_deltas.front().second;
1216 pg_sum_deltas.pop_front();
1217 }
31f18b77 1218 }
b32b8144 1219 stamp = inc.stamp;
7c673cae 1220
31f18b77 1221 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
7c673cae 1222
31f18b77
FG
1223 for (auto p : deleted_pools) {
1224 if (cct)
1225 dout(20) << " deleted pool " << p << dendl;
1226 deleted_pool(p);
1227 }
7c673cae 1228
31f18b77
FG
1229 if (inc.osdmap_epoch)
1230 last_osdmap_epoch = inc.osdmap_epoch;
1231 if (inc.pg_scan)
1232 last_pg_scan = inc.pg_scan;
1233
1234 min_last_epoch_clean = 0; // invalidate
7c673cae
FG
1235}
1236
31f18b77 1237void PGMap::redo_full_sets()
7c673cae 1238{
31f18b77
FG
1239 full_osds.clear();
1240 nearfull_osds.clear();
1241 for (auto i = osd_stat.begin();
1242 i != osd_stat.end();
1243 ++i) {
1244 register_nearfull_status(i->first, i->second);
7c673cae 1245 }
31f18b77 1246}
7c673cae 1247
31f18b77
FG
1248void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
1249{
1250 float ratio = ((float)s.kb_used) / ((float)s.kb);
7c673cae 1251
31f18b77
FG
1252 if (full_ratio > 0 && ratio > full_ratio) {
1253 // full
1254 full_osds.insert(osd);
1255 nearfull_osds.erase(osd);
1256 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
1257 // nearfull
1258 full_osds.erase(osd);
1259 nearfull_osds.insert(osd);
1260 } else {
1261 // ok
1262 full_osds.erase(osd);
1263 nearfull_osds.erase(osd);
1264 }
7c673cae
FG
1265}
1266
31f18b77 1267void PGMap::calc_stats()
7c673cae 1268{
31f18b77
FG
1269 num_pg = 0;
1270 num_pg_active = 0;
1271 num_pg_unknown = 0;
1272 num_osd = 0;
1273 pg_pool_sum.clear();
1274 num_pg_by_pool.clear();
1275 pg_by_osd.clear();
1276 pg_sum = pool_stat_t();
1277 osd_sum = osd_stat_t();
1278 num_pg_by_state.clear();
1279 num_pg_by_osd.clear();
7c673cae 1280
31f18b77
FG
1281 for (auto p = pg_stat.begin();
1282 p != pg_stat.end();
1283 ++p) {
1284 stat_pg_add(p->first, p->second);
1285 }
1286 for (auto p = osd_stat.begin();
1287 p != osd_stat.end();
1288 ++p)
1289 stat_osd_add(p->first, p->second);
7c673cae 1290
31f18b77 1291 redo_full_sets();
7c673cae 1292
31f18b77 1293 min_last_epoch_clean = calc_min_last_epoch_clean();
7c673cae
FG
1294}
1295
31f18b77 1296void PGMap::update_pg(pg_t pgid, bufferlist& bl)
7c673cae 1297{
31f18b77
FG
1298 bufferlist::iterator p = bl.begin();
1299 auto s = pg_stat.find(pgid);
1300 epoch_t old_lec = 0, lec;
1301 if (s != pg_stat.end()) {
1302 old_lec = s->second.get_effective_last_epoch_clean();
1303 stat_pg_update(pgid, s->second, p);
1304 lec = s->second.get_effective_last_epoch_clean();
1305 } else {
1306 pg_stat_t& r = pg_stat[pgid];
1307 ::decode(r, p);
1308 stat_pg_add(pgid, r);
1309 lec = r.get_effective_last_epoch_clean();
1310 }
7c673cae 1311
31f18b77
FG
1312 if (min_last_epoch_clean &&
1313 (lec < min_last_epoch_clean || // we did
1314 (lec > min_last_epoch_clean && // we might
1315 old_lec == min_last_epoch_clean)
1316 ))
1317 min_last_epoch_clean = 0;
1318}
7c673cae 1319
31f18b77
FG
1320void PGMap::remove_pg(pg_t pgid)
1321{
1322 auto s = pg_stat.find(pgid);
1323 if (s != pg_stat.end()) {
1324 if (min_last_epoch_clean &&
1325 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
1326 min_last_epoch_clean = 0;
1327 stat_pg_sub(pgid, s->second);
1328 pg_stat.erase(s);
7c673cae
FG
1329 }
1330}
1331
31f18b77 1332void PGMap::update_osd(int osd, bufferlist& bl)
7c673cae 1333{
31f18b77
FG
1334 bufferlist::iterator p = bl.begin();
1335 auto o = osd_stat.find(osd);
1336 epoch_t old_lec = 0;
1337 if (o != osd_stat.end()) {
1338 auto i = osd_epochs.find(osd);
1339 if (i != osd_epochs.end())
1340 old_lec = i->second;
1341 stat_osd_sub(osd, o->second);
1342 }
1343 osd_stat_t& r = osd_stat[osd];
1344 ::decode(r, p);
1345 stat_osd_add(osd, r);
7c673cae 1346
31f18b77
FG
1347 // adjust [near]full status
1348 register_nearfull_status(osd, r);
1349
1350 // epoch?
1351 if (!p.end()) {
1352 epoch_t e;
1353 ::decode(e, p);
1354
1355 if (e < min_last_epoch_clean ||
1356 (e > min_last_epoch_clean &&
1357 old_lec == min_last_epoch_clean))
1358 min_last_epoch_clean = 0;
1359 } else {
1360 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
1361 // or old mon running.
7c673cae 1362 }
7c673cae
FG
1363}
1364
31f18b77 1365void PGMap::remove_osd(int osd)
7c673cae 1366{
31f18b77
FG
1367 auto o = osd_stat.find(osd);
1368 if (o != osd_stat.end()) {
1369 stat_osd_sub(osd, o->second);
1370 osd_stat.erase(o);
1371
1372 // remove these old osds from full/nearfull set(s), too
1373 nearfull_osds.erase(osd);
1374 full_osds.erase(osd);
7c673cae 1375 }
7c673cae
FG
1376}
1377
31f18b77
FG
1378void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1379 bool sameosds)
7c673cae 1380{
31f18b77
FG
1381 pg_pool_sum[pgid.pool()].add(s);
1382 pg_sum.add(s);
7c673cae 1383
31f18b77
FG
1384 num_pg++;
1385 num_pg_by_state[s.state]++;
1386 num_pg_by_pool[pgid.pool()]++;
7c673cae 1387
31f18b77
FG
1388 if ((s.state & PG_STATE_CREATING) &&
1389 s.parent_split_bits == 0) {
1390 creating_pgs.insert(pgid);
1391 if (s.acting_primary >= 0) {
1392 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
7c673cae
FG
1393 }
1394 }
1395
31f18b77
FG
1396 if (s.state & PG_STATE_ACTIVE) {
1397 ++num_pg_active;
1398 }
1399 if (s.state == 0) {
1400 ++num_pg_unknown;
7c673cae
FG
1401 }
1402
31f18b77
FG
1403 if (sameosds)
1404 return;
7c673cae 1405
31f18b77
FG
1406 for (auto p = s.blocked_by.begin();
1407 p != s.blocked_by.end();
1408 ++p) {
1409 ++blocked_by_sum[*p];
7c673cae 1410 }
31f18b77
FG
1411
1412 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1413 pg_by_osd[*p].insert(pgid);
1414 num_pg_by_osd[*p].acting++;
1415 }
1416 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1417 pg_by_osd[*p].insert(pgid);
1418 num_pg_by_osd[*p].up++;
7c673cae 1419 }
7c673cae 1420
31f18b77
FG
1421 if (s.up_primary >= 0) {
1422 num_pg_by_osd[s.up_primary].primary++;
7c673cae 1423 }
7c673cae 1424}
31f18b77
FG
1425
1426void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1427 bool sameosds)
7c673cae 1428{
31f18b77
FG
1429 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
1430 ps.sub(s);
1431 pg_sum.sub(s);
1432
1433 num_pg--;
1434 int end = --num_pg_by_state[s.state];
1435 assert(end >= 0);
1436 if (end == 0)
1437 num_pg_by_state.erase(s.state);
1438 end = --num_pg_by_pool[pgid.pool()];
1439 if (end == 0) {
1440 num_pg_by_pool.erase(pgid.pool());
1441 pg_pool_sum.erase(pgid.pool());
7c673cae 1442 }
7c673cae 1443
31f18b77
FG
1444 if ((s.state & PG_STATE_CREATING) &&
1445 s.parent_split_bits == 0) {
1446 creating_pgs.erase(pgid);
1447 if (s.acting_primary >= 0) {
1448 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1449 r[s.mapping_epoch].erase(pgid);
1450 if (r[s.mapping_epoch].empty())
1451 r.erase(s.mapping_epoch);
1452 if (r.empty())
1453 creating_pgs_by_osd_epoch.erase(s.acting_primary);
7c673cae
FG
1454 }
1455 }
31f18b77
FG
1456
1457 if (s.state & PG_STATE_ACTIVE) {
1458 --num_pg_active;
1459 }
1460 if (s.state == 0) {
1461 --num_pg_unknown;
1462 }
1463
1464 if (sameosds)
1465 return;
1466
1467 for (auto p = s.blocked_by.begin();
1468 p != s.blocked_by.end();
1469 ++p) {
1470 auto q = blocked_by_sum.find(*p);
1471 assert(q != blocked_by_sum.end());
1472 --q->second;
1473 if (q->second == 0)
1474 blocked_by_sum.erase(q);
1475 }
1476
1477 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1478 auto& oset = pg_by_osd[*p];
1479 oset.erase(pgid);
1480 if (oset.empty())
1481 pg_by_osd.erase(*p);
1482 auto it = num_pg_by_osd.find(*p);
1483 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1484 it->second.acting--;
1485 }
1486 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1487 auto& oset = pg_by_osd[*p];
1488 oset.erase(pgid);
1489 if (oset.empty())
1490 pg_by_osd.erase(*p);
1491 auto it = num_pg_by_osd.find(*p);
1492 if (it != num_pg_by_osd.end() && it->second.up > 0)
1493 it->second.up--;
1494 }
1495
1496 if (s.up_primary >= 0) {
1497 auto it = num_pg_by_osd.find(s.up_primary);
1498 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1499 it->second.primary--;
1500 }
1501}
1502
1503void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
1504 bufferlist::iterator& blp)
1505{
1506 pg_stat_t n;
1507 ::decode(n, blp);
1508
1509 bool sameosds =
1510 s.acting == n.acting &&
1511 s.up == n.up &&
1512 s.blocked_by == n.blocked_by;
1513
1514 stat_pg_sub(pgid, s, sameosds);
1515
1516 // if acting_primary has shift to an just restored osd, and pg yet to finish
1517 // peering, many attributes in current stats remain stale. others seem don't
1518 // mater much while faulty last_active will make "pg stuck in" check unhappy.
1519 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
1520 n.last_active < s.last_active)
1521 n.last_active = s.last_active;
1522 s = n;
1523 stat_pg_add(pgid, n, sameosds);
1524}
1525
1526void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1527{
1528 num_osd++;
1529 osd_sum.add(s);
1530 if (osd >= (int)osd_last_seq.size()) {
1531 osd_last_seq.resize(osd + 1);
1532 }
1533 osd_last_seq[osd] = s.seq;
1534}
1535
1536void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1537{
1538 num_osd--;
1539 osd_sum.sub(s);
1540 assert(osd < (int)osd_last_seq.size());
1541 osd_last_seq[osd] = 0;
1542}
1543
1544epoch_t PGMap::calc_min_last_epoch_clean() const
1545{
1546 if (pg_stat.empty())
1547 return 0;
1548
1549 auto p = pg_stat.begin();
1550 epoch_t min = p->second.get_effective_last_epoch_clean();
1551 for (++p; p != pg_stat.end(); ++p) {
1552 epoch_t lec = p->second.get_effective_last_epoch_clean();
1553 if (lec < min)
1554 min = lec;
1555 }
1556 // also scan osd epochs
1557 // don't trim past the oldest reported osd epoch
1558 for (auto i = osd_epochs.begin();
1559 i != osd_epochs.end();
1560 ++i) {
1561 if (i->second < min)
1562 min = i->second;
1563 }
1564 return min;
1565}
1566
1567void PGMap::encode_digest(const OSDMap& osdmap,
1568 bufferlist& bl, uint64_t features) const
1569{
1570 get_rules_avail(osdmap, &avail_space_by_rule);
1571 PGMapDigest::encode(bl, features);
1572}
1573
1574void PGMap::encode(bufferlist &bl, uint64_t features) const
1575{
1576 if ((features & CEPH_FEATURE_MONENC) == 0) {
1577 __u8 v = 3;
1578 ::encode(v, bl);
1579 ::encode(version, bl);
1580 ::encode(pg_stat, bl);
1581 ::encode(osd_stat, bl);
1582 ::encode(last_osdmap_epoch, bl);
1583 ::encode(last_pg_scan, bl);
1584 ::encode(full_ratio, bl);
1585 ::encode(nearfull_ratio, bl);
1586 return;
1587 }
1588
1589 ENCODE_START(6, 4, bl);
1590 ::encode(version, bl);
1591 ::encode(pg_stat, bl);
1592 ::encode(osd_stat, bl);
1593 ::encode(last_osdmap_epoch, bl);
1594 ::encode(last_pg_scan, bl);
1595 ::encode(full_ratio, bl);
1596 ::encode(nearfull_ratio, bl);
1597 ::encode(stamp, bl);
1598 ::encode(osd_epochs, bl);
1599 ENCODE_FINISH(bl);
1600}
1601
1602void PGMap::decode(bufferlist::iterator &bl)
1603{
1604 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
1605 ::decode(version, bl);
1606 if (struct_v < 3) {
1607 pg_stat.clear();
1608 __u32 n;
1609 ::decode(n, bl);
1610 while (n--) {
1611 old_pg_t opgid;
1612 ::decode(opgid, bl);
1613 pg_t pgid = opgid;
1614 ::decode(pg_stat[pgid], bl);
7c673cae 1615 }
31f18b77
FG
1616 } else {
1617 ::decode(pg_stat, bl);
7c673cae 1618 }
31f18b77
FG
1619 ::decode(osd_stat, bl);
1620 ::decode(last_osdmap_epoch, bl);
1621 ::decode(last_pg_scan, bl);
1622 if (struct_v >= 2) {
1623 ::decode(full_ratio, bl);
1624 ::decode(nearfull_ratio, bl);
1625 }
1626 if (struct_v >= 5)
1627 ::decode(stamp, bl);
1628 if (struct_v >= 6) {
1629 ::decode(osd_epochs, bl);
1630 } else {
1631 for (auto i = osd_stat.begin();
1632 i != osd_stat.end();
1633 ++i) {
1634 // This isn't accurate, but will cause trimming to behave like
1635 // previously.
1636 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
7c673cae
FG
1637 }
1638 }
31f18b77
FG
1639 DECODE_FINISH(bl);
1640
1641 calc_stats();
7c673cae
FG
1642}
1643
31f18b77 1644void PGMap::dirty_all(Incremental& inc)
7c673cae 1645{
31f18b77
FG
1646 inc.osdmap_epoch = last_osdmap_epoch;
1647 inc.pg_scan = last_pg_scan;
1648 inc.full_ratio = full_ratio;
1649 inc.nearfull_ratio = nearfull_ratio;
1650
1651 for (auto p = pg_stat.begin(); p != pg_stat.end(); ++p) {
1652 inc.pg_stat_updates[p->first] = p->second;
1653 }
1654 for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
1655 assert(osd_epochs.count(p->first));
1656 inc.update_stat(p->first,
1657 inc.get_osd_epochs().find(p->first)->second,
1658 p->second);
1659 }
1660}
1661
1662void PGMap::dump(Formatter *f) const
1663{
1664 dump_basic(f);
1665 dump_pg_stats(f, false);
1666 dump_pool_stats(f);
1667 dump_osd_stats(f);
1668}
1669
1670void PGMap::dump_basic(Formatter *f) const
1671{
1672 f->dump_unsigned("version", version);
1673 f->dump_stream("stamp") << stamp;
1674 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1675 f->dump_unsigned("last_pg_scan", last_pg_scan);
1676 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
1677 f->dump_float("full_ratio", full_ratio);
1678 f->dump_float("near_full_ratio", nearfull_ratio);
1679
1680 f->open_object_section("pg_stats_sum");
1681 pg_sum.dump(f);
1682 f->close_section();
1683
1684 f->open_object_section("osd_stats_sum");
1685 osd_sum.dump(f);
1686 f->close_section();
1687
1688 f->open_array_section("osd_epochs");
1689 for (auto p = osd_epochs.begin(); p != osd_epochs.end(); ++p) {
1690 f->open_object_section("osd");
1691 f->dump_unsigned("osd", p->first);
1692 f->dump_unsigned("epoch", p->second);
1693 f->close_section();
1694 }
1695 f->close_section();
1696
1697 dump_delta(f);
1698}
1699
1700void PGMap::dump_delta(Formatter *f) const
1701{
1702 f->open_object_section("pg_stats_delta");
1703 pg_sum_delta.dump(f);
1704 f->close_section();
1705}
1706
1707void PGMap::dump_pg_stats(Formatter *f, bool brief) const
1708{
1709 f->open_array_section("pg_stats");
1710 for (auto i = pg_stat.begin();
1711 i != pg_stat.end();
1712 ++i) {
1713 f->open_object_section("pg_stat");
1714 f->dump_stream("pgid") << i->first;
1715 if (brief)
1716 i->second.dump_brief(f);
1717 else
1718 i->second.dump(f);
1719 f->close_section();
1720 }
1721 f->close_section();
1722}
1723
1724void PGMap::dump_pool_stats(Formatter *f) const
1725{
1726 f->open_array_section("pool_stats");
1727 for (auto p = pg_pool_sum.begin();
1728 p != pg_pool_sum.end();
1729 ++p) {
1730 f->open_object_section("pool_stat");
1731 f->dump_int("poolid", p->first);
1732 auto q = num_pg_by_pool.find(p->first);
1733 if (q != num_pg_by_pool.end())
1734 f->dump_unsigned("num_pg", q->second);
1735 p->second.dump(f);
1736 f->close_section();
1737 }
1738 f->close_section();
1739}
1740
1741void PGMap::dump_osd_stats(Formatter *f) const
1742{
1743 f->open_array_section("osd_stats");
1744 for (auto q = osd_stat.begin();
1745 q != osd_stat.end();
1746 ++q) {
1747 f->open_object_section("osd_stat");
1748 f->dump_int("osd", q->first);
1749 q->second.dump(f);
1750 f->close_section();
1751 }
1752 f->close_section();
1753}
1754
1755void PGMap::dump_pg_stats_plain(
1756 ostream& ss,
1757 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1758 bool brief) const
1759{
1760 TextTable tab;
1761
1762 if (brief){
1763 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1764 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1765 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1766 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1767 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1768 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1769 }
1770 else {
1771 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1772 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1773 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1774 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1775 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1776 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1777 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1778 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1780 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1782 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1784 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1785 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1786 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1787 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1788 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1789 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1790 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1791 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
b32b8144 1792 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
31f18b77
FG
1793 }
1794
1795 for (auto i = pg_stats.begin();
1796 i != pg_stats.end(); ++i) {
1797 const pg_stat_t &st(i->second);
1798 if (brief) {
1799 tab << i->first
1800 << pg_state_string(st.state)
1801 << st.up
1802 << st.up_primary
1803 << st.acting
1804 << st.acting_primary
1805 << TextTable::endrow;
7c673cae 1806 } else {
31f18b77
FG
1807 ostringstream reported;
1808 reported << st.reported_epoch << ":" << st.reported_seq;
1809
1810 tab << i->first
1811 << st.stats.sum.num_objects
1812 << st.stats.sum.num_objects_missing_on_primary
1813 << st.stats.sum.num_objects_degraded
1814 << st.stats.sum.num_objects_misplaced
1815 << st.stats.sum.num_objects_unfound
1816 << st.stats.sum.num_bytes
1817 << st.log_size
1818 << st.ondisk_log_size
1819 << pg_state_string(st.state)
1820 << st.last_change
1821 << st.version
1822 << reported.str()
1823 << pg_vector_string(st.up)
1824 << st.up_primary
1825 << pg_vector_string(st.acting)
1826 << st.acting_primary
1827 << st.last_scrub
1828 << st.last_scrub_stamp
1829 << st.last_deep_scrub
1830 << st.last_deep_scrub_stamp
b32b8144 1831 << st.snaptrimq_len
31f18b77 1832 << TextTable::endrow;
7c673cae
FG
1833 }
1834 }
7c673cae 1835
31f18b77
FG
1836 ss << tab;
1837}
1838
1839void PGMap::dump(ostream& ss) const
1840{
1841 dump_basic(ss);
1842 dump_pg_stats(ss, false);
1843 dump_pool_stats(ss, false);
1844 dump_pg_sum_stats(ss, false);
1845 dump_osd_stats(ss);
1846}
1847
1848void PGMap::dump_basic(ostream& ss) const
1849{
1850 ss << "version " << version << std::endl;
1851 ss << "stamp " << stamp << std::endl;
1852 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1853 ss << "last_pg_scan " << last_pg_scan << std::endl;
1854 ss << "full_ratio " << full_ratio << std::endl;
1855 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
1856}
1857
1858void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1859{
1860 dump_pg_stats_plain(ss, pg_stat, brief);
1861}
1862
1863void PGMap::dump_pool_stats(ostream& ss, bool header) const
1864{
1865 TextTable tab;
1866
1867 if (header) {
1868 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1869 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1870 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1871 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1872 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1873 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1874 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1875 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1876 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1877 } else {
1878 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1879 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1880 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1881 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1882 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1883 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1884 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1885 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1886 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1887 }
1888
1889 for (auto p = pg_pool_sum.begin();
1890 p != pg_pool_sum.end();
1891 ++p) {
1892 tab << p->first
1893 << p->second.stats.sum.num_objects
1894 << p->second.stats.sum.num_objects_missing_on_primary
1895 << p->second.stats.sum.num_objects_degraded
1896 << p->second.stats.sum.num_objects_misplaced
1897 << p->second.stats.sum.num_objects_unfound
1898 << p->second.stats.sum.num_bytes
1899 << p->second.log_size
1900 << p->second.ondisk_log_size
1901 << TextTable::endrow;
1902 }
1903
1904 ss << tab;
1905}
1906
1907void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1908{
1909 TextTable tab;
1910
1911 if (header) {
1912 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1913 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1914 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1915 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1916 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1917 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1918 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1919 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1920 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1921 } else {
1922 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1923 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1924 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1925 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1926 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1927 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1928 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1929 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1930 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1931 };
1932
1933 tab << "sum"
1934 << pg_sum.stats.sum.num_objects
1935 << pg_sum.stats.sum.num_objects_missing_on_primary
1936 << pg_sum.stats.sum.num_objects_degraded
1937 << pg_sum.stats.sum.num_objects_misplaced
1938 << pg_sum.stats.sum.num_objects_unfound
1939 << pg_sum.stats.sum.num_bytes
1940 << pg_sum.log_size
1941 << pg_sum.ondisk_log_size
1942 << TextTable::endrow;
1943
1944 ss << tab;
1945}
1946
1947void PGMap::dump_osd_stats(ostream& ss) const
1948{
1949 TextTable tab;
1950
1951 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1952 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1953 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1954 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1955 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1956 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1957 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1958
1959 for (auto p = osd_stat.begin();
1960 p != osd_stat.end();
1961 ++p) {
1962 tab << p->first
1963 << si_t(p->second.kb_used << 10)
1964 << si_t(p->second.kb_avail << 10)
1965 << si_t(p->second.kb << 10)
1966 << p->second.hb_peers
1967 << get_num_pg_by_osd(p->first)
1968 << get_num_primary_pg_by_osd(p->first)
1969 << TextTable::endrow;
1970 }
1971
1972 tab << "sum"
1973 << si_t(osd_sum.kb_used << 10)
1974 << si_t(osd_sum.kb_avail << 10)
1975 << si_t(osd_sum.kb << 10)
1976 << TextTable::endrow;
7c673cae 1977
31f18b77 1978 ss << tab;
7c673cae
FG
1979}
1980
31f18b77 1981void PGMap::dump_osd_sum_stats(ostream& ss) const
7c673cae 1982{
31f18b77 1983 TextTable tab;
7c673cae 1984
31f18b77
FG
1985 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1986 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1987 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1988 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
7c673cae 1989
31f18b77
FG
1990 tab << "sum"
1991 << si_t(osd_sum.kb_used << 10)
1992 << si_t(osd_sum.kb_avail << 10)
1993 << si_t(osd_sum.kb << 10)
1994 << TextTable::endrow;
7c673cae 1995
31f18b77 1996 ss << tab;
7c673cae
FG
1997}
1998
31f18b77
FG
1999void PGMap::get_stuck_stats(
2000 int types, const utime_t cutoff,
2001 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
7c673cae 2002{
31f18b77
FG
2003 assert(types != 0);
2004 for (auto i = pg_stat.begin();
2005 i != pg_stat.end();
2006 ++i) {
2007 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
2008
2009 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
2010 if (i->second.last_active < val)
2011 val = i->second.last_active;
7c673cae 2012 }
31f18b77
FG
2013
2014 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
2015 if (i->second.last_clean < val)
2016 val = i->second.last_clean;
7c673cae 2017 }
31f18b77
FG
2018
2019 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
2020 if (i->second.last_undegraded < val)
2021 val = i->second.last_undegraded;
7c673cae 2022 }
7c673cae 2023
31f18b77
FG
2024 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
2025 if (i->second.last_fullsized < val)
2026 val = i->second.last_fullsized;
2027 }
7c673cae 2028
31f18b77
FG
2029 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
2030 if (i->second.last_unstale < val)
2031 val = i->second.last_unstale;
2032 }
7c673cae 2033
31f18b77
FG
2034 // val is now the earliest any of the requested stuck states began
2035 if (val < cutoff) {
2036 stuck_pgs[i->first] = i->second;
2037 }
2038 }
7c673cae
FG
2039}
2040
31f18b77 2041bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
7c673cae 2042{
31f18b77
FG
2043 int inactive = 0;
2044 int unclean = 0;
2045 int degraded = 0;
2046 int undersized = 0;
2047 int stale = 0;
7c673cae 2048
31f18b77
FG
2049 for (auto i = pg_stat.begin();
2050 i != pg_stat.end();
2051 ++i) {
2052 if (! (i->second.state & PG_STATE_ACTIVE)) {
2053 if (i->second.last_active < cutoff)
2054 ++inactive;
7c673cae 2055 }
31f18b77
FG
2056 if (! (i->second.state & PG_STATE_CLEAN)) {
2057 if (i->second.last_clean < cutoff)
2058 ++unclean;
7c673cae 2059 }
31f18b77
FG
2060 if (i->second.state & PG_STATE_DEGRADED) {
2061 if (i->second.last_undegraded < cutoff)
2062 ++degraded;
7c673cae 2063 }
31f18b77
FG
2064 if (i->second.state & PG_STATE_UNDERSIZED) {
2065 if (i->second.last_fullsized < cutoff)
2066 ++undersized;
7c673cae 2067 }
31f18b77
FG
2068 if (i->second.state & PG_STATE_STALE) {
2069 if (i->second.last_unstale < cutoff)
2070 ++stale;
7c673cae
FG
2071 }
2072 }
31f18b77
FG
2073
2074 if (inactive)
2075 note["stuck inactive"] = inactive;
2076
2077 if (unclean)
2078 note["stuck unclean"] = unclean;
2079
2080 if (undersized)
2081 note["stuck undersized"] = undersized;
2082
2083 if (degraded)
2084 note["stuck degraded"] = degraded;
2085
2086 if (stale)
2087 note["stuck stale"] = stale;
2088
2089 return inactive || unclean || undersized || degraded || stale;
2090}
2091
2092void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
2093{
2094 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2095 get_stuck_stats(types, cutoff, stuck_pg_stats);
2096 f->open_array_section("stuck_pg_stats");
2097 for (auto i = stuck_pg_stats.begin();
2098 i != stuck_pg_stats.end();
2099 ++i) {
2100 f->open_object_section("pg_stat");
2101 f->dump_stream("pgid") << i->first;
2102 i->second.dump(f);
2103 f->close_section();
2104 }
2105 f->close_section();
2106}
2107
2108void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
2109{
2110 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
2111 get_stuck_stats(types, cutoff, stuck_pg_stats);
2112 if (!stuck_pg_stats.empty())
2113 dump_pg_stats_plain(ss, stuck_pg_stats, true);
2114}
2115
2116int PGMap::dump_stuck_pg_stats(
2117 stringstream &ds,
2118 Formatter *f,
2119 int threshold,
2120 vector<string>& args) const
2121{
2122 int stuck_types = 0;
2123
2124 for (auto i = args.begin(); i != args.end(); ++i) {
2125 if (*i == "inactive")
2126 stuck_types |= PGMap::STUCK_INACTIVE;
2127 else if (*i == "unclean")
2128 stuck_types |= PGMap::STUCK_UNCLEAN;
2129 else if (*i == "undersized")
2130 stuck_types |= PGMap::STUCK_UNDERSIZED;
2131 else if (*i == "degraded")
2132 stuck_types |= PGMap::STUCK_DEGRADED;
2133 else if (*i == "stale")
2134 stuck_types |= PGMap::STUCK_STALE;
2135 else {
2136 ds << "Unknown type: " << *i << std::endl;
2137 return -EINVAL;
7c673cae
FG
2138 }
2139 }
31f18b77
FG
2140
2141 utime_t now(ceph_clock_now());
2142 utime_t cutoff = now - utime_t(threshold, 0);
2143
2144 if (!f) {
2145 dump_stuck_plain(ds, stuck_types, cutoff);
2146 } else {
2147 dump_stuck(f, stuck_types, cutoff);
2148 f->flush(ds);
7c673cae 2149 }
31f18b77
FG
2150
2151 return 0;
7c673cae
FG
2152}
2153
31f18b77 2154void PGMap::dump_osd_perf_stats(Formatter *f) const
7c673cae 2155{
31f18b77
FG
2156 f->open_array_section("osd_perf_infos");
2157 for (auto i = osd_stat.begin();
2158 i != osd_stat.end();
2159 ++i) {
2160 f->open_object_section("osd");
2161 f->dump_int("id", i->first);
2162 {
2163 f->open_object_section("perf_stats");
2164 i->second.os_perf_stat.dump(f);
2165 f->close_section();
2166 }
2167 f->close_section();
2168 }
2169 f->close_section();
7c673cae 2170}
31f18b77 2171void PGMap::print_osd_perf_stats(std::ostream *ss) const
7c673cae 2172{
31f18b77
FG
2173 TextTable tab;
2174 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2175 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2176 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2177 for (auto i = osd_stat.begin();
2178 i != osd_stat.end();
2179 ++i) {
2180 tab << i->first;
2181 tab << i->second.os_perf_stat.os_commit_latency;
2182 tab << i->second.os_perf_stat.os_apply_latency;
2183 tab << TextTable::endrow;
2184 }
2185 (*ss) << tab;
2186}
7c673cae 2187
31f18b77
FG
2188void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
2189{
2190 f->open_array_section("osd_blocked_by_infos");
2191 for (auto i = blocked_by_sum.begin();
2192 i != blocked_by_sum.end();
2193 ++i) {
2194 f->open_object_section("osd");
2195 f->dump_int("id", i->first);
2196 f->dump_int("num_blocked", i->second);
2197 f->close_section();
2198 }
2199 f->close_section();
2200}
2201void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2202{
2203 TextTable tab;
2204 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2205 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2206 for (auto i = blocked_by_sum.begin();
2207 i != blocked_by_sum.end();
2208 ++i) {
2209 tab << i->first;
2210 tab << i->second;
2211 tab << TextTable::endrow;
2212 }
2213 (*ss) << tab;
7c673cae
FG
2214}
2215
31f18b77 2216
7c673cae
FG
2217/**
2218 * update aggregated delta
2219 *
2220 * @param cct ceph context
2221 * @param ts Timestamp for the stats being delta'ed
2222 * @param old_pool_sum Previous stats sum
2223 * @param last_ts Last timestamp for pool
2224 * @param result_pool_sum Resulting stats
2225 * @param result_pool_delta Resulting pool delta
2226 * @param result_ts_delta Resulting timestamp delta
2227 * @param delta_avg_list List of last N computed deltas, used to average
2228 */
31f18b77
FG
2229void PGMap::update_delta(
2230 CephContext *cct,
2231 const utime_t ts,
2232 const pool_stat_t& old_pool_sum,
2233 utime_t *last_ts,
2234 const pool_stat_t& current_pool_sum,
2235 pool_stat_t *result_pool_delta,
2236 utime_t *result_ts_delta,
2237 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
7c673cae
FG
2238{
2239 /* @p ts is the timestamp we want to associate with the data
2240 * in @p old_pool_sum, and on which we will base ourselves to
2241 * calculate the delta, stored in 'delta_t'.
2242 */
2243 utime_t delta_t;
2244 delta_t = ts; // start with the provided timestamp
2245 delta_t -= *last_ts; // take the last timestamp we saw
2246 *last_ts = ts; // @p ts becomes the last timestamp we saw
2247
31f18b77
FG
2248 // adjust delta_t, quick start if there is no update in a long period
2249 delta_t = std::min(delta_t,
2250 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2251
2252 // calculate a delta, and average over the last 6 deltas by default.
7c673cae
FG
2253 /* start by taking a copy of our current @p result_pool_sum, and by
2254 * taking out the stats from @p old_pool_sum. This generates a stats
2255 * delta. Stash this stats delta in @p delta_avg_list, along with the
2256 * timestamp delta for these results.
2257 */
2258 pool_stat_t d = current_pool_sum;
2259 d.stats.sub(old_pool_sum.stats);
7c673cae
FG
2260
2261 /* Aggregate current delta, and take out the last seen delta (if any) to
2262 * average it out.
b32b8144 2263 * Skip calculating delta while sum was not synchronized.
7c673cae 2264 */
b32b8144
FG
2265 if(!old_pool_sum.stats.sum.is_zero()) {
2266 delta_avg_list->push_back(make_pair(d,delta_t));
2267 *result_ts_delta += delta_t;
2268 result_pool_delta->stats.add(d.stats);
2269 }
2270 size_t s = cct ? cct->_conf->get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
7c673cae
FG
2271 if (delta_avg_list->size() > s) {
2272 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2273 *result_ts_delta -= delta_avg_list->front().second;
2274 delta_avg_list->pop_front();
2275 }
2276}
2277
2278/**
2279 * update aggregated delta
2280 *
2281 * @param cct ceph context
2282 * @param ts Timestamp
2283 * @param pg_sum_old Old pg_sum
2284 */
2285void PGMap::update_global_delta(CephContext *cct,
2286 const utime_t ts, const pool_stat_t& pg_sum_old)
2287{
2288 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
2289 &stamp_delta, &pg_sum_deltas);
2290}
2291
2292/**
2293 * Update a given pool's deltas
2294 *
2295 * @param cct Ceph Context
2296 * @param ts Timestamp for the stats being delta'ed
2297 * @param pool Pool's id
2298 * @param old_pool_sum Previous stats sum
2299 */
31f18b77
FG
2300void PGMap::update_one_pool_delta(
2301 CephContext *cct,
2302 const utime_t ts,
2303 const uint64_t pool,
2304 const pool_stat_t& old_pool_sum)
7c673cae
FG
2305{
2306 if (per_pool_sum_deltas.count(pool) == 0) {
2307 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2308 assert(per_pool_sum_delta.count(pool) == 0);
2309 }
2310
31f18b77 2311 auto& sum_delta = per_pool_sum_delta[pool];
7c673cae
FG
2312
2313 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2314 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2315 &per_pool_sum_deltas[pool]);
2316}
2317
2318/**
2319 * Update pools' deltas
2320 *
2321 * @param cct CephContext
2322 * @param ts Timestamp for the stats being delta'ed
2323 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2324 */
31f18b77
FG
2325void PGMap::update_pool_deltas(
2326 CephContext *cct, const utime_t ts,
2327 const mempool::pgmap::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
7c673cae 2328{
31f18b77 2329 for (auto it = pg_pool_sum_old.begin();
7c673cae
FG
2330 it != pg_pool_sum_old.end(); ++it) {
2331 update_one_pool_delta(cct, ts, it->first, it->second);
2332 }
2333}
2334
2335void PGMap::clear_delta()
2336{
2337 pg_sum_delta = pool_stat_t();
2338 pg_sum_deltas.clear();
2339 stamp_delta = utime_t();
2340}
2341
7c673cae
FG
2342void PGMap::generate_test_instances(list<PGMap*>& o)
2343{
2344 o.push_back(new PGMap);
2345 list<Incremental*> inc;
2346 Incremental::generate_test_instances(inc);
2347 delete inc.front();
2348 inc.pop_front();
2349 while (!inc.empty()) {
2350 PGMap *pmp = new PGMap();
2351 *pmp = *o.back();
2352 o.push_back(pmp);
2353 o.back()->apply_incremental(NULL, *inc.front());
2354 delete inc.front();
2355 inc.pop_front();
2356 }
2357}
2358
2359void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
2360 bool primary, set<pg_t>& pgs) const
2361{
31f18b77 2362 for (auto i = pg_stat.begin();
7c673cae
FG
2363 i != pg_stat.end();
2364 ++i) {
2365 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
2366 continue;
2367 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2368 continue;
2369 if (!(i->second.state & state))
2370 continue;
2371 pgs.insert(i->first);
2372 }
2373}
2374
2375void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
2376{
2377 f->open_array_section("pg_stats");
31f18b77 2378 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2379 const pg_stat_t& st = pg_stat.at(*i);
2380 f->open_object_section("pg_stat");
2381 f->dump_stream("pgid") << *i;
2382 st.dump(f);
2383 f->close_section();
2384 }
2385 f->close_section();
2386}
2387
2388void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2389{
2390 TextTable tab;
2391
2392 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
2393 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2394 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2395 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2396 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2397 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2398 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2399 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2400 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
2401 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2402 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
2403 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2404 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2405 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2406 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2407 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2408 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
2409 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2410 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2411 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
2412 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2413
31f18b77 2414 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
7c673cae
FG
2415 const pg_stat_t& st = pg_stat.at(*i);
2416
2417 ostringstream reported;
2418 reported << st.reported_epoch << ":" << st.reported_seq;
2419
2420 tab << *i
2421 << st.stats.sum.num_objects
2422 << st.stats.sum.num_objects_missing_on_primary
2423 << st.stats.sum.num_objects_degraded
2424 << st.stats.sum.num_objects_misplaced
2425 << st.stats.sum.num_objects_unfound
2426 << st.stats.sum.num_bytes
2427 << st.log_size
2428 << st.ondisk_log_size
2429 << pg_state_string(st.state)
2430 << st.last_change
2431 << st.version
2432 << reported.str()
2433 << st.up
2434 << st.up_primary
2435 << st.acting
2436 << st.acting_primary
2437 << st.last_scrub
2438 << st.last_scrub_stamp
2439 << st.last_deep_scrub
2440 << st.last_deep_scrub_stamp
2441 << TextTable::endrow;
2442 }
2443
2444 ss << tab;
2445}
2446
7c673cae 2447
7c673cae 2448
31f18b77
FG
2449// Only called with a single bit set in "what"
2450static void note_stuck_detail(
2451 int what,
2452 mempool::pgmap::unordered_map<pg_t,pg_stat_t>& stuck_pgs,
2453 int max_detail,
2454 list<pair<health_status_t,string> > *detail)
2455{
2456 int n = 0;
2457 for (auto p = stuck_pgs.begin();
2458 p != stuck_pgs.end();
2459 ++p) {
2460 ostringstream ss;
2461 utime_t since;
2462 const char *whatname = 0;
2463 switch (what) {
2464 case PGMap::STUCK_INACTIVE:
2465 since = p->second.last_active;
2466 whatname = "inactive";
2467 break;
2468 case PGMap::STUCK_UNCLEAN:
2469 since = p->second.last_clean;
2470 whatname = "unclean";
2471 break;
2472 case PGMap::STUCK_DEGRADED:
2473 since = p->second.last_undegraded;
2474 whatname = "degraded";
2475 break;
2476 case PGMap::STUCK_UNDERSIZED:
2477 since = p->second.last_fullsized;
2478 whatname = "undersized";
2479 break;
2480 case PGMap::STUCK_STALE:
2481 since = p->second.last_unstale;
2482 whatname = "stale";
2483 break;
2484 default:
2485 ceph_abort();
2486 }
2487 if (--max_detail == 0) {
2488 ostringstream ss;
2489 ss << (stuck_pgs.size() - n) << " more pgs are also stuck " << whatname;
2490 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
2491 break;
2492 }
2493 ++n;
2494 ss << "pg " << p->first << " is stuck " << whatname;
2495 if (since == utime_t()) {
2496 ss << " since forever";
7c673cae 2497 } else {
31f18b77
FG
2498 utime_t dur = ceph_clock_now() - since;
2499 ss << " for " << dur;
7c673cae 2500 }
31f18b77
FG
2501 ss << ", current state " << pg_state_string(p->second.state)
2502 << ", last acting " << p->second.acting;
2503 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae 2504 }
7c673cae
FG
2505}
2506
31f18b77
FG
2507static pair<int,int> _warn_slow_request_histogram(
2508 CephContext *cct,
2509 const pow2_hist_t& h,
2510 string suffix,
2511 list<pair<health_status_t,string> >& summary,
2512 list<pair<health_status_t,string> > *detail)
7c673cae 2513{
31f18b77
FG
2514 if (h.h.empty())
2515 return make_pair(0, 0);
7c673cae 2516
31f18b77
FG
2517 unsigned warn = 0, error = 0;
2518 float err_age =
2519 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
2520 for (unsigned i = h.h.size() - 1; i > 0; --i) {
2521 float ub = (float)(1 << i) / 1000.0;
2522 if (ub < cct->_conf->mon_osd_warn_op_age)
2523 break;
2524 if (h.h[i]) {
2525 auto sev = HEALTH_WARN;
2526 if (ub > err_age) {
2527 sev = HEALTH_ERR;
2528 error += h.h[i];
2529 } else {
2530 warn += h.h[i];
2531 }
2532 if (detail) {
2533 ostringstream ss;
2534 ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
2535 detail->push_back(make_pair(sev, ss.str()));
2536 }
7c673cae 2537 }
31f18b77
FG
2538 }
2539 return make_pair(warn, error);
2540}
7c673cae 2541
31f18b77
FG
2542namespace {
2543 enum class scrubbed_or_deepscrubbed_t { SCRUBBED, DEEPSCRUBBED };
2544
2545 void print_unscrubbed_detailed(
2546 const std::pair<const pg_t,pg_stat_t> &pg_entry,
2547 list<pair<health_status_t,string> > *detail,
2548 scrubbed_or_deepscrubbed_t how_scrubbed)
2549 {
2550 std::stringstream ss;
2551 const auto& pg_stat(pg_entry.second);
2552
2553 ss << "pg " << pg_entry.first << " is not ";
2554 if (how_scrubbed == scrubbed_or_deepscrubbed_t::SCRUBBED) {
2555 ss << "scrubbed, last_scrub_stamp "
2556 << pg_stat.last_scrub_stamp;
2557 } else if (how_scrubbed == scrubbed_or_deepscrubbed_t::DEEPSCRUBBED) {
2558 ss << "deep-scrubbed, last_deep_scrub_stamp "
2559 << pg_stat.last_deep_scrub_stamp;
7c673cae 2560 }
31f18b77
FG
2561
2562 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae
FG
2563 }
2564
31f18b77 2565 using pg_stat_map_t = const mempool::pgmap::unordered_map<pg_t,pg_stat_t>;
7c673cae 2566
31f18b77
FG
2567 void print_unscrubbed_pgs(
2568 pg_stat_map_t& pg_stats,
2569 list<pair<health_status_t,string> > &summary,
2570 list<pair<health_status_t,string> > *detail,
2571 const CephContext* cct)
2572 {
2573 if (cct->_conf->mon_warn_not_scrubbed == 0 &&
2574 cct->_conf->mon_warn_not_deep_scrubbed == 0)
2575 return;
2576
2577 int pgs_count = 0;
2578 const utime_t now = ceph_clock_now();
2579 for (const auto& pg_entry : pg_stats) {
2580 const auto& pg_stat(pg_entry.second);
2581 const utime_t time_since_ls = now - pg_stat.last_scrub_stamp;
2582 const utime_t time_since_lds = now - pg_stat.last_deep_scrub_stamp;
2583
2584 const int mon_warn_not_scrubbed =
2585 cct->_conf->mon_warn_not_scrubbed + cct->_conf->mon_scrub_interval;
2586
2587 const int mon_warn_not_deep_scrubbed =
2588 cct->_conf->mon_warn_not_deep_scrubbed + cct->_conf->osd_deep_scrub_interval;
2589
2590 bool not_scrubbed = (time_since_ls >= mon_warn_not_scrubbed &&
2591 cct->_conf->mon_warn_not_scrubbed != 0);
2592
2593 bool not_deep_scrubbed = (time_since_lds >= mon_warn_not_deep_scrubbed &&
2594 cct->_conf->mon_warn_not_deep_scrubbed != 0);
2595
2596 if (detail != nullptr) {
2597 if (not_scrubbed) {
2598 print_unscrubbed_detailed(pg_entry,
2599 detail,
2600 scrubbed_or_deepscrubbed_t::SCRUBBED);
2601 }
2602 if (not_deep_scrubbed) {
2603 print_unscrubbed_detailed(pg_entry,
2604 detail,
2605 scrubbed_or_deepscrubbed_t::DEEPSCRUBBED);
2606 }
2607 }
2608 if (not_scrubbed || not_deep_scrubbed) {
2609 ++pgs_count;
7c673cae
FG
2610 }
2611 }
31f18b77
FG
2612
2613 if (pgs_count > 0) {
2614 std::stringstream ss;
2615 ss << pgs_count << " unscrubbed pgs";
2616 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae 2617 }
224ce89b
WB
2618 }
2619}
2620
2621void PGMap::get_health_checks(
2622 CephContext *cct,
2623 const OSDMap& osdmap,
2624 health_check_map_t *checks) const
2625{
2626 utime_t now = ceph_clock_now();
b32b8144 2627 const auto max = cct->_conf->get_val<uint64_t>("mon_health_max_detail");
224ce89b
WB
2628 const auto& pools = osdmap.get_pools();
2629
224ce89b
WB
2630 typedef enum pg_consequence_t {
2631 UNAVAILABLE = 1, // Client IO to the pool may block
2632 DEGRADED = 2, // Fewer than the requested number of replicas are present
2633 DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
2634 // and insufficiet resources are present to fix this
2635 DAMAGED = 4 // The data may be missing or inconsistent on disk and
2636 // requires repair
2637 } pg_consequence_t;
2638
2639 // For a given PG state, how should it be reported at the pool level?
2640 class PgStateResponse {
2641 public:
2642 pg_consequence_t consequence;
2643 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2644 stuck_cb stuck_since;
2645 bool invert;
2646
2647 PgStateResponse(const pg_consequence_t &c, stuck_cb s)
2648 : consequence(c), stuck_since(s), invert(false)
2649 {
2650 }
2651
2652 PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
2653 : consequence(c), stuck_since(s), invert(i)
2654 {
2655 }
2656 };
2657
2658 // Record the PG state counts that contributed to a reported pool state
2659 class PgCauses {
2660 public:
2661 // Map of PG_STATE_* to number of pgs in that state.
2662 std::map<unsigned, unsigned> states;
2663
2664 // List of all PG IDs that had a state contributing
2665 // to this health condition.
2666 std::set<pg_t> pgs;
2667
2668 std::map<pg_t, std::string> pg_messages;
2669 };
2670
2671 // Map of PG state to how to respond to it
2672 std::map<unsigned, PgStateResponse> state_to_response = {
2673 // Immediate reports
2674 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
c07f9fc5 2675 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
224ce89b
WB
2676 { PG_STATE_REPAIR, {DAMAGED, {}} },
2677 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
b32b8144
FG
2678 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2679 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
c07f9fc5
FG
2680 { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} },
2681 { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} },
224ce89b
WB
2682 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2683 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2684 // Delayed (wait until stuck) reports
2685 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2686 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2687 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2688 // Delayed and inverted reports
b32b8144 2689 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
224ce89b
WB
2690 };
2691
2692 // Specialized state printer that takes account of inversion of
2693 // ACTIVE, CLEAN checks.
2694 auto state_name = [](const uint32_t &state) {
2695 // Special cases for the states that are inverted checks
2696 if (state == PG_STATE_CLEAN) {
2697 return std::string("unclean");
2698 } else if (state == PG_STATE_ACTIVE) {
2699 return std::string("inactive");
2700 } else {
2701 return pg_state_string(state);
2702 }
2703 };
2704
2705 // Map of what is wrong to information about why, implicitly also stores
2706 // the list of what is wrong.
2707 std::map<pg_consequence_t, PgCauses> detected;
2708
2709 // Optimisation: trim down the number of checks to apply based on
2710 // the summary counters
2711 std::map<unsigned, PgStateResponse> possible_responses;
2712 for (const auto &i : num_pg_by_state) {
2713 for (const auto &j : state_to_response) {
2714 if (!j.second.invert) {
2715 // Check for normal tests by seeing if any pgs have the flag
2716 if (i.first & j.first) {
2717 possible_responses.insert(j);
2718 }
2719 }
2720 }
2721 }
2722
2723 for (const auto &j : state_to_response) {
2724 if (j.second.invert) {
2725 // Check for inverted tests by seeing if not-all pgs have the flag
2726 const auto &found = num_pg_by_state.find(j.first);
2727 if (found == num_pg_by_state.end() || found->second != num_pg) {
2728 possible_responses.insert(j);
2729 }
2730 }
2731 }
2732
b32b8144 2733 utime_t cutoff = now - utime_t(cct->_conf->get_val<int64_t>("mon_pg_stuck_threshold"), 0);
224ce89b
WB
2734 // Loop over all PGs, if there are any possibly-unhealthy states in there
2735 if (!possible_responses.empty()) {
2736 for (const auto& i : pg_stat) {
2737 const auto &pg_id = i.first;
2738 const auto &pg_info = i.second;
2739
2740 for (const auto &j : state_to_response) {
2741 const auto &pg_response_state = j.first;
2742 const auto &pg_response = j.second;
2743
2744 // Apply the state test
2745 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2746 continue;
2747 }
2748
2749 // Apply stuckness test if needed
2750 if (pg_response.stuck_since) {
2751 // Delayed response, check for stuckness
2752 utime_t last_whatever = pg_response.stuck_since(pg_info);
2753 if (last_whatever >= cutoff) {
2754 // Not stuck enough, ignore.
2755 continue;
2756 } else {
2757
2758 }
2759 }
2760
2761 auto &causes = detected[pg_response.consequence];
2762 causes.states[pg_response_state]++;
2763 causes.pgs.insert(pg_id);
2764
2765 // Don't bother composing detail string if we have already recorded
2766 // too many
2767 if (causes.pg_messages.size() > max) {
2768 continue;
2769 }
2770
2771 std::ostringstream ss;
2772 if (pg_response.stuck_since) {
2773 utime_t since = pg_response.stuck_since(pg_info);
2774 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2775 if (since == utime_t()) {
2776 ss << " since forever";
2777 } else {
2778 utime_t dur = now - since;
2779 ss << " for " << dur;
2780 }
2781 ss << ", current state " << pg_state_string(pg_info.state)
2782 << ", last acting " << pg_info.acting;
2783 } else {
2784 ss << "pg " << pg_id << " is "
2785 << pg_state_string(pg_info.state);
2786 ss << ", acting " << pg_info.acting;
2787 if (pg_info.stats.sum.num_objects_unfound) {
2788 ss << ", " << pg_info.stats.sum.num_objects_unfound
2789 << " unfound";
2790 }
2791 }
2792
2793 if (pg_info.state & PG_STATE_INCOMPLETE) {
2794 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2795 if (pi && pi->min_size > 1) {
2796 ss << " (reducing pool "
2797 << osdmap.get_pool_name(pg_id.pool())
2798 << " min_size from " << (int)pi->min_size
2799 << " may help; search ceph.com/docs for 'incomplete')";
2800 }
2801 }
2802
2803 causes.pg_messages[pg_id] = ss.str();
2804 }
2805 }
2806 } else {
2807 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2808 }
2809
2810 for (const auto &i : detected) {
2811 std::string health_code;
2812 health_status_t sev;
2813 std::string summary;
2814 switch(i.first) {
2815 case UNAVAILABLE:
2816 health_code = "PG_AVAILABILITY";
2817 sev = HEALTH_WARN;
2818 summary = "Reduced data availability: ";
2819 break;
2820 case DEGRADED:
2821 health_code = "PG_DEGRADED";
2822 summary = "Degraded data redundancy: ";
2823 sev = HEALTH_WARN;
2824 break;
2825 case DEGRADED_FULL:
2826 health_code = "PG_DEGRADED_FULL";
2827 summary = "Degraded data redundancy (low space): ";
2828 sev = HEALTH_ERR;
2829 break;
2830 case DAMAGED:
2831 health_code = "PG_DAMAGED";
2832 summary = "Possible data damage: ";
2833 sev = HEALTH_ERR;
2834 break;
2835 default:
2836 assert(false);
2837 }
2838
2839 if (i.first == DEGRADED) {
2840 if (pg_sum.stats.sum.num_objects_degraded &&
2841 pg_sum.stats.sum.num_object_copies > 0) {
2842 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2843 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2844 char b[20];
2845 snprintf(b, sizeof(b), "%.3lf", pc);
2846 ostringstream ss;
2847 ss << pg_sum.stats.sum.num_objects_degraded
2848 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2849 << b << "%)";
2850
2851 // Throw in a comma for the benefit of the following PG counts
2852 summary += ss.str() + ", ";
2853 }
2854 }
2855
2856 // Compose summary message saying how many PGs in what states led
2857 // to this health check failing
2858 std::vector<std::string> pg_msgs;
2859 for (const auto &j : i.second.states) {
2860 std::ostringstream msg;
2861 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2862 pg_msgs.push_back(msg.str());
2863 }
2864 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2865
2866
2867
2868 health_check_t *check = &checks->add(
2869 health_code,
2870 sev,
2871 summary);
2872
2873 // Compose list of PGs contributing to this health check failing
2874 for (const auto &j : i.second.pg_messages) {
2875 check->detail.push_back(j.second);
2876 }
2877 }
2878
224ce89b
WB
2879 // OSD_SCRUB_ERRORS
2880 if (pg_sum.stats.sum.num_scrub_errors) {
2881 ostringstream ss;
2882 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2883 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
2884 }
2885
28e407b8
AA
2886 // LARGE_OMAP_OBJECTS
2887 if (pg_sum.stats.sum.num_large_omap_objects) {
2888 list<string> detail;
2889 for (auto &pool : pools) {
2890 const string& pool_name = osdmap.get_pool_name(pool.first);
2891 auto it2 = pg_pool_sum.find(pool.first);
2892 if (it2 == pg_pool_sum.end()) {
2893 continue;
2894 }
2895 const pool_stat_t *pstat = &it2->second;
2896 if (pstat == nullptr) {
2897 continue;
2898 }
2899 const object_stat_sum_t& sum = pstat->stats.sum;
2900 if (sum.num_large_omap_objects) {
2901 stringstream ss;
2902 ss << sum.num_large_omap_objects << " large objects found in pool "
2903 << "'" << pool_name << "'";
2904 detail.push_back(ss.str());
2905 }
2906 }
2907 if (!detail.empty()) {
2908 ostringstream ss;
2909 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
2910 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
2911 stringstream tip;
2912 tip << "Search the cluster log for 'Large omap object found' for more "
2913 << "details.";
2914 detail.push_back(tip.str());
2915 d.detail.swap(detail);
2916 }
2917 }
2918
224ce89b
WB
2919 // CACHE_POOL_NEAR_FULL
2920 {
2921 list<string> detail;
2922 unsigned num_pools = 0;
2923 for (auto& p : pools) {
2924 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2925 !pg_pool_sum.count(p.first)) {
2926 continue;
2927 }
2928 bool nearfull = false;
2929 const string& name = osdmap.get_pool_name(p.first);
2930 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2931 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2932 ((1000000 - p.second.cache_target_full_ratio_micro) *
2933 cct->_conf->mon_cache_target_full_warn_ratio);
2934 if (p.second.target_max_objects &&
2935 (uint64_t)(st.stats.sum.num_objects -
2936 st.stats.sum.num_objects_hit_set_archive) >
2937 p.second.target_max_objects * (ratio / 1000000.0)) {
2938 ostringstream ss;
2939 ss << "cache pool '" << name << "' with "
2940 << si_t(st.stats.sum.num_objects)
2941 << " objects at/near target max "
2942 << si_t(p.second.target_max_objects) << " objects";
2943 detail.push_back(ss.str());
2944 nearfull = true;
2945 }
2946 if (p.second.target_max_bytes &&
2947 (uint64_t)(st.stats.sum.num_bytes -
2948 st.stats.sum.num_bytes_hit_set_archive) >
2949 p.second.target_max_bytes * (ratio / 1000000.0)) {
2950 ostringstream ss;
2951 ss << "cache pool '" << name
2952 << "' with " << si_t(st.stats.sum.num_bytes)
2953 << "B at/near target max "
2954 << si_t(p.second.target_max_bytes) << "B";
2955 detail.push_back(ss.str());
2956 nearfull = true;
2957 }
2958 if (nearfull) {
2959 ++num_pools;
2960 }
2961 }
2962 if (!detail.empty()) {
2963 ostringstream ss;
2964 ss << num_pools << " cache pools at or near target size";
2965 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
2966 d.detail.swap(detail);
2967 }
2968 }
2969
2970 // TOO_FEW_PGS
3efd9988
FG
2971 unsigned num_in = osdmap.get_num_in_osds();
2972 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2973 const auto min_pg_per_osd =
2974 cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
2975 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2976 auto per = sum_pg_up / num_in;
2977 if (per < min_pg_per_osd && per) {
224ce89b
WB
2978 ostringstream ss;
2979 ss << "too few PGs per OSD (" << per
3efd9988 2980 << " < min " << min_pg_per_osd << ")";
224ce89b
WB
2981 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
2982 }
2983 }
2984
2985 // TOO_MANY_PGS
3efd9988
FG
2986 auto max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
2987 if (num_in && max_pg_per_osd > 0) {
2988 auto per = sum_pg_up / num_in;
2989 if (per > max_pg_per_osd) {
224ce89b
WB
2990 ostringstream ss;
2991 ss << "too many PGs per OSD (" << per
3efd9988 2992 << " > max " << max_pg_per_osd << ")";
224ce89b
WB
2993 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
2994 }
2995 }
2996
2997 // SMALLER_PGP_NUM
2998 // MANY_OBJECTS_PER_PG
2999 if (!pg_stat.empty()) {
3000 list<string> pgp_detail, many_detail;
b32b8144
FG
3001 const auto mon_pg_warn_min_objects =
3002 cct->_conf->get_val<int64_t>("mon_pg_warn_min_objects");
3003 const auto mon_pg_warn_min_pool_objects =
3004 cct->_conf->get_val<int64_t>("mon_pg_warn_min_pool_objects");
3005 const auto mon_pg_warn_max_object_skew =
3006 cct->_conf->get_val<double>("mon_pg_warn_max_object_skew");
224ce89b
WB
3007 for (auto p = pg_pool_sum.begin();
3008 p != pg_pool_sum.end();
3009 ++p) {
3010 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
3011 if (!pi)
3012 continue; // in case osdmap changes haven't propagated to PGMap yet
3013 const string& name = osdmap.get_pool_name(p->first);
3014 if (pi->get_pg_num() > pi->get_pgp_num() &&
3015 !(name.find(".DELETED") != string::npos &&
3016 cct->_conf->mon_fake_pool_delete)) {
3017 ostringstream ss;
3018 ss << "pool " << name << " pg_num "
3019 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
3020 pgp_detail.push_back(ss.str());
3021 }
3022 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
3023 if (average_objects_per_pg > 0 &&
b32b8144
FG
3024 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
3025 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
224ce89b
WB
3026 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
3027 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
b32b8144
FG
3028 if (mon_pg_warn_max_object_skew > 0 &&
3029 ratio > mon_pg_warn_max_object_skew) {
224ce89b
WB
3030 ostringstream ss;
3031 ss << "pool " << name << " objects per pg ("
3032 << objects_per_pg << ") is more than " << ratio
3033 << " times cluster average ("
3034 << average_objects_per_pg << ")";
3035 many_detail.push_back(ss.str());
3036 }
3037 }
3038 }
3039 if (!pgp_detail.empty()) {
3040 ostringstream ss;
3041 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
3042 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
3043 d.detail.swap(pgp_detail);
3044 }
3045 if (!many_detail.empty()) {
3046 ostringstream ss;
3047 ss << many_detail.size() << " pools have many more objects per pg than"
3048 << " average";
3049 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
3050 d.detail.swap(many_detail);
3051 }
3052 }
3053
3054 // POOL_FULL
3055 // POOL_NEAR_FULL
3056 {
b32b8144
FG
3057 float warn_threshold = (float)g_conf->get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
3058 float crit_threshold = (float)g_conf->get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
224ce89b
WB
3059 list<string> full_detail, nearfull_detail;
3060 unsigned full_pools = 0, nearfull_pools = 0;
3061 for (auto it : pools) {
3062 auto it2 = pg_pool_sum.find(it.first);
3063 if (it2 == pg_pool_sum.end()) {
3064 continue;
3065 }
3066 const pool_stat_t *pstat = &it2->second;
3067 const object_stat_sum_t& sum = pstat->stats.sum;
3068 const string& pool_name = osdmap.get_pool_name(it.first);
3069 const pg_pool_t &pool = it.second;
3070 bool full = false, nearfull = false;
3071 if (pool.quota_max_objects > 0) {
3072 stringstream ss;
3073 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3074 } else if (crit_threshold > 0 &&
3075 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3076 ss << "pool '" << pool_name
3077 << "' has " << sum.num_objects << " objects"
3078 << " (max " << pool.quota_max_objects << ")";
3079 full_detail.push_back(ss.str());
3080 full = true;
3081 } else if (warn_threshold > 0 &&
3082 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3083 ss << "pool '" << pool_name
3084 << "' has " << sum.num_objects << " objects"
3085 << " (max " << pool.quota_max_objects << ")";
3086 nearfull_detail.push_back(ss.str());
3087 nearfull = true;
3088 }
3089 }
3090 if (pool.quota_max_bytes > 0) {
3091 stringstream ss;
3092 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3093 } else if (crit_threshold > 0 &&
3094 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3095 ss << "pool '" << pool_name
3096 << "' has " << si_t(sum.num_bytes) << " bytes"
3097 << " (max " << si_t(pool.quota_max_bytes) << ")";
3098 full_detail.push_back(ss.str());
3099 full = true;
3100 } else if (warn_threshold > 0 &&
3101 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3102 ss << "pool '" << pool_name
3103 << "' has " << si_t(sum.num_bytes) << " bytes"
3104 << " (max " << si_t(pool.quota_max_bytes) << ")";
3105 nearfull_detail.push_back(ss.str());
3106 nearfull = true;
3107 }
3108 }
3109 if (full) {
3110 ++full_pools;
3111 }
3112 if (nearfull) {
3113 ++nearfull_pools;
3114 }
3115 }
3116 if (full_pools) {
3117 ostringstream ss;
3118 ss << full_pools << " pools full";
3119 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
3120 d.detail.swap(full_detail);
3121 }
3122 if (nearfull_pools) {
3123 ostringstream ss;
3124 ss << nearfull_pools << " pools full";
3125 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
3126 d.detail.swap(nearfull_detail);
3127 }
3128 }
3129
3130 // OBJECT_MISPLACED
3131 if (pg_sum.stats.sum.num_objects_misplaced &&
3132 pg_sum.stats.sum.num_object_copies > 0) {
3133 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3134 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3135 char b[20];
3136 snprintf(b, sizeof(b), "%.3lf", pc);
3137 ostringstream ss;
3138 ss << pg_sum.stats.sum.num_objects_misplaced
3139 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3140 << b << "%)";
3141 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
3142 }
3143
3144 // OBJECT_UNFOUND
3145 if (pg_sum.stats.sum.num_objects_unfound &&
3146 pg_sum.stats.sum.num_objects) {
3147 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3148 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3149 char b[20];
3150 snprintf(b, sizeof(b), "%.3lf", pc);
3151 ostringstream ss;
3152 ss << pg_sum.stats.sum.num_objects_unfound
b5b8bbf5 3153 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
c07f9fc5
FG
3154 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
3155
3156 for (auto& p : pg_stat) {
3157 if (p.second.stats.sum.num_objects_unfound) {
3158 ostringstream ss;
3159 ss << "pg " << p.first
3160 << " has " << p.second.stats.sum.num_objects_unfound
3161 << " unfound objects";
3162 d.detail.push_back(ss.str());
3163 if (d.detail.size() > max) {
3164 d.detail.push_back("(additional pgs left out for brevity)");
3165 break;
3166 }
3167 }
3168 }
224ce89b
WB
3169 }
3170
3171 // REQUEST_SLOW
3172 // REQUEST_STUCK
3173 if (cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3174 !osd_sum.op_queue_age_hist.h.empty() &&
3175 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
224ce89b
WB
3176 cct->_conf->mon_osd_warn_op_age) {
3177 list<string> warn_detail, error_detail;
3178 unsigned warn = 0, error = 0;
3179 float err_age =
3180 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3181 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3182 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3183 float ub = (float)(1 << i) / 1000.0;
3184 if (ub < cct->_conf->mon_osd_warn_op_age)
3185 break;
3186 if (h.h[i]) {
3187 ostringstream ss;
3188 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3189 if (ub > err_age) {
3190 error += h.h[i];
3191 error_detail.push_back(ss.str());
3192 } else {
3193 warn += h.h[i];
3194 warn_detail.push_back(ss.str());
3195 }
3196 }
3197 }
3198
3199 map<float,set<int>> warn_osd_by_max; // max -> osds
3200 map<float,set<int>> error_osd_by_max; // max -> osds
3201 if (!warn_detail.empty() || !error_detail.empty()) {
3202 for (auto& p : osd_stat) {
3203 const pow2_hist_t& h = p.second.op_queue_age_hist;
3204 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3205 float ub = (float)(1 << i) / 1000.0;
3206 if (ub < cct->_conf->mon_osd_warn_op_age)
3207 break;
3208 if (h.h[i]) {
3209 if (ub > err_age) {
3210 error_osd_by_max[ub].insert(p.first);
3211 } else {
3212 warn_osd_by_max[ub].insert(p.first);
3213 }
3214 break;
3215 }
3216 }
3217 }
3218 }
3219
3220 if (!warn_detail.empty()) {
224ce89b 3221 int left = max;
28e407b8 3222 set<int> implicated_osds;
224ce89b
WB
3223 for (auto& p : warn_osd_by_max) {
3224 ostringstream ss;
28e407b8 3225 implicated_osds.insert(p.second.begin(), p.second.end());
224ce89b 3226 if (p.second.size() > 1) {
c07f9fc5
FG
3227 ss << "osds " << p.second
3228 << " have blocked requests > " << p.first << " sec";
224ce89b 3229 } else {
c07f9fc5
FG
3230 ss << "osd." << *p.second.begin()
3231 << " has blocked requests > " << p.first << " sec";
224ce89b 3232 }
28e407b8 3233 warn_detail.push_back(ss.str());
224ce89b
WB
3234 if (--left == 0) {
3235 break;
3236 }
3237 }
28e407b8
AA
3238 ostringstream ss;
3239 ss << warn << " slow requests are blocked > "
3240 << cct->_conf->mon_osd_warn_op_age << " sec. Implicated osds "
3241 << implicated_osds;
3242 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
3243 d.detail.swap(warn_detail);
224ce89b
WB
3244 }
3245 if (!error_detail.empty()) {
224ce89b 3246 int left = max;
28e407b8 3247 set<int> implicated_osds;
224ce89b
WB
3248 for (auto& p : error_osd_by_max) {
3249 ostringstream ss;
28e407b8 3250 implicated_osds.insert(p.second.begin(), p.second.end());
224ce89b 3251 if (p.second.size() > 1) {
c07f9fc5
FG
3252 ss << "osds " << p.second
3253 << " have stuck requests > " << p.first << " sec";
224ce89b 3254 } else {
c07f9fc5
FG
3255 ss << "osd." << *p.second.begin()
3256 << " has stuck requests > " << p.first << " sec";
224ce89b 3257 }
28e407b8 3258 error_detail.push_back(ss.str());
224ce89b
WB
3259 if (--left == 0) {
3260 break;
3261 }
3262 }
28e407b8
AA
3263 ostringstream ss;
3264 ss << error << " stuck requests are blocked > "
3265 << err_age << " sec. Implicated osds " << implicated_osds;
3266 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
3267 d.detail.swap(error_detail);
224ce89b
WB
3268 }
3269 }
7c673cae 3270
224ce89b
WB
3271 // PG_NOT_SCRUBBED
3272 // PG_NOT_DEEP_SCRUBBED
3273 {
c07f9fc5
FG
3274 if (cct->_conf->mon_warn_not_scrubbed ||
3275 cct->_conf->mon_warn_not_deep_scrubbed) {
3276 list<string> detail, deep_detail;
3277 const double age = cct->_conf->mon_warn_not_scrubbed +
3278 cct->_conf->mon_scrub_interval;
3279 utime_t cutoff = now;
3280 cutoff -= age;
3281 const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
3282 cct->_conf->osd_deep_scrub_interval;
3283 utime_t deep_cutoff = now;
3284 deep_cutoff -= deep_age;
3285 for (auto& p : pg_stat) {
3286 if (cct->_conf->mon_warn_not_scrubbed &&
3287 p.second.last_scrub_stamp < cutoff) {
3288 ostringstream ss;
3289 ss << "pg " << p.first << " not scrubbed since "
3290 << p.second.last_scrub_stamp;
3291 detail.push_back(ss.str());
3292 }
3293 if (cct->_conf->mon_warn_not_deep_scrubbed &&
3294 p.second.last_deep_scrub_stamp < deep_cutoff) {
3295 ostringstream ss;
3296 ss << "pg " << p.first << " not deep-scrubbed since "
3297 << p.second.last_deep_scrub_stamp;
3298 deep_detail.push_back(ss.str());
3299 }
224ce89b 3300 }
c07f9fc5
FG
3301 if (!detail.empty()) {
3302 ostringstream ss;
3303 ss << detail.size() << " pgs not scrubbed for " << age;
3304 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
3305 d.detail.swap(detail);
3306 }
3307 if (!deep_detail.empty()) {
3308 ostringstream ss;
3309 ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
3310 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
3311 d.detail.swap(deep_detail);
3312 }
3313 }
3314 }
3315
3316 // POOL_APP
d2e6a577 3317 if (g_conf->get_val<bool>("mon_warn_on_pool_no_app")) {
c07f9fc5
FG
3318 list<string> detail;
3319 for (auto &it : pools) {
3320 const pg_pool_t &pool = it.second;
3321 const string& pool_name = osdmap.get_pool_name(it.first);
3322 auto it2 = pg_pool_sum.find(it.first);
3323 if (it2 == pg_pool_sum.end()) {
3324 continue;
3325 }
3326 const pool_stat_t *pstat = &it2->second;
3327 if (pstat == nullptr) {
3328 continue;
3329 }
3330 const object_stat_sum_t& sum = pstat->stats.sum;
3331 // application metadata is not encoded until luminous is minimum
3332 // required release
3333 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3334 sum.num_objects > 0 && pool.application_metadata.empty() &&
3335 !pool.is_tier() && !g_conf->mon_debug_no_require_luminous) {
3336 stringstream ss;
3337 ss << "application not enabled on pool '" << pool_name << "'";
3338 detail.push_back(ss.str());
224ce89b
WB
3339 }
3340 }
3341 if (!detail.empty()) {
3342 ostringstream ss;
c07f9fc5
FG
3343 ss << "application not enabled on " << detail.size() << " pool(s)";
3344 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
3345 stringstream tip;
3346 tip << "use 'ceph osd pool application enable <pool-name> "
3347 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3348 << "or freeform for custom applications.";
3349 detail.push_back(tip.str());
224ce89b
WB
3350 d.detail.swap(detail);
3351 }
31f18b77 3352 }
b32b8144
FG
3353
3354 // PG_SLOW_SNAP_TRIMMING
3355 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3356 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3357 uint64_t snaptrimq_exceeded = 0;
3358 uint32_t longest_queue = 0;
3359 const pg_t* longest_q_pg = nullptr;
3360 list<string> detail;
3361
3362 for (auto& i: pg_stat) {
3363 uint32_t current_len = i.second.snaptrimq_len;
3364 if (current_len >= snapthreshold) {
3365 snaptrimq_exceeded++;
3366 if (longest_queue <= current_len) {
3367 longest_q_pg = &i.first;
3368 longest_queue = current_len;
3369 }
3370 if (detail.size() < max - 1) {
3371 stringstream ss;
3372 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3373 detail.push_back(ss.str());
3374 continue;
3375 }
3376 if (detail.size() < max) {
3377 detail.push_back("...more pgs affected");
3378 continue;
3379 }
3380 }
3381 }
3382
3383 if (snaptrimq_exceeded) {
3384 {
3385 ostringstream ss;
3386 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3387 detail.push_back(ss.str());
3388 }
3389
3390 stringstream ss;
3391 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3392 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str());
3393 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3394 d.detail.swap(detail);
3395 }
3396 }
31f18b77 3397}
7c673cae 3398
31f18b77
FG
3399void PGMap::get_health(
3400 CephContext *cct,
3401 const OSDMap& osdmap,
3402 list<pair<health_status_t,string> >& summary,
3403 list<pair<health_status_t,string> > *detail) const
3404{
3405 map<string,int> note;
3406 auto p = num_pg_by_state.begin();
3407 auto p_end = num_pg_by_state.end();
3408 for (; p != p_end; ++p) {
3409 if (p->first & PG_STATE_STALE)
3410 note["stale"] += p->second;
3411 if (p->first & PG_STATE_DOWN)
3412 note["down"] += p->second;
3413 if (p->first & PG_STATE_UNDERSIZED)
3414 note["undersized"] += p->second;
3415 if (p->first & PG_STATE_DEGRADED)
3416 note["degraded"] += p->second;
3417 if (p->first & PG_STATE_INCONSISTENT)
3418 note["inconsistent"] += p->second;
3419 if (p->first & PG_STATE_PEERING)
3420 note["peering"] += p->second;
3421 if (p->first & PG_STATE_REPAIR)
3422 note["repair"] += p->second;
3423 if (p->first & PG_STATE_RECOVERING)
3424 note["recovering"] += p->second;
3425 if (p->first & PG_STATE_RECOVERY_WAIT)
3426 note["recovery_wait"] += p->second;
3427 if (p->first & PG_STATE_INCOMPLETE)
3428 note["incomplete"] += p->second;
3429 if (p->first & PG_STATE_BACKFILL_WAIT)
3430 note["backfill_wait"] += p->second;
3efd9988 3431 if (p->first & PG_STATE_BACKFILLING)
31f18b77
FG
3432 note["backfilling"] += p->second;
3433 if (p->first & PG_STATE_BACKFILL_TOOFULL)
3434 note["backfill_toofull"] += p->second;
3435 if (p->first & PG_STATE_RECOVERY_TOOFULL)
3436 note["recovery_toofull"] += p->second;
224ce89b
WB
3437 if (p->first & PG_STATE_SNAPTRIM_ERROR)
3438 note["snaptrim_error"] += p->second;
31f18b77
FG
3439 }
3440
3441 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pgs;
3442 utime_t now(ceph_clock_now());
b32b8144 3443 utime_t cutoff = now - utime_t(g_conf->get_val<int64_t>("mon_pg_stuck_threshold"), 0);
31f18b77
FG
3444 uint64_t num_inactive_pgs = 0;
3445
3446 if (detail) {
3447 // we need to collect details of stuck pgs, first do a quick check
3448 // whether this will yield any results
3449 if (get_stuck_counts(cutoff, note)) {
3450
3451 // there are stuck pgs. gather details for specified statuses
3452 // only if we know that there are pgs stuck in that status
3453
3454 if (note.find("stuck inactive") != note.end()) {
3455 get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
3456 note["stuck inactive"] = stuck_pgs.size();
3457 num_inactive_pgs += stuck_pgs.size();
3458 note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs,
b32b8144 3459 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
31f18b77 3460 stuck_pgs.clear();
7c673cae
FG
3461 }
3462
31f18b77
FG
3463 if (note.find("stuck unclean") != note.end()) {
3464 get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
3465 note["stuck unclean"] = stuck_pgs.size();
3466 note_stuck_detail(PGMap::STUCK_UNCLEAN, stuck_pgs,
b32b8144 3467 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
31f18b77
FG
3468 stuck_pgs.clear();
3469 }
7c673cae 3470
31f18b77
FG
3471 if (note.find("stuck undersized") != note.end()) {
3472 get_stuck_stats(PGMap::STUCK_UNDERSIZED, cutoff, stuck_pgs);
3473 note["stuck undersized"] = stuck_pgs.size();
3474 note_stuck_detail(PGMap::STUCK_UNDERSIZED, stuck_pgs,
b32b8144 3475 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
31f18b77
FG
3476 stuck_pgs.clear();
3477 }
3478
3479 if (note.find("stuck degraded") != note.end()) {
3480 get_stuck_stats(PGMap::STUCK_DEGRADED, cutoff, stuck_pgs);
3481 note["stuck degraded"] = stuck_pgs.size();
3482 note_stuck_detail(PGMap::STUCK_DEGRADED, stuck_pgs,
b32b8144 3483 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
31f18b77
FG
3484 stuck_pgs.clear();
3485 }
3486
3487 if (note.find("stuck stale") != note.end()) {
3488 get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
3489 note["stuck stale"] = stuck_pgs.size();
3490 num_inactive_pgs += stuck_pgs.size();
3491 note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs,
b32b8144 3492 cct->_conf->get_val<uint64_t>("mon_health_max_detail"), detail);
31f18b77
FG
3493 }
3494 }
3495 } else {
3496 get_stuck_counts(cutoff, note);
3497 auto p = note.find("stuck inactive");
3498 if (p != note.end())
3499 num_inactive_pgs += p->second;
3500 p = note.find("stuck stale");
3501 if (p != note.end())
3502 num_inactive_pgs += p->second;
7c673cae 3503 }
31f18b77
FG
3504
3505 if (cct->_conf->mon_pg_min_inactive > 0 &&
3506 num_inactive_pgs >= cct->_conf->mon_pg_min_inactive) {
3507 ostringstream ss;
b32b8144 3508 ss << num_inactive_pgs << " pgs are stuck inactive for more than " << g_conf->get_val<int64_t>("mon_pg_stuck_threshold") << " seconds";
31f18b77 3509 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
7c673cae 3510 }
7c673cae 3511
31f18b77
FG
3512 if (!note.empty()) {
3513 for (auto p = note.begin(); p != note.end(); ++p) {
3514 ostringstream ss;
3515 ss << p->second << " pgs " << p->first;
3516 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3517 }
3518 if (detail) {
3519 int n = 0, more = 0;
b32b8144 3520 int max = cct->_conf->get_val<uint64_t>("mon_health_max_detail");
31f18b77
FG
3521 for (auto p = pg_stat.begin();
3522 p != pg_stat.end();
3523 ++p) {
3524 if ((p->second.state & (PG_STATE_STALE |
3525 PG_STATE_DOWN |
3526 PG_STATE_UNDERSIZED |
3527 PG_STATE_DEGRADED |
3528 PG_STATE_INCONSISTENT |
3529 PG_STATE_PEERING |
3530 PG_STATE_REPAIR |
3531 PG_STATE_RECOVERING |
3532 PG_STATE_RECOVERY_WAIT |
3533 PG_STATE_RECOVERY_TOOFULL |
3534 PG_STATE_INCOMPLETE |
3535 PG_STATE_BACKFILL_WAIT |
3efd9988 3536 PG_STATE_BACKFILLING |
31f18b77
FG
3537 PG_STATE_BACKFILL_TOOFULL)) &&
3538 stuck_pgs.count(p->first) == 0) {
3539 if (max > 0) {
3540 --max;
3541 } else {
3542 ++more;
3543 continue;
3544 }
3545 ++n;
3546 ostringstream ss;
3547 ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
3548 ss << ", acting " << p->second.acting;
3549 if (p->second.stats.sum.num_objects_unfound)
3550 ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
3551 if (p->second.state & PG_STATE_INCOMPLETE) {
3552 const pg_pool_t *pi = osdmap.get_pg_pool(p->first.pool());
3553 if (pi && pi->min_size > 1) {
3554 ss << " (reducing pool " << osdmap.get_pool_name(p->first.pool())
3555 << " min_size from " << (int)pi->min_size
3556 << " may help; search ceph.com/docs for 'incomplete')";
3557 }
3558 }
3559 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3560 }
3561 }
3562 if (more) {
3563 ostringstream ss;
3564 ss << more << " more pgs are also unhealthy";
3565 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3566 }
7c673cae 3567 }
31f18b77
FG
3568 }
3569
3570 // slow requests
3571 if (cct->_conf->mon_osd_warn_op_age > 0 &&
c07f9fc5
FG
3572 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3573 cct->_conf->mon_osd_warn_op_age) {
31f18b77
FG
3574 auto sum = _warn_slow_request_histogram(
3575 cct, osd_sum.op_queue_age_hist, "", summary, NULL);
3576 if (sum.first > 0 || sum.second > 0) {
3577 if (sum.first > 0) {
3578 ostringstream ss;
3579 ss << sum.first << " requests are blocked > "
3580 << cct->_conf->mon_osd_warn_op_age
3581 << " sec";
3582 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3583 }
3584 if (sum.second > 0) {
3585 ostringstream ss;
c07f9fc5 3586 ss << sum.second << " requests are blocked > "
31f18b77
FG
3587 << (cct->_conf->mon_osd_warn_op_age *
3588 cct->_conf->mon_osd_err_op_age_ratio)
3589 << " sec";
3590 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3591 }
3592
3593 if (detail) {
3594 unsigned num_warn = 0, num_err = 0;
3595 // do per-osd warnings
3596 for (auto p = osd_stat.begin();
3597 p != osd_stat.end();
3598 ++p) {
3599 auto sum = _warn_slow_request_histogram(
3600 cct,
3601 p->second.op_queue_age_hist,
3602 string(" on osd.") + stringify(p->first),
3603 summary, detail);
3604 if (sum.second)
3605 ++num_err;
3606 else if (sum.first)
3607 ++num_warn;
3608 }
3609 if (num_err) {
3610 ostringstream ss2;
3611 ss2 << num_err << " osds have very slow requests";
3612 summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
3613 detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
3614 }
3615 if (num_warn) {
3616 ostringstream ss2;
3617 ss2 << num_warn << " osds have slow requests";
3618 summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
3619 detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
3620 }
3621 }
7c673cae 3622 }
31f18b77
FG
3623 }
3624
31f18b77
FG
3625 // recovery
3626 list<string> sl;
3627 overall_recovery_summary(NULL, &sl);
3628 for (auto p = sl.begin(); p != sl.end(); ++p) {
3629 summary.push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3630 if (detail)
3631 detail->push_back(make_pair(HEALTH_WARN, "recovery " + *p));
3632 }
3633
3634 // near-target max pools
3635 auto& pools = osdmap.get_pools();
3636 for (auto p = pools.begin();
3637 p != pools.end(); ++p) {
3638 if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
3639 !pg_pool_sum.count(p->first))
3640 continue;
3641 bool nearfull = false;
3642 const string& name = osdmap.get_pool_name(p->first);
3643 const pool_stat_t& st = get_pg_pool_sum_stat(p->first);
3644 uint64_t ratio = p->second.cache_target_full_ratio_micro +
3645 ((1000000 - p->second.cache_target_full_ratio_micro) *
3646 cct->_conf->mon_cache_target_full_warn_ratio);
3647 if (p->second.target_max_objects &&
3648 (uint64_t)(st.stats.sum.num_objects -
3649 st.stats.sum.num_objects_hit_set_archive) >
3650 p->second.target_max_objects * (ratio / 1000000.0)) {
3651 nearfull = true;
3652 if (detail) {
3653 ostringstream ss;
3654 ss << "cache pool '" << name << "' with "
3655 << si_t(st.stats.sum.num_objects)
3656 << " objects at/near target max "
3657 << si_t(p->second.target_max_objects) << " objects";
3658 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3659 }
3660 }
3661 if (p->second.target_max_bytes &&
3662 (uint64_t)(st.stats.sum.num_bytes -
3663 st.stats.sum.num_bytes_hit_set_archive) >
3664 p->second.target_max_bytes * (ratio / 1000000.0)) {
3665 nearfull = true;
3666 if (detail) {
3667 ostringstream ss;
3668 ss << "cache pool '" << name
3669 << "' with " << si_t(st.stats.sum.num_bytes)
3670 << "B at/near target max "
3671 << si_t(p->second.target_max_bytes) << "B";
3672 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3673 }
3674 }
3675 if (nearfull) {
3676 ostringstream ss;
3677 ss << "'" << name << "' at/near target max";
3678 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3679 }
3680 }
7c673cae 3681
31f18b77
FG
3682 // scrub
3683 if (pg_sum.stats.sum.num_scrub_errors) {
3684 ostringstream ss;
3685 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
3686 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3687 if (detail) {
3688 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
7c673cae 3689 }
31f18b77
FG
3690 }
3691
3692 // pg skew
3efd9988
FG
3693 auto num_in = osdmap.get_num_in_osds();
3694 auto sum_pg_up = MAX(static_cast<unsigned>(pg_sum.up), pg_stat.size());
224ce89b
WB
3695 int sum_objects = pg_sum.stats.sum.num_objects;
3696 if (sum_objects < cct->_conf->mon_pg_warn_min_objects) {
3697 return;
3698 }
3efd9988
FG
3699 const auto min_pg_per_osd =
3700 cct->_conf->get_val<uint64_t>("mon_pg_warn_min_per_osd");
3701 if (num_in && min_pg_per_osd > 0) {
3702 auto per = sum_pg_up / num_in;
3703 if (per < min_pg_per_osd && per) {
31f18b77 3704 ostringstream ss;
3efd9988 3705 ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")";
31f18b77
FG
3706 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3707 if (detail)
3708 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
7c673cae 3709 }
31f18b77 3710 }
3efd9988
FG
3711 int64_t max_pg_per_osd = cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd");
3712 if (num_in && max_pg_per_osd > 0) {
31f18b77 3713 int per = sum_pg_up / num_in;
3efd9988 3714 if (per > max_pg_per_osd) {
31f18b77 3715 ostringstream ss;
3efd9988
FG
3716 ss << "too many PGs per OSD (" << per << " > max "
3717 << max_pg_per_osd << ")";
31f18b77
FG
3718 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3719 if (detail)
3720 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3721 }
3722 }
3723 if (!pg_stat.empty()) {
3724 for (auto p = pg_pool_sum.begin();
3725 p != pg_pool_sum.end();
3726 ++p) {
3727 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
3728 if (!pi)
3729 continue; // in case osdmap changes haven't propagated to PGMap yet
3730 const string& name = osdmap.get_pool_name(p->first);
3731 if (pi->get_pg_num() > pi->get_pgp_num() &&
3732 !(name.find(".DELETED") != string::npos &&
3733 cct->_conf->mon_fake_pool_delete)) {
3734 ostringstream ss;
3735 ss << "pool " << name << " pg_num "
3736 << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
3737 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3738 if (detail)
3739 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3740 }
3741 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
3742 if (average_objects_per_pg > 0 &&
3743 pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
3744 p->second.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_pool_objects) {
3745 int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
3746 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
3747 if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
3748 ratio > cct->_conf->mon_pg_warn_max_object_skew) {
3749 ostringstream ss;
3750 ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
3751 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3752 if (detail) {
3753 ostringstream ss;
3754 ss << "pool " << name << " objects per pg ("
3755 << objects_per_pg << ") is more than " << ratio << " times cluster average ("
3756 << average_objects_per_pg << ")";
3757 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3758 }
3759 }
3760 }
7c673cae
FG
3761 }
3762 }
7c673cae 3763
224ce89b
WB
3764 for (auto it : pools) {
3765 auto it2 = pg_pool_sum.find(it.first);
3766 if (it2 == pg_pool_sum.end()) {
3767 continue;
3768 }
3769 const pool_stat_t *pstat = &it2->second;
3770 const object_stat_sum_t& sum = pstat->stats.sum;
3771 const string& pool_name = osdmap.get_pool_name(it.first);
3772 const pg_pool_t &pool = it.second;
3773
3774 float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
3775 float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
3776
3777 if (pool.quota_max_objects > 0) {
3778 stringstream ss;
3779 health_status_t status = HEALTH_OK;
3780 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
3781 } else if (crit_threshold > 0 &&
3782 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
3783 ss << "pool '" << pool_name
3784 << "' has " << sum.num_objects << " objects"
3785 << " (max " << pool.quota_max_objects << ")";
3786 status = HEALTH_ERR;
3787 } else if (warn_threshold > 0 &&
3788 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
3789 ss << "pool '" << pool_name
3790 << "' has " << sum.num_objects << " objects"
3791 << " (max " << pool.quota_max_objects << ")";
3792 status = HEALTH_WARN;
3793 }
3794 if (status != HEALTH_OK) {
3795 pair<health_status_t,string> s(status, ss.str());
3796 summary.push_back(s);
3797 if (detail)
3798 detail->push_back(s);
3799 }
3800 }
3801
3802 if (pool.quota_max_bytes > 0) {
3803 health_status_t status = HEALTH_OK;
3804 stringstream ss;
3805 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
3806 } else if (crit_threshold > 0 &&
3807 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
3808 ss << "pool '" << pool_name
3809 << "' has " << si_t(sum.num_bytes) << " bytes"
3810 << " (max " << si_t(pool.quota_max_bytes) << ")";
3811 status = HEALTH_ERR;
3812 } else if (warn_threshold > 0 &&
3813 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
3814 ss << "pool '" << pool_name
3815 << "' has " << si_t(sum.num_bytes) << " bytes"
3816 << " (max " << si_t(pool.quota_max_bytes) << ")";
3817 status = HEALTH_WARN;
3818 }
3819 if (status != HEALTH_OK) {
3820 pair<health_status_t,string> s(status, ss.str());
3821 summary.push_back(s);
3822 if (detail)
3823 detail->push_back(s);
3824 }
3825 }
3826 }
3827
31f18b77
FG
3828 print_unscrubbed_pgs(pg_stat, summary, detail, cct);
3829}
7c673cae
FG
3830
3831int process_pg_map_command(
3832 const string& orig_prefix,
3833 const map<string,cmd_vartype>& orig_cmdmap,
3834 const PGMap& pg_map,
3835 const OSDMap& osdmap,
3836 Formatter *f,
3837 stringstream *ss,
3838 bufferlist *odata)
3839{
3840 string prefix = orig_prefix;
3841 map<string,cmd_vartype> cmdmap = orig_cmdmap;
3842
3843 // perhaps these would be better in the parsing, but it's weird
3844 bool primary = false;
3845 if (prefix == "pg dump_json") {
3846 vector<string> v;
3847 v.push_back(string("all"));
3848 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3849 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3850 prefix = "pg dump";
3851 } else if (prefix == "pg dump_pools_json") {
3852 vector<string> v;
3853 v.push_back(string("pools"));
3854 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3855 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3856 prefix = "pg dump";
3857 } else if (prefix == "pg ls-by-primary") {
3858 primary = true;
3859 prefix = "pg ls";
3860 } else if (prefix == "pg ls-by-osd") {
3861 prefix = "pg ls";
3862 } else if (prefix == "pg ls-by-pool") {
3863 prefix = "pg ls";
3864 string poolstr;
3865 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3866 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3867 if (pool < 0) {
3868 *ss << "pool " << poolstr << " does not exist";
3869 return -ENOENT;
3870 }
3871 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3872 }
3873
3874 int r = 0;
3875 stringstream ds;
3876 if (prefix == "pg stat") {
3877 if (f) {
3878 f->open_object_section("pg_summary");
3879 pg_map.print_oneline_summary(f, NULL);
3880 f->close_section();
3881 f->flush(ds);
3882 } else {
3883 ds << pg_map;
3884 }
3885 odata->append(ds);
3886 return 0;
3887 }
3888
3889 if (prefix == "pg getmap") {
3890 pg_map.encode(*odata);
3891 *ss << "got pgmap version " << pg_map.version;
3892 return 0;
3893 }
3894
3895 if (prefix == "pg dump") {
3896 string val;
3897 vector<string> dumpcontents;
3898 set<string> what;
3899 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3900 copy(dumpcontents.begin(), dumpcontents.end(),
3901 inserter(what, what.end()));
3902 }
3903 if (what.empty())
3904 what.insert("all");
3905 if (f) {
3906 if (what.count("all")) {
3907 f->open_object_section("pg_map");
3908 pg_map.dump(f);
3909 f->close_section();
3910 } else if (what.count("summary") || what.count("sum")) {
3911 f->open_object_section("pg_map");
3912 pg_map.dump_basic(f);
3913 f->close_section();
3914 } else {
3915 if (what.count("pools")) {
3916 pg_map.dump_pool_stats(f);
3917 }
3918 if (what.count("osds")) {
3919 pg_map.dump_osd_stats(f);
3920 }
3921 if (what.count("pgs")) {
3922 pg_map.dump_pg_stats(f, false);
3923 }
3924 if (what.count("pgs_brief")) {
3925 pg_map.dump_pg_stats(f, true);
3926 }
3927 if (what.count("delta")) {
3928 f->open_object_section("delta");
3929 pg_map.dump_delta(f);
3930 f->close_section();
3931 }
3932 }
3933 f->flush(*odata);
3934 } else {
3935 if (what.count("all")) {
3936 pg_map.dump(ds);
3937 } else if (what.count("summary") || what.count("sum")) {
3938 pg_map.dump_basic(ds);
3939 pg_map.dump_pg_sum_stats(ds, true);
3940 pg_map.dump_osd_sum_stats(ds);
3941 } else {
3942 if (what.count("pgs_brief")) {
3943 pg_map.dump_pg_stats(ds, true);
3944 }
3945 bool header = true;
3946 if (what.count("pgs")) {
3947 pg_map.dump_pg_stats(ds, false);
3948 header = false;
3949 }
3950 if (what.count("pools")) {
3951 pg_map.dump_pool_stats(ds, header);
3952 }
3953 if (what.count("osds")) {
3954 pg_map.dump_osd_stats(ds);
3955 }
3956 }
3957 odata->append(ds);
3958 }
3959 *ss << "dumped " << what;
3960 return 0;
3961 }
3962
3963 if (prefix == "pg ls") {
3964 int64_t osd = -1;
3965 int64_t pool = -1;
3966 vector<string>states;
3967 set<pg_t> pgs;
3968 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3969 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3970 cmd_getval(g_ceph_context, cmdmap, "states", states);
3971 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3972 *ss << "pool " << pool << " does not exist";
3973 return -ENOENT;
3974 }
3975 if (osd >= 0 && !osdmap.is_up(osd)) {
3976 *ss << "osd " << osd << " is not up";
3977 return -EAGAIN;
3978 }
3979 if (states.empty())
3980 states.push_back("all");
3981
3982 uint32_t state = 0;
3983
3984 while (!states.empty()) {
3985 string state_str = states.back();
3986
3987 if (state_str == "all") {
3988 state = -1;
3989 break;
3990 } else {
3efd9988
FG
3991 auto filter = pg_string_state(state_str);
3992 if (!filter) {
c07f9fc5
FG
3993 *ss << "'" << state_str << "' is not a valid pg state,"
3994 << " available choices: " << pg_state_string(0xFFFFFFFF);
3995 return -EINVAL;
3996 }
3efd9988 3997 state |= *filter;
7c673cae
FG
3998 }
3999
4000 states.pop_back();
4001 }
4002
4003 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
4004
4005 if (f && !pgs.empty()) {
4006 pg_map.dump_filtered_pg_stats(f, pgs);
4007 f->flush(*odata);
4008 } else if (!pgs.empty()) {
4009 pg_map.dump_filtered_pg_stats(ds, pgs);
4010 odata->append(ds);
4011 }
4012 return 0;
4013 }
4014
4015 if (prefix == "pg dump_stuck") {
4016 vector<string> stuckop_vec;
4017 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
4018 if (stuckop_vec.empty())
4019 stuckop_vec.push_back("unclean");
4020 int64_t threshold;
4021 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
b32b8144 4022 g_conf->get_val<int64_t>("mon_pg_stuck_threshold"));
7c673cae
FG
4023
4024 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
4025 odata->append(ds);
4026 if (r < 0)
4027 *ss << "failed";
4028 else
4029 *ss << "ok";
4030 return 0;
4031 }
4032
4033 if (prefix == "pg debug") {
4034 string debugop;
4035 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
4036 string("unfound_objects_exist"));
4037 if (debugop == "unfound_objects_exist") {
4038 bool unfound_objects_exist = false;
4039 for (const auto& p : pg_map.pg_stat) {
4040 if (p.second.stats.sum.num_objects_unfound > 0) {
4041 unfound_objects_exist = true;
4042 break;
4043 }
4044 }
4045 if (unfound_objects_exist)
4046 ds << "TRUE";
4047 else
4048 ds << "FALSE";
4049 odata->append(ds);
4050 return 0;
4051 }
4052 if (debugop == "degraded_pgs_exist") {
4053 bool degraded_pgs_exist = false;
4054 for (const auto& p : pg_map.pg_stat) {
4055 if (p.second.stats.sum.num_objects_degraded > 0) {
4056 degraded_pgs_exist = true;
4057 break;
4058 }
4059 }
4060 if (degraded_pgs_exist)
4061 ds << "TRUE";
4062 else
4063 ds << "FALSE";
4064 odata->append(ds);
4065 return 0;
4066 }
4067 }
4068
4069 if (prefix == "osd perf") {
4070 if (f) {
4071 f->open_object_section("osdstats");
4072 pg_map.dump_osd_perf_stats(f);
4073 f->close_section();
4074 f->flush(ds);
4075 } else {
4076 pg_map.print_osd_perf_stats(&ds);
4077 }
4078 odata->append(ds);
4079 return 0;
4080 }
4081
4082 if (prefix == "osd blocked-by") {
4083 if (f) {
4084 f->open_object_section("osd_blocked_by");
4085 pg_map.dump_osd_blocked_by_stats(f);
4086 f->close_section();
4087 f->flush(ds);
4088 } else {
4089 pg_map.print_osd_blocked_by_stats(&ds);
4090 }
4091 odata->append(ds);
4092 return 0;
4093 }
4094
4095 if (prefix == "osd pool stats") {
4096 string pool_name;
4097 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
4098
4099 int64_t poolid = -ENOENT;
4100 bool one_pool = false;
4101 if (!pool_name.empty()) {
4102 poolid = osdmap.lookup_pg_pool_name(pool_name);
4103 if (poolid < 0) {
4104 assert(poolid == -ENOENT);
4105 *ss << "unrecognized pool '" << pool_name << "'";
4106 return -ENOENT;
4107 }
4108 one_pool = true;
4109 }
4110
4111 stringstream rs;
4112
4113 if (f)
4114 f->open_array_section("pool_stats");
4115 else {
4116 if (osdmap.get_pools().empty()) {
4117 *ss << "there are no pools!";
4118 goto stats_out;
4119 }
4120 }
4121
4122 for (auto& p : osdmap.get_pools()) {
4123 if (!one_pool)
4124 poolid = p.first;
4125
4126 pool_name = osdmap.get_pool_name(poolid);
4127
4128 if (f) {
4129 f->open_object_section("pool");
4130 f->dump_string("pool_name", pool_name.c_str());
4131 f->dump_int("pool_id", poolid);
4132 f->open_object_section("recovery");
4133 }
4134
4135 list<string> sl;
4136 stringstream tss;
4137 pg_map.pool_recovery_summary(f, &sl, poolid);
4138 if (!f && !sl.empty()) {
4139 for (auto& p : sl)
4140 tss << " " << p << "\n";
4141 }
4142
4143 if (f) {
4144 f->close_section();
4145 f->open_object_section("recovery_rate");
4146 }
4147
4148 ostringstream rss;
4149 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
4150 if (!f && !rss.str().empty())
4151 tss << " recovery io " << rss.str() << "\n";
4152
4153 if (f) {
4154 f->close_section();
4155 f->open_object_section("client_io_rate");
4156 }
4157 rss.clear();
4158 rss.str("");
4159
4160 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
4161 if (!f && !rss.str().empty())
4162 tss << " client io " << rss.str() << "\n";
4163
4164 // dump cache tier IO rate for cache pool
4165 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
4166 if (pool->is_tier()) {
4167 if (f) {
4168 f->close_section();
4169 f->open_object_section("cache_io_rate");
4170 }
4171 rss.clear();
4172 rss.str("");
4173
4174 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
4175 if (!f && !rss.str().empty())
4176 tss << " cache tier io " << rss.str() << "\n";
4177 }
4178 if (f) {
4179 f->close_section();
4180 f->close_section();
4181 } else {
4182 rs << "pool " << pool_name << " id " << poolid << "\n";
4183 if (!tss.str().empty())
4184 rs << tss.str() << "\n";
4185 else
4186 rs << " nothing is going on\n\n";
4187 }
4188 if (one_pool)
4189 break;
4190 }
4191
4192stats_out:
4193 if (f) {
4194 f->close_section();
4195 f->flush(ds);
4196 odata->append(ds);
4197 } else {
4198 odata->append(rs.str());
4199 }
4200 return 0;
4201 }
4202
4203 return -EOPNOTSUPP;
4204}
4205
4206void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
4207 std::set<int> *need_check_down_pg_osds,
4208 std::map<int,utime_t> *last_osd_report,
4209 PGMap *pg_map,
4210 PGMap::Incremental *pending_inc)
4211{
4212 for (const auto &p : osd_inc.new_weight) {
4213 if (p.second == CEPH_OSD_OUT) {
4214 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
31f18b77
FG
4215 auto j = pg_map->osd_epochs.find(p.first);
4216 if (j != pg_map->osd_epochs.end())
4217 pending_inc->stat_osd_out(p.first, j->second);
7c673cae
FG
4218 }
4219 }
4220
4221 // this is conservative: we want to know if any osds (maybe) got marked down.
4222 for (const auto &p : osd_inc.new_state) {
4223 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
4224 // but we're too lazy to check
4225 // which
4226 need_check_down_pg_osds->insert(p.first);
4227
4228 // clear out the last_osd_report for this OSD
31f18b77 4229 auto report = last_osd_report->find(p.first);
7c673cae
FG
4230 if (report != last_osd_report->end()) {
4231 last_osd_report->erase(report);
4232 }
4233
4234 // clear out osd_stat slow request histogram
4235 dout(20) << __func__ << " clearing osd." << p.first
4236 << " request histogram" << dendl;
31f18b77 4237 pending_inc->stat_osd_down_up(p.first, osd_inc.epoch, *pg_map);
7c673cae
FG
4238 }
4239
4240 if (p.second & CEPH_OSD_EXISTS) {
4241 // whether it was created *or* destroyed, we can safely drop
4242 // it's osd_stat_t record.
4243 dout(10) << __func__ << " osd." << p.first
4244 << " created or destroyed" << dendl;
4245 pending_inc->rm_stat(p.first);
4246
4247 // and adjust full, nearfull set
4248 pg_map->nearfull_osds.erase(p.first);
4249 pg_map->full_osds.erase(p.first);
4250 }
4251 }
4252}
4253
31f18b77
FG
4254void PGMapUpdater::check_osd_map(
4255 CephContext *cct,
4256 const OSDMap& osdmap,
4257 const PGMap& pgmap,
4258 PGMap::Incremental *pending_inc)
4259{
4260 for (auto& p : pgmap.osd_stat) {
4261 if (!osdmap.exists(p.first)) {
4262 // remove osd_stat
4263 pending_inc->rm_stat(p.first);
4264 } else if (osdmap.is_out(p.first)) {
4265 // zero osd_stat
4266 if (p.second.kb != 0) {
4267 auto j = pgmap.osd_epochs.find(p.first);
4268 if (j != pgmap.osd_epochs.end()) {
4269 pending_inc->stat_osd_out(p.first, j->second);
4270 }
4271 }
4272 } else if (!osdmap.is_up(p.first)) {
4273 // zero the op_queue_age_hist
4274 if (!p.second.op_queue_age_hist.empty()) {
4275 pending_inc->stat_osd_down_up(p.first, osdmap.get_epoch(), pgmap);
4276 }
4277 }
4278 }
4279
4280 // deleted pgs (pools)?
4281 for (auto& p : pgmap.pg_pool_sum) {
4282 if (!osdmap.have_pg_pool(p.first)) {
4283 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
4284 << dendl;
4285 for (auto& q : pgmap.pg_stat) {
4286 if (q.first.pool() == (uint64_t)p.first) {
4287 pending_inc->pg_remove.insert(q.first);
4288 }
4289 }
4290 auto q = pending_inc->pg_stat_updates.begin();
4291 while (q != pending_inc->pg_stat_updates.end()) {
4292 if (q->first.pool() == (uint64_t)p.first) {
4293 q = pending_inc->pg_stat_updates.erase(q);
4294 } else {
4295 ++q;
4296 }
4297 }
4298 }
4299 }
4300
4301 // new pgs (split or new pool)?
4302 for (auto& p : osdmap.get_pools()) {
4303 int64_t poolid = p.first;
4304 const pg_pool_t& pi = p.second;
4305 auto q = pgmap.num_pg_by_pool.find(poolid);
4306 unsigned my_pg_num = 0;
4307 if (q != pgmap.num_pg_by_pool.end())
4308 my_pg_num = q->second;
4309 unsigned pg_num = pi.get_pg_num();
4310 if (my_pg_num != pg_num) {
224ce89b
WB
4311 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
4312 << " != my pg_num " << my_pg_num << dendl;
31f18b77
FG
4313 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
4314 pg_t pgid(ps, poolid);
4315 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
224ce89b 4316 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
31f18b77
FG
4317 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4318 stats.last_fresh = osdmap.get_modified();
4319 stats.last_active = osdmap.get_modified();
4320 stats.last_change = osdmap.get_modified();
4321 stats.last_peered = osdmap.get_modified();
4322 stats.last_clean = osdmap.get_modified();
4323 stats.last_unstale = osdmap.get_modified();
4324 stats.last_undegraded = osdmap.get_modified();
4325 stats.last_fullsized = osdmap.get_modified();
4326 stats.last_scrub_stamp = osdmap.get_modified();
4327 stats.last_deep_scrub_stamp = osdmap.get_modified();
4328 stats.last_clean_scrub_stamp = osdmap.get_modified();
4329 }
4330 }
4331 }
4332 }
4333}
4334
7c673cae
FG
4335void PGMapUpdater::register_pg(
4336 const OSDMap &osd_map,
4337 pg_t pgid, epoch_t epoch,
4338 bool new_pool,
4339 const PGMap &pg_map,
4340 PGMap::Incremental *pending_inc)
4341{
4342 pg_t parent;
4343 int split_bits = 0;
4344 auto parent_stat = pg_map.pg_stat.end();
4345 if (!new_pool) {
4346 parent = pgid;
4347 while (1) {
4348 // remove most significant bit
4349 int msb = cbits(parent.ps());
4350 if (!msb)
4351 break;
4352 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
4353 split_bits++;
4354 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
4355 parent_stat = pg_map.pg_stat.find(parent);
4356 if (parent_stat != pg_map.pg_stat.end() &&
4357 parent_stat->second.state != PG_STATE_CREATING) {
4358 dout(10) << " parent is " << parent << dendl;
4359 break;
4360 }
4361 }
4362 }
4363
4364 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
4365 stats.state = PG_STATE_CREATING;
4366 stats.created = epoch;
4367 stats.parent = parent;
4368 stats.parent_split_bits = split_bits;
4369 stats.mapping_epoch = epoch;
4370
4371 if (parent_stat != pg_map.pg_stat.end()) {
4372 const pg_stat_t &ps = parent_stat->second;
4373 stats.last_fresh = ps.last_fresh;
4374 stats.last_active = ps.last_active;
4375 stats.last_change = ps.last_change;
4376 stats.last_peered = ps.last_peered;
4377 stats.last_clean = ps.last_clean;
4378 stats.last_unstale = ps.last_unstale;
4379 stats.last_undegraded = ps.last_undegraded;
4380 stats.last_fullsized = ps.last_fullsized;
4381 stats.last_scrub_stamp = ps.last_scrub_stamp;
4382 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
4383 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
4384 } else {
4385 utime_t now = osd_map.get_modified();
4386 stats.last_fresh = now;
4387 stats.last_active = now;
4388 stats.last_change = now;
4389 stats.last_peered = now;
4390 stats.last_clean = now;
4391 stats.last_unstale = now;
4392 stats.last_undegraded = now;
4393 stats.last_fullsized = now;
4394 stats.last_scrub_stamp = now;
4395 stats.last_deep_scrub_stamp = now;
4396 stats.last_clean_scrub_stamp = now;
4397 }
4398
4399 osd_map.pg_to_up_acting_osds(
4400 pgid,
4401 &stats.up,
4402 &stats.up_primary,
4403 &stats.acting,
4404 &stats.acting_primary);
4405
4406 if (split_bits == 0) {
4407 dout(10) << __func__ << " will create " << pgid
4408 << " primary " << stats.acting_primary
4409 << " acting " << stats.acting
4410 << dendl;
4411 } else {
4412 dout(10) << __func__ << " will create " << pgid
4413 << " primary " << stats.acting_primary
4414 << " acting " << stats.acting
4415 << " parent " << parent
4416 << " by " << split_bits << " bits"
4417 << dendl;
4418 }
4419}
4420
4421void PGMapUpdater::register_new_pgs(
4422 const OSDMap &osd_map,
4423 const PGMap &pg_map,
4424 PGMap::Incremental *pending_inc)
4425{
4426 epoch_t epoch = osd_map.get_epoch();
4427 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
4428 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
4429
4430 int created = 0;
4431 const auto &pools = osd_map.get_pools();
4432
4433 for (const auto &p : pools) {
4434 int64_t poolid = p.first;
4435 const pg_pool_t &pool = p.second;
31f18b77 4436 int ruleno = osd_map.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
4437 pool.get_type(), pool.get_size());
4438 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
4439 continue;
4440
4441 if (pool.get_last_change() <= pg_map.last_pg_scan ||
4442 pool.get_last_change() <= pending_inc->pg_scan) {
4443 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
4444 continue;
4445 }
4446
4447 dout(10) << __func__ << " scanning pool " << poolid
4448 << " " << pool << dendl;
4449
4450 // first pgs in this pool
4451 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
4452
4453 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
4454 pg_t pgid(ps, poolid, -1);
4455 if (pg_map.pg_stat.count(pgid)) {
4456 dout(20) << "register_new_pgs have " << pgid << dendl;
4457 continue;
4458 }
4459 created++;
4460 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
4461 pg_map, pending_inc);
4462 }
4463 }
4464
4465 int removed = 0;
4466 for (const auto &p : pg_map.creating_pgs) {
4467 if (p.preferred() >= 0) {
4468 dout(20) << " removing creating_pg " << p
4469 << " because it is localized and obsolete" << dendl;
4470 pending_inc->pg_remove.insert(p);
4471 ++removed;
4472 } else if (!osd_map.have_pg_pool(p.pool())) {
4473 dout(20) << " removing creating_pg " << p
4474 << " because containing pool deleted" << dendl;
4475 pending_inc->pg_remove.insert(p);
4476 ++removed;
4477 }
4478 }
4479
4480 // deleted pools?
4481 for (const auto &p : pg_map.pg_stat) {
4482 if (!osd_map.have_pg_pool(p.first.pool())) {
4483 dout(20) << " removing pg_stat " << p.first << " because "
4484 << "containing pool deleted" << dendl;
4485 pending_inc->pg_remove.insert(p.first);
4486 ++removed;
4487 } else if (p.first.preferred() >= 0) {
4488 dout(20) << " removing localized pg " << p.first << dendl;
4489 pending_inc->pg_remove.insert(p.first);
4490 ++removed;
4491 }
4492 }
4493
4494 // we don't want to redo this work if we can avoid it.
4495 pending_inc->pg_scan = epoch;
4496
4497 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
4498 << removed << " uncreated pgs" << dendl;
4499}
4500
4501
4502void PGMapUpdater::update_creating_pgs(
4503 const OSDMap &osd_map,
4504 const PGMap &pg_map,
4505 PGMap::Incremental *pending_inc)
4506{
4507 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
4508 << " pgs, osdmap epoch " << osd_map.get_epoch()
4509 << dendl;
4510
4511 unsigned changed = 0;
31f18b77 4512 for (auto p = pg_map.creating_pgs.begin();
7c673cae
FG
4513 p != pg_map.creating_pgs.end();
4514 ++p) {
4515 pg_t pgid = *p;
4516 pg_t on = pgid;
31f18b77 4517 auto q = pg_map.pg_stat.find(pgid);
7c673cae
FG
4518 assert(q != pg_map.pg_stat.end());
4519 const pg_stat_t *s = &q->second;
4520
4521 if (s->parent_split_bits)
4522 on = s->parent;
4523
4524 vector<int> up, acting;
4525 int up_primary, acting_primary;
4526 osd_map.pg_to_up_acting_osds(
4527 on,
4528 &up,
4529 &up_primary,
4530 &acting,
4531 &acting_primary);
4532
4533 if (up != s->up ||
4534 up_primary != s->up_primary ||
4535 acting != s->acting ||
4536 acting_primary != s->acting_primary) {
4537 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
4538 if (osd_map.get_epoch() > ns->reported_epoch) {
4539 dout(20) << __func__ << " " << pgid << " "
4540 << " acting_primary: " << s->acting_primary
4541 << " -> " << acting_primary
4542 << " acting: " << s->acting << " -> " << acting
4543 << " up_primary: " << s->up_primary << " -> " << up_primary
4544 << " up: " << s->up << " -> " << up
4545 << dendl;
4546
4547 // only initialize if it wasn't already a pending update
4548 if (ns->reported_epoch == 0)
4549 *ns = *s;
4550
4551 // note epoch if the target of the create message changed
4552 if (acting_primary != ns->acting_primary)
4553 ns->mapping_epoch = osd_map.get_epoch();
4554
4555 ns->up = up;
4556 ns->up_primary = up_primary;
4557 ns->acting = acting;
4558 ns->acting_primary = acting_primary;
4559
4560 ++changed;
4561 } else {
4562 dout(20) << __func__ << " " << pgid << " has pending update from newer"
4563 << " epoch " << ns->reported_epoch
4564 << dendl;
4565 }
4566 }
4567 }
4568 if (changed) {
4569 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
4570 }
4571}
4572
4573static void _try_mark_pg_stale(
4574 const OSDMap& osdmap,
4575 pg_t pgid,
4576 const pg_stat_t& cur,
4577 PGMap::Incremental *pending_inc)
4578{
4579 if ((cur.state & PG_STATE_STALE) == 0 &&
4580 cur.acting_primary != -1 &&
4581 osdmap.is_down(cur.acting_primary)) {
4582 pg_stat_t *newstat;
4583 auto q = pending_inc->pg_stat_updates.find(pgid);
4584 if (q != pending_inc->pg_stat_updates.end()) {
4585 if ((q->second.acting_primary == cur.acting_primary) ||
4586 ((q->second.state & PG_STATE_STALE) == 0 &&
4587 q->second.acting_primary != -1 &&
4588 osdmap.is_down(q->second.acting_primary))) {
4589 newstat = &q->second;
4590 } else {
4591 // pending update is no longer down or already stale
4592 return;
4593 }
4594 } else {
4595 newstat = &pending_inc->pg_stat_updates[pgid];
4596 *newstat = cur;
4597 }
4598 dout(10) << __func__ << " marking pg " << pgid
4599 << " stale (acting_primary " << newstat->acting_primary
4600 << ")" << dendl;
4601 newstat->state |= PG_STATE_STALE;
4602 newstat->last_unstale = ceph_clock_now();
4603 }
4604}
4605
4606void PGMapUpdater::check_down_pgs(
4607 const OSDMap &osdmap,
4608 const PGMap &pg_map,
4609 bool check_all,
4610 const set<int>& need_check_down_pg_osds,
4611 PGMap::Incremental *pending_inc)
4612{
4613 // if a large number of osds changed state, just iterate over the whole
4614 // pg map.
4615 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
b32b8144 4616 g_conf->get_val<double>("mon_pg_check_down_all_threshold")) {
7c673cae
FG
4617 check_all = true;
4618 }
4619
4620 if (check_all) {
4621 for (const auto& p : pg_map.pg_stat) {
4622 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
4623 }
4624 } else {
4625 for (auto osd : need_check_down_pg_osds) {
4626 if (osdmap.is_down(osd)) {
4627 auto p = pg_map.pg_by_osd.find(osd);
4628 if (p == pg_map.pg_by_osd.end()) {
4629 continue;
4630 }
4631 for (auto pgid : p->second) {
4632 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
4633 assert(stat.acting_primary == osd);
4634 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
4635 }
4636 }
4637 }
4638 }
4639}
4640
4641int reweight::by_utilization(
4642 const OSDMap &osdmap,
4643 const PGMap &pgm,
4644 int oload,
4645 double max_changef,
4646 int max_osds,
4647 bool by_pg, const set<int64_t> *pools,
4648 bool no_increasing,
4649 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
4650 std::stringstream *ss,
4651 std::string *out_str,
4652 Formatter *f)
4653{
4654 if (oload <= 100) {
4655 *ss << "You must give a percentage higher than 100. "
4656 "The reweighting threshold will be calculated as <average-utilization> "
4657 "times <input-percentage>. For example, an argument of 200 would "
4658 "reweight OSDs which are twice as utilized as the average OSD.\n";
4659 return -EINVAL;
4660 }
4661
4662 vector<int> pgs_by_osd(osdmap.get_max_osd());
4663
4664 // Avoid putting a small number (or 0) in the denominator when calculating
4665 // average_util
4666 double average_util;
4667 if (by_pg) {
4668 // by pg mapping
4669 double weight_sum = 0.0; // sum up the crush weights
4670 unsigned num_pg_copies = 0;
4671 int num_osds = 0;
4672 for (const auto& pg : pgm.pg_stat) {
4673 if (pools && pools->count(pg.first.pool()) == 0)
4674 continue;
4675 for (const auto acting : pg.second.acting) {
b5b8bbf5
FG
4676 if (!osdmap.exists(acting)) {
4677 continue;
4678 }
7c673cae
FG
4679 if (acting >= (int)pgs_by_osd.size())
4680 pgs_by_osd.resize(acting);
4681 if (pgs_by_osd[acting] == 0) {
4682 if (osdmap.crush->get_item_weightf(acting) <= 0) {
4683 //skip if we currently can not identify item
4684 continue;
4685 }
4686 weight_sum += osdmap.crush->get_item_weightf(acting);
4687 ++num_osds;
4688 }
4689 ++pgs_by_osd[acting];
4690 ++num_pg_copies;
4691 }
4692 }
4693
4694 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
4695 *ss << "Refusing to reweight: we only have " << num_pg_copies
4696 << " PGs across " << num_osds << " osds!\n";
4697 return -EDOM;
4698 }
4699
4700 average_util = (double)num_pg_copies / weight_sum;
4701 } else {
4702 // by osd utilization
4703 int num_osd = MAX(1, pgm.osd_stat.size());
4704 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
4705 < g_conf->mon_reweight_min_bytes_per_osd) {
4706 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
4707 << " kb across all osds!\n";
4708 return -EDOM;
4709 }
4710 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
4711 < g_conf->mon_reweight_min_bytes_per_osd) {
4712 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
4713 << " kb used across all osds!\n";
4714 return -EDOM;
4715 }
4716
4717 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
4718 }
4719
4720 // adjust down only if we are above the threshold
4721 const double overload_util = average_util * (double)oload / 100.0;
4722
4723 // but aggressively adjust weights up whenever possible.
4724 const double underload_util = average_util;
4725
4726 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
4727
4728 ostringstream oss;
4729 if (f) {
4730 f->open_object_section("reweight_by_utilization");
4731 f->dump_int("overload_min", oload);
4732 f->dump_float("max_change", max_changef);
4733 f->dump_int("max_change_osds", max_osds);
4734 f->dump_float("average_utilization", average_util);
4735 f->dump_float("overload_utilization", overload_util);
4736 } else {
4737 oss << "oload " << oload << "\n";
4738 oss << "max_change " << max_changef << "\n";
4739 oss << "max_change_osds " << max_osds << "\n";
4740 oss.precision(4);
4741 oss << "average_utilization " << std::fixed << average_util << "\n";
4742 oss << "overload_utilization " << overload_util << "\n";
4743 }
4744 int num_changed = 0;
4745
4746 // precompute util for each OSD
4747 std::vector<std::pair<int, float> > util_by_osd;
4748 for (const auto& p : pgm.osd_stat) {
4749 std::pair<int, float> osd_util;
4750 osd_util.first = p.first;
4751 if (by_pg) {
4752 if (p.first >= (int)pgs_by_osd.size() ||
4753 pgs_by_osd[p.first] == 0) {
4754 // skip if this OSD does not contain any pg
4755 // belonging to the specified pool(s).
4756 continue;
4757 }
4758
4759 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
4760 // skip if we are unable to locate item.
4761 continue;
4762 }
4763
4764 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
4765 } else {
4766 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
4767 }
4768 util_by_osd.push_back(osd_util);
4769 }
4770
4771 // sort by absolute deviation from the mean utilization,
4772 // in descending order.
4773 std::sort(util_by_osd.begin(), util_by_osd.end(),
4774 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
4775 return abs(l.second - average_util) > abs(r.second - average_util);
4776 }
4777 );
4778
4779 if (f)
4780 f->open_array_section("reweights");
4781
4782 for (const auto& p : util_by_osd) {
4783 unsigned weight = osdmap.get_weight(p.first);
4784 if (weight == 0) {
4785 // skip if OSD is currently out
4786 continue;
4787 }
4788 float util = p.second;
4789
4790 if (util >= overload_util) {
4791 // Assign a lower weight to overloaded OSDs. The current weight
4792 // is a factor to take into account the original weights,
4793 // to represent e.g. differing storage capacities
4794 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4795 if (weight > max_change)
4796 new_weight = MAX(new_weight, weight - max_change);
4797 new_weights->insert({p.first, new_weight});
4798 if (f) {
4799 f->open_object_section("osd");
4800 f->dump_int("osd", p.first);
4801 f->dump_float("weight", (float)weight / (float)0x10000);
4802 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4803 f->close_section();
4804 } else {
4805 oss << "osd." << p.first << " weight "
4806 << (float)weight / (float)0x10000 << " -> "
4807 << (float)new_weight / (float)0x10000 << "\n";
4808 }
4809 if (++num_changed >= max_osds)
4810 break;
4811 }
4812 if (!no_increasing && util <= underload_util) {
4813 // assign a higher weight.. if we can.
4814 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4815 new_weight = MIN(new_weight, weight + max_change);
4816 if (new_weight > 0x10000)
4817 new_weight = 0x10000;
4818 if (new_weight > weight) {
4819 new_weights->insert({p.first, new_weight});
4820 oss << "osd." << p.first << " weight "
4821 << (float)weight / (float)0x10000 << " -> "
4822 << (float)new_weight / (float)0x10000 << "\n";
4823 if (++num_changed >= max_osds)
4824 break;
4825 }
4826 }
4827 }
4828 if (f) {
4829 f->close_section();
4830 }
4831
4832 OSDMap newmap;
4833 newmap.deepish_copy_from(osdmap);
4834 OSDMap::Incremental newinc;
4835 newinc.fsid = newmap.get_fsid();
4836 newinc.epoch = newmap.get_epoch() + 1;
4837 newinc.new_weight = *new_weights;
4838 newmap.apply_incremental(newinc);
4839
4840 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4841
4842 if (f) {
4843 f->close_section();
4844 } else {
4845 *out_str += "\n";
4846 *out_str += oss.str();
4847 }
4848 return num_changed;
4849}