]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PGMap.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mon / PGMap.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "PGMap.h"
5
6#define dout_subsys ceph_subsys_mon
7#include "common/debug.h"
8#include "common/Formatter.h"
9#include "include/ceph_features.h"
10#include "include/stringify.h"
11
12#include "osd/osd_types.h"
13#include "osd/OSDMap.h"
14
15#define dout_context g_ceph_context
16
17// --
18
19void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
20{
21 if ((features & CEPH_FEATURE_MONENC) == 0) {
22 __u8 v = 4;
23 ::encode(v, bl);
24 ::encode(version, bl);
25 ::encode(pg_stat_updates, bl);
26 ::encode(osd_stat_updates, bl);
27 ::encode(osd_stat_rm, bl);
28 ::encode(osdmap_epoch, bl);
29 ::encode(pg_scan, bl);
30 ::encode(full_ratio, bl);
31 ::encode(nearfull_ratio, bl);
32 ::encode(pg_remove, bl);
33 return;
34 }
35
36 ENCODE_START(7, 5, bl);
37 ::encode(version, bl);
38 ::encode(pg_stat_updates, bl);
39 ::encode(osd_stat_updates, bl);
40 ::encode(osd_stat_rm, bl);
41 ::encode(osdmap_epoch, bl);
42 ::encode(pg_scan, bl);
43 ::encode(full_ratio, bl);
44 ::encode(nearfull_ratio, bl);
45 ::encode(pg_remove, bl);
46 ::encode(stamp, bl);
47 ::encode(osd_epochs, bl);
48 ENCODE_FINISH(bl);
49}
50
51void PGMap::Incremental::decode(bufferlist::iterator &bl)
52{
53 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
54 ::decode(version, bl);
55 if (struct_v < 3) {
56 pg_stat_updates.clear();
57 __u32 n;
58 ::decode(n, bl);
59 while (n--) {
60 old_pg_t opgid;
61 ::decode(opgid, bl);
62 pg_t pgid = opgid;
63 ::decode(pg_stat_updates[pgid], bl);
64 }
65 } else {
66 ::decode(pg_stat_updates, bl);
67 }
68 ::decode(osd_stat_updates, bl);
69 ::decode(osd_stat_rm, bl);
70 ::decode(osdmap_epoch, bl);
71 ::decode(pg_scan, bl);
72 if (struct_v >= 2) {
73 ::decode(full_ratio, bl);
74 ::decode(nearfull_ratio, bl);
75 }
76 if (struct_v < 3) {
77 pg_remove.clear();
78 __u32 n;
79 ::decode(n, bl);
80 while (n--) {
81 old_pg_t opgid;
82 ::decode(opgid, bl);
83 pg_remove.insert(pg_t(opgid));
84 }
85 } else {
86 ::decode(pg_remove, bl);
87 }
88 if (struct_v < 4 && full_ratio == 0) {
89 full_ratio = -1;
90 }
91 if (struct_v < 4 && nearfull_ratio == 0) {
92 nearfull_ratio = -1;
93 }
94 if (struct_v >= 6)
95 ::decode(stamp, bl);
96 if (struct_v >= 7) {
97 ::decode(osd_epochs, bl);
98 } else {
99 for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin();
100 i != osd_stat_updates.end();
101 ++i) {
102 // This isn't accurate, but will cause trimming to behave like
103 // previously.
104 osd_epochs.insert(make_pair(i->first, osdmap_epoch));
105 }
106 }
107 DECODE_FINISH(bl);
108}
109
110void PGMap::Incremental::dump(Formatter *f) const
111{
112 f->dump_unsigned("version", version);
113 f->dump_stream("stamp") << stamp;
114 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
115 f->dump_unsigned("pg_scan_epoch", pg_scan);
116 f->dump_float("full_ratio", full_ratio);
117 f->dump_float("nearfull_ratio", nearfull_ratio);
118
119 f->open_array_section("pg_stat_updates");
120 for (map<pg_t,pg_stat_t>::const_iterator p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
121 f->open_object_section("pg_stat");
122 f->dump_stream("pgid") << p->first;
123 p->second.dump(f);
124 f->close_section();
125 }
126 f->close_section();
127
128 f->open_array_section("osd_stat_updates");
129 for (map<int32_t,osd_stat_t>::const_iterator p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
130 f->open_object_section("osd_stat");
131 f->dump_int("osd", p->first);
132 p->second.dump(f);
133 f->close_section();
134 }
135 f->close_section();
136
137 f->open_array_section("osd_stat_removals");
138 for (set<int>::const_iterator p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
139 f->dump_int("osd", *p);
140 f->close_section();
141
142 f->open_array_section("pg_removals");
143 for (set<pg_t>::const_iterator p = pg_remove.begin(); p != pg_remove.end(); ++p)
144 f->dump_stream("pgid") << *p;
145 f->close_section();
146}
147
148void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
149{
150 o.push_back(new Incremental);
151 o.push_back(new Incremental);
152 o.back()->version = 1;
153 o.back()->stamp = utime_t(123,345);
154 o.push_back(new Incremental);
155 o.back()->version = 2;
156 o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
157 o.back()->osd_stat_updates[5] = osd_stat_t();
158 o.back()->osd_epochs[5] = 12;
159 o.push_back(new Incremental);
160 o.back()->version = 3;
161 o.back()->osdmap_epoch = 1;
162 o.back()->pg_scan = 2;
163 o.back()->full_ratio = .2;
164 o.back()->nearfull_ratio = .3;
165 o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
166 o.back()->osd_stat_updates[6] = osd_stat_t();
167 o.back()->osd_epochs[6] = 12;
168 o.back()->pg_remove.insert(pg_t(1,2,3));
169 o.back()->osd_stat_rm.insert(5);
170}
171
172
173// --
174
175void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
176{
177 assert(inc.version == version+1);
178 version++;
179
180 utime_t delta_t;
181 delta_t = inc.stamp;
182 delta_t -= stamp;
183 stamp = inc.stamp;
184
185 pool_stat_t pg_sum_old = pg_sum;
186 ceph::unordered_map<uint64_t, pool_stat_t> pg_pool_sum_old;
187
188 bool ratios_changed = false;
189 if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
190 full_ratio = inc.full_ratio;
191 ratios_changed = true;
192 }
193 if (inc.nearfull_ratio != nearfull_ratio && inc.nearfull_ratio != -1) {
194 nearfull_ratio = inc.nearfull_ratio;
195 ratios_changed = true;
196 }
197 if (ratios_changed)
198 redo_full_sets();
199
200 for (map<pg_t,pg_stat_t>::const_iterator p = inc.pg_stat_updates.begin();
201 p != inc.pg_stat_updates.end();
202 ++p) {
203 const pg_t &update_pg(p->first);
204 const pg_stat_t &update_stat(p->second);
205
206 if (pg_pool_sum_old.count(update_pg.pool()) == 0)
207 pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
208
209 ceph::unordered_map<pg_t,pg_stat_t>::iterator t = pg_stat.find(update_pg);
210 if (t == pg_stat.end()) {
211 ceph::unordered_map<pg_t,pg_stat_t>::value_type v(update_pg, update_stat);
212 pg_stat.insert(v);
213 } else {
214 stat_pg_sub(update_pg, t->second);
215 t->second = update_stat;
216 }
217 stat_pg_add(update_pg, update_stat);
218 }
219 assert(osd_stat.size() == osd_epochs.size());
220 for (map<int32_t,osd_stat_t>::const_iterator p =
221 inc.get_osd_stat_updates().begin();
222 p != inc.get_osd_stat_updates().end();
223 ++p) {
224 int osd = p->first;
225 const osd_stat_t &new_stats(p->second);
226
227 ceph::unordered_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(osd);
228 if (t == osd_stat.end()) {
229 ceph::unordered_map<int32_t,osd_stat_t>::value_type v(osd, new_stats);
230 osd_stat.insert(v);
231 } else {
232 stat_osd_sub(t->second);
233 t->second = new_stats;
234 }
235 ceph::unordered_map<int32_t,epoch_t>::iterator i = osd_epochs.find(osd);
236 map<int32_t,epoch_t>::const_iterator j = inc.get_osd_epochs().find(osd);
237 assert(j != inc.get_osd_epochs().end());
238
239 if (i == osd_epochs.end())
240 osd_epochs.insert(*j);
241 else
242 i->second = j->second;
243
244 stat_osd_add(new_stats);
245
246 // adjust [near]full status
247 register_nearfull_status(osd, new_stats);
248 }
249 set<int64_t> deleted_pools;
250 for (set<pg_t>::const_iterator p = inc.pg_remove.begin();
251 p != inc.pg_remove.end();
252 ++p) {
253 const pg_t &removed_pg(*p);
254 ceph::unordered_map<pg_t,pg_stat_t>::iterator s = pg_stat.find(removed_pg);
255 if (s != pg_stat.end()) {
256 stat_pg_sub(removed_pg, s->second);
257 pg_stat.erase(s);
258 }
259 if (removed_pg.ps() == 0)
260 deleted_pools.insert(removed_pg.pool());
261 }
262 for (set<int64_t>::iterator p = deleted_pools.begin();
263 p != deleted_pools.end();
264 ++p) {
265 dout(20) << " deleted pool " << *p << dendl;
266 deleted_pool(*p);
267 }
268
269 for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
270 p != inc.get_osd_stat_rm().end();
271 ++p) {
272 ceph::unordered_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p);
273 if (t != osd_stat.end()) {
274 stat_osd_sub(t->second);
275 osd_stat.erase(t);
276 }
277
278 // remove these old osds from full/nearfull set(s), too
279 nearfull_osds.erase(*p);
280 full_osds.erase(*p);
281 }
282
283 // calculate a delta, and average over the last 2 deltas.
284 pool_stat_t d = pg_sum;
285 d.stats.sub(pg_sum_old.stats);
286 pg_sum_deltas.push_back(make_pair(d, delta_t));
287 stamp_delta += delta_t;
288
289 pg_sum_delta.stats.add(d.stats);
290 if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
291 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
292 stamp_delta -= pg_sum_deltas.front().second;
293 pg_sum_deltas.pop_front();
294 }
295
296 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
297
298 if (inc.osdmap_epoch)
299 last_osdmap_epoch = inc.osdmap_epoch;
300 if (inc.pg_scan)
301 last_pg_scan = inc.pg_scan;
302
303 min_last_epoch_clean = 0; // invalidate
304}
305
306void PGMap::redo_full_sets()
307{
308 full_osds.clear();
309 nearfull_osds.clear();
310 for (ceph::unordered_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
311 i != osd_stat.end();
312 ++i) {
313 register_nearfull_status(i->first, i->second);
314 }
315}
316
317void PGMap::register_nearfull_status(int osd, const osd_stat_t& s)
318{
319 float ratio = ((float)s.kb_used) / ((float)s.kb);
320
321 if (full_ratio > 0 && ratio > full_ratio) {
322 // full
323 full_osds.insert(osd);
324 nearfull_osds.erase(osd);
325 } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
326 // nearfull
327 full_osds.erase(osd);
328 nearfull_osds.insert(osd);
329 } else {
330 // ok
331 full_osds.erase(osd);
332 nearfull_osds.erase(osd);
333 }
334}
335
336void PGMap::calc_stats()
337{
338 num_pg_by_state.clear();
339 num_pg = 0;
340 num_pg_active = 0;
341 num_osd = 0;
342 pg_pool_sum.clear();
343 pg_sum = pool_stat_t();
344 osd_sum = osd_stat_t();
345 pg_by_osd.clear();
346 num_primary_pg_by_osd.clear();
347
348 for (ceph::unordered_map<pg_t,pg_stat_t>::iterator p = pg_stat.begin();
349 p != pg_stat.end();
350 ++p) {
351 stat_pg_add(p->first, p->second);
352 }
353 for (ceph::unordered_map<int32_t,osd_stat_t>::iterator p = osd_stat.begin();
354 p != osd_stat.end();
355 ++p)
356 stat_osd_add(p->second);
357
358 redo_full_sets();
359
360 min_last_epoch_clean = calc_min_last_epoch_clean();
361}
362
363void PGMap::update_pg(pg_t pgid, bufferlist& bl)
364{
365 bufferlist::iterator p = bl.begin();
366 ceph::unordered_map<pg_t,pg_stat_t>::iterator s = pg_stat.find(pgid);
367 epoch_t old_lec = 0, lec;
368 if (s != pg_stat.end()) {
369 old_lec = s->second.get_effective_last_epoch_clean();
370 stat_pg_update(pgid, s->second, p);
371 lec = s->second.get_effective_last_epoch_clean();
372 } else {
373 pg_stat_t& r = pg_stat[pgid];
374 ::decode(r, p);
375 stat_pg_add(pgid, r);
376 lec = r.get_effective_last_epoch_clean();
377 }
378
379 if (min_last_epoch_clean &&
380 (lec < min_last_epoch_clean || // we did
381 (lec > min_last_epoch_clean && // we might
382 old_lec == min_last_epoch_clean)
383 ))
384 min_last_epoch_clean = 0;
385}
386
387void PGMap::remove_pg(pg_t pgid)
388{
389 ceph::unordered_map<pg_t,pg_stat_t>::iterator s = pg_stat.find(pgid);
390 if (s != pg_stat.end()) {
391 if (min_last_epoch_clean &&
392 s->second.get_effective_last_epoch_clean() == min_last_epoch_clean)
393 min_last_epoch_clean = 0;
394 stat_pg_sub(pgid, s->second);
395 pg_stat.erase(s);
396 }
397}
398
399void PGMap::update_osd(int osd, bufferlist& bl)
400{
401 bufferlist::iterator p = bl.begin();
402 ceph::unordered_map<int32_t,osd_stat_t>::iterator o = osd_stat.find(osd);
403 epoch_t old_lec = 0;
404 if (o != osd_stat.end()) {
405 ceph::unordered_map<int32_t,epoch_t>::iterator i = osd_epochs.find(osd);
406 if (i != osd_epochs.end())
407 old_lec = i->second;
408 stat_osd_sub(o->second);
409 }
410 osd_stat_t& r = osd_stat[osd];
411 ::decode(r, p);
412 stat_osd_add(r);
413
414 // adjust [near]full status
415 register_nearfull_status(osd, r);
416
417 // epoch?
418 if (!p.end()) {
419 epoch_t e;
420 ::decode(e, p);
421
422 if (e < min_last_epoch_clean ||
423 (e > min_last_epoch_clean &&
424 old_lec == min_last_epoch_clean))
425 min_last_epoch_clean = 0;
426 } else {
427 // WARNING: we are not refreshing min_last_epoch_clean! must be old store
428 // or old mon running.
429 }
430}
431
432void PGMap::remove_osd(int osd)
433{
434 ceph::unordered_map<int32_t,osd_stat_t>::iterator o = osd_stat.find(osd);
435 if (o != osd_stat.end()) {
436 stat_osd_sub(o->second);
437 osd_stat.erase(o);
438
439 // remove these old osds from full/nearfull set(s), too
440 nearfull_osds.erase(osd);
441 full_osds.erase(osd);
442 }
443}
444
445void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
446 bool sameosds)
447{
448 pg_pool_sum[pgid.pool()].add(s);
449 pg_sum.add(s);
450
451 num_pg++;
452 num_pg_by_state[s.state]++;
453
454 if ((s.state & PG_STATE_CREATING) &&
455 s.parent_split_bits == 0) {
456 creating_pgs.insert(pgid);
457 if (s.acting_primary >= 0) {
458 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
459 }
460 }
461
462 if (s.state & PG_STATE_ACTIVE) {
463 ++num_pg_active;
464 }
465
466 if (sameosds)
467 return;
468
469 for (vector<int>::const_iterator p = s.blocked_by.begin();
470 p != s.blocked_by.end();
471 ++p) {
472 ++blocked_by_sum[*p];
473 }
474
475 for (vector<int>::const_iterator p = s.acting.begin(); p != s.acting.end(); ++p)
476 pg_by_osd[*p].insert(pgid);
477 for (vector<int>::const_iterator p = s.up.begin(); p != s.up.end(); ++p)
478 pg_by_osd[*p].insert(pgid);
479
480 if (s.up_primary >= 0)
481 num_primary_pg_by_osd[s.up_primary]++;
482}
483
484void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
485 bool sameosds)
486{
487 pool_stat_t& ps = pg_pool_sum[pgid.pool()];
488 ps.sub(s);
489 if (ps.is_zero())
490 pg_pool_sum.erase(pgid.pool());
491 pg_sum.sub(s);
492
493 num_pg--;
494 int end = --num_pg_by_state[s.state];
495 assert(end >= 0);
496 if (end == 0)
497 num_pg_by_state.erase(s.state);
498
499 if ((s.state & PG_STATE_CREATING) &&
500 s.parent_split_bits == 0) {
501 creating_pgs.erase(pgid);
502 if (s.acting_primary >= 0) {
503 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
504 r[s.mapping_epoch].erase(pgid);
505 if (r[s.mapping_epoch].empty())
506 r.erase(s.mapping_epoch);
507 if (r.empty())
508 creating_pgs_by_osd_epoch.erase(s.acting_primary);
509 }
510 }
511
512 if (s.state & PG_STATE_ACTIVE) {
513 --num_pg_active;
514 }
515
516 if (sameosds)
517 return;
518
519 for (vector<int>::const_iterator p = s.blocked_by.begin();
520 p != s.blocked_by.end();
521 ++p) {
522 ceph::unordered_map<int,int>::iterator q = blocked_by_sum.find(*p);
523 assert(q != blocked_by_sum.end());
524 --q->second;
525 if (q->second == 0)
526 blocked_by_sum.erase(q);
527 }
528
529 for (vector<int>::const_iterator p = s.acting.begin(); p != s.acting.end(); ++p) {
530 set<pg_t>& oset = pg_by_osd[*p];
531 oset.erase(pgid);
532 if (oset.empty())
533 pg_by_osd.erase(*p);
534 }
535 for (vector<int>::const_iterator p = s.up.begin(); p != s.up.end(); ++p) {
536 set<pg_t>& oset = pg_by_osd[*p];
537 oset.erase(pgid);
538 if (oset.empty())
539 pg_by_osd.erase(*p);
540 }
541
542 if (s.up_primary >= 0) {
543 auto it = num_primary_pg_by_osd.find(s.up_primary);
544 if (it != num_primary_pg_by_osd.end() && it->second > 0)
545 it->second--;
546 }
547}
548
549void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
550 bufferlist::iterator& blp)
551{
552 pg_stat_t n;
553 ::decode(n, blp);
554
555 bool sameosds =
556 s.acting == n.acting &&
557 s.up == n.up &&
558 s.blocked_by == n.blocked_by;
559
560 stat_pg_sub(pgid, s, sameosds);
561
562 // if acting_primary has shift to an just restored osd, and pg yet to finish
563 // peering, many attributes in current stats remain stale. others seem don't
564 // mater much while faulty last_active will make "pg stuck in" check unhappy.
565 if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
566 n.last_active < s.last_active)
567 n.last_active = s.last_active;
568 s = n;
569 stat_pg_add(pgid, n, sameosds);
570}
571
572void PGMap::stat_osd_add(const osd_stat_t &s)
573{
574 num_osd++;
575 osd_sum.add(s);
576}
577
578void PGMap::stat_osd_sub(const osd_stat_t &s)
579{
580 num_osd--;
581 osd_sum.sub(s);
582}
583
584epoch_t PGMap::calc_min_last_epoch_clean() const
585{
586 if (pg_stat.empty())
587 return 0;
588
589 ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p = pg_stat.begin();
590 epoch_t min = p->second.get_effective_last_epoch_clean();
591 for (++p; p != pg_stat.end(); ++p) {
592 epoch_t lec = p->second.get_effective_last_epoch_clean();
593 if (lec < min)
594 min = lec;
595 }
596 // also scan osd epochs
597 // don't trim past the oldest reported osd epoch
598 for (ceph::unordered_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin();
599 i != osd_epochs.end();
600 ++i) {
601 if (i->second < min)
602 min = i->second;
603 }
604 return min;
605}
606
607void PGMap::encode(bufferlist &bl, uint64_t features) const
608{
609 if ((features & CEPH_FEATURE_MONENC) == 0) {
610 __u8 v = 3;
611 ::encode(v, bl);
612 ::encode(version, bl);
613 ::encode(pg_stat, bl);
614 ::encode(osd_stat, bl);
615 ::encode(last_osdmap_epoch, bl);
616 ::encode(last_pg_scan, bl);
617 ::encode(full_ratio, bl);
618 ::encode(nearfull_ratio, bl);
619 return;
620 }
621
622 ENCODE_START(6, 4, bl);
623 ::encode(version, bl);
624 ::encode(pg_stat, bl);
625 ::encode(osd_stat, bl);
626 ::encode(last_osdmap_epoch, bl);
627 ::encode(last_pg_scan, bl);
628 ::encode(full_ratio, bl);
629 ::encode(nearfull_ratio, bl);
630 ::encode(stamp, bl);
631 ::encode(osd_epochs, bl);
632 ENCODE_FINISH(bl);
633}
634
635void PGMap::decode(bufferlist::iterator &bl)
636{
637 DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, bl);
638 ::decode(version, bl);
639 if (struct_v < 3) {
640 pg_stat.clear();
641 __u32 n;
642 ::decode(n, bl);
643 while (n--) {
644 old_pg_t opgid;
645 ::decode(opgid, bl);
646 pg_t pgid = opgid;
647 ::decode(pg_stat[pgid], bl);
648 }
649 } else {
650 ::decode(pg_stat, bl);
651 }
652 ::decode(osd_stat, bl);
653 ::decode(last_osdmap_epoch, bl);
654 ::decode(last_pg_scan, bl);
655 if (struct_v >= 2) {
656 ::decode(full_ratio, bl);
657 ::decode(nearfull_ratio, bl);
658 }
659 if (struct_v >= 5)
660 ::decode(stamp, bl);
661 if (struct_v >= 6) {
662 ::decode(osd_epochs, bl);
663 } else {
664 for (ceph::unordered_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
665 i != osd_stat.end();
666 ++i) {
667 // This isn't accurate, but will cause trimming to behave like
668 // previously.
669 osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
670 }
671 }
672 DECODE_FINISH(bl);
673
674 calc_stats();
675}
676
677void PGMap::dirty_all(Incremental& inc)
678{
679 inc.osdmap_epoch = last_osdmap_epoch;
680 inc.pg_scan = last_pg_scan;
681 inc.full_ratio = full_ratio;
682 inc.nearfull_ratio = nearfull_ratio;
683
684 for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p = pg_stat.begin(); p != pg_stat.end(); ++p) {
685 inc.pg_stat_updates[p->first] = p->second;
686 }
687 for (ceph::unordered_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) {
688 assert(osd_epochs.count(p->first));
689 inc.update_stat(p->first,
690 inc.get_osd_epochs().find(p->first)->second,
691 p->second);
692 }
693}
694
695void PGMap::dump(Formatter *f) const
696{
697 dump_basic(f);
698 dump_pg_stats(f, false);
699 dump_pool_stats(f);
700 dump_osd_stats(f);
701}
702
703void PGMap::dump_basic(Formatter *f) const
704{
705 f->dump_unsigned("version", version);
706 f->dump_stream("stamp") << stamp;
707 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
708 f->dump_unsigned("last_pg_scan", last_pg_scan);
709 f->dump_unsigned("min_last_epoch_clean", min_last_epoch_clean);
710 f->dump_float("full_ratio", full_ratio);
711 f->dump_float("near_full_ratio", nearfull_ratio);
712
713 f->open_object_section("pg_stats_sum");
714 pg_sum.dump(f);
715 f->close_section();
716
717 f->open_object_section("osd_stats_sum");
718 osd_sum.dump(f);
719 f->close_section();
720
721 f->open_array_section("osd_epochs");
722 for (ceph::unordered_map<int32_t,epoch_t>::const_iterator p =
723 osd_epochs.begin(); p != osd_epochs.end(); ++p) {
724 f->open_object_section("osd");
725 f->dump_unsigned("osd", p->first);
726 f->dump_unsigned("epoch", p->second);
727 f->close_section();
728 }
729 f->close_section();
730
731 dump_delta(f);
732}
733
734void PGMap::dump_delta(Formatter *f) const
735{
736 f->open_object_section("pg_stats_delta");
737 pg_sum_delta.dump(f);
738 f->close_section();
739}
740
741void PGMap::dump_pg_stats(Formatter *f, bool brief) const
742{
743 f->open_array_section("pg_stats");
744 for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator i = pg_stat.begin();
745 i != pg_stat.end();
746 ++i) {
747 f->open_object_section("pg_stat");
748 f->dump_stream("pgid") << i->first;
749 if (brief)
750 i->second.dump_brief(f);
751 else
752 i->second.dump(f);
753 f->close_section();
754 }
755 f->close_section();
756}
757
758void PGMap::dump_pool_stats(Formatter *f) const
759{
760 f->open_array_section("pool_stats");
761 for (ceph::unordered_map<int,pool_stat_t>::const_iterator p = pg_pool_sum.begin();
762 p != pg_pool_sum.end();
763 ++p) {
764 f->open_object_section("pool_stat");
765 f->dump_int("poolid", p->first);
766 p->second.dump(f);
767 f->close_section();
768 }
769 f->close_section();
770}
771
772void PGMap::dump_osd_stats(Formatter *f) const
773{
774 f->open_array_section("osd_stats");
775 for (ceph::unordered_map<int32_t,osd_stat_t>::const_iterator q = osd_stat.begin();
776 q != osd_stat.end();
777 ++q) {
778 f->open_object_section("osd_stat");
779 f->dump_int("osd", q->first);
780 q->second.dump(f);
781 f->close_section();
782 }
783 f->close_section();
784}
785
786void PGMap::dump_pg_stats_plain(ostream& ss,
787 const ceph::unordered_map<pg_t, pg_stat_t>& pg_stats,
788 bool brief) const
789{
790 TextTable tab;
791
792 if (brief){
793 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
794 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
795 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
796 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
797 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
798 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
799 }
800 else {
801 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
802 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
803 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
804 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
805 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
806 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
807 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
808 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
809 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
810 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
811 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
812 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
813 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
814 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
815 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
816 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
817 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
818 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
819 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
820 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
821 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
822 }
823
824 for (ceph::unordered_map<pg_t, pg_stat_t>::const_iterator i = pg_stats.begin();
825 i != pg_stats.end(); ++i) {
826 const pg_stat_t &st(i->second);
827 if (brief) {
828 tab << i->first
829 << pg_state_string(st.state)
830 << st.up
831 << st.up_primary
832 << st.acting
833 << st.acting_primary
834 << TextTable::endrow;
835 } else {
836 ostringstream reported;
837 reported << st.reported_epoch << ":" << st.reported_seq;
838
839 tab << i->first
840 << st.stats.sum.num_objects
841 << st.stats.sum.num_objects_missing_on_primary
842 << st.stats.sum.num_objects_degraded
843 << st.stats.sum.num_objects_misplaced
844 << st.stats.sum.num_objects_unfound
845 << st.stats.sum.num_bytes
846 << st.log_size
847 << st.ondisk_log_size
848 << pg_state_string(st.state)
849 << st.last_change
850 << st.version
851 << reported.str()
852 << pg_vector_string(st.up)
853 << st.up_primary
854 << pg_vector_string(st.acting)
855 << st.acting_primary
856 << st.last_scrub
857 << st.last_scrub_stamp
858 << st.last_deep_scrub
859 << st.last_deep_scrub_stamp
860 << TextTable::endrow;
861 }
862 }
863
864 ss << tab;
865}
866
867void PGMap::dump(ostream& ss) const
868{
869 dump_basic(ss);
870 dump_pg_stats(ss, false);
871 dump_pool_stats(ss, false);
872 dump_pg_sum_stats(ss, false);
873 dump_osd_stats(ss);
874}
875
876void PGMap::dump_basic(ostream& ss) const
877{
878 ss << "version " << version << std::endl;
879 ss << "stamp " << stamp << std::endl;
880 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
881 ss << "last_pg_scan " << last_pg_scan << std::endl;
882 ss << "full_ratio " << full_ratio << std::endl;
883 ss << "nearfull_ratio " << nearfull_ratio << std::endl;
884}
885
886void PGMap::dump_pg_stats(ostream& ss, bool brief) const
887{
888 dump_pg_stats_plain(ss, pg_stat, brief);
889}
890
891void PGMap::dump_pool_stats(ostream& ss, bool header) const
892{
893 TextTable tab;
894
895 if (header) {
896 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
897 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
898 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
899 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
900 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
901 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
902 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
903 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
904 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
905 } else {
906 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
907 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
908 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
909 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
910 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
911 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
912 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
913 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
914 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
915 }
916
917 for (ceph::unordered_map<int,pool_stat_t>::const_iterator p = pg_pool_sum.begin();
918 p != pg_pool_sum.end();
919 ++p) {
920 tab << p->first
921 << p->second.stats.sum.num_objects
922 << p->second.stats.sum.num_objects_missing_on_primary
923 << p->second.stats.sum.num_objects_degraded
924 << p->second.stats.sum.num_objects_misplaced
925 << p->second.stats.sum.num_objects_unfound
926 << p->second.stats.sum.num_bytes
927 << p->second.log_size
928 << p->second.ondisk_log_size
929 << TextTable::endrow;
930 }
931
932 ss << tab;
933}
934
935void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
936{
937 TextTable tab;
938
939 if (header) {
940 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
941 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
942 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
943 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
944 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
945 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
946 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
947 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
948 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
949 } else {
950 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
951 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
952 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
953 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
954 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
955 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
956 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
957 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
958 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
959 };
960
961 tab << "sum"
962 << pg_sum.stats.sum.num_objects
963 << pg_sum.stats.sum.num_objects_missing_on_primary
964 << pg_sum.stats.sum.num_objects_degraded
965 << pg_sum.stats.sum.num_objects_misplaced
966 << pg_sum.stats.sum.num_objects_unfound
967 << pg_sum.stats.sum.num_bytes
968 << pg_sum.log_size
969 << pg_sum.ondisk_log_size
970 << TextTable::endrow;
971
972 ss << tab;
973}
974
975void PGMap::dump_osd_stats(ostream& ss) const
976{
977 TextTable tab;
978
979 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
980 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
981 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
982 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
983 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
984 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
985 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
986
987 for (ceph::unordered_map<int32_t,osd_stat_t>::const_iterator p = osd_stat.begin();
988 p != osd_stat.end();
989 ++p) {
990 tab << p->first
991 << si_t(p->second.kb_used << 10)
992 << si_t(p->second.kb_avail << 10)
993 << si_t(p->second.kb << 10)
994 << p->second.hb_peers
995 << get_num_pg_by_osd(p->first)
996 << get_num_primary_pg_by_osd(p->first)
997 << TextTable::endrow;
998 }
999
1000 tab << "sum"
1001 << si_t(osd_sum.kb_used << 10)
1002 << si_t(osd_sum.kb_avail << 10)
1003 << si_t(osd_sum.kb << 10)
1004 << TextTable::endrow;
1005
1006 ss << tab;
1007}
1008
1009void PGMap::dump_osd_sum_stats(ostream& ss) const
1010{
1011 TextTable tab;
1012
1013 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1014 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1015 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1016 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1017
1018 tab << "sum"
1019 << si_t(osd_sum.kb_used << 10)
1020 << si_t(osd_sum.kb_avail << 10)
1021 << si_t(osd_sum.kb << 10)
1022 << TextTable::endrow;
1023
1024 ss << tab;
1025}
1026
1027void PGMap::get_stuck_stats(int types, const utime_t cutoff,
1028 ceph::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1029{
1030 assert(types != 0);
1031 for (ceph::unordered_map<pg_t, pg_stat_t>::const_iterator i = pg_stat.begin();
1032 i != pg_stat.end();
1033 ++i) {
1034 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1035
1036 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1037 if (i->second.last_active < val)
1038 val = i->second.last_active;
1039 }
1040
1041 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1042 if (i->second.last_clean < val)
1043 val = i->second.last_clean;
1044 }
1045
1046 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1047 if (i->second.last_undegraded < val)
1048 val = i->second.last_undegraded;
1049 }
1050
1051 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1052 if (i->second.last_fullsized < val)
1053 val = i->second.last_fullsized;
1054 }
1055
1056 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1057 if (i->second.last_unstale < val)
1058 val = i->second.last_unstale;
1059 }
1060
1061 // val is now the earliest any of the requested stuck states began
1062 if (val < cutoff) {
1063 stuck_pgs[i->first] = i->second;
1064 }
1065 }
1066}
1067
1068bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1069{
1070 int inactive = 0;
1071 int unclean = 0;
1072 int degraded = 0;
1073 int undersized = 0;
1074 int stale = 0;
1075
1076 for (ceph::unordered_map<pg_t, pg_stat_t>::const_iterator i = pg_stat.begin();
1077 i != pg_stat.end();
1078 ++i) {
1079 if (! (i->second.state & PG_STATE_ACTIVE)) {
1080 if (i->second.last_active < cutoff)
1081 ++inactive;
1082 }
1083 if (! (i->second.state & PG_STATE_CLEAN)) {
1084 if (i->second.last_clean < cutoff)
1085 ++unclean;
1086 }
1087 if (i->second.state & PG_STATE_DEGRADED) {
1088 if (i->second.last_undegraded < cutoff)
1089 ++degraded;
1090 }
1091 if (i->second.state & PG_STATE_UNDERSIZED) {
1092 if (i->second.last_fullsized < cutoff)
1093 ++undersized;
1094 }
1095 if (i->second.state & PG_STATE_STALE) {
1096 if (i->second.last_unstale < cutoff)
1097 ++stale;
1098 }
1099 }
1100
1101 if (inactive)
1102 note["stuck inactive"] = inactive;
1103
1104 if (unclean)
1105 note["stuck unclean"] = unclean;
1106
1107 if (undersized)
1108 note["stuck undersized"] = undersized;
1109
1110 if (degraded)
1111 note["stuck degraded"] = degraded;
1112
1113 if (stale)
1114 note["stuck stale"] = stale;
1115
1116 return inactive || unclean || undersized || degraded || stale;
1117}
1118
1119void PGMap::dump_stuck(Formatter *f, int types, utime_t cutoff) const
1120{
1121 ceph::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1122 get_stuck_stats(types, cutoff, stuck_pg_stats);
1123 f->open_array_section("stuck_pg_stats");
1124 for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator i = stuck_pg_stats.begin();
1125 i != stuck_pg_stats.end();
1126 ++i) {
1127 f->open_object_section("pg_stat");
1128 f->dump_stream("pgid") << i->first;
1129 i->second.dump(f);
1130 f->close_section();
1131 }
1132 f->close_section();
1133}
1134
1135void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
1136{
1137 ceph::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1138 get_stuck_stats(types, cutoff, stuck_pg_stats);
1139 if (!stuck_pg_stats.empty())
1140 dump_pg_stats_plain(ss, stuck_pg_stats, true);
1141}
1142
1143int PGMap::dump_stuck_pg_stats(
1144 stringstream &ds,
1145 Formatter *f,
1146 int threshold,
1147 vector<string>& args) const
1148{
1149 int stuck_types = 0;
1150
1151 for (vector<string>::iterator i = args.begin(); i != args.end(); ++i) {
1152 if (*i == "inactive")
1153 stuck_types |= PGMap::STUCK_INACTIVE;
1154 else if (*i == "unclean")
1155 stuck_types |= PGMap::STUCK_UNCLEAN;
1156 else if (*i == "undersized")
1157 stuck_types |= PGMap::STUCK_UNDERSIZED;
1158 else if (*i == "degraded")
1159 stuck_types |= PGMap::STUCK_DEGRADED;
1160 else if (*i == "stale")
1161 stuck_types |= PGMap::STUCK_STALE;
1162 else {
1163 ds << "Unknown type: " << *i << std::endl;
1164 return -EINVAL;
1165 }
1166 }
1167
1168 utime_t now(ceph_clock_now());
1169 utime_t cutoff = now - utime_t(threshold, 0);
1170
1171 if (!f) {
1172 dump_stuck_plain(ds, stuck_types, cutoff);
1173 } else {
1174 dump_stuck(f, stuck_types, cutoff);
1175 f->flush(ds);
1176 }
1177
1178 return 0;
1179}
1180
1181void PGMap::dump_osd_perf_stats(Formatter *f) const
1182{
1183 f->open_array_section("osd_perf_infos");
1184 for (ceph::unordered_map<int32_t, osd_stat_t>::const_iterator i = osd_stat.begin();
1185 i != osd_stat.end();
1186 ++i) {
1187 f->open_object_section("osd");
1188 f->dump_int("id", i->first);
1189 {
1190 f->open_object_section("perf_stats");
1191 i->second.os_perf_stat.dump(f);
1192 f->close_section();
1193 }
1194 f->close_section();
1195 }
1196 f->close_section();
1197}
1198void PGMap::print_osd_perf_stats(std::ostream *ss) const
1199{
1200 TextTable tab;
1201 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
1202 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
1203 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
1204 for (ceph::unordered_map<int32_t, osd_stat_t>::const_iterator i = osd_stat.begin();
1205 i != osd_stat.end();
1206 ++i) {
1207 tab << i->first;
1208 tab << i->second.os_perf_stat.os_commit_latency;
1209 tab << i->second.os_perf_stat.os_apply_latency;
1210 tab << TextTable::endrow;
1211 }
1212 (*ss) << tab;
1213}
1214
1215void PGMap::dump_osd_blocked_by_stats(Formatter *f) const
1216{
1217 f->open_array_section("osd_blocked_by_infos");
1218 for (ceph::unordered_map<int,int>::const_iterator i = blocked_by_sum.begin();
1219 i != blocked_by_sum.end();
1220 ++i) {
1221 f->open_object_section("osd");
1222 f->dump_int("id", i->first);
1223 f->dump_int("num_blocked", i->second);
1224 f->close_section();
1225 }
1226 f->close_section();
1227}
1228void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
1229{
1230 TextTable tab;
1231 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
1232 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
1233 for (ceph::unordered_map<int,int>::const_iterator i = blocked_by_sum.begin();
1234 i != blocked_by_sum.end();
1235 ++i) {
1236 tab << i->first;
1237 tab << i->second;
1238 tab << TextTable::endrow;
1239 }
1240 (*ss) << tab;
1241}
1242
1243void PGMap::recovery_summary(Formatter *f, list<string> *psl,
1244 const pool_stat_t& delta_sum) const
1245{
1246 if (delta_sum.stats.sum.num_objects_degraded && delta_sum.stats.sum.num_object_copies > 0) {
1247 double pc = (double)delta_sum.stats.sum.num_objects_degraded /
1248 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
1249 char b[20];
1250 snprintf(b, sizeof(b), "%.3lf", pc);
1251 if (f) {
1252 f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
1253 f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
1254 f->dump_float("degraded_ratio", pc / 100.0);
1255 } else {
1256 ostringstream ss;
1257 ss << delta_sum.stats.sum.num_objects_degraded
1258 << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
1259 psl->push_back(ss.str());
1260 }
1261 }
1262 if (delta_sum.stats.sum.num_objects_misplaced && delta_sum.stats.sum.num_object_copies > 0) {
1263 double pc = (double)delta_sum.stats.sum.num_objects_misplaced /
1264 (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
1265 char b[20];
1266 snprintf(b, sizeof(b), "%.3lf", pc);
1267 if (f) {
1268 f->dump_unsigned("misplaced_objects", delta_sum.stats.sum.num_objects_misplaced);
1269 f->dump_unsigned("misplaced_total", delta_sum.stats.sum.num_object_copies);
1270 f->dump_float("misplaced_ratio", pc / 100.0);
1271 } else {
1272 ostringstream ss;
1273 ss << delta_sum.stats.sum.num_objects_misplaced
1274 << "/" << delta_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
1275 psl->push_back(ss.str());
1276 }
1277 }
1278 if (delta_sum.stats.sum.num_objects_unfound && delta_sum.stats.sum.num_objects) {
1279 double pc = (double)delta_sum.stats.sum.num_objects_unfound /
1280 (double)delta_sum.stats.sum.num_objects * (double)100.0;
1281 char b[20];
1282 snprintf(b, sizeof(b), "%.3lf", pc);
1283 if (f) {
1284 f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
1285 f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
1286 f->dump_float("unfound_ratio", pc / 100.0);
1287 } else {
1288 ostringstream ss;
1289 ss << delta_sum.stats.sum.num_objects_unfound
1290 << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
1291 psl->push_back(ss.str());
1292 }
1293 }
1294}
1295
1296void PGMap::recovery_rate_summary(Formatter *f, ostream *out,
1297 const pool_stat_t& delta_sum,
1298 utime_t delta_stamp) const
1299{
1300 // make non-negative; we can get negative values if osds send
1301 // uncommitted stats and then "go backward" or if they are just
1302 // buggy/wrong.
1303 pool_stat_t pos_delta = delta_sum;
1304 pos_delta.floor(0);
1305 if (pos_delta.stats.sum.num_objects_recovered ||
1306 pos_delta.stats.sum.num_bytes_recovered ||
1307 pos_delta.stats.sum.num_keys_recovered) {
1308 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
1309 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
1310 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
1311 if (f) {
1312 f->dump_int("recovering_objects_per_sec", objps);
1313 f->dump_int("recovering_bytes_per_sec", bps);
1314 f->dump_int("recovering_keys_per_sec", kps);
1315 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
1316 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
1317 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
1318 } else {
1319 *out << pretty_si_t(bps) << "B/s";
1320 if (pos_delta.stats.sum.num_keys_recovered)
1321 *out << ", " << pretty_si_t(kps) << "keys/s";
1322 *out << ", " << pretty_si_t(objps) << "objects/s";
1323 }
1324 }
1325}
1326
1327void PGMap::overall_recovery_rate_summary(Formatter *f, ostream *out) const
1328{
1329 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
1330}
1331
1332void PGMap::overall_recovery_summary(Formatter *f, list<string> *psl) const
1333{
1334 recovery_summary(f, psl, pg_sum);
1335}
1336
1337void PGMap::pool_recovery_rate_summary(Formatter *f, ostream *out,
1338 uint64_t poolid) const
1339{
1340 ceph::unordered_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
1341 per_pool_sum_delta.find(poolid);
1342 if (p == per_pool_sum_delta.end())
1343 return;
1344
1345 ceph::unordered_map<uint64_t,utime_t>::const_iterator ts =
1346 per_pool_sum_deltas_stamps.find(p->first);
1347 assert(ts != per_pool_sum_deltas_stamps.end());
1348 recovery_rate_summary(f, out, p->second.first, ts->second);
1349}
1350
1351void PGMap::pool_recovery_summary(Formatter *f, list<string> *psl,
1352 uint64_t poolid) const
1353{
1354 ceph::unordered_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
1355 per_pool_sum_delta.find(poolid);
1356 if (p == per_pool_sum_delta.end())
1357 return;
1358
1359 recovery_summary(f, psl, p->second.first);
1360}
1361
1362void PGMap::client_io_rate_summary(Formatter *f, ostream *out,
1363 const pool_stat_t& delta_sum,
1364 utime_t delta_stamp) const
1365{
1366 pool_stat_t pos_delta = delta_sum;
1367 pos_delta.floor(0);
1368 if (pos_delta.stats.sum.num_rd ||
1369 pos_delta.stats.sum.num_wr) {
1370 if (pos_delta.stats.sum.num_rd) {
1371 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
1372 if (f) {
1373 f->dump_int("read_bytes_sec", rd);
1374 } else {
1375 *out << pretty_si_t(rd) << "B/s rd, ";
1376 }
1377 }
1378 if (pos_delta.stats.sum.num_wr) {
1379 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
1380 if (f) {
1381 f->dump_int("write_bytes_sec", wr);
1382 } else {
1383 *out << pretty_si_t(wr) << "B/s wr, ";
1384 }
1385 }
1386 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
1387 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
1388 if (f) {
1389 f->dump_int("read_op_per_sec", iops_rd);
1390 f->dump_int("write_op_per_sec", iops_wr);
1391 } else {
1392 *out << pretty_si_t(iops_rd) << "op/s rd, " << pretty_si_t(iops_wr) << "op/s wr";
1393 }
1394 }
1395}
1396
1397void PGMap::overall_client_io_rate_summary(Formatter *f, ostream *out) const
1398{
1399 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
1400}
1401
1402void PGMap::pool_client_io_rate_summary(Formatter *f, ostream *out,
1403 uint64_t poolid) const
1404{
1405 ceph::unordered_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
1406 per_pool_sum_delta.find(poolid);
1407 if (p == per_pool_sum_delta.end())
1408 return;
1409
1410 ceph::unordered_map<uint64_t,utime_t>::const_iterator ts =
1411 per_pool_sum_deltas_stamps.find(p->first);
1412 assert(ts != per_pool_sum_deltas_stamps.end());
1413 client_io_rate_summary(f, out, p->second.first, ts->second);
1414}
1415
1416void PGMap::cache_io_rate_summary(Formatter *f, ostream *out,
1417 const pool_stat_t& delta_sum,
1418 utime_t delta_stamp) const
1419{
1420 pool_stat_t pos_delta = delta_sum;
1421 pos_delta.floor(0);
1422 bool have_output = false;
1423
1424 if (pos_delta.stats.sum.num_flush) {
1425 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
1426 if (f) {
1427 f->dump_int("flush_bytes_sec", flush);
1428 } else {
1429 *out << pretty_si_t(flush) << "B/s flush";
1430 have_output = true;
1431 }
1432 }
1433 if (pos_delta.stats.sum.num_evict) {
1434 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
1435 if (f) {
1436 f->dump_int("evict_bytes_sec", evict);
1437 } else {
1438 if (have_output)
1439 *out << ", ";
1440 *out << pretty_si_t(evict) << "B/s evict";
1441 have_output = true;
1442 }
1443 }
1444 if (pos_delta.stats.sum.num_promote) {
1445 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
1446 if (f) {
1447 f->dump_int("promote_op_per_sec", promote);
1448 } else {
1449 if (have_output)
1450 *out << ", ";
1451 *out << pretty_si_t(promote) << "op/s promote";
1452 have_output = true;
1453 }
1454 }
1455 if (pos_delta.stats.sum.num_flush_mode_low) {
1456 if (f) {
1457 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
1458 } else {
1459 if (have_output)
1460 *out << ", ";
1461 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_low) << "PG(s) flushing";
1462 have_output = true;
1463 }
1464 }
1465 if (pos_delta.stats.sum.num_flush_mode_high) {
1466 if (f) {
1467 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
1468 } else {
1469 if (have_output)
1470 *out << ", ";
1471 *out << pretty_si_t(pos_delta.stats.sum.num_flush_mode_high) << "PG(s) flushing (high)";
1472 have_output = true;
1473 }
1474 }
1475 if (pos_delta.stats.sum.num_evict_mode_some) {
1476 if (f) {
1477 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
1478 } else {
1479 if (have_output)
1480 *out << ", ";
1481 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_some) << "PG(s) evicting";
1482 have_output = true;
1483 }
1484 }
1485 if (pos_delta.stats.sum.num_evict_mode_full) {
1486 if (f) {
1487 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
1488 } else {
1489 if (have_output)
1490 *out << ", ";
1491 *out << pretty_si_t(pos_delta.stats.sum.num_evict_mode_full) << "PG(s) evicting (full)";
1492 }
1493 }
1494}
1495
1496void PGMap::overall_cache_io_rate_summary(Formatter *f, ostream *out) const
1497{
1498 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
1499}
1500
1501void PGMap::pool_cache_io_rate_summary(Formatter *f, ostream *out,
1502 uint64_t poolid) const
1503{
1504 ceph::unordered_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
1505 per_pool_sum_delta.find(poolid);
1506 if (p == per_pool_sum_delta.end())
1507 return;
1508
1509 ceph::unordered_map<uint64_t,utime_t>::const_iterator ts =
1510 per_pool_sum_deltas_stamps.find(p->first);
1511 assert(ts != per_pool_sum_deltas_stamps.end());
1512 cache_io_rate_summary(f, out, p->second.first, ts->second);
1513}
1514
1515/**
1516 * update aggregated delta
1517 *
1518 * @param cct ceph context
1519 * @param ts Timestamp for the stats being delta'ed
1520 * @param old_pool_sum Previous stats sum
1521 * @param last_ts Last timestamp for pool
1522 * @param result_pool_sum Resulting stats
1523 * @param result_pool_delta Resulting pool delta
1524 * @param result_ts_delta Resulting timestamp delta
1525 * @param delta_avg_list List of last N computed deltas, used to average
1526 */
1527void PGMap::update_delta(CephContext *cct,
1528 const utime_t ts,
1529 const pool_stat_t& old_pool_sum,
1530 utime_t *last_ts,
1531 const pool_stat_t& current_pool_sum,
1532 pool_stat_t *result_pool_delta,
1533 utime_t *result_ts_delta,
1534 list<pair<pool_stat_t,utime_t> > *delta_avg_list)
1535{
1536 /* @p ts is the timestamp we want to associate with the data
1537 * in @p old_pool_sum, and on which we will base ourselves to
1538 * calculate the delta, stored in 'delta_t'.
1539 */
1540 utime_t delta_t;
1541 delta_t = ts; // start with the provided timestamp
1542 delta_t -= *last_ts; // take the last timestamp we saw
1543 *last_ts = ts; // @p ts becomes the last timestamp we saw
1544
1545 // calculate a delta, and average over the last 2 deltas.
1546 /* start by taking a copy of our current @p result_pool_sum, and by
1547 * taking out the stats from @p old_pool_sum. This generates a stats
1548 * delta. Stash this stats delta in @p delta_avg_list, along with the
1549 * timestamp delta for these results.
1550 */
1551 pool_stat_t d = current_pool_sum;
1552 d.stats.sub(old_pool_sum.stats);
1553 delta_avg_list->push_back(make_pair(d,delta_t));
1554 *result_ts_delta += delta_t;
1555
1556 /* Aggregate current delta, and take out the last seen delta (if any) to
1557 * average it out.
1558 */
1559 result_pool_delta->stats.add(d.stats);
1560 size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
1561 if (delta_avg_list->size() > s) {
1562 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
1563 *result_ts_delta -= delta_avg_list->front().second;
1564 delta_avg_list->pop_front();
1565 }
1566}
1567
1568/**
1569 * update aggregated delta
1570 *
1571 * @param cct ceph context
1572 * @param ts Timestamp
1573 * @param pg_sum_old Old pg_sum
1574 */
1575void PGMap::update_global_delta(CephContext *cct,
1576 const utime_t ts, const pool_stat_t& pg_sum_old)
1577{
1578 update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
1579 &stamp_delta, &pg_sum_deltas);
1580}
1581
1582/**
1583 * Update a given pool's deltas
1584 *
1585 * @param cct Ceph Context
1586 * @param ts Timestamp for the stats being delta'ed
1587 * @param pool Pool's id
1588 * @param old_pool_sum Previous stats sum
1589 */
1590void PGMap::update_one_pool_delta(CephContext *cct,
1591 const utime_t ts,
1592 const uint64_t pool,
1593 const pool_stat_t& old_pool_sum)
1594{
1595 if (per_pool_sum_deltas.count(pool) == 0) {
1596 assert(per_pool_sum_deltas_stamps.count(pool) == 0);
1597 assert(per_pool_sum_delta.count(pool) == 0);
1598 }
1599
1600 pair<pool_stat_t,utime_t>& sum_delta = per_pool_sum_delta[pool];
1601
1602 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
1603 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
1604 &per_pool_sum_deltas[pool]);
1605}
1606
1607/**
1608 * Update pools' deltas
1609 *
1610 * @param cct CephContext
1611 * @param ts Timestamp for the stats being delta'ed
1612 * @param pg_pool_sum_old Map of pool stats for delta calcs.
1613 */
1614void PGMap::update_pool_deltas(CephContext *cct, const utime_t ts,
1615 const ceph::unordered_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
1616{
1617 for (ceph::unordered_map<uint64_t,pool_stat_t>::const_iterator it = pg_pool_sum_old.begin();
1618 it != pg_pool_sum_old.end(); ++it) {
1619 update_one_pool_delta(cct, ts, it->first, it->second);
1620 }
1621}
1622
1623void PGMap::clear_delta()
1624{
1625 pg_sum_delta = pool_stat_t();
1626 pg_sum_deltas.clear();
1627 stamp_delta = utime_t();
1628}
1629
1630void PGMap::print_summary(Formatter *f, ostream *out) const
1631{
1632 std::stringstream ss;
1633 if (f)
1634 f->open_array_section("pgs_by_state");
1635
1636 // list is descending numeric order (by count)
1637 multimap<int,int> state_by_count; // count -> state
1638 for (ceph::unordered_map<int,int>::const_iterator p = num_pg_by_state.begin();
1639 p != num_pg_by_state.end();
1640 ++p) {
1641 state_by_count.insert(make_pair(p->second, p->first));
1642 }
1643 for (multimap<int,int>::reverse_iterator p = state_by_count.rbegin();
1644 p != state_by_count.rend();
1645 ++p) {
1646 if (f) {
1647 f->open_object_section("pgs_by_state_element");
1648 f->dump_string("state_name", pg_state_string(p->second));
1649 f->dump_unsigned("count", p->first);
1650 f->close_section();
1651 } else {
1652 ss.setf(std::ios::right);
1653 ss << " " << std::setw(7) << p->first
1654 << " " << pg_state_string(p->second) << "\n";
1655 ss.unsetf(std::ios::right);
1656 }
1657 }
1658 if (f)
1659 f->close_section();
1660
1661 if (f) {
1662 f->dump_unsigned("version", version);
1663 f->dump_unsigned("num_pgs", pg_stat.size());
1664 f->dump_unsigned("num_pools", pg_pool_sum.size());
1665 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
1666 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
1667 f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
1668 f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
1669 f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
1670 } else {
1671 *out << " pgmap v" << version << ": "
1672 << pg_stat.size() << " pgs, " << pg_pool_sum.size() << " pools, "
1673 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
1674 << si_t(pg_sum.stats.sum.num_objects) << " objects\n";
1675 *out << " "
1676 << kb_t(osd_sum.kb_used) << " used, "
1677 << kb_t(osd_sum.kb_avail) << " / "
1678 << kb_t(osd_sum.kb) << " avail\n";
1679 }
1680
1681
1682 if (num_pg_active < num_pg) {
1683 float p = (float)num_pg_active / (float)num_pg;
1684 if (f) {
1685 f->dump_float("active_pgs_ratio", p);
1686 } else {
1687 char b[20];
1688 snprintf(b, sizeof(b), "%.3lf", (1.0 - p) * 100.0);
1689 *out << " " << b << "% pgs inactive\n";
1690 }
1691 }
1692
1693 list<string> sl;
1694 overall_recovery_summary(f, &sl);
1695 if (!f && !sl.empty()) {
1696 for (list<string>::iterator p = sl.begin(); p != sl.end(); ++p)
1697 *out << " " << *p << "\n";
1698 }
1699 sl.clear();
1700
1701 if (!f)
1702 *out << ss.str(); // pgs by state
1703
1704 ostringstream ssr;
1705 overall_recovery_rate_summary(f, &ssr);
1706 if (!f && ssr.str().length())
1707 *out << "recovery io " << ssr.str() << "\n";
1708
1709 ssr.clear();
1710 ssr.str("");
1711
1712 overall_client_io_rate_summary(f, &ssr);
1713 if (!f && ssr.str().length())
1714 *out << " client io " << ssr.str() << "\n";
1715
1716 ssr.clear();
1717 ssr.str("");
1718
1719 overall_cache_io_rate_summary(f, &ssr);
1720 if (!f && ssr.str().length())
1721 *out << " cache io " << ssr.str() << "\n";
1722}
1723
1724void PGMap::print_oneline_summary(Formatter *f, ostream *out) const
1725{
1726 std::stringstream ss;
1727
1728 if (f)
1729 f->open_array_section("num_pg_by_state");
1730 for (ceph::unordered_map<int,int>::const_iterator p = num_pg_by_state.begin();
1731 p != num_pg_by_state.end();
1732 ++p) {
1733 if (f) {
1734 f->open_object_section("state");
1735 f->dump_string("name", pg_state_string(p->first));
1736 f->dump_unsigned("num", p->second);
1737 f->close_section();
1738 }
1739 if (p != num_pg_by_state.begin())
1740 ss << ", ";
1741 ss << p->second << " " << pg_state_string(p->first);
1742 }
1743 if (f)
1744 f->close_section();
1745
1746 string states = ss.str();
1747 if (out)
1748 *out << "v" << version << ": "
1749 << pg_stat.size() << " pgs: "
1750 << states << "; "
1751 << prettybyte_t(pg_sum.stats.sum.num_bytes) << " data, "
1752 << kb_t(osd_sum.kb_used) << " used, "
1753 << kb_t(osd_sum.kb_avail) << " / "
1754 << kb_t(osd_sum.kb) << " avail";
1755 if (f) {
1756 f->dump_unsigned("version", version);
1757 f->dump_unsigned("num_pgs", pg_stat.size());
1758 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
1759 f->dump_unsigned("raw_bytes_used", osd_sum.kb_used << 10);
1760 f->dump_unsigned("raw_bytes_avail", osd_sum.kb_avail << 10);
1761 f->dump_unsigned("raw_bytes", osd_sum.kb << 10);
1762 }
1763
1764 // make non-negative; we can get negative values if osds send
1765 // uncommitted stats and then "go backward" or if they are just
1766 // buggy/wrong.
1767 pool_stat_t pos_delta = pg_sum_delta;
1768 pos_delta.floor(0);
1769 if (pos_delta.stats.sum.num_rd ||
1770 pos_delta.stats.sum.num_wr) {
1771 if (out)
1772 *out << "; ";
1773 if (pos_delta.stats.sum.num_rd) {
1774 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
1775 if (out)
1776 *out << pretty_si_t(rd) << "B/s rd, ";
1777 if (f)
1778 f->dump_unsigned("read_bytes_sec", rd);
1779 }
1780 if (pos_delta.stats.sum.num_wr) {
1781 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
1782 if (out)
1783 *out << pretty_si_t(wr) << "B/s wr, ";
1784 if (f)
1785 f->dump_unsigned("write_bytes_sec", wr);
1786 }
1787 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
1788 if (out)
1789 *out << pretty_si_t(iops) << "op/s";
1790 if (f)
1791 f->dump_unsigned("io_sec", iops);
1792 }
1793
1794 list<string> sl;
1795 overall_recovery_summary(f, &sl);
1796 if (out)
1797 for (list<string>::iterator p = sl.begin(); p != sl.end(); ++p)
1798 *out << "; " << *p;
1799 std::stringstream ssr;
1800 overall_recovery_rate_summary(f, &ssr);
1801 if (out && ssr.str().length())
1802 *out << "; " << ssr.str() << " recovering";
1803}
1804
1805void PGMap::generate_test_instances(list<PGMap*>& o)
1806{
1807 o.push_back(new PGMap);
1808 list<Incremental*> inc;
1809 Incremental::generate_test_instances(inc);
1810 delete inc.front();
1811 inc.pop_front();
1812 while (!inc.empty()) {
1813 PGMap *pmp = new PGMap();
1814 *pmp = *o.back();
1815 o.push_back(pmp);
1816 o.back()->apply_incremental(NULL, *inc.front());
1817 delete inc.front();
1818 inc.pop_front();
1819 }
1820}
1821
1822void PGMap::get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
1823 bool primary, set<pg_t>& pgs) const
1824{
1825 for (ceph::unordered_map<pg_t, pg_stat_t>::const_iterator i = pg_stat.begin();
1826 i != pg_stat.end();
1827 ++i) {
1828 if ((poolid >= 0) && (uint64_t(poolid) != i->first.pool()))
1829 continue;
1830 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
1831 continue;
1832 if (!(i->second.state & state))
1833 continue;
1834 pgs.insert(i->first);
1835 }
1836}
1837
1838void PGMap::dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const
1839{
1840 f->open_array_section("pg_stats");
1841 for (set<pg_t>::iterator i = pgs.begin(); i != pgs.end(); ++i) {
1842 const pg_stat_t& st = pg_stat.at(*i);
1843 f->open_object_section("pg_stat");
1844 f->dump_stream("pgid") << *i;
1845 st.dump(f);
1846 f->close_section();
1847 }
1848 f->close_section();
1849}
1850
1851void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
1852{
1853 TextTable tab;
1854
1855 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1856 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1857 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1858 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1859 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1860 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1864 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1865 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1866 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1867 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1868 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1869 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1870 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1871 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1872 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1873 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1874 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1875 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1876
1877 for (set<pg_t>::iterator i = pgs.begin(); i != pgs.end(); ++i) {
1878 const pg_stat_t& st = pg_stat.at(*i);
1879
1880 ostringstream reported;
1881 reported << st.reported_epoch << ":" << st.reported_seq;
1882
1883 tab << *i
1884 << st.stats.sum.num_objects
1885 << st.stats.sum.num_objects_missing_on_primary
1886 << st.stats.sum.num_objects_degraded
1887 << st.stats.sum.num_objects_misplaced
1888 << st.stats.sum.num_objects_unfound
1889 << st.stats.sum.num_bytes
1890 << st.log_size
1891 << st.ondisk_log_size
1892 << pg_state_string(st.state)
1893 << st.last_change
1894 << st.version
1895 << reported.str()
1896 << st.up
1897 << st.up_primary
1898 << st.acting
1899 << st.acting_primary
1900 << st.last_scrub
1901 << st.last_scrub_stamp
1902 << st.last_deep_scrub
1903 << st.last_deep_scrub_stamp
1904 << TextTable::endrow;
1905 }
1906
1907 ss << tab;
1908}
1909
1910int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1911{
1912 map<int,float> wm;
1913 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1914 if (r < 0) {
1915 return r;
1916 }
1917 if (wm.empty()) {
1918 return 0;
1919 }
1920
1921 float fratio;
1922 if (osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS) && osdmap.get_full_ratio() > 0) {
1923 fratio = osdmap.get_full_ratio();
1924 } else if (full_ratio > 0) {
1925 fratio = full_ratio;
1926 } else {
1927 // this shouldn't really happen
1928 fratio = g_conf->mon_osd_full_ratio;
1929 if (fratio > 1.0) fratio /= 100;
1930 }
1931
1932 int64_t min = -1;
1933 for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
1934 ceph::unordered_map<int32_t,osd_stat_t>::const_iterator osd_info =
1935 osd_stat.find(p->first);
1936 if (osd_info != osd_stat.end()) {
1937 if (osd_info->second.kb == 0 || p->second == 0) {
1938 // osd must be out, hence its stats have been zeroed
1939 // (unless we somehow managed to have a disk with size 0...)
1940 //
1941 // (p->second == 0), if osd weight is 0, no need to
1942 // calculate proj below.
1943 continue;
1944 }
1945 double unusable = (double)osd_info->second.kb *
1946 (1.0 - fratio);
1947 double avail = MAX(0.0, (double)osd_info->second.kb_avail - unusable);
1948 avail *= 1024.0;
1949 int64_t proj = (int64_t)(avail / (double)p->second);
1950 if (min < 0 || proj < min) {
1951 min = proj;
1952 }
1953 } else {
1954 dout(0) << "Cannot get stat of OSD " << p->first << dendl;
1955 }
1956 }
1957 return min;
1958}
1959
1960inline std::string percentify(const float& a) {
1961 std::stringstream ss;
1962 if (a < 0.01)
1963 ss << "0";
1964 else
1965 ss << std::fixed << std::setprecision(2) << a;
1966 return ss.str();
1967}
1968
1969void PGMap::dump_pool_stats(const OSDMap &osd_map, stringstream *ss,
1970 Formatter *f, bool verbose) const
1971{
1972 TextTable tbl;
1973
1974 if (f) {
1975 f->open_array_section("pools");
1976 } else {
1977 tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
1978 tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
1979 if (verbose) {
1980 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
1981 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
1982 }
1983
1984 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1985 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
1986 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
1987 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1988 if (verbose) {
1989 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
1990 tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
1991 tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
1992 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
1993 }
1994 }
1995
1996 map<int,uint64_t> avail_by_rule;
1997 for (map<int64_t,pg_pool_t>::const_iterator p = osd_map.get_pools().begin();
1998 p != osd_map.get_pools().end(); ++p) {
1999 int64_t pool_id = p->first;
2000 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
2001 continue;
2002 const string& pool_name = osd_map.get_pool_name(pool_id);
2003 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
2004
2005 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
2006 int ruleno = osd_map.crush->find_rule(pool->get_crush_ruleset(),
2007 pool->get_type(),
2008 pool->get_size());
2009 int64_t avail;
2010 float raw_used_rate;
2011 if (avail_by_rule.count(ruleno) == 0) {
2012 avail = get_rule_avail(osd_map, ruleno);
2013 if (avail < 0)
2014 avail = 0;
2015 avail_by_rule[ruleno] = avail;
2016 } else {
2017 avail = avail_by_rule[ruleno];
2018 }
2019 switch (pool->get_type()) {
2020 case pg_pool_t::TYPE_REPLICATED:
2021 avail /= pool->get_size();
2022 raw_used_rate = pool->get_size();
2023 break;
2024 case pg_pool_t::TYPE_ERASURE:
2025 {
2026 auto& ecp =
2027 osd_map.get_erasure_code_profile(pool->erasure_code_profile);
2028 auto pm = ecp.find("m");
2029 auto pk = ecp.find("k");
2030 if (pm != ecp.end() && pk != ecp.end()) {
2031 int k = atoi(pk->second.c_str());
2032 int m = atoi(pm->second.c_str());
2033 avail = avail * k / (m + k);
2034 raw_used_rate = (float)(m + k) / k;
2035 } else {
2036 raw_used_rate = 0.0;
2037 }
2038 }
2039 break;
2040 default:
2041 assert(0 == "unrecognized pool type");
2042 }
2043
2044 if (f) {
2045 f->open_object_section("pool");
2046 f->dump_string("name", pool_name);
2047 f->dump_int("id", pool_id);
2048 f->open_object_section("stats");
2049 } else {
2050 tbl << pool_name
2051 << pool_id;
2052 if (verbose) {
2053 if (pool->quota_max_objects == 0)
2054 tbl << "N/A";
2055 else
2056 tbl << si_t(pool->quota_max_objects);
2057
2058 if (pool->quota_max_bytes == 0)
2059 tbl << "N/A";
2060 else
2061 tbl << si_t(pool->quota_max_bytes);
2062 }
2063
2064 }
2065 dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
2066 if (f)
2067 f->close_section(); // stats
2068 else
2069 tbl << TextTable::endrow;
2070
2071 if (f)
2072 f->close_section(); // pool
2073 }
2074 if (f)
2075 f->close_section();
2076 else {
2077 assert(ss != nullptr);
2078 *ss << "POOLS:\n";
2079 tbl.set_indent(4);
2080 *ss << tbl;
2081 }
2082}
2083
2084void PGMap::dump_fs_stats(
2085 stringstream *ss, Formatter *f, bool verbose) const
2086{
2087 if (f) {
2088 f->open_object_section("stats");
2089 f->dump_int("total_bytes", osd_sum.kb * 1024ull);
2090 f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
2091 f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
2092 if (verbose) {
2093 f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
2094 }
2095 f->close_section();
2096 } else {
2097 assert(ss != nullptr);
2098 TextTable tbl;
2099 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
2100 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
2101 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
2102 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
2103 if (verbose) {
2104 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2105 }
2106 tbl << stringify(si_t(osd_sum.kb*1024))
2107 << stringify(si_t(osd_sum.kb_avail*1024))
2108 << stringify(si_t(osd_sum.kb_used*1024));
2109 float used = 0.0;
2110 if (osd_sum.kb > 0) {
2111 used = ((float)osd_sum.kb_used / osd_sum.kb);
2112 }
2113 tbl << percentify(used*100);
2114 if (verbose) {
2115 tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
2116 }
2117 tbl << TextTable::endrow;
2118
2119 *ss << "GLOBAL:\n";
2120 tbl.set_indent(4);
2121 *ss << tbl;
2122 }
2123}
2124
2125void PGMap::dump_object_stat_sum(TextTable &tbl, Formatter *f,
2126 const object_stat_sum_t &sum, uint64_t avail,
2127 float raw_used_rate, bool verbose,
2128 const pg_pool_t *pool)
2129{
2130 float curr_object_copies_rate = 0.0;
2131 if (sum.num_object_copies > 0)
2132 curr_object_copies_rate = (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
2133
2134 if (f) {
2135 f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
2136 f->dump_int("bytes_used", sum.num_bytes);
2137 f->dump_unsigned("max_avail", avail);
2138 f->dump_int("objects", sum.num_objects);
2139 if (verbose) {
2140 f->dump_int("quota_objects", pool->quota_max_objects);
2141 f->dump_int("quota_bytes", pool->quota_max_bytes);
2142 f->dump_int("dirty", sum.num_objects_dirty);
2143 f->dump_int("rd", sum.num_rd);
2144 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
2145 f->dump_int("wr", sum.num_wr);
2146 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
2147 f->dump_int("raw_bytes_used", sum.num_bytes * raw_used_rate * curr_object_copies_rate);
2148 }
2149 } else {
2150 tbl << stringify(si_t(sum.num_bytes));
2151 float used = 0.0;
2152 if (avail) {
2153 used = sum.num_bytes * curr_object_copies_rate;
2154 used /= used + avail;
2155 } else if (sum.num_bytes) {
2156 used = 1.0;
2157 }
2158 tbl << percentify(used*100);
2159 tbl << si_t(avail);
2160 tbl << sum.num_objects;
2161 if (verbose) {
2162 tbl << stringify(si_t(sum.num_objects_dirty))
2163 << stringify(si_t(sum.num_rd))
2164 << stringify(si_t(sum.num_wr))
2165 << stringify(si_t(sum.num_bytes * raw_used_rate * curr_object_copies_rate));
2166 }
2167 }
2168}
2169
2170
2171int process_pg_map_command(
2172 const string& orig_prefix,
2173 const map<string,cmd_vartype>& orig_cmdmap,
2174 const PGMap& pg_map,
2175 const OSDMap& osdmap,
2176 Formatter *f,
2177 stringstream *ss,
2178 bufferlist *odata)
2179{
2180 string prefix = orig_prefix;
2181 map<string,cmd_vartype> cmdmap = orig_cmdmap;
2182
2183 // perhaps these would be better in the parsing, but it's weird
2184 bool primary = false;
2185 if (prefix == "pg dump_json") {
2186 vector<string> v;
2187 v.push_back(string("all"));
2188 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
2189 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
2190 prefix = "pg dump";
2191 } else if (prefix == "pg dump_pools_json") {
2192 vector<string> v;
2193 v.push_back(string("pools"));
2194 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
2195 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
2196 prefix = "pg dump";
2197 } else if (prefix == "pg ls-by-primary") {
2198 primary = true;
2199 prefix = "pg ls";
2200 } else if (prefix == "pg ls-by-osd") {
2201 prefix = "pg ls";
2202 } else if (prefix == "pg ls-by-pool") {
2203 prefix = "pg ls";
2204 string poolstr;
2205 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
2206 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
2207 if (pool < 0) {
2208 *ss << "pool " << poolstr << " does not exist";
2209 return -ENOENT;
2210 }
2211 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
2212 }
2213
2214 int r = 0;
2215 stringstream ds;
2216 if (prefix == "pg stat") {
2217 if (f) {
2218 f->open_object_section("pg_summary");
2219 pg_map.print_oneline_summary(f, NULL);
2220 f->close_section();
2221 f->flush(ds);
2222 } else {
2223 ds << pg_map;
2224 }
2225 odata->append(ds);
2226 return 0;
2227 }
2228
2229 if (prefix == "pg getmap") {
2230 pg_map.encode(*odata);
2231 *ss << "got pgmap version " << pg_map.version;
2232 return 0;
2233 }
2234
2235 if (prefix == "pg dump") {
2236 string val;
2237 vector<string> dumpcontents;
2238 set<string> what;
2239 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
2240 copy(dumpcontents.begin(), dumpcontents.end(),
2241 inserter(what, what.end()));
2242 }
2243 if (what.empty())
2244 what.insert("all");
2245 if (f) {
2246 if (what.count("all")) {
2247 f->open_object_section("pg_map");
2248 pg_map.dump(f);
2249 f->close_section();
2250 } else if (what.count("summary") || what.count("sum")) {
2251 f->open_object_section("pg_map");
2252 pg_map.dump_basic(f);
2253 f->close_section();
2254 } else {
2255 if (what.count("pools")) {
2256 pg_map.dump_pool_stats(f);
2257 }
2258 if (what.count("osds")) {
2259 pg_map.dump_osd_stats(f);
2260 }
2261 if (what.count("pgs")) {
2262 pg_map.dump_pg_stats(f, false);
2263 }
2264 if (what.count("pgs_brief")) {
2265 pg_map.dump_pg_stats(f, true);
2266 }
2267 if (what.count("delta")) {
2268 f->open_object_section("delta");
2269 pg_map.dump_delta(f);
2270 f->close_section();
2271 }
2272 }
2273 f->flush(*odata);
2274 } else {
2275 if (what.count("all")) {
2276 pg_map.dump(ds);
2277 } else if (what.count("summary") || what.count("sum")) {
2278 pg_map.dump_basic(ds);
2279 pg_map.dump_pg_sum_stats(ds, true);
2280 pg_map.dump_osd_sum_stats(ds);
2281 } else {
2282 if (what.count("pgs_brief")) {
2283 pg_map.dump_pg_stats(ds, true);
2284 }
2285 bool header = true;
2286 if (what.count("pgs")) {
2287 pg_map.dump_pg_stats(ds, false);
2288 header = false;
2289 }
2290 if (what.count("pools")) {
2291 pg_map.dump_pool_stats(ds, header);
2292 }
2293 if (what.count("osds")) {
2294 pg_map.dump_osd_stats(ds);
2295 }
2296 }
2297 odata->append(ds);
2298 }
2299 *ss << "dumped " << what;
2300 return 0;
2301 }
2302
2303 if (prefix == "pg ls") {
2304 int64_t osd = -1;
2305 int64_t pool = -1;
2306 vector<string>states;
2307 set<pg_t> pgs;
2308 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
2309 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
2310 cmd_getval(g_ceph_context, cmdmap, "states", states);
2311 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
2312 *ss << "pool " << pool << " does not exist";
2313 return -ENOENT;
2314 }
2315 if (osd >= 0 && !osdmap.is_up(osd)) {
2316 *ss << "osd " << osd << " is not up";
2317 return -EAGAIN;
2318 }
2319 if (states.empty())
2320 states.push_back("all");
2321
2322 uint32_t state = 0;
2323
2324 while (!states.empty()) {
2325 string state_str = states.back();
2326
2327 if (state_str == "all") {
2328 state = -1;
2329 break;
2330 } else {
2331 int filter = pg_string_state(state_str);
2332 assert(filter != -1);
2333 state |= filter;
2334 }
2335
2336 states.pop_back();
2337 }
2338
2339 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
2340
2341 if (f && !pgs.empty()) {
2342 pg_map.dump_filtered_pg_stats(f, pgs);
2343 f->flush(*odata);
2344 } else if (!pgs.empty()) {
2345 pg_map.dump_filtered_pg_stats(ds, pgs);
2346 odata->append(ds);
2347 }
2348 return 0;
2349 }
2350
2351 if (prefix == "pg dump_stuck") {
2352 vector<string> stuckop_vec;
2353 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
2354 if (stuckop_vec.empty())
2355 stuckop_vec.push_back("unclean");
2356 int64_t threshold;
2357 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
2358 int64_t(g_conf->mon_pg_stuck_threshold));
2359
2360 r = pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec);
2361 odata->append(ds);
2362 if (r < 0)
2363 *ss << "failed";
2364 else
2365 *ss << "ok";
2366 return 0;
2367 }
2368
2369 if (prefix == "pg debug") {
2370 string debugop;
2371 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
2372 string("unfound_objects_exist"));
2373 if (debugop == "unfound_objects_exist") {
2374 bool unfound_objects_exist = false;
2375 for (const auto& p : pg_map.pg_stat) {
2376 if (p.second.stats.sum.num_objects_unfound > 0) {
2377 unfound_objects_exist = true;
2378 break;
2379 }
2380 }
2381 if (unfound_objects_exist)
2382 ds << "TRUE";
2383 else
2384 ds << "FALSE";
2385 odata->append(ds);
2386 return 0;
2387 }
2388 if (debugop == "degraded_pgs_exist") {
2389 bool degraded_pgs_exist = false;
2390 for (const auto& p : pg_map.pg_stat) {
2391 if (p.second.stats.sum.num_objects_degraded > 0) {
2392 degraded_pgs_exist = true;
2393 break;
2394 }
2395 }
2396 if (degraded_pgs_exist)
2397 ds << "TRUE";
2398 else
2399 ds << "FALSE";
2400 odata->append(ds);
2401 return 0;
2402 }
2403 }
2404
2405 if (prefix == "osd perf") {
2406 if (f) {
2407 f->open_object_section("osdstats");
2408 pg_map.dump_osd_perf_stats(f);
2409 f->close_section();
2410 f->flush(ds);
2411 } else {
2412 pg_map.print_osd_perf_stats(&ds);
2413 }
2414 odata->append(ds);
2415 return 0;
2416 }
2417
2418 if (prefix == "osd blocked-by") {
2419 if (f) {
2420 f->open_object_section("osd_blocked_by");
2421 pg_map.dump_osd_blocked_by_stats(f);
2422 f->close_section();
2423 f->flush(ds);
2424 } else {
2425 pg_map.print_osd_blocked_by_stats(&ds);
2426 }
2427 odata->append(ds);
2428 return 0;
2429 }
2430
2431 if (prefix == "osd pool stats") {
2432 string pool_name;
2433 cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
2434
2435 int64_t poolid = -ENOENT;
2436 bool one_pool = false;
2437 if (!pool_name.empty()) {
2438 poolid = osdmap.lookup_pg_pool_name(pool_name);
2439 if (poolid < 0) {
2440 assert(poolid == -ENOENT);
2441 *ss << "unrecognized pool '" << pool_name << "'";
2442 return -ENOENT;
2443 }
2444 one_pool = true;
2445 }
2446
2447 stringstream rs;
2448
2449 if (f)
2450 f->open_array_section("pool_stats");
2451 else {
2452 if (osdmap.get_pools().empty()) {
2453 *ss << "there are no pools!";
2454 goto stats_out;
2455 }
2456 }
2457
2458 for (auto& p : osdmap.get_pools()) {
2459 if (!one_pool)
2460 poolid = p.first;
2461
2462 pool_name = osdmap.get_pool_name(poolid);
2463
2464 if (f) {
2465 f->open_object_section("pool");
2466 f->dump_string("pool_name", pool_name.c_str());
2467 f->dump_int("pool_id", poolid);
2468 f->open_object_section("recovery");
2469 }
2470
2471 list<string> sl;
2472 stringstream tss;
2473 pg_map.pool_recovery_summary(f, &sl, poolid);
2474 if (!f && !sl.empty()) {
2475 for (auto& p : sl)
2476 tss << " " << p << "\n";
2477 }
2478
2479 if (f) {
2480 f->close_section();
2481 f->open_object_section("recovery_rate");
2482 }
2483
2484 ostringstream rss;
2485 pg_map.pool_recovery_rate_summary(f, &rss, poolid);
2486 if (!f && !rss.str().empty())
2487 tss << " recovery io " << rss.str() << "\n";
2488
2489 if (f) {
2490 f->close_section();
2491 f->open_object_section("client_io_rate");
2492 }
2493 rss.clear();
2494 rss.str("");
2495
2496 pg_map.pool_client_io_rate_summary(f, &rss, poolid);
2497 if (!f && !rss.str().empty())
2498 tss << " client io " << rss.str() << "\n";
2499
2500 // dump cache tier IO rate for cache pool
2501 const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
2502 if (pool->is_tier()) {
2503 if (f) {
2504 f->close_section();
2505 f->open_object_section("cache_io_rate");
2506 }
2507 rss.clear();
2508 rss.str("");
2509
2510 pg_map.pool_cache_io_rate_summary(f, &rss, poolid);
2511 if (!f && !rss.str().empty())
2512 tss << " cache tier io " << rss.str() << "\n";
2513 }
2514 if (f) {
2515 f->close_section();
2516 f->close_section();
2517 } else {
2518 rs << "pool " << pool_name << " id " << poolid << "\n";
2519 if (!tss.str().empty())
2520 rs << tss.str() << "\n";
2521 else
2522 rs << " nothing is going on\n\n";
2523 }
2524 if (one_pool)
2525 break;
2526 }
2527
2528stats_out:
2529 if (f) {
2530 f->close_section();
2531 f->flush(ds);
2532 odata->append(ds);
2533 } else {
2534 odata->append(rs.str());
2535 }
2536 return 0;
2537 }
2538
2539 return -EOPNOTSUPP;
2540}
2541
2542void PGMapUpdater::check_osd_map(const OSDMap::Incremental &osd_inc,
2543 std::set<int> *need_check_down_pg_osds,
2544 std::map<int,utime_t> *last_osd_report,
2545 PGMap *pg_map,
2546 PGMap::Incremental *pending_inc)
2547{
2548 for (const auto &p : osd_inc.new_weight) {
2549 if (p.second == CEPH_OSD_OUT) {
2550 dout(10) << __func__ << " osd." << p.first << " went OUT" << dendl;
2551 pending_inc->stat_osd_out(p.first);
2552 }
2553 }
2554
2555 // this is conservative: we want to know if any osds (maybe) got marked down.
2556 for (const auto &p : osd_inc.new_state) {
2557 if (p.second & CEPH_OSD_UP) { // true if marked up OR down,
2558 // but we're too lazy to check
2559 // which
2560 need_check_down_pg_osds->insert(p.first);
2561
2562 // clear out the last_osd_report for this OSD
2563 map<int, utime_t>::iterator report = last_osd_report->find(p.first);
2564 if (report != last_osd_report->end()) {
2565 last_osd_report->erase(report);
2566 }
2567
2568 // clear out osd_stat slow request histogram
2569 dout(20) << __func__ << " clearing osd." << p.first
2570 << " request histogram" << dendl;
2571 pending_inc->stat_osd_down_up(p.first, *pg_map);
2572 }
2573
2574 if (p.second & CEPH_OSD_EXISTS) {
2575 // whether it was created *or* destroyed, we can safely drop
2576 // it's osd_stat_t record.
2577 dout(10) << __func__ << " osd." << p.first
2578 << " created or destroyed" << dendl;
2579 pending_inc->rm_stat(p.first);
2580
2581 // and adjust full, nearfull set
2582 pg_map->nearfull_osds.erase(p.first);
2583 pg_map->full_osds.erase(p.first);
2584 }
2585 }
2586}
2587
2588void PGMapUpdater::register_pg(
2589 const OSDMap &osd_map,
2590 pg_t pgid, epoch_t epoch,
2591 bool new_pool,
2592 const PGMap &pg_map,
2593 PGMap::Incremental *pending_inc)
2594{
2595 pg_t parent;
2596 int split_bits = 0;
2597 auto parent_stat = pg_map.pg_stat.end();
2598 if (!new_pool) {
2599 parent = pgid;
2600 while (1) {
2601 // remove most significant bit
2602 int msb = cbits(parent.ps());
2603 if (!msb)
2604 break;
2605 parent.set_ps(parent.ps() & ~(1<<(msb-1)));
2606 split_bits++;
2607 dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
2608 parent_stat = pg_map.pg_stat.find(parent);
2609 if (parent_stat != pg_map.pg_stat.end() &&
2610 parent_stat->second.state != PG_STATE_CREATING) {
2611 dout(10) << " parent is " << parent << dendl;
2612 break;
2613 }
2614 }
2615 }
2616
2617 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
2618 stats.state = PG_STATE_CREATING;
2619 stats.created = epoch;
2620 stats.parent = parent;
2621 stats.parent_split_bits = split_bits;
2622 stats.mapping_epoch = epoch;
2623
2624 if (parent_stat != pg_map.pg_stat.end()) {
2625 const pg_stat_t &ps = parent_stat->second;
2626 stats.last_fresh = ps.last_fresh;
2627 stats.last_active = ps.last_active;
2628 stats.last_change = ps.last_change;
2629 stats.last_peered = ps.last_peered;
2630 stats.last_clean = ps.last_clean;
2631 stats.last_unstale = ps.last_unstale;
2632 stats.last_undegraded = ps.last_undegraded;
2633 stats.last_fullsized = ps.last_fullsized;
2634 stats.last_scrub_stamp = ps.last_scrub_stamp;
2635 stats.last_deep_scrub_stamp = ps.last_deep_scrub_stamp;
2636 stats.last_clean_scrub_stamp = ps.last_clean_scrub_stamp;
2637 } else {
2638 utime_t now = osd_map.get_modified();
2639 stats.last_fresh = now;
2640 stats.last_active = now;
2641 stats.last_change = now;
2642 stats.last_peered = now;
2643 stats.last_clean = now;
2644 stats.last_unstale = now;
2645 stats.last_undegraded = now;
2646 stats.last_fullsized = now;
2647 stats.last_scrub_stamp = now;
2648 stats.last_deep_scrub_stamp = now;
2649 stats.last_clean_scrub_stamp = now;
2650 }
2651
2652 osd_map.pg_to_up_acting_osds(
2653 pgid,
2654 &stats.up,
2655 &stats.up_primary,
2656 &stats.acting,
2657 &stats.acting_primary);
2658
2659 if (split_bits == 0) {
2660 dout(10) << __func__ << " will create " << pgid
2661 << " primary " << stats.acting_primary
2662 << " acting " << stats.acting
2663 << dendl;
2664 } else {
2665 dout(10) << __func__ << " will create " << pgid
2666 << " primary " << stats.acting_primary
2667 << " acting " << stats.acting
2668 << " parent " << parent
2669 << " by " << split_bits << " bits"
2670 << dendl;
2671 }
2672}
2673
2674void PGMapUpdater::register_new_pgs(
2675 const OSDMap &osd_map,
2676 const PGMap &pg_map,
2677 PGMap::Incremental *pending_inc)
2678{
2679 epoch_t epoch = osd_map.get_epoch();
2680 dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
2681 << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
2682
2683 int created = 0;
2684 const auto &pools = osd_map.get_pools();
2685
2686 for (const auto &p : pools) {
2687 int64_t poolid = p.first;
2688 const pg_pool_t &pool = p.second;
2689 int ruleno = osd_map.crush->find_rule(pool.get_crush_ruleset(),
2690 pool.get_type(), pool.get_size());
2691 if (ruleno < 0 || !osd_map.crush->rule_exists(ruleno))
2692 continue;
2693
2694 if (pool.get_last_change() <= pg_map.last_pg_scan ||
2695 pool.get_last_change() <= pending_inc->pg_scan) {
2696 dout(10) << " no change in pool " << poolid << " " << pool << dendl;
2697 continue;
2698 }
2699
2700 dout(10) << __func__ << " scanning pool " << poolid
2701 << " " << pool << dendl;
2702
2703 // first pgs in this pool
2704 bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
2705
2706 for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
2707 pg_t pgid(ps, poolid, -1);
2708 if (pg_map.pg_stat.count(pgid)) {
2709 dout(20) << "register_new_pgs have " << pgid << dendl;
2710 continue;
2711 }
2712 created++;
2713 register_pg(osd_map, pgid, pool.get_last_change(), new_pool,
2714 pg_map, pending_inc);
2715 }
2716 }
2717
2718 int removed = 0;
2719 for (const auto &p : pg_map.creating_pgs) {
2720 if (p.preferred() >= 0) {
2721 dout(20) << " removing creating_pg " << p
2722 << " because it is localized and obsolete" << dendl;
2723 pending_inc->pg_remove.insert(p);
2724 ++removed;
2725 } else if (!osd_map.have_pg_pool(p.pool())) {
2726 dout(20) << " removing creating_pg " << p
2727 << " because containing pool deleted" << dendl;
2728 pending_inc->pg_remove.insert(p);
2729 ++removed;
2730 }
2731 }
2732
2733 // deleted pools?
2734 for (const auto &p : pg_map.pg_stat) {
2735 if (!osd_map.have_pg_pool(p.first.pool())) {
2736 dout(20) << " removing pg_stat " << p.first << " because "
2737 << "containing pool deleted" << dendl;
2738 pending_inc->pg_remove.insert(p.first);
2739 ++removed;
2740 } else if (p.first.preferred() >= 0) {
2741 dout(20) << " removing localized pg " << p.first << dendl;
2742 pending_inc->pg_remove.insert(p.first);
2743 ++removed;
2744 }
2745 }
2746
2747 // we don't want to redo this work if we can avoid it.
2748 pending_inc->pg_scan = epoch;
2749
2750 dout(10) << "register_new_pgs registered " << created << " new pgs, removed "
2751 << removed << " uncreated pgs" << dendl;
2752}
2753
2754
2755void PGMapUpdater::update_creating_pgs(
2756 const OSDMap &osd_map,
2757 const PGMap &pg_map,
2758 PGMap::Incremental *pending_inc)
2759{
2760 dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
2761 << " pgs, osdmap epoch " << osd_map.get_epoch()
2762 << dendl;
2763
2764 unsigned changed = 0;
2765 for (set<pg_t>::const_iterator p = pg_map.creating_pgs.begin();
2766 p != pg_map.creating_pgs.end();
2767 ++p) {
2768 pg_t pgid = *p;
2769 pg_t on = pgid;
2770 ceph::unordered_map<pg_t,pg_stat_t>::const_iterator q =
2771 pg_map.pg_stat.find(pgid);
2772 assert(q != pg_map.pg_stat.end());
2773 const pg_stat_t *s = &q->second;
2774
2775 if (s->parent_split_bits)
2776 on = s->parent;
2777
2778 vector<int> up, acting;
2779 int up_primary, acting_primary;
2780 osd_map.pg_to_up_acting_osds(
2781 on,
2782 &up,
2783 &up_primary,
2784 &acting,
2785 &acting_primary);
2786
2787 if (up != s->up ||
2788 up_primary != s->up_primary ||
2789 acting != s->acting ||
2790 acting_primary != s->acting_primary) {
2791 pg_stat_t *ns = &pending_inc->pg_stat_updates[pgid];
2792 if (osd_map.get_epoch() > ns->reported_epoch) {
2793 dout(20) << __func__ << " " << pgid << " "
2794 << " acting_primary: " << s->acting_primary
2795 << " -> " << acting_primary
2796 << " acting: " << s->acting << " -> " << acting
2797 << " up_primary: " << s->up_primary << " -> " << up_primary
2798 << " up: " << s->up << " -> " << up
2799 << dendl;
2800
2801 // only initialize if it wasn't already a pending update
2802 if (ns->reported_epoch == 0)
2803 *ns = *s;
2804
2805 // note epoch if the target of the create message changed
2806 if (acting_primary != ns->acting_primary)
2807 ns->mapping_epoch = osd_map.get_epoch();
2808
2809 ns->up = up;
2810 ns->up_primary = up_primary;
2811 ns->acting = acting;
2812 ns->acting_primary = acting_primary;
2813
2814 ++changed;
2815 } else {
2816 dout(20) << __func__ << " " << pgid << " has pending update from newer"
2817 << " epoch " << ns->reported_epoch
2818 << dendl;
2819 }
2820 }
2821 }
2822 if (changed) {
2823 dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
2824 }
2825}
2826
2827static void _try_mark_pg_stale(
2828 const OSDMap& osdmap,
2829 pg_t pgid,
2830 const pg_stat_t& cur,
2831 PGMap::Incremental *pending_inc)
2832{
2833 if ((cur.state & PG_STATE_STALE) == 0 &&
2834 cur.acting_primary != -1 &&
2835 osdmap.is_down(cur.acting_primary)) {
2836 pg_stat_t *newstat;
2837 auto q = pending_inc->pg_stat_updates.find(pgid);
2838 if (q != pending_inc->pg_stat_updates.end()) {
2839 if ((q->second.acting_primary == cur.acting_primary) ||
2840 ((q->second.state & PG_STATE_STALE) == 0 &&
2841 q->second.acting_primary != -1 &&
2842 osdmap.is_down(q->second.acting_primary))) {
2843 newstat = &q->second;
2844 } else {
2845 // pending update is no longer down or already stale
2846 return;
2847 }
2848 } else {
2849 newstat = &pending_inc->pg_stat_updates[pgid];
2850 *newstat = cur;
2851 }
2852 dout(10) << __func__ << " marking pg " << pgid
2853 << " stale (acting_primary " << newstat->acting_primary
2854 << ")" << dendl;
2855 newstat->state |= PG_STATE_STALE;
2856 newstat->last_unstale = ceph_clock_now();
2857 }
2858}
2859
2860void PGMapUpdater::check_down_pgs(
2861 const OSDMap &osdmap,
2862 const PGMap &pg_map,
2863 bool check_all,
2864 const set<int>& need_check_down_pg_osds,
2865 PGMap::Incremental *pending_inc)
2866{
2867 // if a large number of osds changed state, just iterate over the whole
2868 // pg map.
2869 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
2870 g_conf->mon_pg_check_down_all_threshold) {
2871 check_all = true;
2872 }
2873
2874 if (check_all) {
2875 for (const auto& p : pg_map.pg_stat) {
2876 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
2877 }
2878 } else {
2879 for (auto osd : need_check_down_pg_osds) {
2880 if (osdmap.is_down(osd)) {
2881 auto p = pg_map.pg_by_osd.find(osd);
2882 if (p == pg_map.pg_by_osd.end()) {
2883 continue;
2884 }
2885 for (auto pgid : p->second) {
2886 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
2887 assert(stat.acting_primary == osd);
2888 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
2889 }
2890 }
2891 }
2892 }
2893}
2894
2895int reweight::by_utilization(
2896 const OSDMap &osdmap,
2897 const PGMap &pgm,
2898 int oload,
2899 double max_changef,
2900 int max_osds,
2901 bool by_pg, const set<int64_t> *pools,
2902 bool no_increasing,
2903 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
2904 std::stringstream *ss,
2905 std::string *out_str,
2906 Formatter *f)
2907{
2908 if (oload <= 100) {
2909 *ss << "You must give a percentage higher than 100. "
2910 "The reweighting threshold will be calculated as <average-utilization> "
2911 "times <input-percentage>. For example, an argument of 200 would "
2912 "reweight OSDs which are twice as utilized as the average OSD.\n";
2913 return -EINVAL;
2914 }
2915
2916 vector<int> pgs_by_osd(osdmap.get_max_osd());
2917
2918 // Avoid putting a small number (or 0) in the denominator when calculating
2919 // average_util
2920 double average_util;
2921 if (by_pg) {
2922 // by pg mapping
2923 double weight_sum = 0.0; // sum up the crush weights
2924 unsigned num_pg_copies = 0;
2925 int num_osds = 0;
2926 for (const auto& pg : pgm.pg_stat) {
2927 if (pools && pools->count(pg.first.pool()) == 0)
2928 continue;
2929 for (const auto acting : pg.second.acting) {
2930 if (acting >= (int)pgs_by_osd.size())
2931 pgs_by_osd.resize(acting);
2932 if (pgs_by_osd[acting] == 0) {
2933 if (osdmap.crush->get_item_weightf(acting) <= 0) {
2934 //skip if we currently can not identify item
2935 continue;
2936 }
2937 weight_sum += osdmap.crush->get_item_weightf(acting);
2938 ++num_osds;
2939 }
2940 ++pgs_by_osd[acting];
2941 ++num_pg_copies;
2942 }
2943 }
2944
2945 if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
2946 *ss << "Refusing to reweight: we only have " << num_pg_copies
2947 << " PGs across " << num_osds << " osds!\n";
2948 return -EDOM;
2949 }
2950
2951 average_util = (double)num_pg_copies / weight_sum;
2952 } else {
2953 // by osd utilization
2954 int num_osd = MAX(1, pgm.osd_stat.size());
2955 if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
2956 < g_conf->mon_reweight_min_bytes_per_osd) {
2957 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
2958 << " kb across all osds!\n";
2959 return -EDOM;
2960 }
2961 if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
2962 < g_conf->mon_reweight_min_bytes_per_osd) {
2963 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
2964 << " kb used across all osds!\n";
2965 return -EDOM;
2966 }
2967
2968 average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
2969 }
2970
2971 // adjust down only if we are above the threshold
2972 const double overload_util = average_util * (double)oload / 100.0;
2973
2974 // but aggressively adjust weights up whenever possible.
2975 const double underload_util = average_util;
2976
2977 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
2978
2979 ostringstream oss;
2980 if (f) {
2981 f->open_object_section("reweight_by_utilization");
2982 f->dump_int("overload_min", oload);
2983 f->dump_float("max_change", max_changef);
2984 f->dump_int("max_change_osds", max_osds);
2985 f->dump_float("average_utilization", average_util);
2986 f->dump_float("overload_utilization", overload_util);
2987 } else {
2988 oss << "oload " << oload << "\n";
2989 oss << "max_change " << max_changef << "\n";
2990 oss << "max_change_osds " << max_osds << "\n";
2991 oss.precision(4);
2992 oss << "average_utilization " << std::fixed << average_util << "\n";
2993 oss << "overload_utilization " << overload_util << "\n";
2994 }
2995 int num_changed = 0;
2996
2997 // precompute util for each OSD
2998 std::vector<std::pair<int, float> > util_by_osd;
2999 for (const auto& p : pgm.osd_stat) {
3000 std::pair<int, float> osd_util;
3001 osd_util.first = p.first;
3002 if (by_pg) {
3003 if (p.first >= (int)pgs_by_osd.size() ||
3004 pgs_by_osd[p.first] == 0) {
3005 // skip if this OSD does not contain any pg
3006 // belonging to the specified pool(s).
3007 continue;
3008 }
3009
3010 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
3011 // skip if we are unable to locate item.
3012 continue;
3013 }
3014
3015 osd_util.second = pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
3016 } else {
3017 osd_util.second = (double)p.second.kb_used / (double)p.second.kb;
3018 }
3019 util_by_osd.push_back(osd_util);
3020 }
3021
3022 // sort by absolute deviation from the mean utilization,
3023 // in descending order.
3024 std::sort(util_by_osd.begin(), util_by_osd.end(),
3025 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
3026 return abs(l.second - average_util) > abs(r.second - average_util);
3027 }
3028 );
3029
3030 if (f)
3031 f->open_array_section("reweights");
3032
3033 for (const auto& p : util_by_osd) {
3034 unsigned weight = osdmap.get_weight(p.first);
3035 if (weight == 0) {
3036 // skip if OSD is currently out
3037 continue;
3038 }
3039 float util = p.second;
3040
3041 if (util >= overload_util) {
3042 // Assign a lower weight to overloaded OSDs. The current weight
3043 // is a factor to take into account the original weights,
3044 // to represent e.g. differing storage capacities
3045 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3046 if (weight > max_change)
3047 new_weight = MAX(new_weight, weight - max_change);
3048 new_weights->insert({p.first, new_weight});
3049 if (f) {
3050 f->open_object_section("osd");
3051 f->dump_int("osd", p.first);
3052 f->dump_float("weight", (float)weight / (float)0x10000);
3053 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
3054 f->close_section();
3055 } else {
3056 oss << "osd." << p.first << " weight "
3057 << (float)weight / (float)0x10000 << " -> "
3058 << (float)new_weight / (float)0x10000 << "\n";
3059 }
3060 if (++num_changed >= max_osds)
3061 break;
3062 }
3063 if (!no_increasing && util <= underload_util) {
3064 // assign a higher weight.. if we can.
3065 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3066 new_weight = MIN(new_weight, weight + max_change);
3067 if (new_weight > 0x10000)
3068 new_weight = 0x10000;
3069 if (new_weight > weight) {
3070 new_weights->insert({p.first, new_weight});
3071 oss << "osd." << p.first << " weight "
3072 << (float)weight / (float)0x10000 << " -> "
3073 << (float)new_weight / (float)0x10000 << "\n";
3074 if (++num_changed >= max_osds)
3075 break;
3076 }
3077 }
3078 }
3079 if (f) {
3080 f->close_section();
3081 }
3082
3083 OSDMap newmap;
3084 newmap.deepish_copy_from(osdmap);
3085 OSDMap::Incremental newinc;
3086 newinc.fsid = newmap.get_fsid();
3087 newinc.epoch = newmap.get_epoch() + 1;
3088 newinc.new_weight = *new_weights;
3089 newmap.apply_incremental(newinc);
3090
3091 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
3092
3093 if (f) {
3094 f->close_section();
3095 } else {
3096 *out_str += "\n";
3097 *out_str += oss.str();
3098 }
3099 return num_changed;
3100}