1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
20 #include "common/config.h"
21 #include "common/Formatter.h"
22 #include "common/TextTable.h"
23 #include "include/ceph_features.h"
24 #include "include/str_map.h"
26 #include "common/code_environment.h"
28 #include "crush/CrushTreeDumper.h"
29 #include "common/Clock.h"
30 #include "mon/PGStatService.h"
32 #define dout_subsys ceph_subsys_osd
34 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
35 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
38 // ----------------------------------
41 void osd_info_t::dump(Formatter
*f
) const
43 f
->dump_int("last_clean_begin", last_clean_begin
);
44 f
->dump_int("last_clean_end", last_clean_end
);
45 f
->dump_int("up_from", up_from
);
46 f
->dump_int("up_thru", up_thru
);
47 f
->dump_int("down_at", down_at
);
48 f
->dump_int("lost_at", lost_at
);
51 void osd_info_t::encode(bufferlist
& bl
) const
54 ::encode(struct_v
, bl
);
55 ::encode(last_clean_begin
, bl
);
56 ::encode(last_clean_end
, bl
);
57 ::encode(up_from
, bl
);
58 ::encode(up_thru
, bl
);
59 ::encode(down_at
, bl
);
60 ::encode(lost_at
, bl
);
63 void osd_info_t::decode(bufferlist::iterator
& bl
)
66 ::decode(struct_v
, bl
);
67 ::decode(last_clean_begin
, bl
);
68 ::decode(last_clean_end
, bl
);
69 ::decode(up_from
, bl
);
70 ::decode(up_thru
, bl
);
71 ::decode(down_at
, bl
);
72 ::decode(lost_at
, bl
);
75 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
77 o
.push_back(new osd_info_t
);
78 o
.push_back(new osd_info_t
);
79 o
.back()->last_clean_begin
= 1;
80 o
.back()->last_clean_end
= 2;
81 o
.back()->up_from
= 30;
82 o
.back()->up_thru
= 40;
83 o
.back()->down_at
= 5;
84 o
.back()->lost_at
= 6;
87 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
89 out
<< "up_from " << info
.up_from
90 << " up_thru " << info
.up_thru
91 << " down_at " << info
.down_at
92 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
94 out
<< " lost_at " << info
.lost_at
;
98 // ----------------------------------
101 void osd_xinfo_t::dump(Formatter
*f
) const
103 f
->dump_stream("down_stamp") << down_stamp
;
104 f
->dump_float("laggy_probability", laggy_probability
);
105 f
->dump_int("laggy_interval", laggy_interval
);
106 f
->dump_int("features", features
);
107 f
->dump_unsigned("old_weight", old_weight
);
110 void osd_xinfo_t::encode(bufferlist
& bl
) const
112 ENCODE_START(3, 1, bl
);
113 ::encode(down_stamp
, bl
);
114 __u32 lp
= laggy_probability
* 0xfffffffful
;
116 ::encode(laggy_interval
, bl
);
117 ::encode(features
, bl
);
118 ::encode(old_weight
, bl
);
122 void osd_xinfo_t::decode(bufferlist::iterator
& bl
)
125 ::decode(down_stamp
, bl
);
128 laggy_probability
= (float)lp
/ (float)0xffffffff;
129 ::decode(laggy_interval
, bl
);
131 ::decode(features
, bl
);
135 ::decode(old_weight
, bl
);
141 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
143 o
.push_back(new osd_xinfo_t
);
144 o
.push_back(new osd_xinfo_t
);
145 o
.back()->down_stamp
= utime_t(2, 3);
146 o
.back()->laggy_probability
= .123;
147 o
.back()->laggy_interval
= 123456;
148 o
.back()->old_weight
= 0x7fff;
151 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
153 return out
<< "down_stamp " << xi
.down_stamp
154 << " laggy_probability " << xi
.laggy_probability
155 << " laggy_interval " << xi
.laggy_interval
156 << " old_weight " << xi
.old_weight
;
159 // ----------------------------------
160 // OSDMap::Incremental
162 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
165 for (auto &weight
: new_weight
) {
166 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
168 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
174 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
177 for (auto &state
: new_state
) { //
178 if (state
.second
& CEPH_OSD_UP
) {
179 if (previous
->is_up(state
.first
))
188 int OSDMap::Incremental::identify_osd(uuid_d u
) const
190 for (auto &uuid
: new_uuid
)
191 if (uuid
.second
== u
)
196 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext
*cct
,
197 const OSDMap
& osdmap
)
199 assert(epoch
== osdmap
.get_epoch() + 1);
201 for (auto &new_pool
: new_pools
) {
202 if (!new_pool
.second
.tiers
.empty()) {
203 pg_pool_t
& base
= new_pool
.second
;
205 for (const auto &tier_pool
: base
.tiers
) {
206 const auto &r
= new_pools
.find(tier_pool
);
208 if (r
== new_pools
.end()) {
209 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
211 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
214 tier
= get_new_pool(tier_pool
, orig
);
218 if (tier
->tier_of
!= new_pool
.first
) {
219 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
223 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
224 << tier_pool
<< dendl
;
225 tier
->snap_seq
= base
.snap_seq
;
226 tier
->snap_epoch
= base
.snap_epoch
;
227 tier
->snaps
= base
.snaps
;
228 tier
->removed_snaps
= base
.removed_snaps
;
236 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
242 down_cache
->count(id
)) {
247 crush
->get_children(id
, &children
);
248 for (const auto &child
: children
) {
249 if (!subtree_is_down(child
, down_cache
)) {
254 down_cache
->insert(id
);
259 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
261 // use a stack-local down_cache if we didn't get one from the
262 // caller. then at least this particular call will avoid duplicated
264 set
<int> local_down_cache
;
266 down_cache
= &local_down_cache
;
275 type
= crush
->get_bucket_type(current
);
279 if (!subtree_is_down(current
, down_cache
)) {
280 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
284 // is this a big enough subtree to be marked as down?
285 if (type
>= subtree_type
) {
286 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
290 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
297 bool OSDMap::subtree_type_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_in_osds
, set
<int> *up_in_osds
,
298 set
<int> *subtree_up
, unordered_map
<int, set
<int> > *subtree_type_down
) const
301 bool is_down_ret
= is_down(id
);
304 down_in_osds
->insert(id
);
306 up_in_osds
->insert(id
);
312 if (subtree_type_down
&&
313 (*subtree_type_down
)[subtree_type
].count(id
)) {
318 crush
->get_children(id
, &children
);
319 for (const auto &child
: children
) {
320 if (!subtree_type_is_down(cct
, child
, crush
->get_bucket_type(child
), down_in_osds
, up_in_osds
, subtree_up
, subtree_type_down
)) {
321 subtree_up
->insert(id
);
325 if (subtree_type_down
) {
326 (*subtree_type_down
)[subtree_type
].insert(id
);
331 void OSDMap::Incremental::encode_client_old(bufferlist
& bl
) const
337 ::encode(modified
, bl
);
338 int32_t new_t
= new_pool_max
;
340 ::encode(new_flags
, bl
);
341 ::encode(fullmap
, bl
);
344 ::encode(new_max_osd
, bl
);
345 // for ::encode(new_pools, bl);
346 __u32 n
= new_pools
.size();
348 for (const auto &new_pool
: new_pools
) {
351 ::encode(new_pool
.second
, bl
, 0);
353 // for ::encode(new_pool_names, bl);
354 n
= new_pool_names
.size();
357 for (const auto &new_pool_name
: new_pool_names
) {
358 n
= new_pool_name
.first
;
360 ::encode(new_pool_name
.second
, bl
);
362 // for ::encode(old_pools, bl);
363 n
= old_pools
.size();
365 for (auto &old_pool
: old_pools
) {
369 ::encode(new_up_client
, bl
, 0);
371 // legacy is map<int32_t,uint8_t>
372 uint32_t n
= new_state
.size();
374 for (auto p
: new_state
) {
375 ::encode(p
.first
, bl
);
376 ::encode((uint8_t)p
.second
, bl
);
379 ::encode(new_weight
, bl
);
380 // for ::encode(new_pg_temp, bl);
381 n
= new_pg_temp
.size();
384 for (const auto &pg_temp
: new_pg_temp
) {
385 old_pg_t opg
= pg_temp
.first
.get_old_pg();
387 ::encode(pg_temp
.second
, bl
);
391 void OSDMap::Incremental::encode_classic(bufferlist
& bl
, uint64_t features
) const
393 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
394 encode_client_old(bl
);
403 ::encode(modified
, bl
);
404 ::encode(new_pool_max
, bl
);
405 ::encode(new_flags
, bl
);
406 ::encode(fullmap
, bl
);
409 ::encode(new_max_osd
, bl
);
410 ::encode(new_pools
, bl
, features
);
411 ::encode(new_pool_names
, bl
);
412 ::encode(old_pools
, bl
);
413 ::encode(new_up_client
, bl
, features
);
415 uint32_t n
= new_state
.size();
417 for (auto p
: new_state
) {
418 ::encode(p
.first
, bl
);
419 ::encode((uint8_t)p
.second
, bl
);
422 ::encode(new_weight
, bl
);
423 ::encode(new_pg_temp
, bl
);
428 ::encode(new_hb_back_up
, bl
, features
);
429 ::encode(new_up_thru
, bl
);
430 ::encode(new_last_clean_interval
, bl
);
431 ::encode(new_lost
, bl
);
432 ::encode(new_blacklist
, bl
, features
);
433 ::encode(old_blacklist
, bl
, features
);
434 ::encode(new_up_cluster
, bl
, features
);
435 ::encode(cluster_snapshot
, bl
);
436 ::encode(new_uuid
, bl
);
437 ::encode(new_xinfo
, bl
);
438 ::encode(new_hb_front_up
, bl
, features
);
441 void OSDMap::Incremental::encode(bufferlist
& bl
, uint64_t features
) const
443 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
444 encode_classic(bl
, features
);
448 // only a select set of callers should *ever* be encoding new
449 // OSDMaps. others should be passing around the canonical encoded
450 // buffers from on high. select out those callers by passing in an
451 // "impossible" feature bit.
452 assert(features
& CEPH_FEATURE_RESERVED
);
453 features
&= ~CEPH_FEATURE_RESERVED
;
455 size_t start_offset
= bl
.length();
457 buffer::list::iterator crc_it
;
459 // meta-encoding: how we include client-used and osd-specific data
460 ENCODE_START(8, 7, bl
);
464 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
467 ENCODE_START(v
, 1, bl
); // client-usable data
470 ::encode(modified
, bl
);
471 ::encode(new_pool_max
, bl
);
472 ::encode(new_flags
, bl
);
473 ::encode(fullmap
, bl
);
476 ::encode(new_max_osd
, bl
);
477 ::encode(new_pools
, bl
, features
);
478 ::encode(new_pool_names
, bl
);
479 ::encode(old_pools
, bl
);
480 ::encode(new_up_client
, bl
, features
);
482 ::encode(new_state
, bl
);
484 uint32_t n
= new_state
.size();
486 for (auto p
: new_state
) {
487 ::encode(p
.first
, bl
);
488 ::encode((uint8_t)p
.second
, bl
);
491 ::encode(new_weight
, bl
);
492 ::encode(new_pg_temp
, bl
);
493 ::encode(new_primary_temp
, bl
);
494 ::encode(new_primary_affinity
, bl
);
495 ::encode(new_erasure_code_profiles
, bl
);
496 ::encode(old_erasure_code_profiles
, bl
);
498 ::encode(new_pg_upmap
, bl
);
499 ::encode(old_pg_upmap
, bl
);
500 ::encode(new_pg_upmap_items
, bl
);
501 ::encode(old_pg_upmap_items
, bl
);
503 ENCODE_FINISH(bl
); // client-usable data
507 uint8_t target_v
= 6;
508 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
511 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
512 ::encode(new_hb_back_up
, bl
, features
);
513 ::encode(new_up_thru
, bl
);
514 ::encode(new_last_clean_interval
, bl
);
515 ::encode(new_lost
, bl
);
516 ::encode(new_blacklist
, bl
, features
);
517 ::encode(old_blacklist
, bl
, features
);
518 ::encode(new_up_cluster
, bl
, features
);
519 ::encode(cluster_snapshot
, bl
);
520 ::encode(new_uuid
, bl
);
521 ::encode(new_xinfo
, bl
);
522 ::encode(new_hb_front_up
, bl
, features
);
523 ::encode(features
, bl
); // NOTE: features arg, not the member
525 ::encode(new_nearfull_ratio
, bl
);
526 ::encode(new_full_ratio
, bl
);
527 ::encode(new_backfillfull_ratio
, bl
);
529 // 5 was string-based new_require_min_compat_client
531 ::encode(new_require_min_compat_client
, bl
);
532 ::encode(new_require_osd_release
, bl
);
534 ENCODE_FINISH(bl
); // osd-only data
537 ::encode((uint32_t)0, bl
); // dummy inc_crc
540 tail_offset
= bl
.length();
542 ::encode(full_crc
, bl
);
544 ENCODE_FINISH(bl
); // meta-encoding wrapper
548 front
.substr_of(bl
, start_offset
, crc_it
.get_off() - start_offset
);
549 inc_crc
= front
.crc32c(-1);
551 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
552 inc_crc
= tail
.crc32c(inc_crc
);
555 crc_it
.copy_in(4, (char*)&crc_le
);
559 void OSDMap::Incremental::decode_classic(bufferlist::iterator
&p
)
567 ::decode(modified
, p
);
568 if (v
== 4 || v
== 5) {
572 ::decode(new_pool_max
, p
);
573 ::decode(new_flags
, p
);
574 ::decode(fullmap
, p
);
577 ::decode(new_max_osd
, p
);
583 ::decode(new_pools
[t
], p
);
586 ::decode(new_pools
, p
);
589 new_pool_names
.clear();
593 ::decode(new_pool_names
[t
], p
);
596 ::decode(new_pool_names
, p
);
606 ::decode(old_pools
, p
);
608 ::decode(new_up_client
, p
);
610 map
<int32_t,uint8_t> ns
;
613 new_state
[q
.first
] = q
.second
;
616 ::decode(new_weight
, p
);
623 ::decode_raw(opg
, p
);
624 ::decode(new_pg_temp
[pg_t(opg
)], p
);
627 ::decode(new_pg_temp
, p
);
630 // decode short map, too.
631 if (v
== 5 && p
.end())
638 ::decode(new_hb_back_up
, p
);
640 ::decode(new_pool_names
, p
);
641 ::decode(new_up_thru
, p
);
642 ::decode(new_last_clean_interval
, p
);
643 ::decode(new_lost
, p
);
644 ::decode(new_blacklist
, p
);
645 ::decode(old_blacklist
, p
);
647 ::decode(new_up_cluster
, p
);
649 ::decode(cluster_snapshot
, p
);
651 ::decode(new_uuid
, p
);
653 ::decode(new_xinfo
, p
);
655 ::decode(new_hb_front_up
, p
);
658 void OSDMap::Incremental::decode(bufferlist::iterator
& bl
)
661 * Older encodings of the Incremental had a single struct_v which
662 * covered the whole encoding, and was prior to our modern
663 * stuff which includes a compatv and a size. So if we see
664 * a struct_v < 7, we must rewind to the beginning and use our
667 size_t start_offset
= bl
.get_off();
668 size_t tail_offset
= 0;
669 bufferlist crc_front
, crc_tail
;
671 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
673 int struct_v_size
= sizeof(struct_v
);
674 bl
.advance(-struct_v_size
);
678 encode_features
= CEPH_FEATURE_PGID64
;
684 DECODE_START(5, bl
); // client-usable data
687 ::decode(modified
, bl
);
688 ::decode(new_pool_max
, bl
);
689 ::decode(new_flags
, bl
);
690 ::decode(fullmap
, bl
);
693 ::decode(new_max_osd
, bl
);
694 ::decode(new_pools
, bl
);
695 ::decode(new_pool_names
, bl
);
696 ::decode(old_pools
, bl
);
697 ::decode(new_up_client
, bl
);
699 ::decode(new_state
, bl
);
701 map
<int32_t,uint8_t> ns
;
704 new_state
[q
.first
] = q
.second
;
707 ::decode(new_weight
, bl
);
708 ::decode(new_pg_temp
, bl
);
709 ::decode(new_primary_temp
, bl
);
711 ::decode(new_primary_affinity
, bl
);
713 new_primary_affinity
.clear();
715 ::decode(new_erasure_code_profiles
, bl
);
716 ::decode(old_erasure_code_profiles
, bl
);
718 new_erasure_code_profiles
.clear();
719 old_erasure_code_profiles
.clear();
722 ::decode(new_pg_upmap
, bl
);
723 ::decode(old_pg_upmap
, bl
);
724 ::decode(new_pg_upmap_items
, bl
);
725 ::decode(old_pg_upmap_items
, bl
);
727 DECODE_FINISH(bl
); // client-usable data
731 DECODE_START(6, bl
); // extended, osd-only data
732 ::decode(new_hb_back_up
, bl
);
733 ::decode(new_up_thru
, bl
);
734 ::decode(new_last_clean_interval
, bl
);
735 ::decode(new_lost
, bl
);
736 ::decode(new_blacklist
, bl
);
737 ::decode(old_blacklist
, bl
);
738 ::decode(new_up_cluster
, bl
);
739 ::decode(cluster_snapshot
, bl
);
740 ::decode(new_uuid
, bl
);
741 ::decode(new_xinfo
, bl
);
742 ::decode(new_hb_front_up
, bl
);
744 ::decode(encode_features
, bl
);
746 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
748 ::decode(new_nearfull_ratio
, bl
);
749 ::decode(new_full_ratio
, bl
);
751 new_nearfull_ratio
= -1;
755 ::decode(new_backfillfull_ratio
, bl
);
757 new_backfillfull_ratio
= -1;
763 new_require_min_compat_client
= ceph_release_from_name(r
.c_str());
767 ::decode(new_require_min_compat_client
, bl
);
768 ::decode(new_require_osd_release
, bl
);
770 if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
771 // only for compat with post-kraken pre-luminous test clusters
772 new_require_osd_release
= CEPH_RELEASE_LUMINOUS
;
773 new_flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
774 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
775 new_require_osd_release
= CEPH_RELEASE_KRAKEN
;
776 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_JEWEL
)) {
777 new_require_osd_release
= CEPH_RELEASE_JEWEL
;
779 new_require_osd_release
= -1;
782 DECODE_FINISH(bl
); // osd-only data
787 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
788 ::decode(inc_crc
, bl
);
789 tail_offset
= bl
.get_off();
790 ::decode(full_crc
, bl
);
797 DECODE_FINISH(bl
); // wrapper
801 uint32_t actual
= crc_front
.crc32c(-1);
802 if (tail_offset
< bl
.get_off()) {
804 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
805 actual
= tail
.crc32c(actual
);
807 if (inc_crc
!= actual
) {
809 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
811 throw buffer::malformed_input(s
.c_str());
816 void OSDMap::Incremental::dump(Formatter
*f
) const
818 f
->dump_int("epoch", epoch
);
819 f
->dump_stream("fsid") << fsid
;
820 f
->dump_stream("modified") << modified
;
821 f
->dump_int("new_pool_max", new_pool_max
);
822 f
->dump_int("new_flags", new_flags
);
823 f
->dump_float("new_full_ratio", new_full_ratio
);
824 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
825 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
826 f
->dump_int("new_require_min_compat_client", new_require_min_compat_client
);
827 f
->dump_int("new_require_osd_release", new_require_osd_release
);
829 if (fullmap
.length()) {
830 f
->open_object_section("full_map");
832 bufferlist fbl
= fullmap
; // kludge around constness.
833 auto p
= fbl
.begin();
838 if (crush
.length()) {
839 f
->open_object_section("crush");
841 bufferlist tbl
= crush
; // kludge around constness.
842 auto p
= tbl
.begin();
848 f
->dump_int("new_max_osd", new_max_osd
);
850 f
->open_array_section("new_pools");
852 for (const auto &new_pool
: new_pools
) {
853 f
->open_object_section("pool");
854 f
->dump_int("pool", new_pool
.first
);
855 new_pool
.second
.dump(f
);
859 f
->open_array_section("new_pool_names");
861 for (const auto &new_pool_name
: new_pool_names
) {
862 f
->open_object_section("pool_name");
863 f
->dump_int("pool", new_pool_name
.first
);
864 f
->dump_string("name", new_pool_name
.second
);
868 f
->open_array_section("old_pools");
870 for (const auto &old_pool
: old_pools
)
871 f
->dump_int("pool", old_pool
);
874 f
->open_array_section("new_up_osds");
876 for (const auto &upclient
: new_up_client
) {
877 f
->open_object_section("osd");
878 f
->dump_int("osd", upclient
.first
);
879 f
->dump_stream("public_addr") << upclient
.second
;
880 f
->dump_stream("cluster_addr") << new_up_cluster
.find(upclient
.first
)->second
;
881 f
->dump_stream("heartbeat_back_addr") << new_hb_back_up
.find(upclient
.first
)->second
;
882 map
<int32_t, entity_addr_t
>::const_iterator q
;
883 if ((q
= new_hb_front_up
.find(upclient
.first
)) != new_hb_front_up
.end())
884 f
->dump_stream("heartbeat_front_addr") << q
->second
;
889 f
->open_array_section("new_weight");
891 for (const auto &weight
: new_weight
) {
892 f
->open_object_section("osd");
893 f
->dump_int("osd", weight
.first
);
894 f
->dump_int("weight", weight
.second
);
899 f
->open_array_section("osd_state_xor");
900 for (const auto &ns
: new_state
) {
901 f
->open_object_section("osd");
902 f
->dump_int("osd", ns
.first
);
904 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
905 f
->open_array_section("state_xor");
906 for (auto &state
: st
)
907 f
->dump_string("state", state
);
912 f
->open_array_section("new_pg_temp");
914 for (const auto &pg_temp
: new_pg_temp
) {
915 f
->open_object_section("pg");
916 f
->dump_stream("pgid") << pg_temp
.first
;
917 f
->open_array_section("osds");
919 for (const auto &osd
: pg_temp
.second
)
920 f
->dump_int("osd", osd
);
926 f
->open_array_section("primary_temp");
928 for (const auto &primary_temp
: new_primary_temp
) {
929 f
->dump_stream("pgid") << primary_temp
.first
;
930 f
->dump_int("osd", primary_temp
.second
);
932 f
->close_section(); // primary_temp
934 f
->open_array_section("new_pg_upmap");
935 for (auto& i
: new_pg_upmap
) {
936 f
->open_object_section("mapping");
937 f
->dump_stream("pgid") << i
.first
;
938 f
->open_array_section("osds");
939 for (auto osd
: i
.second
) {
940 f
->dump_int("osd", osd
);
946 f
->open_array_section("old_pg_upmap");
947 for (auto& i
: old_pg_upmap
) {
948 f
->dump_stream("pgid") << i
;
952 f
->open_array_section("new_pg_upmap_items");
953 for (auto& i
: new_pg_upmap_items
) {
954 f
->open_object_section("mapping");
955 f
->dump_stream("pgid") << i
.first
;
956 f
->open_array_section("mappings");
957 for (auto& p
: i
.second
) {
958 f
->open_object_section("mapping");
959 f
->dump_int("from", p
.first
);
960 f
->dump_int("to", p
.second
);
967 f
->open_array_section("old_pg_upmap_items");
968 for (auto& i
: old_pg_upmap_items
) {
969 f
->dump_stream("pgid") << i
;
973 f
->open_array_section("new_up_thru");
975 for (const auto &up_thru
: new_up_thru
) {
976 f
->open_object_section("osd");
977 f
->dump_int("osd", up_thru
.first
);
978 f
->dump_int("up_thru", up_thru
.second
);
983 f
->open_array_section("new_lost");
985 for (const auto &lost
: new_lost
) {
986 f
->open_object_section("osd");
987 f
->dump_int("osd", lost
.first
);
988 f
->dump_int("epoch_lost", lost
.second
);
993 f
->open_array_section("new_last_clean_interval");
995 for (const auto &last_clean_interval
: new_last_clean_interval
) {
996 f
->open_object_section("osd");
997 f
->dump_int("osd", last_clean_interval
.first
);
998 f
->dump_int("first", last_clean_interval
.second
.first
);
999 f
->dump_int("last", last_clean_interval
.second
.second
);
1004 f
->open_array_section("new_blacklist");
1005 for (const auto &blist
: new_blacklist
) {
1008 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1011 f
->open_array_section("old_blacklist");
1012 for (const auto &blist
: old_blacklist
)
1013 f
->dump_stream("addr") << blist
;
1016 f
->open_array_section("new_xinfo");
1017 for (const auto &xinfo
: new_xinfo
) {
1018 f
->open_object_section("xinfo");
1019 f
->dump_int("osd", xinfo
.first
);
1020 xinfo
.second
.dump(f
);
1025 if (cluster_snapshot
.size())
1026 f
->dump_string("cluster_snapshot", cluster_snapshot
);
1028 f
->open_array_section("new_uuid");
1029 for (const auto &uuid
: new_uuid
) {
1030 f
->open_object_section("osd");
1031 f
->dump_int("osd", uuid
.first
);
1032 f
->dump_stream("uuid") << uuid
.second
;
1037 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
1038 f
->open_array_section("old_erasure_code_profiles");
1039 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
1040 f
->dump_string("old", erasure_code_profile
.c_str());
1045 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
1047 o
.push_back(new Incremental
);
1050 // ----------------------------------
1053 void OSDMap::set_epoch(epoch_t e
)
1056 for (auto &pool
: pools
)
1057 pool
.second
.last_change
= e
;
1060 bool OSDMap::is_blacklisted(const entity_addr_t
& a
) const
1062 if (blacklist
.empty())
1065 // this specific instance?
1066 if (blacklist
.count(a
))
1069 // is entire ip blacklisted?
1071 entity_addr_t b
= a
;
1074 if (blacklist
.count(b
)) {
1082 void OSDMap::get_blacklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const
1084 std::copy(blacklist
.begin(), blacklist
.end(), std::back_inserter(*bl
));
1087 void OSDMap::get_blacklist(std::set
<entity_addr_t
> *bl
) const
1089 for (const auto &i
: blacklist
) {
1090 bl
->insert(i
.first
);
1094 void OSDMap::set_max_osd(int m
)
1098 osd_state
.resize(m
);
1099 osd_weight
.resize(m
);
1100 for (; o
<max_osd
; o
++) {
1102 osd_weight
[o
] = CEPH_OSD_OUT
;
1105 osd_xinfo
.resize(m
);
1106 osd_addrs
->client_addr
.resize(m
);
1107 osd_addrs
->cluster_addr
.resize(m
);
1108 osd_addrs
->hb_back_addr
.resize(m
);
1109 osd_addrs
->hb_front_addr
.resize(m
);
1110 osd_uuid
->resize(m
);
1111 if (osd_primary_affinity
)
1112 osd_primary_affinity
->resize(m
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1117 int OSDMap::calc_num_osds()
1122 for (int i
=0; i
<max_osd
; i
++) {
1123 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1125 if (osd_state
[i
] & CEPH_OSD_UP
) {
1128 if (get_weight(i
) != CEPH_OSD_OUT
) {
1136 void OSDMap::count_full_nearfull_osds(int *full
, int *backfill
, int *nearfull
) const
1141 for (int i
= 0; i
< max_osd
; ++i
) {
1142 if (exists(i
) && is_up(i
) && is_in(i
)) {
1143 if (osd_state
[i
] & CEPH_OSD_FULL
)
1145 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1147 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1153 static bool get_osd_utilization(
1154 const mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
1155 int id
, int64_t* kb
, int64_t* kb_used
, int64_t* kb_avail
)
1157 auto p
= osd_stat
.find(id
);
1158 if (p
== osd_stat
.end())
1161 *kb_used
= p
->second
.kb_used
;
1162 *kb_avail
= p
->second
.kb_avail
;
1166 void OSDMap::get_full_osd_util(
1167 const mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
1168 map
<int, float> *full
, map
<int, float> *backfill
, map
<int, float> *nearfull
) const
1173 for (int i
= 0; i
< max_osd
; ++i
) {
1174 if (exists(i
) && is_up(i
) && is_in(i
)) {
1175 int64_t kb
, kb_used
, kb_avail
;
1176 if (osd_state
[i
] & CEPH_OSD_FULL
) {
1177 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1178 full
->emplace(i
, (float)kb_used
/ (float)kb
);
1179 } else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
) {
1180 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1181 backfill
->emplace(i
, (float)kb_used
/ (float)kb
);
1182 } else if (osd_state
[i
] & CEPH_OSD_NEARFULL
) {
1183 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1184 nearfull
->emplace(i
, (float)kb_used
/ (float)kb
);
1190 void OSDMap::get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
1191 set
<int> *nearfull
) const
1196 for (int i
= 0; i
< max_osd
; ++i
) {
1197 if (exists(i
) && is_up(i
) && is_in(i
)) {
1198 if (osd_state
[i
] & CEPH_OSD_FULL
)
1200 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1201 backfill
->emplace(i
);
1202 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1203 nearfull
->emplace(i
);
1208 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1210 for (int i
=0; i
<max_osd
; i
++)
1215 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1217 for (int i
= 0; i
< max_osd
; i
++) {
1223 void OSDMap::get_out_osds(set
<int32_t>& ls
) const
1225 for (int i
= 0; i
< max_osd
; i
++) {
1231 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1234 for (unsigned s
= 1; t
; s
<<= 1) {
1237 st
.insert(ceph_osd_state_name(s
));
1242 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1245 for (const auto &weight
: weights
) {
1246 if (weight
.second
> max
)
1247 max
= weight
.second
;
1250 for (const auto &weight
: weights
) {
1251 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1255 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1257 for (int i
=0; i
<max_osd
; i
++)
1258 if (exists(i
) && (get_addr(i
) == addr
|| get_cluster_addr(i
) == addr
))
1263 int OSDMap::identify_osd(const uuid_d
& u
) const
1265 for (int i
=0; i
<max_osd
; i
++)
1266 if (exists(i
) && get_uuid(i
) == u
)
1271 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1273 for (int i
=0; i
<max_osd
; i
++)
1274 if (exists(i
) && (get_addr(i
) == addr
|| get_cluster_addr(i
) == addr
||
1275 get_hb_back_addr(i
) == addr
|| get_hb_front_addr(i
) == addr
))
1280 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1282 for (int i
=0; i
<max_osd
; i
++)
1283 if (exists(i
) && (get_addr(i
).is_same_host(ip
) || get_cluster_addr(i
).is_same_host(ip
)))
1289 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1291 uint64_t features
= 0; // things we actually have
1292 uint64_t mask
= 0; // things we could have
1294 if (crush
->has_nondefault_tunables())
1295 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1296 if (crush
->has_nondefault_tunables2())
1297 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1298 if (crush
->has_nondefault_tunables3())
1299 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1300 if (crush
->has_v4_buckets())
1301 features
|= CEPH_FEATURE_CRUSH_V4
;
1302 if (crush
->has_nondefault_tunables5())
1303 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1304 if (crush
->has_incompat_choose_args())
1305 features
|= CEPH_FEATURE_CRUSH_CHOOSE_ARGS
;
1306 mask
|= CEPH_FEATURES_CRUSH
;
1308 if (!pg_upmap
.empty() || !pg_upmap_items
.empty())
1309 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1310 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1312 for (auto &pool
: pools
) {
1313 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1314 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1316 if (pool
.second
.is_erasure() &&
1317 entity_type
!= CEPH_ENTITY_TYPE_CLIENT
) { // not for clients
1318 features
|= CEPH_FEATURE_OSD_ERASURE_CODES
;
1320 if (!pool
.second
.tiers
.empty() ||
1321 pool
.second
.is_tier()) {
1322 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1324 int ruleid
= crush
->find_rule(pool
.second
.get_crush_rule(),
1325 pool
.second
.get_type(),
1326 pool
.second
.get_size());
1328 if (crush
->is_v2_rule(ruleid
))
1329 features
|= CEPH_FEATURE_CRUSH_V2
;
1330 if (crush
->is_v3_rule(ruleid
))
1331 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1332 if (crush
->is_v5_rule(ruleid
))
1333 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1336 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1337 for (auto &erasure_code_profile
: erasure_code_profiles
) {
1338 auto& profile
= erasure_code_profile
.second
;
1339 const auto& plugin
= profile
.find("plugin");
1340 if (plugin
!= profile
.end()) {
1341 if (plugin
->second
== "isa" || plugin
->second
== "lrc")
1342 features
|= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
;
1343 if (plugin
->second
== "shec")
1344 features
|= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
;
1348 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1349 if (entity_type
!= CEPH_ENTITY_TYPE_CLIENT
)
1350 mask
|= CEPH_FEATURE_OSD_ERASURE_CODES
;
1352 if (osd_primary_affinity
) {
1353 for (int i
= 0; i
< max_osd
; ++i
) {
1354 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1355 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1360 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1362 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1363 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1364 if (require_osd_release
>= CEPH_RELEASE_JEWEL
) {
1365 features
|= jewel_features
;
1367 mask
|= jewel_features
;
1369 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1370 | CEPH_FEATURE_MSG_ADDR2
;
1371 if (require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
1372 features
|= kraken_features
;
1374 mask
|= kraken_features
;
1382 uint8_t OSDMap::get_min_compat_client() const
1384 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1386 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1387 HAVE_FEATURE(f
, CRUSH_CHOOSE_ARGS
)) { // v12.0.1-2172-gef1ef28
1388 return CEPH_RELEASE_LUMINOUS
; // v12.2.0
1390 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1391 return CEPH_RELEASE_JEWEL
; // v10.2.0
1393 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1394 return CEPH_RELEASE_HAMMER
; // v0.94.0
1396 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1397 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1398 HAVE_FEATURE(f
, OSD_ERASURE_CODES
) || // v0.73-498-gbfc86a8
1399 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1400 return CEPH_RELEASE_FIREFLY
; // v0.80.0
1402 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1403 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1404 return CEPH_RELEASE_DUMPLING
; // v0.67.0
1406 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1407 return CEPH_RELEASE_ARGONAUT
; // v0.48argonaut-206-g6f381af
1409 return CEPH_RELEASE_ARGONAUT
; // v0.48argonaut-206-g6f381af
1412 void OSDMap::_calc_up_osd_features()
1415 cached_up_osd_features
= 0;
1416 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1419 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1421 cached_up_osd_features
= xi
.features
;
1424 cached_up_osd_features
&= xi
.features
;
1429 uint64_t OSDMap::get_up_osd_features() const
1431 return cached_up_osd_features
;
1434 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1436 if (o
->epoch
== n
->epoch
)
1442 if (o
->max_osd
!= n
->max_osd
)
1444 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1445 if ( n
->osd_addrs
->client_addr
[i
] && o
->osd_addrs
->client_addr
[i
] &&
1446 *n
->osd_addrs
->client_addr
[i
] == *o
->osd_addrs
->client_addr
[i
])
1447 n
->osd_addrs
->client_addr
[i
] = o
->osd_addrs
->client_addr
[i
];
1450 if ( n
->osd_addrs
->cluster_addr
[i
] && o
->osd_addrs
->cluster_addr
[i
] &&
1451 *n
->osd_addrs
->cluster_addr
[i
] == *o
->osd_addrs
->cluster_addr
[i
])
1452 n
->osd_addrs
->cluster_addr
[i
] = o
->osd_addrs
->cluster_addr
[i
];
1455 if ( n
->osd_addrs
->hb_back_addr
[i
] && o
->osd_addrs
->hb_back_addr
[i
] &&
1456 *n
->osd_addrs
->hb_back_addr
[i
] == *o
->osd_addrs
->hb_back_addr
[i
])
1457 n
->osd_addrs
->hb_back_addr
[i
] = o
->osd_addrs
->hb_back_addr
[i
];
1460 if ( n
->osd_addrs
->hb_front_addr
[i
] && o
->osd_addrs
->hb_front_addr
[i
] &&
1461 *n
->osd_addrs
->hb_front_addr
[i
] == *o
->osd_addrs
->hb_front_addr
[i
])
1462 n
->osd_addrs
->hb_front_addr
[i
] = o
->osd_addrs
->hb_front_addr
[i
];
1467 // zoinks, no differences at all!
1468 n
->osd_addrs
= o
->osd_addrs
;
1471 // does crush match?
1473 ::encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1474 ::encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1475 if (oc
.contents_equal(nc
)) {
1476 n
->crush
= o
->crush
;
1479 // does pg_temp match?
1480 if (*o
->pg_temp
== *n
->pg_temp
)
1481 n
->pg_temp
= o
->pg_temp
;
1483 // does primary_temp match?
1484 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1485 if (*o
->primary_temp
== *n
->primary_temp
)
1486 n
->primary_temp
= o
->primary_temp
;
1490 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1491 *o
->osd_uuid
== *n
->osd_uuid
)
1492 n
->osd_uuid
= o
->osd_uuid
;
1495 void OSDMap::clean_temps(CephContext
*cct
,
1496 const OSDMap
& osdmap
, Incremental
*pending_inc
)
1498 ldout(cct
, 10) << __func__
<< dendl
;
1500 tmpmap
.deepish_copy_from(osdmap
);
1501 tmpmap
.apply_incremental(*pending_inc
);
1503 for (auto pg
: *tmpmap
.pg_temp
) {
1504 // if pool does not exist, remove any existing pg_temps associated with
1505 // it. we don't care about pg_temps on the pending_inc either; if there
1506 // are new_pg_temp entries on the pending, clear them out just as well.
1507 if (!osdmap
.have_pg_pool(pg
.first
.pool())) {
1508 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1509 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1510 pending_inc
->new_pg_temp
[pg
.first
].clear();
1514 unsigned num_up
= 0;
1515 for (auto o
: pg
.second
) {
1516 if (!tmpmap
.is_down(o
)) {
1522 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1523 << " with all down osds" << pg
.second
<< dendl
;
1524 pending_inc
->new_pg_temp
[pg
.first
].clear();
1527 // redundant pg_temp?
1530 tmpmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1531 if (vectors_equal(raw_up
, pg
.second
)) {
1532 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1533 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1534 if (osdmap
.pg_temp
->count(pg
.first
))
1535 pending_inc
->new_pg_temp
[pg
.first
].clear();
1537 pending_inc
->new_pg_temp
.erase(pg
.first
);
1541 for (auto &pg
: *tmpmap
.primary_temp
) {
1543 if (tmpmap
.is_down(pg
.second
)) {
1544 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
1545 << " to down " << pg
.second
<< dendl
;
1546 pending_inc
->new_primary_temp
[pg
.first
] = -1;
1549 // redundant primary_temp?
1550 vector
<int> real_up
, templess_up
;
1551 int real_primary
, templess_primary
;
1552 pg_t pgid
= pg
.first
;
1553 tmpmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
1554 tmpmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
1555 if (real_primary
== templess_primary
){
1556 ldout(cct
, 10) << __func__
<< " removing primary_temp "
1557 << pgid
<< " -> " << real_primary
1558 << " (unnecessary/redundant)" << dendl
;
1559 if (osdmap
.primary_temp
->count(pgid
))
1560 pending_inc
->new_primary_temp
[pgid
] = -1;
1562 pending_inc
->new_primary_temp
.erase(pgid
);
1567 int OSDMap::apply_incremental(const Incremental
&inc
)
1569 new_blacklist_entries
= false;
1572 else if (inc
.fsid
!= fsid
)
1575 assert(inc
.epoch
== epoch
+1);
1578 modified
= inc
.modified
;
1581 if (inc
.fullmap
.length()) {
1582 bufferlist
bl(inc
.fullmap
);
1587 // nope, incremental.
1588 if (inc
.new_flags
>= 0) {
1589 flags
= inc
.new_flags
;
1590 // the below is just to cover a newly-upgraded luminous mon
1591 // cluster that has to set require_jewel_osds or
1592 // require_kraken_osds before the osds can be upgraded to
1594 if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
1595 if (require_osd_release
< CEPH_RELEASE_KRAKEN
) {
1596 require_osd_release
= CEPH_RELEASE_KRAKEN
;
1598 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
1599 if (require_osd_release
< CEPH_RELEASE_JEWEL
) {
1600 require_osd_release
= CEPH_RELEASE_JEWEL
;
1605 if (inc
.new_max_osd
>= 0)
1606 set_max_osd(inc
.new_max_osd
);
1608 if (inc
.new_pool_max
!= -1)
1609 pool_max
= inc
.new_pool_max
;
1611 for (const auto &pool
: inc
.new_pools
) {
1612 pools
[pool
.first
] = pool
.second
;
1613 pools
[pool
.first
].last_change
= epoch
;
1616 for (const auto &pname
: inc
.new_pool_names
) {
1617 auto pool_name_entry
= pool_name
.find(pname
.first
);
1618 if (pool_name_entry
!= pool_name
.end()) {
1619 name_pool
.erase(pool_name_entry
->second
);
1620 pool_name_entry
->second
= pname
.second
;
1622 pool_name
[pname
.first
] = pname
.second
;
1624 name_pool
[pname
.second
] = pname
.first
;
1627 for (const auto &pool
: inc
.old_pools
) {
1629 name_pool
.erase(pool_name
[pool
]);
1630 pool_name
.erase(pool
);
1633 for (const auto &weight
: inc
.new_weight
) {
1634 set_weight(weight
.first
, weight
.second
);
1636 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1637 // xinfo old_weight.
1638 if (weight
.second
) {
1639 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
1640 osd_xinfo
[weight
.first
].old_weight
= 0;
1644 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
1645 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
1648 // erasure_code_profiles
1649 for (const auto &profile
: inc
.old_erasure_code_profiles
)
1650 erasure_code_profiles
.erase(profile
);
1652 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
1653 set_erasure_code_profile(profile
.first
, profile
.second
);
1657 for (const auto &state
: inc
.new_state
) {
1658 const auto osd
= state
.first
;
1659 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
1660 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
1661 (s
& CEPH_OSD_UP
)) {
1662 osd_info
[osd
].down_at
= epoch
;
1663 osd_xinfo
[osd
].down_stamp
= modified
;
1665 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
1666 (s
& CEPH_OSD_EXISTS
)) {
1667 // osd is destroyed; clear out anything interesting.
1668 (*osd_uuid
)[osd
] = uuid_d();
1669 osd_info
[osd
] = osd_info_t();
1670 osd_xinfo
[osd
] = osd_xinfo_t();
1671 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1672 osd_addrs
->client_addr
[osd
].reset(new entity_addr_t());
1673 osd_addrs
->cluster_addr
[osd
].reset(new entity_addr_t());
1674 osd_addrs
->hb_front_addr
[osd
].reset(new entity_addr_t());
1675 osd_addrs
->hb_back_addr
[osd
].reset(new entity_addr_t());
1678 osd_state
[osd
] ^= s
;
1682 for (const auto &client
: inc
.new_up_client
) {
1683 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
1684 osd_addrs
->client_addr
[client
.first
].reset(new entity_addr_t(client
.second
));
1685 if (inc
.new_hb_back_up
.empty())
1686 osd_addrs
->hb_back_addr
[client
.first
].reset(new entity_addr_t(client
.second
)); //this is a backward-compatibility hack
1688 osd_addrs
->hb_back_addr
[client
.first
].reset(
1689 new entity_addr_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
1690 const auto j
= inc
.new_hb_front_up
.find(client
.first
);
1691 if (j
!= inc
.new_hb_front_up
.end())
1692 osd_addrs
->hb_front_addr
[client
.first
].reset(new entity_addr_t(j
->second
));
1694 osd_addrs
->hb_front_addr
[client
.first
].reset();
1696 osd_info
[client
.first
].up_from
= epoch
;
1699 for (const auto &cluster
: inc
.new_up_cluster
)
1700 osd_addrs
->cluster_addr
[cluster
.first
].reset(new entity_addr_t(cluster
.second
));
1703 for (const auto &thru
: inc
.new_up_thru
)
1704 osd_info
[thru
.first
].up_thru
= thru
.second
;
1706 for (const auto &interval
: inc
.new_last_clean_interval
) {
1707 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
1708 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
1711 for (const auto &lost
: inc
.new_lost
)
1712 osd_info
[lost
.first
].lost_at
= lost
.second
;
1715 for (const auto &xinfo
: inc
.new_xinfo
)
1716 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
1719 for (const auto &uuid
: inc
.new_uuid
)
1720 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
1723 for (const auto &pg
: inc
.new_pg_temp
) {
1724 if (pg
.second
.empty())
1725 pg_temp
->erase(pg
.first
);
1727 pg_temp
->set(pg
.first
, pg
.second
);
1729 if (!inc
.new_pg_temp
.empty()) {
1730 // make sure pg_temp is efficiently stored
1734 for (const auto &pg
: inc
.new_primary_temp
) {
1735 if (pg
.second
== -1)
1736 primary_temp
->erase(pg
.first
);
1738 (*primary_temp
)[pg
.first
] = pg
.second
;
1741 for (auto& p
: inc
.new_pg_upmap
) {
1742 pg_upmap
[p
.first
] = p
.second
;
1744 for (auto& pg
: inc
.old_pg_upmap
) {
1747 for (auto& p
: inc
.new_pg_upmap_items
) {
1748 pg_upmap_items
[p
.first
] = p
.second
;
1750 for (auto& pg
: inc
.old_pg_upmap_items
) {
1751 pg_upmap_items
.erase(pg
);
1755 if (!inc
.new_blacklist
.empty()) {
1756 blacklist
.insert(inc
.new_blacklist
.begin(),inc
.new_blacklist
.end());
1757 new_blacklist_entries
= true;
1759 for (const auto &addr
: inc
.old_blacklist
)
1760 blacklist
.erase(addr
);
1762 // cluster snapshot?
1763 if (inc
.cluster_snapshot
.length()) {
1764 cluster_snapshot
= inc
.cluster_snapshot
;
1765 cluster_snapshot_epoch
= inc
.epoch
;
1767 cluster_snapshot
.clear();
1768 cluster_snapshot_epoch
= 0;
1771 if (inc
.new_nearfull_ratio
>= 0) {
1772 nearfull_ratio
= inc
.new_nearfull_ratio
;
1774 if (inc
.new_backfillfull_ratio
>= 0) {
1775 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
1777 if (inc
.new_full_ratio
>= 0) {
1778 full_ratio
= inc
.new_full_ratio
;
1780 if (inc
.new_require_min_compat_client
> 0) {
1781 require_min_compat_client
= inc
.new_require_min_compat_client
;
1783 if (inc
.new_require_osd_release
>= 0) {
1784 require_osd_release
= inc
.new_require_osd_release
;
1785 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1786 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
1790 // do new crush map last (after up/down stuff)
1791 if (inc
.crush
.length()) {
1792 bufferlist
bl(inc
.crush
);
1793 auto blp
= bl
.begin();
1794 crush
.reset(new CrushWrapper
);
1796 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1797 // only increment if this is a luminous-encoded osdmap, lest
1798 // the mon's crush_version diverge from what the osds or others
1799 // are decoding and applying on their end. if we won't encode
1800 // it in the canonical version, don't change it.
1806 _calc_up_osd_features();
1811 int OSDMap::map_to_pg(
1815 const string
& nspace
,
1818 // calculate ps (placement seed)
1819 const pg_pool_t
*pool
= get_pg_pool(poolid
);
1824 ps
= pool
->hash_key(key
, nspace
);
1826 ps
= pool
->hash_key(name
, nspace
);
1827 *pg
= pg_t(ps
, poolid
);
1831 int OSDMap::object_locator_to_pg(
1832 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
1834 if (loc
.hash
>= 0) {
1835 if (!get_pg_pool(loc
.get_pool())) {
1838 pg
= pg_t(loc
.hash
, loc
.get_pool());
1841 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
1844 ceph_object_layout
OSDMap::make_object_layout(
1845 object_t oid
, int pg_pool
, string nspace
) const
1847 object_locator_t
loc(pg_pool
, nspace
);
1849 ceph_object_layout ol
;
1850 pg_t pgid
= object_locator_to_pg(oid
, loc
);
1851 ol
.ol_pgid
= pgid
.get_old_pg().v
;
1852 ol
.ol_stripe_unit
= 0;
1856 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
1857 vector
<int>& osds
) const
1859 if (pool
.can_shift_osds()) {
1860 unsigned removed
= 0;
1861 for (unsigned i
= 0; i
< osds
.size(); i
++) {
1862 if (!exists(osds
[i
])) {
1867 osds
[i
- removed
] = osds
[i
];
1871 osds
.resize(osds
.size() - removed
);
1873 for (auto& osd
: osds
) {
1875 osd
= CRUSH_ITEM_NONE
;
1880 void OSDMap::_pg_to_raw_osds(
1881 const pg_pool_t
& pool
, pg_t pg
,
1886 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
1887 unsigned size
= pool
.get_size();
1890 int ruleno
= crush
->find_rule(pool
.get_crush_rule(), pool
.get_type(), size
);
1892 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
1894 _remove_nonexistent_osds(pool
, *osds
);
1900 int OSDMap::_pick_primary(const vector
<int>& osds
) const
1902 for (auto osd
: osds
) {
1903 if (osd
!= CRUSH_ITEM_NONE
) {
1910 void OSDMap::_apply_remap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
1912 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
1913 auto p
= pg_upmap
.find(pg
);
1914 if (p
!= pg_upmap
.end()) {
1915 // make sure targets aren't marked out
1916 for (auto osd
: p
->second
) {
1917 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd_weight
[osd
] == 0) {
1918 // reject/ignore the explicit mapping
1922 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
1926 auto q
= pg_upmap_items
.find(pg
);
1927 if (q
!= pg_upmap_items
.end()) {
1928 // NOTE: this approach does not allow a bidirectional swap,
1929 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1930 for (auto& r
: q
->second
) {
1931 // make sure the replacement value doesn't already appear
1932 bool exists
= false;
1934 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
1935 int osd
= (*raw
)[i
];
1936 if (osd
== r
.second
) {
1940 // ignore mapping if target is marked out (or invalid osd id)
1941 if (osd
== r
.first
&&
1943 !(r
.second
!= CRUSH_ITEM_NONE
&& r
.second
< max_osd
&&
1944 osd_weight
[r
.second
] == 0)) {
1948 if (!exists
&& pos
>= 0) {
1949 (*raw
)[pos
] = r
.second
;
1956 // pg -> (up osd list)
1957 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
1958 vector
<int> *up
) const
1960 if (pool
.can_shift_osds()) {
1963 up
->reserve(raw
.size());
1964 for (unsigned i
=0; i
<raw
.size(); i
++) {
1965 if (!exists(raw
[i
]) || is_down(raw
[i
]))
1967 up
->push_back(raw
[i
]);
1970 // set down/dne devices to NONE
1971 up
->resize(raw
.size());
1972 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
1973 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
1974 (*up
)[i
] = CRUSH_ITEM_NONE
;
1982 void OSDMap::_apply_primary_affinity(ps_t seed
,
1983 const pg_pool_t
& pool
,
1987 // do we have any non-default primary_affinity values for these osds?
1988 if (!osd_primary_affinity
)
1992 for (const auto osd
: *osds
) {
1993 if (osd
!= CRUSH_ITEM_NONE
&&
1994 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
2002 // pick the primary. feed both the seed (for the pg) and the osd
2003 // into the hash/rng so that a proportional fraction of an osd's pgs
2004 // get rejected as primary.
2006 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
2008 if (o
== CRUSH_ITEM_NONE
)
2010 unsigned a
= (*osd_primary_affinity
)[o
];
2011 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
2012 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
2013 seed
, o
) >> 16) >= a
) {
2014 // we chose not to use this primary. note it anyway as a
2015 // fallback in case we don't pick anyone else, but keep looking.
2026 *primary
= (*osds
)[pos
];
2028 if (pool
.can_shift_osds() && pos
> 0) {
2029 // move the new primary to the front.
2030 for (int i
= pos
; i
> 0; --i
) {
2031 (*osds
)[i
] = (*osds
)[i
-1];
2033 (*osds
)[0] = *primary
;
2037 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
2038 vector
<int> *temp_pg
, int *temp_primary
) const
2040 pg
= pool
.raw_pg_to_pg(pg
);
2041 const auto p
= pg_temp
->find(pg
);
2043 if (p
!= pg_temp
->end()) {
2044 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
2045 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
2046 if (pool
.can_shift_osds()) {
2049 temp_pg
->push_back(CRUSH_ITEM_NONE
);
2052 temp_pg
->push_back(p
->second
[i
]);
2056 const auto &pp
= primary_temp
->find(pg
);
2058 if (pp
!= primary_temp
->end()) {
2059 *temp_primary
= pp
->second
;
2060 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
2061 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
2062 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
2063 *temp_primary
= (*temp_pg
)[i
];
2070 void OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
2074 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2077 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2079 *primary
= _pick_primary(*raw
);
2082 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
2084 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2094 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2095 _apply_remap(*pool
, pg
, &raw
);
2096 _raw_to_up_osds(*pool
, raw
, up
);
2097 *primary
= _pick_primary(raw
);
2098 _apply_primary_affinity(pps
, *pool
, up
, primary
);
2101 void OSDMap::_pg_to_up_acting_osds(
2102 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
2103 vector
<int> *acting
, int *acting_primary
,
2104 bool raw_pg_to_pg
) const
2106 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2108 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
2116 *acting_primary
= -1;
2121 vector
<int> _acting
;
2123 int _acting_primary
;
2125 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
2126 if (_acting
.empty() || up
|| up_primary
) {
2127 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2128 _apply_remap(*pool
, pg
, &raw
);
2129 _raw_to_up_osds(*pool
, raw
, &_up
);
2130 _up_primary
= _pick_primary(_up
);
2131 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
2132 if (_acting
.empty()) {
2134 if (_acting_primary
== -1) {
2135 _acting_primary
= _up_primary
;
2142 *up_primary
= _up_primary
;
2146 acting
->swap(_acting
);
2148 *acting_primary
= _acting_primary
;
2151 int OSDMap::calc_pg_rank(int osd
, const vector
<int>& acting
, int nrep
)
2154 nrep
= acting
.size();
2155 for (int i
=0; i
<nrep
; i
++)
2156 if (acting
[i
] == osd
)
2161 int OSDMap::calc_pg_role(int osd
, const vector
<int>& acting
, int nrep
)
2163 return calc_pg_rank(osd
, acting
, nrep
);
2166 bool OSDMap::primary_changed(
2168 const vector
<int> &oldacting
,
2170 const vector
<int> &newacting
)
2172 if (oldacting
.empty() && newacting
.empty())
2173 return false; // both still empty
2174 if (oldacting
.empty() ^ newacting
.empty())
2175 return true; // was empty, now not, or vice versa
2176 if (oldprimary
!= newprimary
)
2177 return true; // primary changed
2178 if (calc_pg_rank(oldprimary
, oldacting
) !=
2179 calc_pg_rank(newprimary
, newacting
))
2181 return false; // same primary (tho replicas may have changed)
2185 // serialize, unserialize
2186 void OSDMap::encode_client_old(bufferlist
& bl
) const
2193 ::encode(epoch
, bl
);
2194 ::encode(created
, bl
);
2195 ::encode(modified
, bl
);
2197 // for ::encode(pools, bl);
2198 __u32 n
= pools
.size();
2201 for (const auto &pool
: pools
) {
2204 ::encode(pool
.second
, bl
, 0);
2206 // for ::encode(pool_name, bl);
2207 n
= pool_name
.size();
2209 for (const auto &pname
: pool_name
) {
2212 ::encode(pname
.second
, bl
);
2214 // for ::encode(pool_max, bl);
2218 ::encode(flags
, bl
);
2220 ::encode(max_osd
, bl
);
2222 uint32_t n
= osd_state
.size();
2224 for (auto s
: osd_state
) {
2225 ::encode((uint8_t)s
, bl
);
2228 ::encode(osd_weight
, bl
);
2229 ::encode(osd_addrs
->client_addr
, bl
, 0);
2231 // for ::encode(pg_temp, bl);
2232 n
= pg_temp
->size();
2234 for (const auto pg
: *pg_temp
) {
2235 old_pg_t opg
= pg
.first
.get_old_pg();
2237 ::encode(pg
.second
, bl
);
2242 crush
->encode(cbl
, 0 /* legacy (no) features */);
2246 void OSDMap::encode_classic(bufferlist
& bl
, uint64_t features
) const
2248 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
2249 encode_client_old(bl
);
2258 ::encode(epoch
, bl
);
2259 ::encode(created
, bl
);
2260 ::encode(modified
, bl
);
2262 ::encode(pools
, bl
, features
);
2263 ::encode(pool_name
, bl
);
2264 ::encode(pool_max
, bl
);
2266 ::encode(flags
, bl
);
2268 ::encode(max_osd
, bl
);
2270 uint32_t n
= osd_state
.size();
2272 for (auto s
: osd_state
) {
2273 ::encode((uint8_t)s
, bl
);
2276 ::encode(osd_weight
, bl
);
2277 ::encode(osd_addrs
->client_addr
, bl
, features
);
2279 ::encode(*pg_temp
, bl
);
2283 crush
->encode(cbl
, 0 /* legacy (no) features */);
2289 ::encode(osd_addrs
->hb_back_addr
, bl
, features
);
2290 ::encode(osd_info
, bl
);
2291 ::encode(blacklist
, bl
, features
);
2292 ::encode(osd_addrs
->cluster_addr
, bl
, features
);
2293 ::encode(cluster_snapshot_epoch
, bl
);
2294 ::encode(cluster_snapshot
, bl
);
2295 ::encode(*osd_uuid
, bl
);
2296 ::encode(osd_xinfo
, bl
);
2297 ::encode(osd_addrs
->hb_front_addr
, bl
, features
);
2300 void OSDMap::encode(bufferlist
& bl
, uint64_t features
) const
2302 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
2303 encode_classic(bl
, features
);
2307 // only a select set of callers should *ever* be encoding new
2308 // OSDMaps. others should be passing around the canonical encoded
2309 // buffers from on high. select out those callers by passing in an
2310 // "impossible" feature bit.
2311 assert(features
& CEPH_FEATURE_RESERVED
);
2312 features
&= ~CEPH_FEATURE_RESERVED
;
2314 size_t start_offset
= bl
.length();
2316 buffer::list::iterator crc_it
;
2318 // meta-encoding: how we include client-used and osd-specific data
2319 ENCODE_START(8, 7, bl
);
2323 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2326 ENCODE_START(v
, 1, bl
); // client-usable data
2329 ::encode(epoch
, bl
);
2330 ::encode(created
, bl
);
2331 ::encode(modified
, bl
);
2333 ::encode(pools
, bl
, features
);
2334 ::encode(pool_name
, bl
);
2335 ::encode(pool_max
, bl
);
2338 decltype(flags
) f
= flags
;
2339 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
)
2340 f
|= CEPH_OSDMAP_REQUIRE_LUMINOUS
;
2341 else if (require_osd_release
== CEPH_RELEASE_KRAKEN
)
2342 f
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
2343 else if (require_osd_release
== CEPH_RELEASE_JEWEL
)
2344 f
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
2347 ::encode(flags
, bl
);
2350 ::encode(max_osd
, bl
);
2352 ::encode(osd_state
, bl
);
2354 uint32_t n
= osd_state
.size();
2356 for (auto s
: osd_state
) {
2357 ::encode((uint8_t)s
, bl
);
2360 ::encode(osd_weight
, bl
);
2361 ::encode(osd_addrs
->client_addr
, bl
, features
);
2363 ::encode(*pg_temp
, bl
);
2364 ::encode(*primary_temp
, bl
);
2365 if (osd_primary_affinity
) {
2366 ::encode(*osd_primary_affinity
, bl
);
2374 crush
->encode(cbl
, features
);
2376 ::encode(erasure_code_profiles
, bl
);
2379 ::encode(pg_upmap
, bl
);
2380 ::encode(pg_upmap_items
, bl
);
2382 assert(pg_upmap
.empty());
2383 assert(pg_upmap_items
.empty());
2386 ::encode(crush_version
, bl
);
2388 ENCODE_FINISH(bl
); // client-usable data
2392 uint8_t target_v
= 5;
2393 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2396 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
2397 ::encode(osd_addrs
->hb_back_addr
, bl
, features
);
2398 ::encode(osd_info
, bl
);
2400 // put this in a sorted, ordered map<> so that we encode in a
2401 // deterministic order.
2402 map
<entity_addr_t
,utime_t
> blacklist_map
;
2403 for (const auto &addr
: blacklist
)
2404 blacklist_map
.insert(make_pair(addr
.first
, addr
.second
));
2405 ::encode(blacklist_map
, bl
, features
);
2407 ::encode(osd_addrs
->cluster_addr
, bl
, features
);
2408 ::encode(cluster_snapshot_epoch
, bl
);
2409 ::encode(cluster_snapshot
, bl
);
2410 ::encode(*osd_uuid
, bl
);
2411 ::encode(osd_xinfo
, bl
);
2412 ::encode(osd_addrs
->hb_front_addr
, bl
, features
);
2413 if (target_v
>= 2) {
2414 ::encode(nearfull_ratio
, bl
);
2415 ::encode(full_ratio
, bl
);
2416 ::encode(backfillfull_ratio
, bl
);
2418 // 4 was string-based new_require_min_compat_client
2419 if (target_v
>= 5) {
2420 ::encode(require_min_compat_client
, bl
);
2421 ::encode(require_osd_release
, bl
);
2423 ENCODE_FINISH(bl
); // osd-only data
2426 ::encode((uint32_t)0, bl
); // dummy crc
2429 tail_offset
= bl
.length();
2431 ENCODE_FINISH(bl
); // meta-encoding wrapper
2435 front
.substr_of(bl
, start_offset
, crc_it
.get_off() - start_offset
);
2436 crc
= front
.crc32c(-1);
2437 if (tail_offset
< bl
.length()) {
2439 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
2440 crc
= tail
.crc32c(crc
);
2444 crc_it
.copy_in(4, (char*)&crc_le
);
2448 void OSDMap::decode(bufferlist
& bl
)
2450 auto p
= bl
.begin();
2454 void OSDMap::decode_classic(bufferlist::iterator
& p
)
2463 ::decode(created
, p
);
2464 ::decode(modified
, p
);
2468 int32_t max_pools
= 0;
2469 ::decode(max_pools
, p
);
2470 pool_max
= max_pools
;
2476 ::decode(pools
[t
], p
);
2481 } else if (v
== 5) {
2486 ::decode(pool_name
[t
], p
);
2493 ::decode(pool_name
, p
);
2494 ::decode(pool_max
, p
);
2496 // kludge around some old bug that zeroed out pool_max (#2307)
2497 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
2498 pool_max
= pools
.rbegin()->first
;
2503 ::decode(max_osd
, p
);
2507 osd_state
.resize(os
.size());
2508 for (unsigned i
= 0; i
< os
.size(); ++i
) {
2509 osd_state
[i
] = os
[i
];
2512 ::decode(osd_weight
, p
);
2513 ::decode(osd_addrs
->client_addr
, p
);
2519 ::decode_raw(opg
, p
);
2520 mempool::osdmap::vector
<int32_t> v
;
2522 pg_temp
->set(pg_t(opg
), v
);
2525 ::decode(*pg_temp
, p
);
2531 auto cblp
= cbl
.begin();
2532 crush
->decode(cblp
);
2538 ::decode(osd_addrs
->hb_back_addr
, p
);
2539 ::decode(osd_info
, p
);
2541 ::decode(pool_name
, p
);
2543 ::decode(blacklist
, p
);
2545 ::decode(osd_addrs
->cluster_addr
, p
);
2547 osd_addrs
->cluster_addr
.resize(osd_addrs
->client_addr
.size());
2550 ::decode(cluster_snapshot_epoch
, p
);
2551 ::decode(cluster_snapshot
, p
);
2555 ::decode(*osd_uuid
, p
);
2557 osd_uuid
->resize(max_osd
);
2560 ::decode(osd_xinfo
, p
);
2562 osd_xinfo
.resize(max_osd
);
2565 ::decode(osd_addrs
->hb_front_addr
, p
);
2567 osd_addrs
->hb_front_addr
.resize(osd_addrs
->hb_back_addr
.size());
2569 osd_primary_affinity
.reset();
2574 void OSDMap::decode(bufferlist::iterator
& bl
)
2577 * Older encodings of the OSDMap had a single struct_v which
2578 * covered the whole encoding, and was prior to our modern
2579 * stuff which includes a compatv and a size. So if we see
2580 * a struct_v < 7, we must rewind to the beginning and use our
2583 size_t start_offset
= bl
.get_off();
2584 size_t tail_offset
= 0;
2585 bufferlist crc_front
, crc_tail
;
2587 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
2589 int struct_v_size
= sizeof(struct_v
);
2590 bl
.advance(-struct_v_size
);
2595 * Since we made it past that hurdle, we can use our normal paths.
2598 DECODE_START(6, bl
); // client-usable data
2601 ::decode(epoch
, bl
);
2602 ::decode(created
, bl
);
2603 ::decode(modified
, bl
);
2605 ::decode(pools
, bl
);
2606 ::decode(pool_name
, bl
);
2607 ::decode(pool_max
, bl
);
2609 ::decode(flags
, bl
);
2611 ::decode(max_osd
, bl
);
2612 if (struct_v
>= 5) {
2613 ::decode(osd_state
, bl
);
2617 osd_state
.resize(os
.size());
2618 for (unsigned i
= 0; i
< os
.size(); ++i
) {
2619 osd_state
[i
] = os
[i
];
2622 ::decode(osd_weight
, bl
);
2623 ::decode(osd_addrs
->client_addr
, bl
);
2625 ::decode(*pg_temp
, bl
);
2626 ::decode(*primary_temp
, bl
);
2627 if (struct_v
>= 2) {
2628 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
2629 ::decode(*osd_primary_affinity
, bl
);
2630 if (osd_primary_affinity
->empty())
2631 osd_primary_affinity
.reset();
2633 osd_primary_affinity
.reset();
2639 auto cblp
= cbl
.begin();
2640 crush
->decode(cblp
);
2641 if (struct_v
>= 3) {
2642 ::decode(erasure_code_profiles
, bl
);
2644 erasure_code_profiles
.clear();
2646 if (struct_v
>= 4) {
2647 ::decode(pg_upmap
, bl
);
2648 ::decode(pg_upmap_items
, bl
);
2651 pg_upmap_items
.clear();
2653 if (struct_v
>= 6) {
2654 ::decode(crush_version
, bl
);
2656 DECODE_FINISH(bl
); // client-usable data
2660 DECODE_START(5, bl
); // extended, osd-only data
2661 ::decode(osd_addrs
->hb_back_addr
, bl
);
2662 ::decode(osd_info
, bl
);
2663 ::decode(blacklist
, bl
);
2664 ::decode(osd_addrs
->cluster_addr
, bl
);
2665 ::decode(cluster_snapshot_epoch
, bl
);
2666 ::decode(cluster_snapshot
, bl
);
2667 ::decode(*osd_uuid
, bl
);
2668 ::decode(osd_xinfo
, bl
);
2669 ::decode(osd_addrs
->hb_front_addr
, bl
);
2670 if (struct_v
>= 2) {
2671 ::decode(nearfull_ratio
, bl
);
2672 ::decode(full_ratio
, bl
);
2677 if (struct_v
>= 3) {
2678 ::decode(backfillfull_ratio
, bl
);
2680 backfillfull_ratio
= 0;
2682 if (struct_v
== 4) {
2686 require_min_compat_client
= ceph_release_from_name(r
.c_str());
2688 if (struct_v
>= 5) {
2689 ::decode(require_min_compat_client
, bl
);
2690 ::decode(require_osd_release
, bl
);
2691 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
2692 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2695 if (flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
) {
2696 // only for compat with post-kraken pre-luminous test clusters
2697 require_osd_release
= CEPH_RELEASE_LUMINOUS
;
2698 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2699 } else if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
2700 require_osd_release
= CEPH_RELEASE_KRAKEN
;
2701 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
2702 require_osd_release
= CEPH_RELEASE_JEWEL
;
2704 require_osd_release
= 0;
2707 DECODE_FINISH(bl
); // osd-only data
2710 if (struct_v
>= 8) {
2711 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
2713 tail_offset
= bl
.get_off();
2716 crc_defined
= false;
2720 DECODE_FINISH(bl
); // wrapper
2724 uint32_t actual
= crc_front
.crc32c(-1);
2725 if (tail_offset
< bl
.get_off()) {
2727 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
2728 actual
= tail
.crc32c(actual
);
2730 if (crc
!= actual
) {
2732 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
2733 string s
= ss
.str();
2734 throw buffer::malformed_input(s
.c_str());
2741 void OSDMap::post_decode()
2745 for (const auto &pname
: pool_name
) {
2746 name_pool
[pname
.second
] = pname
.first
;
2750 _calc_up_osd_features();
2753 void OSDMap::dump_erasure_code_profiles(
2754 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
2757 f
->open_object_section("erasure_code_profiles");
2758 for (const auto &profile
: profiles
) {
2759 f
->open_object_section(profile
.first
.c_str());
2760 for (const auto &profm
: profile
.second
) {
2761 f
->dump_string(profm
.first
.c_str(), profm
.second
.c_str());
2768 void OSDMap::dump(Formatter
*f
) const
2770 f
->dump_int("epoch", get_epoch());
2771 f
->dump_stream("fsid") << get_fsid();
2772 f
->dump_stream("created") << get_created();
2773 f
->dump_stream("modified") << get_modified();
2774 f
->dump_string("flags", get_flag_string());
2775 f
->dump_unsigned("crush_version", get_crush_version());
2776 f
->dump_float("full_ratio", full_ratio
);
2777 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
2778 f
->dump_float("nearfull_ratio", nearfull_ratio
);
2779 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
2780 f
->dump_int("pool_max", get_pool_max());
2781 f
->dump_int("max_osd", get_max_osd());
2782 f
->dump_string("require_min_compat_client",
2783 ceph_release_name(require_min_compat_client
));
2784 f
->dump_string("min_compat_client",
2785 ceph_release_name(get_min_compat_client()));
2786 f
->dump_string("require_osd_release",
2787 ceph_release_name(require_osd_release
));
2789 f
->open_array_section("pools");
2790 for (const auto &pool
: pools
) {
2791 std::string
name("<unknown>");
2792 const auto &pni
= pool_name
.find(pool
.first
);
2793 if (pni
!= pool_name
.end())
2795 f
->open_object_section("pool");
2796 f
->dump_int("pool", pool
.first
);
2797 f
->dump_string("pool_name", name
);
2798 pool
.second
.dump(f
);
2803 f
->open_array_section("osds");
2804 for (int i
=0; i
<get_max_osd(); i
++)
2806 f
->open_object_section("osd_info");
2807 f
->dump_int("osd", i
);
2808 f
->dump_stream("uuid") << get_uuid(i
);
2809 f
->dump_int("up", is_up(i
));
2810 f
->dump_int("in", is_in(i
));
2811 f
->dump_float("weight", get_weightf(i
));
2812 f
->dump_float("primary_affinity", get_primary_affinityf(i
));
2813 get_info(i
).dump(f
);
2814 f
->dump_stream("public_addr") << get_addr(i
);
2815 f
->dump_stream("cluster_addr") << get_cluster_addr(i
);
2816 f
->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i
);
2817 f
->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i
);
2821 f
->open_array_section("state");
2822 for (const auto &state
: st
)
2823 f
->dump_string("state", state
);
2830 f
->open_array_section("osd_xinfo");
2831 for (int i
=0; i
<get_max_osd(); i
++) {
2833 f
->open_object_section("xinfo");
2834 f
->dump_int("osd", i
);
2835 osd_xinfo
[i
].dump(f
);
2841 f
->open_array_section("pg_upmap");
2842 for (auto& p
: pg_upmap
) {
2843 f
->open_object_section("mapping");
2844 f
->dump_stream("pgid") << p
.first
;
2845 f
->open_array_section("osds");
2846 for (auto q
: p
.second
) {
2847 f
->dump_int("osd", q
);
2853 f
->open_array_section("pg_upmap_items");
2854 for (auto& p
: pg_upmap_items
) {
2855 f
->open_object_section("mapping");
2856 f
->dump_stream("pgid") << p
.first
;
2857 f
->open_array_section("mappings");
2858 for (auto& q
: p
.second
) {
2859 f
->open_object_section("mapping");
2860 f
->dump_int("from", q
.first
);
2861 f
->dump_int("to", q
.second
);
2868 f
->open_array_section("pg_temp");
2872 f
->open_array_section("primary_temp");
2873 for (const auto &pg
: *primary_temp
) {
2874 f
->dump_stream("pgid") << pg
.first
;
2875 f
->dump_int("osd", pg
.second
);
2877 f
->close_section(); // primary_temp
2879 f
->open_object_section("blacklist");
2880 for (const auto &addr
: blacklist
) {
2883 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
2887 dump_erasure_code_profiles(erasure_code_profiles
, f
);
2890 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
2892 o
.push_back(new OSDMap
);
2894 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
2895 o
.push_back(new OSDMap
);
2897 o
.back()->build_simple(cct
, 1, fsid
, 16, 7, 8);
2898 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
2899 o
.back()->blacklist
[entity_addr_t()] = utime_t(5, 6);
2903 string
OSDMap::get_flag_string(unsigned f
)
2906 if ( f
& CEPH_OSDMAP_NEARFULL
)
2908 if (f
& CEPH_OSDMAP_FULL
)
2910 if (f
& CEPH_OSDMAP_PAUSERD
)
2912 if (f
& CEPH_OSDMAP_PAUSEWR
)
2914 if (f
& CEPH_OSDMAP_PAUSEREC
)
2916 if (f
& CEPH_OSDMAP_NOUP
)
2918 if (f
& CEPH_OSDMAP_NODOWN
)
2920 if (f
& CEPH_OSDMAP_NOOUT
)
2922 if (f
& CEPH_OSDMAP_NOIN
)
2924 if (f
& CEPH_OSDMAP_NOBACKFILL
)
2926 if (f
& CEPH_OSDMAP_NOREBALANCE
)
2927 s
+= ",norebalance";
2928 if (f
& CEPH_OSDMAP_NORECOVER
)
2930 if (f
& CEPH_OSDMAP_NOSCRUB
)
2932 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
2933 s
+= ",nodeep-scrub";
2934 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
2935 s
+= ",notieragent";
2936 if (f
& CEPH_OSDMAP_SORTBITWISE
)
2937 s
+= ",sortbitwise";
2938 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
2939 s
+= ",require_jewel_osds";
2940 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
2941 s
+= ",require_kraken_osds";
2942 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
2943 s
+= ",require_luminous_osds";
2949 string
OSDMap::get_flag_string() const
2951 return get_flag_string(flags
);
2958 qi() : item(0), depth(0), weight(0) {}
2959 qi(int i
, int d
, float w
) : item(i
), depth(d
), weight(w
) {}
2962 void OSDMap::print_pools(ostream
& out
) const
2964 for (const auto &pool
: pools
) {
2965 std::string
name("<unknown>");
2966 const auto &pni
= pool_name
.find(pool
.first
);
2967 if (pni
!= pool_name
.end())
2969 out
<< "pool " << pool
.first
2971 << "' " << pool
.second
<< "\n";
2973 for (const auto &snap
: pool
.second
.snaps
)
2974 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
2976 if (!pool
.second
.removed_snaps
.empty())
2977 out
<< "\tremoved_snaps " << pool
.second
.removed_snaps
<< "\n";
2982 void OSDMap::print(ostream
& out
) const
2984 out
<< "epoch " << get_epoch() << "\n"
2985 << "fsid " << get_fsid() << "\n"
2986 << "created " << get_created() << "\n"
2987 << "modified " << get_modified() << "\n";
2989 out
<< "flags " << get_flag_string() << "\n";
2990 out
<< "crush_version " << get_crush_version() << "\n";
2991 out
<< "full_ratio " << full_ratio
<< "\n";
2992 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
2993 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
2994 if (require_min_compat_client
> 0) {
2995 out
<< "require_min_compat_client "
2996 << ceph_release_name(require_min_compat_client
) << "\n";
2998 out
<< "min_compat_client " << ceph_release_name(get_min_compat_client())
3000 if (get_cluster_snapshot().length())
3001 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
3006 out
<< "max_osd " << get_max_osd() << "\n";
3007 for (int i
=0; i
<get_max_osd(); i
++) {
3010 out
<< (is_up(i
) ? " up ":" down");
3011 out
<< (is_in(i
) ? " in ":" out");
3012 out
<< " weight " << get_weightf(i
);
3013 if (get_primary_affinity(i
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
)
3014 out
<< " primary_affinity " << get_primary_affinityf(i
);
3015 const osd_info_t
& info(get_info(i
));
3017 out
<< " " << get_addr(i
) << " " << get_cluster_addr(i
) << " " << get_hb_back_addr(i
)
3018 << " " << get_hb_front_addr(i
);
3022 if (!get_uuid(i
).is_zero())
3023 out
<< " " << get_uuid(i
);
3029 for (auto& p
: pg_upmap
) {
3030 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
3032 for (auto& p
: pg_upmap_items
) {
3033 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
3036 for (const auto pg
: *pg_temp
)
3037 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3039 for (const auto pg
: *primary_temp
)
3040 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3042 for (const auto &addr
: blacklist
)
3043 out
<< "blacklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
3045 // ignore pg_swap_primary
3048 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
3050 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
3052 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3054 : Parent(crush
), osdmap(osdmap_
), filter(f
) { }
3056 bool should_dump_leaf(int i
) const override
{
3057 if (((filter
& OSDMap::DUMP_UP
) && !osdmap
->is_up(i
)) ||
3058 ((filter
& OSDMap::DUMP_DOWN
) && !osdmap
->is_down(i
)) ||
3059 ((filter
& OSDMap::DUMP_IN
) && !osdmap
->is_in(i
)) ||
3060 ((filter
& OSDMap::DUMP_OUT
) && !osdmap
->is_out(i
))) {
3066 bool should_dump_empty_bucket() const override
{
3070 void dump(TextTable
*tbl
) {
3071 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
3072 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3073 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
3074 tbl
->define_column("UP/DOWN", TextTable::LEFT
, TextTable::RIGHT
);
3075 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3076 tbl
->define_column("PRIMARY-AFFINITY", TextTable::LEFT
, TextTable::RIGHT
);
3080 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3081 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
)) {
3082 dump_item(CrushTreeDumper::Item(i
, 0, 0), tbl
);
3088 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
3091 << weightf_t(qi
.weight
);
3094 for (int k
= 0; k
< qi
.depth
; k
++)
3096 if (qi
.is_bucket()) {
3097 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
3098 << crush
->get_item_name(qi
.id
);
3100 name
<< "osd." << qi
.id
;
3104 if (!qi
.is_bucket()) {
3105 if (!osdmap
->exists(qi
.id
)) {
3109 *tbl
<< (osdmap
->is_up(qi
.id
) ? "up" : "down")
3110 << weightf_t(osdmap
->get_weightf(qi
.id
))
3111 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
3114 *tbl
<< TextTable::endrow
;
3118 const OSDMap
*osdmap
;
3119 const unsigned filter
;
3122 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
3124 typedef CrushTreeDumper::FormattingDumper Parent
;
3126 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3128 : Parent(crush
), osdmap(osdmap_
), filter(f
) { }
3130 bool should_dump_leaf(int i
) const override
{
3131 if (((filter
& OSDMap::DUMP_UP
) && !osdmap
->is_up(i
)) ||
3132 ((filter
& OSDMap::DUMP_DOWN
) && !osdmap
->is_down(i
)) ||
3133 ((filter
& OSDMap::DUMP_IN
) && !osdmap
->is_in(i
)) ||
3134 ((filter
& OSDMap::DUMP_OUT
) && !osdmap
->is_out(i
))) {
3140 bool should_dump_empty_bucket() const override
{
3144 void dump(Formatter
*f
) {
3145 f
->open_array_section("nodes");
3148 f
->open_array_section("stray");
3149 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3150 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
))
3151 dump_item(CrushTreeDumper::Item(i
, 0, 0), f
);
3157 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
3158 Parent::dump_item_fields(qi
, f
);
3159 if (!qi
.is_bucket())
3161 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
3162 f
->dump_string("status", osdmap
->is_up(qi
.id
) ? "up" : "down");
3163 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
3164 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
3169 const OSDMap
*osdmap
;
3170 const unsigned filter
;
3173 void OSDMap::print_tree(Formatter
*f
, ostream
*out
, unsigned filter
) const
3176 OSDTreeFormattingDumper(crush
.get(), this, filter
).dump(f
);
3180 OSDTreePlainDumper(crush
.get(), this, filter
).dump(&tbl
);
3185 void OSDMap::print_summary(Formatter
*f
, ostream
& out
) const
3188 f
->open_object_section("osdmap");
3189 f
->dump_int("epoch", get_epoch());
3190 f
->dump_int("num_osds", get_num_osds());
3191 f
->dump_int("num_up_osds", get_num_up_osds());
3192 f
->dump_int("num_in_osds", get_num_in_osds());
3193 f
->dump_bool("full", test_flag(CEPH_OSDMAP_FULL
) ? true : false);
3194 f
->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL
) ? true : false);
3195 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3198 out
<< get_num_osds() << " osds: "
3199 << get_num_up_osds() << " up, "
3200 << get_num_in_osds() << " in";
3201 if (get_num_pg_temp())
3202 out
<< "; " << get_num_pg_temp() << " remapped pgs";
3204 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
3205 if (important_flags
)
3206 out
<< " flags " << get_flag_string(important_flags
) << "\n";
3210 void OSDMap::print_oneline_summary(ostream
& out
) const
3212 out
<< "e" << get_epoch() << ": "
3213 << get_num_osds() << " total, "
3214 << get_num_up_osds() << " up, "
3215 << get_num_in_osds() << " in";
3216 if (test_flag(CEPH_OSDMAP_FULL
))
3218 else if (test_flag(CEPH_OSDMAP_NEARFULL
))
3222 bool OSDMap::crush_ruleset_in_use(int ruleset
) const
3224 for (const auto &pool
: pools
) {
3225 if (pool
.second
.crush_rule
== ruleset
)
3231 int OSDMap::build_simple(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
3232 int nosd
, int pg_bits
, int pgp_bits
)
3234 ldout(cct
, 10) << "build_simple on " << num_osd
3235 << " osds with " << pg_bits
<< " pg bits per osd, "
3239 created
= modified
= ceph_clock_now();
3246 const md_config_t
*conf
= cct
->_conf
;
3247 vector
<string
> sections
;
3248 conf
->get_all_sections(sections
);
3250 for (auto §ion
: sections
) {
3251 if (section
.find("osd.") != 0)
3254 const char *begin
= section
.c_str() + 4;
3255 char *end
= (char*)begin
;
3256 int o
= strtol(begin
, &end
, 10);
3260 if (o
> cct
->_conf
->mon_max_osd
) {
3261 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
3269 set_max_osd(maxosd
+ 1);
3272 // pgp_num <= pg_num
3273 if (pgp_bits
> pg_bits
)
3276 vector
<string
> pool_names
;
3277 pool_names
.push_back("rbd");
3282 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
3284 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
3287 int poolbase
= get_max_osd() ? get_max_osd() : 1;
3289 int const default_replicated_rule
=
3290 crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
3291 assert(default_replicated_rule
>= 0);
3293 for (auto &plname
: pool_names
) {
3294 int64_t pool
= ++pool_max
;
3295 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
3296 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
3297 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
3298 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
3299 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
3300 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
3301 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
3302 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
3303 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
3304 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
3305 pools
[pool
].size
= cct
->_conf
->osd_pool_default_size
;
3306 pools
[pool
].min_size
= cct
->_conf
->get_osd_pool_default_min_size();
3307 pools
[pool
].crush_rule
= default_replicated_rule
;
3308 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
3309 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
3310 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
3311 pools
[pool
].last_change
= epoch
;
3312 pool_name
[pool
] = plname
;
3313 name_pool
[plname
] = pool
;
3316 for (int i
=0; i
<get_max_osd(); i
++) {
3318 set_weight(i
, CEPH_OSD_OUT
);
3321 map
<string
,string
> profile_map
;
3322 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
3324 lderr(cct
) << ss
.str() << dendl
;
3327 set_erasure_code_profile("default", profile_map
);
3331 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
3332 map
<string
,string
> &profile_map
,
3335 int r
= get_json_str_map(cct
->_conf
->osd_pool_default_erasure_code_profile
,
3341 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
3343 crush
.set_type_name(0, "osd");
3344 crush
.set_type_name(1, "host");
3345 crush
.set_type_name(2, "chassis");
3346 crush
.set_type_name(3, "rack");
3347 crush
.set_type_name(4, "row");
3348 crush
.set_type_name(5, "pdu");
3349 crush
.set_type_name(6, "pod");
3350 crush
.set_type_name(7, "room");
3351 crush
.set_type_name(8, "datacenter");
3352 crush
.set_type_name(9, "region");
3353 crush
.set_type_name(10, "root");
3357 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
3358 int nosd
, ostream
*ss
)
3363 int root_type
= _build_crush_types(crush
);
3365 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
3366 root_type
, 0, NULL
, NULL
, &rootid
);
3368 crush
.set_item_name(rootid
, "default");
3370 for (int o
=0; o
<nosd
; o
++) {
3371 map
<string
,string
> loc
;
3372 loc
["host"] = "localhost";
3373 loc
["rack"] = "localrack";
3374 loc
["root"] = "default";
3375 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
3377 snprintf(name
, sizeof(name
), "osd.%d", o
);
3378 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
3381 build_simple_crush_rules(cct
, crush
, "default", ss
);
3388 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
3389 CrushWrapper
& crush
,
3392 const md_config_t
*conf
= cct
->_conf
;
3397 int root_type
= _build_crush_types(crush
);
3399 int r
= crush
.add_bucket(0, 0,
3401 root_type
, 0, NULL
, NULL
, &rootid
);
3403 crush
.set_item_name(rootid
, "default");
3406 vector
<string
> sections
;
3407 conf
->get_all_sections(sections
);
3409 for (auto §ion
: sections
) {
3410 if (section
.find("osd.") != 0)
3413 const char *begin
= section
.c_str() + 4;
3414 char *end
= (char*)begin
;
3415 int o
= strtol(begin
, &end
, 10);
3419 string host
, rack
, row
, room
, dc
, pool
;
3420 vector
<string
> sectiontmp
;
3421 sectiontmp
.push_back("osd");
3422 sectiontmp
.push_back(section
);
3423 conf
->get_val_from_conf_file(sectiontmp
, "host", host
, false);
3424 conf
->get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
3425 conf
->get_val_from_conf_file(sectiontmp
, "row", row
, false);
3426 conf
->get_val_from_conf_file(sectiontmp
, "room", room
, false);
3427 conf
->get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
3428 conf
->get_val_from_conf_file(sectiontmp
, "root", pool
, false);
3430 if (host
.length() == 0)
3431 host
= "unknownhost";
3432 if (rack
.length() == 0)
3433 rack
= "unknownrack";
3435 map
<string
,string
> loc
;
3443 loc
["datacenter"] = dc
;
3444 loc
["root"] = "default";
3446 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
3447 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
3450 build_simple_crush_rules(cct
, crush
, "default", ss
);
3458 int OSDMap::build_simple_crush_rules(
3460 CrushWrapper
& crush
,
3464 int crush_rule
= crush
.get_osd_pool_default_crush_replicated_ruleset(cct
);
3465 string failure_domain
=
3466 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
3469 r
= crush
.add_simple_rule_at(
3470 "replicated_rule", root
, failure_domain
,
3471 "firstn", pg_pool_t::TYPE_REPLICATED
,
3475 // do not add an erasure rule by default or else we will implicitly
3476 // require the crush_v2 feature of clients
3480 int OSDMap::summarize_mapping_stats(
3482 const set
<int64_t> *pools
,
3490 for (auto &p
: get_pools())
3494 unsigned total_pg
= 0;
3495 unsigned moved_pg
= 0;
3496 vector
<unsigned> base_by_osd(get_max_osd(), 0);
3497 vector
<unsigned> new_by_osd(get_max_osd(), 0);
3498 for (int64_t pool_id
: ls
) {
3499 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
3500 vector
<int> up
, up2
;
3502 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
3503 pg_t
pgid(ps
, pool_id
, -1);
3504 total_pg
+= pi
->get_size();
3505 pg_to_up_acting_osds(pgid
, &up
, &up_primary
, nullptr, nullptr);
3506 for (int osd
: up
) {
3507 if (osd
>= 0 && osd
< get_max_osd())
3511 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
, nullptr, nullptr);
3512 for (int osd
: up2
) {
3513 if (osd
>= 0 && osd
< get_max_osd())
3516 if (pi
->type
== pg_pool_t::TYPE_ERASURE
) {
3517 for (unsigned i
=0; i
<up
.size(); ++i
) {
3518 if (up
[i
] != up2
[i
]) {
3522 } else if (pi
->type
== pg_pool_t::TYPE_REPLICATED
) {
3523 for (int osd
: up
) {
3524 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
3529 assert(0 == "unhandled pool type");
3535 unsigned num_up_in
= 0;
3536 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
3537 if (is_up(osd
) && is_in(osd
))
3544 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
3545 float base_stddev
= 0, new_stddev
= 0;
3546 int min
= -1, max
= -1;
3547 unsigned min_base_pg
= 0, max_base_pg
= 0;
3548 unsigned min_new_pg
= 0, max_new_pg
= 0;
3549 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
3550 if (is_up(osd
) && is_in(osd
)) {
3551 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
3552 base_stddev
+= base_diff
* base_diff
;
3553 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
3554 new_stddev
+= new_diff
* new_diff
;
3555 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
3557 min_base_pg
= base_by_osd
[osd
];
3558 min_new_pg
= new_by_osd
[osd
];
3560 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
3562 max_base_pg
= base_by_osd
[osd
];
3563 max_new_pg
= new_by_osd
[osd
];
3567 base_stddev
= sqrt(base_stddev
/ num_up_in
);
3568 new_stddev
= sqrt(new_stddev
/ num_up_in
);
3570 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
3574 f
->open_object_section("utilization");
3577 f
->dump_unsigned("moved_pgs", moved_pg
);
3578 f
->dump_unsigned("total_pgs", total_pg
);
3582 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
3583 ss
<< "moved " << moved_pg
<< " / " << total_pg
3584 << " (" << percent
<< "%)\n";
3588 f
->dump_float("avg_pgs", avg_pg
);
3589 f
->dump_float("std_dev", base_stddev
);
3590 f
->dump_float("expected_baseline_std_dev", edev
);
3592 f
->dump_float("new_std_dev", new_stddev
);
3594 ss
<< "avg " << avg_pg
<< "\n";
3595 ss
<< "stddev " << base_stddev
;
3597 ss
<< " -> " << new_stddev
;
3598 ss
<< " (expected baseline " << edev
<< ")\n";
3602 f
->dump_unsigned("min_osd", min
);
3603 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
3605 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
3607 ss
<< "min osd." << min
<< " with " << min_base_pg
;
3609 ss
<< " -> " << min_new_pg
;
3610 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
3612 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
3618 f
->dump_unsigned("max_osd", max
);
3619 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
3621 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
3623 ss
<< "max osd." << max
<< " with " << max_base_pg
;
3625 ss
<< " -> " << max_new_pg
;
3626 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
3628 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
3640 int OSDMap::clean_pg_upmaps(
3642 Incremental
*pending_inc
)
3644 ldout(cct
, 10) << __func__
<< dendl
;
3646 for (auto& p
: pg_upmap
) {
3649 pg_to_raw_osds(p
.first
, &raw
, &primary
);
3650 if (vectors_equal(raw
, p
.second
)) {
3651 ldout(cct
, 10) << " removing redundant pg_upmap " << p
.first
<< " "
3652 << p
.second
<< dendl
;
3653 pending_inc
->old_pg_upmap
.insert(p
.first
);
3657 for (auto& p
: pg_upmap_items
) {
3660 pg_to_raw_osds(p
.first
, &raw
, &primary
);
3661 mempool::osdmap::vector
<pair
<int,int>> newmap
;
3662 for (auto& q
: p
.second
) {
3663 if (std::find(raw
.begin(), raw
.end(), q
.first
) != raw
.end()) {
3664 newmap
.push_back(q
);
3667 if (newmap
.empty()) {
3668 ldout(cct
, 10) << " removing no-op pg_upmap_items " << p
.first
<< " "
3669 << p
.second
<< dendl
;
3670 pending_inc
->old_pg_upmap_items
.insert(p
.first
);
3672 } else if (newmap
!= p
.second
) {
3673 ldout(cct
, 10) << " simplifying partially no-op pg_upmap_items "
3674 << p
.first
<< " " << p
.second
<< " -> " << newmap
<< dendl
;
3675 pending_inc
->new_pg_upmap_items
[p
.first
] = newmap
;
3682 bool OSDMap::try_pg_upmap(
3684 pg_t pg
, ///< pg to potentially remap
3685 const set
<int>& overfull
, ///< osds we'd want to evacuate
3686 const vector
<int>& underfull
, ///< osds to move to, in order of preference
3688 vector
<int> *out
) ///< resulting alternative mapping
3690 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
3693 int rule
= crush
->find_rule(pool
->get_crush_rule(), pool
->get_type(),
3698 // get original mapping
3699 _pg_to_raw_osds(*pool
, pg
, orig
, NULL
);
3701 // make sure there is something there to remap
3703 for (auto osd
: *orig
) {
3704 if (overfull
.count(osd
)) {
3713 int r
= crush
->try_remap_rule(
3717 overfull
, underfull
,
3727 int OSDMap::calc_pg_upmaps(
3729 float max_deviation_ratio
,
3731 const set
<int64_t>& only_pools_orig
,
3732 OSDMap::Incremental
*pending_inc
)
3734 set
<int64_t> only_pools
;
3735 if (only_pools_orig
.empty()) {
3736 for (auto& i
: pools
) {
3737 only_pools
.insert(i
.first
);
3740 only_pools
= only_pools_orig
;
3743 tmp
.deepish_copy_from(*this);
3744 float start_deviation
= 0;
3745 float end_deviation
= 0;
3746 int num_changed
= 0;
3748 map
<int,set
<pg_t
>> pgs_by_osd
;
3750 float osd_weight_total
= 0;
3751 map
<int,float> osd_weight
;
3752 for (auto& i
: pools
) {
3753 if (!only_pools
.empty() && !only_pools
.count(i
.first
))
3755 for (unsigned ps
= 0; ps
< i
.second
.get_pg_num(); ++ps
) {
3756 pg_t
pg(ps
, i
.first
);
3758 tmp
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
3759 for (auto osd
: up
) {
3760 if (osd
!= CRUSH_ITEM_NONE
)
3761 pgs_by_osd
[osd
].insert(pg
);
3764 total_pgs
+= i
.second
.get_size() * i
.second
.get_pg_num();
3766 map
<int,float> pmap
;
3767 int ruleno
= tmp
.crush
->find_rule(i
.second
.get_crush_rule(),
3768 i
.second
.get_type(),
3769 i
.second
.get_size());
3770 tmp
.crush
->get_rule_weight_osd_map(ruleno
, &pmap
);
3771 ldout(cct
,30) << __func__
<< " pool " << i
.first
<< " ruleno " << ruleno
<< dendl
;
3772 for (auto p
: pmap
) {
3773 osd_weight
[p
.first
] += p
.second
;
3774 osd_weight_total
+= p
.second
;
3777 for (auto& i
: osd_weight
) {
3779 auto p
= pgs_by_osd
.find(i
.first
);
3780 if (p
!= pgs_by_osd
.end())
3781 pgs
= p
->second
.size();
3783 pgs_by_osd
.emplace(i
.first
, set
<pg_t
>());
3784 ldout(cct
, 20) << " osd." << i
.first
<< " weight " << i
.second
3785 << " pgs " << pgs
<< dendl
;
3788 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
3789 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
3790 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
3793 float total_deviation
= 0;
3794 map
<int,float> osd_deviation
; // osd, deviation(pgs)
3795 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
3797 for (auto& i
: pgs_by_osd
) {
3798 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
3799 float deviation
= (float)i
.second
.size() - target
;
3800 ldout(cct
, 20) << " osd." << i
.first
3801 << "\tpgs " << i
.second
.size()
3802 << "\ttarget " << target
3803 << "\tdeviation " << deviation
3805 osd_deviation
[i
.first
] = deviation
;
3806 deviation_osd
.insert(make_pair(deviation
, i
.first
));
3807 if (deviation
>= 1.0)
3808 overfull
.insert(i
.first
);
3809 total_deviation
+= abs(deviation
);
3811 if (num_changed
== 0) {
3812 start_deviation
= total_deviation
;
3814 end_deviation
= total_deviation
;
3816 // build underfull, sorted from least-full to most-average
3817 vector
<int> underfull
;
3818 for (auto i
= deviation_osd
.begin();
3819 i
!= deviation_osd
.end();
3821 if (i
->first
>= -.999)
3823 underfull
.push_back(i
->second
);
3825 ldout(cct
, 10) << " total_deviation " << total_deviation
3826 << " overfull " << overfull
3827 << " underfull " << underfull
<< dendl
;
3828 if (overfull
.empty() || underfull
.empty())
3832 bool restart
= false;
3833 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
3834 int osd
= p
->second
;
3835 float deviation
= p
->first
;
3836 float target
= osd_weight
[osd
] * pgs_per_weight
;
3837 if (deviation
/target
< max_deviation_ratio
) {
3838 ldout(cct
, 10) << " osd." << osd
3839 << " target " << target
3840 << " deviation " << deviation
3841 << " -> ratio " << deviation
/target
3842 << " < max ratio " << max_deviation_ratio
<< dendl
;
3845 int num_to_move
= deviation
;
3846 ldout(cct
, 10) << " osd." << osd
<< " move " << num_to_move
<< dendl
;
3847 if (num_to_move
< 1)
3850 set
<pg_t
>& pgs
= pgs_by_osd
[osd
];
3852 // look for remaps we can un-remap
3853 for (auto pg
: pgs
) {
3854 auto p
= tmp
.pg_upmap_items
.find(pg
);
3855 if (p
!= tmp
.pg_upmap_items
.end()) {
3856 for (auto q
: p
->second
) {
3857 if (q
.second
== osd
) {
3858 ldout(cct
, 10) << " dropping pg_upmap_items " << pg
3859 << " " << p
->second
<< dendl
;
3860 tmp
.pg_upmap_items
.erase(p
);
3861 pending_inc
->old_pg_upmap_items
.insert(pg
);
3873 for (auto pg
: pgs
) {
3874 if (tmp
.pg_upmap
.count(pg
) ||
3875 tmp
.pg_upmap_items
.count(pg
)) {
3876 ldout(cct
, 20) << " already remapped " << pg
<< dendl
;
3879 ldout(cct
, 10) << " trying " << pg
<< dendl
;
3880 vector
<int> orig
, out
;
3881 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, &orig
, &out
)) {
3884 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
3885 if (orig
.size() != out
.size()) {
3888 assert(orig
!= out
);
3889 auto& rmi
= tmp
.pg_upmap_items
[pg
];
3890 for (unsigned i
= 0; i
< out
.size(); ++i
) {
3891 if (orig
[i
] != out
[i
]) {
3892 rmi
.push_back(make_pair(orig
[i
], out
[i
]));
3895 pending_inc
->new_pg_upmap_items
[pg
] = rmi
;
3896 ldout(cct
, 10) << " " << pg
<< " pg_upmap_items " << rmi
<< dendl
;
3906 ldout(cct
, 10) << " failed to find any changes to make" << dendl
;
3910 ldout(cct
, 10) << " hit max iterations, stopping" << dendl
;
3914 ldout(cct
, 10) << " start deviation " << start_deviation
<< dendl
;
3915 ldout(cct
, 10) << " end deviation " << end_deviation
<< dendl
;
3919 int OSDMap::get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const
3921 return crush
->get_leaves(name
, osds
);
3924 template <typename F
>
3925 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
3927 typedef CrushTreeDumper::Dumper
<F
> Parent
;
3929 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3930 const PGStatService
*pgs_
, bool tree_
) :
3935 average_util(average_utilization()),
3943 void dump_stray(F
*f
) {
3944 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3945 if (osdmap
->exists(i
) && !this->is_touched(i
))
3946 dump_item(CrushTreeDumper::Item(i
, 0, 0), f
);
3950 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
3951 if (!tree
&& qi
.is_bucket())
3954 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
3955 int64_t kb
= 0, kb_used
= 0, kb_avail
= 0;
3957 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_avail
))
3959 util
= 100.0 * (double)kb_used
/ (double)kb
;
3963 var
= util
/ average_util
;
3965 size_t num_pgs
= qi
.is_bucket() ? 0 : pgs
->get_num_pg_by_osd(qi
.id
);
3967 dump_item(qi
, reweight
, kb
, kb_used
, kb_avail
, util
, var
, num_pgs
, f
);
3969 if (!qi
.is_bucket() && reweight
> 0) {
3970 if (min_var
< 0 || var
< min_var
)
3972 if (max_var
< 0 || var
> max_var
)
3975 double dev
= util
- average_util
;
3977 stddev
+= reweight
* dev
;
3982 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
3989 const size_t num_pgs
,
3993 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
3996 double average_utilization() {
3997 int64_t kb
= 0, kb_used
= 0;
3998 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3999 if (!osdmap
->exists(i
) || osdmap
->get_weight(i
) == 0)
4001 int64_t kb_i
, kb_used_i
, kb_avail_i
;
4002 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_avail_i
)) {
4004 kb_used
+= kb_used_i
;
4007 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
4010 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
4011 int64_t* kb_avail
) const {
4012 const osd_stat_t
*p
= pgs
->get_osd_stat(id
);
4013 if (!p
) return false;
4015 *kb_used
= p
->kb_used
;
4016 *kb_avail
= p
->kb_avail
;
4020 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
4021 int64_t* kb_avail
) const {
4023 if (osdmap
->is_out(id
)) {
4029 return get_osd_utilization(id
, kb
, kb_used
, kb_avail
);
4036 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
4037 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
4038 int64_t kb_i
= 0, kb_used_i
= 0, kb_avail_i
= 0;
4039 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
, &kb_avail_i
))
4042 *kb_used
+= kb_used_i
;
4043 *kb_avail
+= kb_avail_i
;
4049 const OSDMap
*osdmap
;
4050 const PGStatService
*pgs
;
4052 double average_util
;
4060 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
4062 typedef OSDUtilizationDumper
<TextTable
> Parent
;
4064 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
4065 const PGStatService
*pgs
, bool tree
) :
4066 Parent(crush
, osdmap
, pgs
, tree
) {}
4068 void dump(TextTable
*tbl
) {
4069 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
4070 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4071 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4072 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
4073 tbl
->define_column("USE", TextTable::LEFT
, TextTable::RIGHT
);
4074 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
4075 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
4076 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
4077 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
4079 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
4085 *tbl
<< "" << "" << "TOTAL"
4086 << si_t(pgs
->get_osd_sum().kb
<< 10)
4087 << si_t(pgs
->get_osd_sum().kb_used
<< 10)
4088 << si_t(pgs
->get_osd_sum().kb_avail
<< 10)
4089 << lowprecision_t(average_util
)
4091 << TextTable::endrow
;
4095 struct lowprecision_t
{
4097 explicit lowprecision_t(float _v
) : v(_v
) {}
4099 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
4101 using OSDUtilizationDumper
<TextTable
>::dump_item
;
4102 void dump_item(const CrushTreeDumper::Item
&qi
,
4109 const size_t num_pgs
,
4110 TextTable
*tbl
) override
{
4112 << weightf_t(qi
.weight
)
4113 << weightf_t(reweight
)
4115 << si_t(kb_used
<< 10)
4116 << si_t(kb_avail
<< 10)
4117 << lowprecision_t(util
)
4118 << lowprecision_t(var
);
4120 if (qi
.is_bucket()) {
4128 for (int k
= 0; k
< qi
.depth
; k
++)
4130 if (qi
.is_bucket()) {
4131 int type
= crush
->get_bucket_type(qi
.id
);
4132 name
<< crush
->get_type_name(type
) << " "
4133 << crush
->get_item_name(qi
.id
);
4135 name
<< "osd." << qi
.id
;
4140 *tbl
<< TextTable::endrow
;
4146 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
4147 << "/" << lowprecision_t(max_var
) << " "
4148 << "STDDEV: " << lowprecision_t(dev());
4153 ostream
& operator<<(ostream
& out
,
4154 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
4158 } else if (v
.v
< 0.001) {
4161 std::streamsize p
= out
.precision();
4162 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
4166 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
4168 typedef OSDUtilizationDumper
<Formatter
> Parent
;
4170 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
4171 const PGStatService
*pgs
, bool tree
) :
4172 Parent(crush
, osdmap
, pgs
, tree
) {}
4174 void dump(Formatter
*f
) {
4175 f
->open_array_section("nodes");
4179 f
->open_array_section("stray");
4185 using OSDUtilizationDumper
<Formatter
>::dump_item
;
4186 void dump_item(const CrushTreeDumper::Item
&qi
,
4193 const size_t num_pgs
,
4194 Formatter
*f
) override
{
4195 f
->open_object_section("item");
4196 CrushTreeDumper::dump_item_fields(crush
, qi
, f
);
4197 f
->dump_float("reweight", reweight
);
4198 f
->dump_int("kb", kb
);
4199 f
->dump_int("kb_used", kb_used
);
4200 f
->dump_int("kb_avail", kb_avail
);
4201 f
->dump_float("utilization", util
);
4202 f
->dump_float("var", var
);
4203 f
->dump_unsigned("pgs", num_pgs
);
4204 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
4209 void summary(Formatter
*f
) {
4210 f
->open_object_section("summary");
4211 f
->dump_int("total_kb", pgs
->get_osd_sum().kb
);
4212 f
->dump_int("total_kb_used", pgs
->get_osd_sum().kb_used
);
4213 f
->dump_int("total_kb_avail", pgs
->get_osd_sum().kb_avail
);
4214 f
->dump_float("average_utilization", average_util
);
4215 f
->dump_float("min_var", min_var
);
4216 f
->dump_float("max_var", max_var
);
4217 f
->dump_float("dev", dev());
4222 void print_osd_utilization(const OSDMap
& osdmap
,
4223 const PGStatService
*pgstat
,
4228 const CrushWrapper
*crush
= osdmap
.crush
.get();
4230 f
->open_object_section("df");
4231 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgstat
, tree
);
4237 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgstat
, tree
);
4240 out
<< tbl
<< d
.summary() << "\n";