1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
22 #include <fmt/format.h>
24 #include <boost/algorithm/string.hpp>
27 #include "common/config.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/TextTable.h"
31 #include "include/ceph_features.h"
32 #include "include/common_fwd.h"
33 #include "include/str_map.h"
35 #include "common/code_environment.h"
36 #include "mon/health_check.h"
38 #include "crush/CrushTreeDumper.h"
39 #include "common/Clock.h"
40 #include "mon/PGMap.h"
47 using std::ostringstream
;
51 using std::stringstream
;
52 using std::unordered_map
;
57 using ceph::Formatter
;
59 #define dout_subsys ceph_subsys_osd
61 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
62 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
65 // ----------------------------------
68 void osd_info_t::dump(Formatter
*f
) const
70 f
->dump_int("last_clean_begin", last_clean_begin
);
71 f
->dump_int("last_clean_end", last_clean_end
);
72 f
->dump_int("up_from", up_from
);
73 f
->dump_int("up_thru", up_thru
);
74 f
->dump_int("down_at", down_at
);
75 f
->dump_int("lost_at", lost_at
);
78 void osd_info_t::encode(ceph::buffer::list
& bl
) const
83 encode(last_clean_begin
, bl
);
84 encode(last_clean_end
, bl
);
91 void osd_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
96 decode(last_clean_begin
, bl
);
97 decode(last_clean_end
, bl
);
104 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
106 o
.push_back(new osd_info_t
);
107 o
.push_back(new osd_info_t
);
108 o
.back()->last_clean_begin
= 1;
109 o
.back()->last_clean_end
= 2;
110 o
.back()->up_from
= 30;
111 o
.back()->up_thru
= 40;
112 o
.back()->down_at
= 5;
113 o
.back()->lost_at
= 6;
116 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
118 out
<< "up_from " << info
.up_from
119 << " up_thru " << info
.up_thru
120 << " down_at " << info
.down_at
121 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
123 out
<< " lost_at " << info
.lost_at
;
127 // ----------------------------------
130 void osd_xinfo_t::dump(Formatter
*f
) const
132 f
->dump_stream("down_stamp") << down_stamp
;
133 f
->dump_float("laggy_probability", laggy_probability
);
134 f
->dump_int("laggy_interval", laggy_interval
);
135 f
->dump_int("features", features
);
136 f
->dump_unsigned("old_weight", old_weight
);
137 f
->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub
;
138 f
->dump_int("dead_epoch", dead_epoch
);
141 void osd_xinfo_t::encode(ceph::buffer::list
& bl
, uint64_t enc_features
) const
144 if (!HAVE_FEATURE(enc_features
, SERVER_OCTOPUS
)) {
147 ENCODE_START(v
, 1, bl
);
148 encode(down_stamp
, bl
);
149 __u32 lp
= laggy_probability
* float(0xfffffffful
);
151 encode(laggy_interval
, bl
);
152 encode(features
, bl
);
153 encode(old_weight
, bl
);
155 encode(last_purged_snaps_scrub
, bl
);
156 encode(dead_epoch
, bl
);
161 void osd_xinfo_t::decode(ceph::buffer::list::const_iterator
& bl
)
164 decode(down_stamp
, bl
);
167 laggy_probability
= (float)lp
/ (float)0xffffffff;
168 decode(laggy_interval
, bl
);
170 decode(features
, bl
);
174 decode(old_weight
, bl
);
178 decode(last_purged_snaps_scrub
, bl
);
179 decode(dead_epoch
, bl
);
186 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
188 o
.push_back(new osd_xinfo_t
);
189 o
.push_back(new osd_xinfo_t
);
190 o
.back()->down_stamp
= utime_t(2, 3);
191 o
.back()->laggy_probability
= .123;
192 o
.back()->laggy_interval
= 123456;
193 o
.back()->old_weight
= 0x7fff;
196 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
198 return out
<< "down_stamp " << xi
.down_stamp
199 << " laggy_probability " << xi
.laggy_probability
200 << " laggy_interval " << xi
.laggy_interval
201 << " old_weight " << xi
.old_weight
202 << " last_purged_snaps_scrub " << xi
.last_purged_snaps_scrub
203 << " dead_epoch " << xi
.dead_epoch
;
206 // ----------------------------------
207 // OSDMap::Incremental
209 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
212 for (auto &weight
: new_weight
) {
213 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
215 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
221 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
224 for (auto &state
: new_state
) { //
225 if (state
.second
& CEPH_OSD_UP
) {
226 if (previous
->is_up(state
.first
))
235 int OSDMap::Incremental::identify_osd(uuid_d u
) const
237 for (auto &uuid
: new_uuid
)
238 if (uuid
.second
== u
)
243 int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext
*cct
,
244 const OSDMap
& osdmap
)
246 ceph_assert(epoch
== osdmap
.get_epoch() + 1);
248 for (auto &new_pool
: new_pools
) {
249 if (!new_pool
.second
.tiers
.empty()) {
250 pg_pool_t
& base
= new_pool
.second
;
252 auto new_rem_it
= new_removed_snaps
.find(new_pool
.first
);
254 for (const auto &tier_pool
: base
.tiers
) {
255 const auto &r
= new_pools
.find(tier_pool
);
257 if (r
== new_pools
.end()) {
258 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
260 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
263 tier
= get_new_pool(tier_pool
, orig
);
267 if (tier
->tier_of
!= new_pool
.first
) {
268 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
272 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
273 << tier_pool
<< dendl
;
274 tier
->snap_seq
= base
.snap_seq
;
275 tier
->snap_epoch
= base
.snap_epoch
;
276 tier
->snaps
= base
.snaps
;
277 tier
->removed_snaps
= base
.removed_snaps
;
278 tier
->flags
|= base
.flags
& (pg_pool_t::FLAG_SELFMANAGED_SNAPS
|
279 pg_pool_t::FLAG_POOL_SNAPS
);
281 if (new_rem_it
!= new_removed_snaps
.end()) {
282 new_removed_snaps
[tier_pool
] = new_rem_it
->second
;
285 tier
->application_metadata
= base
.application_metadata
;
292 // ----------------------------------
295 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
301 down_cache
->count(id
)) {
306 crush
->get_children(id
, &children
);
307 for (const auto &child
: children
) {
308 if (!subtree_is_down(child
, down_cache
)) {
313 down_cache
->insert(id
);
318 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
320 // use a stack-local down_cache if we didn't get one from the
321 // caller. then at least this particular call will avoid duplicated
323 set
<int> local_down_cache
;
325 down_cache
= &local_down_cache
;
334 type
= crush
->get_bucket_type(current
);
336 ceph_assert(type
>= 0);
338 if (!subtree_is_down(current
, down_cache
)) {
339 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
343 // is this a big enough subtree to be marked as down?
344 if (type
>= subtree_type
) {
345 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
349 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
356 bool OSDMap::subtree_type_is_down(
360 set
<int> *down_in_osds
,
361 set
<int> *up_in_osds
,
362 set
<int> *subtree_up
,
363 unordered_map
<int, set
<int> > *subtree_type_down
) const
366 bool is_down_ret
= is_down(id
);
369 down_in_osds
->insert(id
);
371 up_in_osds
->insert(id
);
377 if (subtree_type_down
&&
378 (*subtree_type_down
)[subtree_type
].count(id
)) {
383 crush
->get_children(id
, &children
);
384 for (const auto &child
: children
) {
385 if (!subtree_type_is_down(
386 cct
, child
, crush
->get_bucket_type(child
),
387 down_in_osds
, up_in_osds
, subtree_up
, subtree_type_down
)) {
388 subtree_up
->insert(id
);
392 if (subtree_type_down
) {
393 (*subtree_type_down
)[subtree_type
].insert(id
);
398 void OSDMap::Incremental::encode_client_old(ceph::buffer::list
& bl
) const
405 encode(modified
, bl
);
406 int32_t new_t
= new_pool_max
;
408 encode(new_flags
, bl
);
412 encode(new_max_osd
, bl
);
413 // for encode(new_pools, bl);
414 __u32 n
= new_pools
.size();
416 for (const auto &new_pool
: new_pools
) {
419 encode(new_pool
.second
, bl
, 0);
421 // for encode(new_pool_names, bl);
422 n
= new_pool_names
.size();
425 for (const auto &new_pool_name
: new_pool_names
) {
426 n
= new_pool_name
.first
;
428 encode(new_pool_name
.second
, bl
);
430 // for encode(old_pools, bl);
431 n
= old_pools
.size();
433 for (auto &old_pool
: old_pools
) {
437 encode(new_up_client
, bl
, 0);
439 // legacy is map<int32_t,uint8_t>
440 map
<int32_t, uint8_t> os
;
441 for (auto p
: new_state
) {
442 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
443 // that an old client could not understand.
445 uint8_t s
= p
.second
;
446 if (p
.second
!= 0 && s
== 0)
450 uint32_t n
= os
.size();
454 encode(p
.second
, bl
);
457 encode(new_weight
, bl
);
458 // for encode(new_pg_temp, bl);
459 n
= new_pg_temp
.size();
462 for (const auto &pg_temp
: new_pg_temp
) {
463 old_pg_t opg
= pg_temp
.first
.get_old_pg();
465 encode(pg_temp
.second
, bl
);
469 void OSDMap::Incremental::encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const
472 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
473 encode_client_old(bl
);
482 encode(modified
, bl
);
483 encode(new_pool_max
, bl
);
484 encode(new_flags
, bl
);
488 encode(new_max_osd
, bl
);
489 encode(new_pools
, bl
, features
);
490 encode(new_pool_names
, bl
);
491 encode(old_pools
, bl
);
492 encode(new_up_client
, bl
, features
);
494 map
<int32_t, uint8_t> os
;
495 for (auto p
: new_state
) {
496 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
497 // that an old client could not understand.
499 uint8_t s
= p
.second
;
500 if (p
.second
!= 0 && s
== 0)
504 uint32_t n
= os
.size();
508 encode(p
.second
, bl
);
511 encode(new_weight
, bl
);
512 encode(new_pg_temp
, bl
);
517 encode(new_hb_back_up
, bl
, features
);
518 encode(new_up_thru
, bl
);
519 encode(new_last_clean_interval
, bl
);
520 encode(new_lost
, bl
);
521 encode(new_blocklist
, bl
, features
);
522 encode(old_blocklist
, bl
, features
);
523 encode(new_up_cluster
, bl
, features
);
524 encode(cluster_snapshot
, bl
);
525 encode(new_uuid
, bl
);
526 encode(new_xinfo
, bl
, features
);
527 encode(new_hb_front_up
, bl
, features
);
531 static void encode_addrvec_map_as_addr(const T
& m
, ceph::buffer::list
& bl
, uint64_t f
)
533 uint32_t n
= m
.size();
537 encode(i
.second
.legacy_addr(), bl
, f
);
542 static void encode_addrvec_pvec_as_addr(const T
& m
, ceph::buffer::list
& bl
, uint64_t f
)
544 uint32_t n
= m
.size();
548 encode(i
->legacy_addr(), bl
, f
);
550 encode(entity_addr_t(), bl
, f
);
555 /* for a description of osdmap incremental versions, and when they were
556 * introduced, please refer to
557 * doc/dev/osd_internals/osdmap_versions.txt
559 void OSDMap::Incremental::encode(ceph::buffer::list
& bl
, uint64_t features
) const
562 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
563 encode_classic(bl
, features
);
567 // only a select set of callers should *ever* be encoding new
568 // OSDMaps. others should be passing around the canonical encoded
569 // buffers from on high. select out those callers by passing in an
570 // "impossible" feature bit.
571 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
572 features
&= ~CEPH_FEATURE_RESERVED
;
574 size_t start_offset
= bl
.length();
577 std::optional
<ceph::buffer::list::contiguous_filler
> crc_filler
;
579 // meta-encoding: how we include client-used and osd-specific data
580 ENCODE_START(8, 7, bl
);
584 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
586 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
588 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
590 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
593 ENCODE_START(v
, 1, bl
); // client-usable data
596 encode(modified
, bl
);
597 encode(new_pool_max
, bl
);
598 encode(new_flags
, bl
);
602 encode(new_max_osd
, bl
);
603 encode(new_pools
, bl
, features
);
604 encode(new_pool_names
, bl
);
605 encode(old_pools
, bl
);
607 encode(new_up_client
, bl
, features
);
609 encode_addrvec_map_as_addr(new_up_client
, bl
, features
);
612 encode(new_state
, bl
);
614 map
<int32_t, uint8_t> os
;
615 for (auto p
: new_state
) {
616 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
617 // that an old client could not understand.
619 uint8_t s
= p
.second
;
620 if (p
.second
!= 0 && s
== 0)
624 uint32_t n
= os
.size();
628 encode(p
.second
, bl
);
631 encode(new_weight
, bl
);
632 encode(new_pg_temp
, bl
);
633 encode(new_primary_temp
, bl
);
634 encode(new_primary_affinity
, bl
);
635 encode(new_erasure_code_profiles
, bl
);
636 encode(old_erasure_code_profiles
, bl
);
638 encode(new_pg_upmap
, bl
);
639 encode(old_pg_upmap
, bl
);
640 encode(new_pg_upmap_items
, bl
);
641 encode(old_pg_upmap_items
, bl
);
644 encode(new_removed_snaps
, bl
);
645 encode(new_purged_snaps
, bl
);
648 encode(new_last_up_change
, bl
);
649 encode(new_last_in_change
, bl
);
652 encode(new_pg_upmap_primary
, bl
);
653 encode(old_pg_upmap_primary
, bl
);
655 ENCODE_FINISH(bl
); // client-usable data
659 uint8_t target_v
= 9; // if bumping this, be aware of allow_crimson 12
660 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
662 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
665 if (change_stretch_mode
) {
666 target_v
= std::max((uint8_t)10, target_v
);
668 if (!new_range_blocklist
.empty() ||
669 !old_range_blocklist
.empty()) {
670 target_v
= std::max((uint8_t)11, target_v
);
672 if (mutate_allow_crimson
!= mutate_allow_crimson_t::NONE
) {
673 target_v
= std::max((uint8_t)12, target_v
);
675 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
677 encode_addrvec_map_as_addr(new_hb_back_up
, bl
, features
);
679 encode(new_hb_back_up
, bl
, features
);
681 encode(new_up_thru
, bl
);
682 encode(new_last_clean_interval
, bl
);
683 encode(new_lost
, bl
);
684 encode(new_blocklist
, bl
, features
);
685 encode(old_blocklist
, bl
, features
);
687 encode_addrvec_map_as_addr(new_up_cluster
, bl
, features
);
689 encode(new_up_cluster
, bl
, features
);
691 encode(cluster_snapshot
, bl
);
692 encode(new_uuid
, bl
);
693 encode(new_xinfo
, bl
, features
);
695 encode_addrvec_map_as_addr(new_hb_front_up
, bl
, features
);
697 encode(new_hb_front_up
, bl
, features
);
699 encode(features
, bl
); // NOTE: features arg, not the member
701 encode(new_nearfull_ratio
, bl
);
702 encode(new_full_ratio
, bl
);
703 encode(new_backfillfull_ratio
, bl
);
705 // 5 was string-based new_require_min_compat_client
707 encode(new_require_min_compat_client
, bl
);
708 encode(new_require_osd_release
, bl
);
711 encode(new_crush_node_flags
, bl
);
714 encode(new_device_class_flags
, bl
);
716 if (target_v
>= 10) {
717 encode(change_stretch_mode
, bl
);
718 encode(new_stretch_bucket_count
, bl
);
719 encode(new_degraded_stretch_mode
, bl
);
720 encode(new_recovering_stretch_mode
, bl
);
721 encode(new_stretch_mode_bucket
, bl
);
722 encode(stretch_mode_enabled
, bl
);
724 if (target_v
>= 11) {
725 encode(new_range_blocklist
, bl
, features
);
726 encode(old_range_blocklist
, bl
, features
);
728 if (target_v
>= 12) {
729 encode(mutate_allow_crimson
, bl
);
731 ENCODE_FINISH(bl
); // osd-only data
734 crc_offset
= bl
.length();
735 crc_filler
= bl
.append_hole(sizeof(uint32_t));
736 tail_offset
= bl
.length();
738 encode(full_crc
, bl
);
740 ENCODE_FINISH(bl
); // meta-encoding wrapper
743 ceph::buffer::list front
;
744 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
745 inc_crc
= front
.crc32c(-1);
746 ceph::buffer::list tail
;
747 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
748 inc_crc
= tail
.crc32c(inc_crc
);
751 crc_filler
->copy_in(4u, (char*)&crc_le
);
755 void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator
&p
)
765 if (v
== 4 || v
== 5) {
769 decode(new_pool_max
, p
);
770 decode(new_flags
, p
);
774 decode(new_max_osd
, p
);
780 decode(new_pools
[t
], p
);
783 decode(new_pools
, p
);
786 new_pool_names
.clear();
790 decode(new_pool_names
[t
], p
);
793 decode(new_pool_names
, p
);
803 decode(old_pools
, p
);
805 decode(new_up_client
, p
);
807 map
<int32_t,uint8_t> ns
;
810 new_state
[q
.first
] = q
.second
;
813 decode(new_weight
, p
);
820 ceph::decode_raw(opg
, p
);
821 decode(new_pg_temp
[pg_t(opg
)], p
);
824 decode(new_pg_temp
, p
);
827 // decode short map, too.
828 if (v
== 5 && p
.end())
835 decode(new_hb_back_up
, p
);
837 decode(new_pool_names
, p
);
838 decode(new_up_thru
, p
);
839 decode(new_last_clean_interval
, p
);
841 decode(new_blocklist
, p
);
842 decode(old_blocklist
, p
);
844 decode(new_up_cluster
, p
);
846 decode(cluster_snapshot
, p
);
850 decode(new_xinfo
, p
);
852 decode(new_hb_front_up
, p
);
855 /* for a description of osdmap incremental versions, and when they were
856 * introduced, please refer to
857 * doc/dev/osd_internals/osdmap_versions.txt
859 void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator
& bl
)
863 * Older encodings of the Incremental had a single struct_v which
864 * covered the whole encoding, and was prior to our modern
865 * stuff which includes a compatv and a size. So if we see
866 * a struct_v < 7, we must rewind to the beginning and use our
869 size_t start_offset
= bl
.get_off();
870 size_t tail_offset
= 0;
871 ceph::buffer::list crc_front
, crc_tail
;
873 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
875 bl
.seek(start_offset
);
879 encode_features
= CEPH_FEATURE_PGID64
;
885 DECODE_START(8, bl
); // client-usable data
888 decode(modified
, bl
);
889 decode(new_pool_max
, bl
);
890 decode(new_flags
, bl
);
894 decode(new_max_osd
, bl
);
895 decode(new_pools
, bl
);
896 decode(new_pool_names
, bl
);
897 decode(old_pools
, bl
);
898 decode(new_up_client
, bl
);
900 decode(new_state
, bl
);
902 map
<int32_t,uint8_t> ns
;
905 new_state
[q
.first
] = q
.second
;
908 decode(new_weight
, bl
);
909 decode(new_pg_temp
, bl
);
910 decode(new_primary_temp
, bl
);
912 decode(new_primary_affinity
, bl
);
914 new_primary_affinity
.clear();
916 decode(new_erasure_code_profiles
, bl
);
917 decode(old_erasure_code_profiles
, bl
);
919 new_erasure_code_profiles
.clear();
920 old_erasure_code_profiles
.clear();
923 decode(new_pg_upmap
, bl
);
924 decode(old_pg_upmap
, bl
);
925 decode(new_pg_upmap_items
, bl
);
926 decode(old_pg_upmap_items
, bl
);
929 decode(new_removed_snaps
, bl
);
930 decode(new_purged_snaps
, bl
);
933 decode(new_last_up_change
, bl
);
934 decode(new_last_in_change
, bl
);
936 DECODE_FINISH(bl
); // client-usable data
940 DECODE_START(10, bl
); // extended, osd-only data
941 decode(new_hb_back_up
, bl
);
942 decode(new_up_thru
, bl
);
943 decode(new_last_clean_interval
, bl
);
944 decode(new_lost
, bl
);
945 decode(new_blocklist
, bl
);
946 decode(old_blocklist
, bl
);
947 decode(new_up_cluster
, bl
);
948 decode(cluster_snapshot
, bl
);
949 decode(new_uuid
, bl
);
950 decode(new_xinfo
, bl
);
951 decode(new_hb_front_up
, bl
);
953 decode(encode_features
, bl
);
955 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
957 decode(new_nearfull_ratio
, bl
);
958 decode(new_full_ratio
, bl
);
960 new_nearfull_ratio
= -1;
964 decode(new_backfillfull_ratio
, bl
);
966 new_backfillfull_ratio
= -1;
972 new_require_min_compat_client
= ceph_release_from_name(r
);
976 decode(new_require_min_compat_client
, bl
);
977 decode(new_require_osd_release
, bl
);
979 if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
980 // only for compat with post-kraken pre-luminous test clusters
981 new_require_osd_release
= ceph_release_t::luminous
;
982 new_flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
983 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
984 new_require_osd_release
= ceph_release_t::kraken
;
985 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_JEWEL
)) {
986 new_require_osd_release
= ceph_release_t::jewel
;
988 new_require_osd_release
= ceph_release_t::unknown
;
992 decode(new_crush_node_flags
, bl
);
995 decode(new_device_class_flags
, bl
);
997 if (struct_v
>= 10) {
998 decode(change_stretch_mode
, bl
);
999 decode(new_stretch_bucket_count
, bl
);
1000 decode(new_degraded_stretch_mode
, bl
);
1001 decode(new_recovering_stretch_mode
, bl
);
1002 decode(new_stretch_mode_bucket
, bl
);
1003 decode(stretch_mode_enabled
, bl
);
1005 if (struct_v
>= 11) {
1006 decode(new_range_blocklist
, bl
);
1007 decode(old_range_blocklist
, bl
);
1009 if (struct_v
>= 12) {
1010 decode(mutate_allow_crimson
, bl
);
1012 DECODE_FINISH(bl
); // osd-only data
1015 if (struct_v
>= 8) {
1017 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
1018 decode(inc_crc
, bl
);
1019 tail_offset
= bl
.get_off();
1020 decode(full_crc
, bl
);
1027 DECODE_FINISH(bl
); // wrapper
1031 uint32_t actual
= crc_front
.crc32c(-1);
1032 if (tail_offset
< bl
.get_off()) {
1033 ceph::buffer::list tail
;
1034 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
1035 actual
= tail
.crc32c(actual
);
1037 if (inc_crc
!= actual
) {
1039 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
1040 string s
= ss
.str();
1041 throw ceph::buffer::malformed_input(s
.c_str());
1046 void OSDMap::Incremental::dump(Formatter
*f
) const
1048 f
->dump_int("epoch", epoch
);
1049 f
->dump_stream("fsid") << fsid
;
1050 f
->dump_stream("modified") << modified
;
1051 f
->dump_stream("new_last_up_change") << new_last_up_change
;
1052 f
->dump_stream("new_last_in_change") << new_last_in_change
;
1053 f
->dump_int("new_pool_max", new_pool_max
);
1054 f
->dump_int("new_flags", new_flags
);
1055 f
->dump_float("new_full_ratio", new_full_ratio
);
1056 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
1057 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
1058 f
->dump_int("new_require_min_compat_client", to_integer
<int>(new_require_min_compat_client
));
1059 f
->dump_int("new_require_osd_release", to_integer
<int>(new_require_osd_release
));
1060 f
->dump_unsigned("mutate_allow_crimson", static_cast<unsigned>(mutate_allow_crimson
));
1062 if (fullmap
.length()) {
1063 f
->open_object_section("full_map");
1065 ceph::buffer::list fbl
= fullmap
; // kludge around constness.
1066 auto p
= fbl
.cbegin();
1071 if (crush
.length()) {
1072 f
->open_object_section("crush");
1074 ceph::buffer::list tbl
= crush
; // kludge around constness.
1075 auto p
= tbl
.cbegin();
1081 f
->dump_int("new_max_osd", new_max_osd
);
1083 f
->open_array_section("new_pools");
1085 for (const auto &new_pool
: new_pools
) {
1086 f
->open_object_section("pool");
1087 f
->dump_int("pool", new_pool
.first
);
1088 new_pool
.second
.dump(f
);
1092 f
->open_array_section("new_pool_names");
1094 for (const auto &new_pool_name
: new_pool_names
) {
1095 f
->open_object_section("pool_name");
1096 f
->dump_int("pool", new_pool_name
.first
);
1097 f
->dump_string("name", new_pool_name
.second
);
1101 f
->open_array_section("old_pools");
1103 for (const auto &old_pool
: old_pools
)
1104 f
->dump_int("pool", old_pool
);
1107 f
->open_array_section("new_up_osds");
1109 for (const auto &upclient
: new_up_client
) {
1110 f
->open_object_section("osd");
1111 f
->dump_int("osd", upclient
.first
);
1112 f
->dump_stream("public_addr") << upclient
.second
.legacy_addr();
1113 f
->dump_object("public_addrs", upclient
.second
);
1114 if (auto p
= new_up_cluster
.find(upclient
.first
);
1115 p
!= new_up_cluster
.end()) {
1116 f
->dump_stream("cluster_addr") << p
->second
.legacy_addr();
1117 f
->dump_object("cluster_addrs", p
->second
);
1119 if (auto p
= new_hb_back_up
.find(upclient
.first
);
1120 p
!= new_hb_back_up
.end()) {
1121 f
->dump_object("heartbeat_back_addrs", p
->second
);
1123 if (auto p
= new_hb_front_up
.find(upclient
.first
);
1124 p
!= new_hb_front_up
.end()) {
1125 f
->dump_object("heartbeat_front_addrs", p
->second
);
1131 f
->open_array_section("new_weight");
1133 for (const auto &weight
: new_weight
) {
1134 f
->open_object_section("osd");
1135 f
->dump_int("osd", weight
.first
);
1136 f
->dump_int("weight", weight
.second
);
1141 f
->open_array_section("osd_state_xor");
1142 for (const auto &ns
: new_state
) {
1143 f
->open_object_section("osd");
1144 f
->dump_int("osd", ns
.first
);
1146 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
1147 f
->open_array_section("state_xor");
1148 for (auto &state
: st
)
1149 f
->dump_string("state", state
);
1155 f
->open_array_section("new_pg_temp");
1157 for (const auto &pg_temp
: new_pg_temp
) {
1158 f
->open_object_section("pg");
1159 f
->dump_stream("pgid") << pg_temp
.first
;
1160 f
->open_array_section("osds");
1162 for (const auto &osd
: pg_temp
.second
)
1163 f
->dump_int("osd", osd
);
1169 f
->open_array_section("primary_temp");
1171 for (const auto &primary_temp
: new_primary_temp
) {
1172 f
->dump_stream("pgid") << primary_temp
.first
;
1173 f
->dump_int("osd", primary_temp
.second
);
1175 f
->close_section(); // primary_temp
1177 f
->open_array_section("new_pg_upmap");
1178 for (auto& i
: new_pg_upmap
) {
1179 f
->open_object_section("mapping");
1180 f
->dump_stream("pgid") << i
.first
;
1181 f
->open_array_section("osds");
1182 for (auto osd
: i
.second
) {
1183 f
->dump_int("osd", osd
);
1189 f
->open_array_section("old_pg_upmap");
1190 for (auto& i
: old_pg_upmap
) {
1191 f
->dump_stream("pgid") << i
;
1195 f
->open_array_section("new_pg_upmap_items");
1196 for (auto& i
: new_pg_upmap_items
) {
1197 f
->open_object_section("mapping");
1198 f
->dump_stream("pgid") << i
.first
;
1199 f
->open_array_section("mappings");
1200 for (auto& p
: i
.second
) {
1201 f
->open_object_section("mapping");
1202 f
->dump_int("from", p
.first
);
1203 f
->dump_int("to", p
.second
);
1210 f
->open_array_section("old_pg_upmap_items");
1211 for (auto& i
: old_pg_upmap_items
) {
1212 f
->dump_stream("pgid") << i
;
1216 // dump upmap_primaries
1217 f
->open_array_section("new_pg_upmap_primaries");
1218 for (auto& [pg
, osd
] : new_pg_upmap_primary
) {
1219 f
->open_object_section("primary_mapping");
1220 f
->dump_stream("pgid") << pg
;
1221 f
->dump_int("primary_osd", osd
);
1224 f
->close_section(); // new_pg_upmap_primaries
1226 // dump old_pg_upmap_primaries (removed primary mappings)
1227 f
->open_array_section("old_pg_upmap_primaries");
1228 for (auto& pg
: old_pg_upmap_primary
) {
1229 f
->dump_stream("pgid") << pg
;
1231 f
->close_section(); // old_pg_upmap_primaries
1233 f
->open_array_section("new_up_thru");
1235 for (const auto &up_thru
: new_up_thru
) {
1236 f
->open_object_section("osd");
1237 f
->dump_int("osd", up_thru
.first
);
1238 f
->dump_int("up_thru", up_thru
.second
);
1243 f
->open_array_section("new_lost");
1245 for (const auto &lost
: new_lost
) {
1246 f
->open_object_section("osd");
1247 f
->dump_int("osd", lost
.first
);
1248 f
->dump_int("epoch_lost", lost
.second
);
1253 f
->open_array_section("new_last_clean_interval");
1255 for (const auto &last_clean_interval
: new_last_clean_interval
) {
1256 f
->open_object_section("osd");
1257 f
->dump_int("osd", last_clean_interval
.first
);
1258 f
->dump_int("first", last_clean_interval
.second
.first
);
1259 f
->dump_int("last", last_clean_interval
.second
.second
);
1264 f
->open_array_section("new_blocklist");
1265 for (const auto &blist
: new_blocklist
) {
1268 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1271 f
->open_array_section("old_blocklist");
1272 for (const auto &blist
: old_blocklist
)
1273 f
->dump_stream("addr") << blist
;
1275 f
->open_array_section("new_range_blocklist");
1276 for (const auto &blist
: new_range_blocklist
) {
1279 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1282 f
->open_array_section("old_range_blocklist");
1283 for (const auto &blist
: old_range_blocklist
)
1284 f
->dump_stream("addr") << blist
;
1287 f
->open_array_section("new_xinfo");
1288 for (const auto &xinfo
: new_xinfo
) {
1289 f
->open_object_section("xinfo");
1290 f
->dump_int("osd", xinfo
.first
);
1291 xinfo
.second
.dump(f
);
1296 if (cluster_snapshot
.size())
1297 f
->dump_string("cluster_snapshot", cluster_snapshot
);
1299 f
->open_array_section("new_uuid");
1300 for (const auto &uuid
: new_uuid
) {
1301 f
->open_object_section("osd");
1302 f
->dump_int("osd", uuid
.first
);
1303 f
->dump_stream("uuid") << uuid
.second
;
1308 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
1309 f
->open_array_section("old_erasure_code_profiles");
1310 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
1311 f
->dump_string("old", erasure_code_profile
);
1315 f
->open_array_section("new_removed_snaps");
1316 for (auto& p
: new_removed_snaps
) {
1317 f
->open_object_section("pool");
1318 f
->dump_int("pool", p
.first
);
1319 f
->open_array_section("snaps");
1320 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1321 f
->open_object_section("interval");
1322 f
->dump_unsigned("begin", q
.get_start());
1323 f
->dump_unsigned("length", q
.get_len());
1330 f
->open_array_section("new_purged_snaps");
1331 for (auto& p
: new_purged_snaps
) {
1332 f
->open_object_section("pool");
1333 f
->dump_int("pool", p
.first
);
1334 f
->open_array_section("snaps");
1335 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1336 f
->open_object_section("interval");
1337 f
->dump_unsigned("begin", q
.get_start());
1338 f
->dump_unsigned("length", q
.get_len());
1344 f
->open_array_section("new_crush_node_flags");
1345 for (auto& i
: new_crush_node_flags
) {
1346 f
->open_object_section("node");
1347 f
->dump_int("id", i
.first
);
1349 calc_state_set(i
.second
, st
);
1350 for (auto& j
: st
) {
1351 f
->dump_string("flag", j
);
1356 f
->open_array_section("new_device_class_flags");
1357 for (auto& i
: new_device_class_flags
) {
1358 f
->open_object_section("device_class");
1359 f
->dump_int("id", i
.first
);
1361 calc_state_set(i
.second
, st
);
1362 for (auto& j
: st
) {
1363 f
->dump_string("flag", j
);
1368 f
->open_object_section("stretch_mode");
1370 f
->dump_bool("change_stretch_mode", change_stretch_mode
);
1371 f
->dump_bool("stretch_mode_enabled", stretch_mode_enabled
);
1372 f
->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count
);
1373 f
->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode
);
1374 f
->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode
);
1375 f
->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket
);
1381 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
1383 o
.push_back(new Incremental
);
1386 // ----------------------------------
1389 void OSDMap::set_epoch(epoch_t e
)
1392 for (auto &pool
: pools
)
1393 pool
.second
.last_change
= e
;
1396 OSDMap::range_bits::range_bits() : ipv6(false) {
1397 memset(&bits
, 0, sizeof(bits
));
1400 OSDMap::range_bits::range_bits(const entity_addr_t
& addr
) : ipv6(false) {
1401 memset(&bits
, 0, sizeof(bits
));
1405 void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr
,
1406 uint64_t *upper
, uint64_t *lower
)
1408 *upper
= ((uint64_t)(ntohl(*(uint32_t*)(addr
)))) << 32 |
1409 ((uint64_t)(ntohl(*(uint32_t*)(&addr
[4]))));
1410 *lower
= ((uint64_t)(ntohl(*(uint32_t*)(&addr
[8])))) << 32 |
1411 ((uint64_t)(ntohl(*(uint32_t*)(&addr
[12]))));
1414 void OSDMap::range_bits::parse(const entity_addr_t
& addr
) {
1415 // parse it into meaningful data
1416 if (addr
.is_ipv6()) {
1417 get_ipv6_bytes(addr
.in6_addr().sin6_addr
.s6_addr
,
1418 &bits
.ipv6
.upper_64_bits
, &bits
.ipv6
.lower_64_bits
);
1419 int32_t lower_shift
= std::min(128-
1420 static_cast<int32_t>(addr
.get_nonce()), 64);
1421 int32_t upper_shift
= std::max(64- //(128-b.first.get_nonce())-64
1422 static_cast<int32_t>(addr
.get_nonce()), 0);
1424 auto get_mask
= [](int32_t shift
) -> uint64_t {
1425 if (shift
>= 0 && shift
< 64) {
1426 return UINT64_MAX
<< shift
;
1431 bits
.ipv6
.lower_mask
= get_mask(lower_shift
);
1432 bits
.ipv6
.upper_mask
= get_mask(upper_shift
);
1434 } else if (addr
.is_ipv4()) {
1435 bits
.ipv4
.ip_32_bits
= ntohl(addr
.in4_addr().sin_addr
.s_addr
);
1436 if (addr
.get_nonce() > 0) {
1437 bits
.ipv4
.mask
= UINT32_MAX
<< (32-addr
.get_nonce());
1446 bool OSDMap::range_bits::matches(const entity_addr_t
& addr
) const {
1447 if (addr
.is_ipv4() && !ipv6
) {
1448 return ((ntohl(addr
.in4_addr().sin_addr
.s_addr
) & bits
.ipv4
.mask
) ==
1449 (bits
.ipv4
.ip_32_bits
& bits
.ipv4
.mask
));
1450 } else if (addr
.is_ipv6() && ipv6
) {
1451 uint64_t upper_64
, lower_64
;
1452 get_ipv6_bytes(addr
.in6_addr().sin6_addr
.s6_addr
, &upper_64
, &lower_64
);
1453 return (((upper_64
& bits
.ipv6
.upper_mask
) ==
1454 (bits
.ipv6
.upper_64_bits
& bits
.ipv6
.upper_mask
)) &&
1455 ((lower_64
& bits
.ipv6
.lower_mask
) ==
1456 (bits
.ipv6
.lower_64_bits
& bits
.ipv6
.lower_mask
)));
1461 bool OSDMap::is_blocklisted(const entity_addr_t
& orig
, CephContext
*cct
) const
1463 if (cct
) ldout(cct
, 25) << "is_blocklisted: " << orig
<< dendl
;
1464 if (blocklist
.empty() && range_blocklist
.empty()) {
1465 if (cct
) ldout(cct
, 30) << "not blocklisted: " << orig
<< dendl
;
1469 // all blocklist entries are type ANY for nautilus+
1470 // FIXME: avoid this copy!
1471 entity_addr_t a
= orig
;
1472 if (require_osd_release
< ceph_release_t::nautilus
) {
1473 a
.set_type(entity_addr_t::TYPE_LEGACY
);
1475 a
.set_type(entity_addr_t::TYPE_ANY
);
1478 // this specific instance?
1479 if (blocklist
.count(a
)) {
1480 if (cct
) ldout(cct
, 20) << "blocklist contains " << a
<< dendl
;
1484 // is entire ip blocklisted?
1488 if (blocklist
.count(a
)) {
1489 if (cct
) ldout(cct
, 20) << "blocklist contains " << a
<< dendl
;
1494 // is it in a blocklisted range?
1495 for (const auto& i
: calculated_ranges
) {
1496 bool blocked
= i
.second
.matches(a
);
1498 if (cct
) ldout(cct
, 20) << "range_blocklist contains " << a
<< dendl
;
1503 if (cct
) ldout(cct
, 25) << "not blocklisted: " << orig
<< dendl
;
1507 bool OSDMap::is_blocklisted(const entity_addrvec_t
& av
, CephContext
*cct
) const
1509 if (blocklist
.empty() && range_blocklist
.empty())
1512 for (auto& a
: av
.v
) {
1513 if (is_blocklisted(a
, cct
)) {
1521 void OSDMap::get_blocklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
,
1522 std::list
<std::pair
<entity_addr_t
,utime_t
> > *rl
) const
1524 std::copy(blocklist
.begin(), blocklist
.end(), std::back_inserter(*bl
));
1525 std::copy(range_blocklist
.begin(), range_blocklist
.end(),
1526 std::back_inserter(*rl
));
1529 void OSDMap::get_blocklist(std::set
<entity_addr_t
> *bl
,
1530 std::set
<entity_addr_t
> *rl
) const
1532 for (const auto &i
: blocklist
) {
1533 bl
->insert(i
.first
);
1535 for (const auto &i
: range_blocklist
) {
1536 rl
->insert(i
.first
);
1540 void OSDMap::set_max_osd(int m
)
1543 osd_state
.resize(max_osd
, 0);
1544 osd_weight
.resize(max_osd
, CEPH_OSD_OUT
);
1545 osd_info
.resize(max_osd
);
1546 osd_xinfo
.resize(max_osd
);
1547 osd_addrs
->client_addrs
.resize(max_osd
);
1548 osd_addrs
->cluster_addrs
.resize(max_osd
);
1549 osd_addrs
->hb_back_addrs
.resize(max_osd
);
1550 osd_addrs
->hb_front_addrs
.resize(max_osd
);
1551 osd_uuid
->resize(max_osd
);
1552 if (osd_primary_affinity
)
1553 osd_primary_affinity
->resize(max_osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1558 int OSDMap::calc_num_osds()
1563 for (int i
=0; i
<max_osd
; i
++) {
1564 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1566 if (osd_state
[i
] & CEPH_OSD_UP
) {
1569 if (get_weight(i
) != CEPH_OSD_OUT
) {
1577 void OSDMap::get_full_pools(CephContext
*cct
,
1579 set
<int64_t> *backfillfull
,
1580 set
<int64_t> *nearfull
) const
1583 ceph_assert(backfillfull
);
1584 ceph_assert(nearfull
);
1586 backfillfull
->clear();
1589 vector
<int> full_osds
;
1590 vector
<int> backfillfull_osds
;
1591 vector
<int> nearfull_osds
;
1592 for (int i
= 0; i
< max_osd
; ++i
) {
1593 if (exists(i
) && is_up(i
) && is_in(i
)) {
1594 if (osd_state
[i
] & CEPH_OSD_FULL
)
1595 full_osds
.push_back(i
);
1596 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1597 backfillfull_osds
.push_back(i
);
1598 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1599 nearfull_osds
.push_back(i
);
1603 for (auto i
: full_osds
) {
1604 get_pool_ids_by_osd(cct
, i
, full
);
1606 for (auto i
: backfillfull_osds
) {
1607 get_pool_ids_by_osd(cct
, i
, backfillfull
);
1609 for (auto i
: nearfull_osds
) {
1610 get_pool_ids_by_osd(cct
, i
, nearfull
);
1614 void OSDMap::get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
1615 set
<int> *nearfull
) const
1620 for (int i
= 0; i
< max_osd
; ++i
) {
1621 if (exists(i
) && is_up(i
) && is_in(i
)) {
1622 if (osd_state
[i
] & CEPH_OSD_FULL
)
1624 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1625 backfill
->emplace(i
);
1626 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1627 nearfull
->emplace(i
);
1632 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1634 for (int i
=0; i
<max_osd
; i
++)
1639 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1641 for (int i
= 0; i
< max_osd
; i
++) {
1647 void OSDMap::get_out_existing_osds(set
<int32_t>& ls
) const
1649 for (int i
= 0; i
< max_osd
; i
++) {
1650 if (exists(i
) && get_weight(i
) == CEPH_OSD_OUT
)
1655 void OSDMap::get_flag_set(set
<string
> *flagset
) const
1657 for (unsigned i
= 0; i
< sizeof(flags
) * 8; ++i
) {
1658 if (flags
& (1<<i
)) {
1659 flagset
->insert(get_flag_string(flags
& (1<<i
)));
1664 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1667 for (unsigned s
= 1; t
; s
<<= 1) {
1670 st
.insert(ceph_osd_state_name(s
));
1675 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1678 for (const auto &weight
: weights
) {
1679 if (weight
.second
> max
)
1680 max
= weight
.second
;
1683 for (const auto &weight
: weights
) {
1684 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1688 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1690 for (int i
=0; i
<max_osd
; i
++)
1691 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1692 get_cluster_addrs(i
).contains(addr
)))
1697 int OSDMap::identify_osd(const uuid_d
& u
) const
1699 for (int i
=0; i
<max_osd
; i
++)
1700 if (exists(i
) && get_uuid(i
) == u
)
1705 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1707 for (int i
=0; i
<max_osd
; i
++)
1708 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1709 get_cluster_addrs(i
).contains(addr
) ||
1710 get_hb_back_addrs(i
).contains(addr
) ||
1711 get_hb_front_addrs(i
).contains(addr
)))
1716 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1718 for (int i
=0; i
<max_osd
; i
++)
1719 if (exists(i
) && (get_addrs(i
).is_same_host(ip
) ||
1720 get_cluster_addrs(i
).is_same_host(ip
)))
1726 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1728 uint64_t features
= 0; // things we actually have
1729 uint64_t mask
= 0; // things we could have
1731 if (crush
->has_nondefault_tunables())
1732 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1733 if (crush
->has_nondefault_tunables2())
1734 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1735 if (crush
->has_nondefault_tunables3())
1736 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1737 if (crush
->has_v4_buckets())
1738 features
|= CEPH_FEATURE_CRUSH_V4
;
1739 if (crush
->has_nondefault_tunables5())
1740 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1741 if (crush
->has_incompat_choose_args()) {
1742 features
|= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
;
1744 mask
|= CEPH_FEATURES_CRUSH
;
1746 if (!pg_upmap
.empty() || !pg_upmap_items
.empty() || !pg_upmap_primaries
.empty())
1747 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1748 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1750 for (auto &pool
: pools
) {
1751 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1752 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1754 if (!pool
.second
.tiers
.empty() ||
1755 pool
.second
.is_tier()) {
1756 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1758 int ruleid
= pool
.second
.get_crush_rule();
1760 if (crush
->is_v2_rule(ruleid
))
1761 features
|= CEPH_FEATURE_CRUSH_V2
;
1762 if (crush
->is_v3_rule(ruleid
))
1763 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1764 if (crush
->is_v5_rule(ruleid
))
1765 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1768 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1770 if (osd_primary_affinity
) {
1771 for (int i
= 0; i
< max_osd
; ++i
) {
1772 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1773 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1778 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1780 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1781 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1782 if (require_osd_release
>= ceph_release_t::jewel
) {
1783 features
|= jewel_features
;
1785 mask
|= jewel_features
;
1787 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1788 | CEPH_FEATURE_MSG_ADDR2
;
1789 if (require_osd_release
>= ceph_release_t::kraken
) {
1790 features
|= kraken_features
;
1792 mask
|= kraken_features
;
1794 if (stretch_mode_enabled
) {
1795 features
|= CEPH_FEATUREMASK_STRETCH_MODE
;
1796 mask
|= CEPH_FEATUREMASK_STRETCH_MODE
;
1800 if (require_min_compat_client
>= ceph_release_t::nautilus
) {
1801 // if min_compat_client is >= nautilus, require v2 cephx signatures
1803 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1804 } else if (require_osd_release
>= ceph_release_t::nautilus
&&
1805 entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1806 // if osds are >= nautilus, at least require the signatures from them
1807 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1809 mask
|= CEPH_FEATUREMASK_CEPHX_V2
;
1816 ceph_release_t
OSDMap::get_min_compat_client() const
1818 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1820 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1821 HAVE_FEATURE(f
, CRUSH_CHOOSE_ARGS
)) { // v12.0.1-2172-gef1ef28
1822 return ceph_release_t::luminous
; // v12.2.0
1824 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1825 return ceph_release_t::jewel
; // v10.2.0
1827 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1828 return ceph_release_t::hammer
; // v0.94.0
1830 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1831 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1832 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1833 return ceph_release_t::firefly
; // v0.80.0
1835 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1836 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1837 return ceph_release_t::dumpling
; // v0.67.0
1839 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1840 return ceph_release_t::argonaut
; // v0.48argonaut-206-g6f381af
1842 return ceph_release_t::argonaut
; // v0.48argonaut-206-g6f381af
1845 ceph_release_t
OSDMap::get_require_min_compat_client() const
1847 return require_min_compat_client
;
1850 void OSDMap::_calc_up_osd_features()
1853 cached_up_osd_features
= 0;
1854 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1857 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1858 if (xi
.features
== 0)
1859 continue; // bogus xinfo, maybe #20751 or similar, skipping
1861 cached_up_osd_features
= xi
.features
;
1864 cached_up_osd_features
&= xi
.features
;
1869 uint64_t OSDMap::get_up_osd_features() const
1871 return cached_up_osd_features
;
1874 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1877 if (o
->epoch
== n
->epoch
)
1883 if (o
->max_osd
!= n
->max_osd
)
1885 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1886 if ( n
->osd_addrs
->client_addrs
[i
] && o
->osd_addrs
->client_addrs
[i
] &&
1887 *n
->osd_addrs
->client_addrs
[i
] == *o
->osd_addrs
->client_addrs
[i
])
1888 n
->osd_addrs
->client_addrs
[i
] = o
->osd_addrs
->client_addrs
[i
];
1891 if ( n
->osd_addrs
->cluster_addrs
[i
] && o
->osd_addrs
->cluster_addrs
[i
] &&
1892 *n
->osd_addrs
->cluster_addrs
[i
] == *o
->osd_addrs
->cluster_addrs
[i
])
1893 n
->osd_addrs
->cluster_addrs
[i
] = o
->osd_addrs
->cluster_addrs
[i
];
1896 if ( n
->osd_addrs
->hb_back_addrs
[i
] && o
->osd_addrs
->hb_back_addrs
[i
] &&
1897 *n
->osd_addrs
->hb_back_addrs
[i
] == *o
->osd_addrs
->hb_back_addrs
[i
])
1898 n
->osd_addrs
->hb_back_addrs
[i
] = o
->osd_addrs
->hb_back_addrs
[i
];
1901 if ( n
->osd_addrs
->hb_front_addrs
[i
] && o
->osd_addrs
->hb_front_addrs
[i
] &&
1902 *n
->osd_addrs
->hb_front_addrs
[i
] == *o
->osd_addrs
->hb_front_addrs
[i
])
1903 n
->osd_addrs
->hb_front_addrs
[i
] = o
->osd_addrs
->hb_front_addrs
[i
];
1908 // zoinks, no differences at all!
1909 n
->osd_addrs
= o
->osd_addrs
;
1912 // does crush match?
1913 ceph::buffer::list oc
, nc
;
1914 encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1915 encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1916 if (oc
.contents_equal(nc
)) {
1917 n
->crush
= o
->crush
;
1920 // does pg_temp match?
1921 if (*o
->pg_temp
== *n
->pg_temp
)
1922 n
->pg_temp
= o
->pg_temp
;
1924 // does primary_temp match?
1925 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1926 if (*o
->primary_temp
== *n
->primary_temp
)
1927 n
->primary_temp
= o
->primary_temp
;
1931 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1932 *o
->osd_uuid
== *n
->osd_uuid
)
1933 n
->osd_uuid
= o
->osd_uuid
;
1936 void OSDMap::clean_temps(CephContext
*cct
,
1937 const OSDMap
& oldmap
,
1938 const OSDMap
& nextmap
,
1939 Incremental
*pending_inc
)
1941 ldout(cct
, 10) << __func__
<< dendl
;
1943 for (auto pg
: *nextmap
.pg_temp
) {
1944 // if pool does not exist, remove any existing pg_temps associated with
1945 // it. we don't care about pg_temps on the pending_inc either; if there
1946 // are new_pg_temp entries on the pending, clear them out just as well.
1947 if (!nextmap
.have_pg_pool(pg
.first
.pool())) {
1948 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1949 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1950 pending_inc
->new_pg_temp
[pg
.first
].clear();
1953 if (!nextmap
.pg_exists(pg
.first
)) {
1954 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1955 << " for nonexistent pg " << dendl
;
1956 pending_inc
->new_pg_temp
[pg
.first
].clear();
1960 unsigned num_up
= 0;
1961 for (auto o
: pg
.second
) {
1962 if (!nextmap
.is_down(o
)) {
1968 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1969 << " with all down osds" << pg
.second
<< dendl
;
1970 pending_inc
->new_pg_temp
[pg
.first
].clear();
1973 // redundant pg_temp?
1976 nextmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1977 bool remove
= false;
1978 if (raw_up
== pg
.second
) {
1979 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1980 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1983 // oversized pg_temp?
1984 if (pg
.second
.size() > nextmap
.get_pg_pool(pg
.first
.pool())->get_size()) {
1985 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1986 << pg
.second
<< " exceeds pool size" << dendl
;
1990 if (oldmap
.pg_temp
->count(pg
.first
))
1991 pending_inc
->new_pg_temp
[pg
.first
].clear();
1993 pending_inc
->new_pg_temp
.erase(pg
.first
);
1997 for (auto &pg
: *nextmap
.primary_temp
) {
1999 if (nextmap
.is_down(pg
.second
)) {
2000 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
2001 << " to down " << pg
.second
<< dendl
;
2002 pending_inc
->new_primary_temp
[pg
.first
] = -1;
2005 // redundant primary_temp?
2006 vector
<int> real_up
, templess_up
;
2007 int real_primary
, templess_primary
;
2008 pg_t pgid
= pg
.first
;
2009 nextmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
2010 nextmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
2011 if (real_primary
== templess_primary
){
2012 ldout(cct
, 10) << __func__
<< " removing primary_temp "
2013 << pgid
<< " -> " << real_primary
2014 << " (unnecessary/redundant)" << dendl
;
2015 if (oldmap
.primary_temp
->count(pgid
))
2016 pending_inc
->new_primary_temp
[pgid
] = -1;
2018 pending_inc
->new_primary_temp
.erase(pgid
);
2023 void OSDMap::get_upmap_pgs(vector
<pg_t
> *upmap_pgs
) const
2025 upmap_pgs
->reserve(pg_upmap
.size() + pg_upmap_items
.size());
2026 for (auto& p
: pg_upmap
)
2027 upmap_pgs
->push_back(p
.first
);
2028 for (auto& p
: pg_upmap_items
)
2029 upmap_pgs
->push_back(p
.first
);
2032 bool OSDMap::check_pg_upmaps(
2034 const vector
<pg_t
>& to_check
,
2035 vector
<pg_t
> *to_cancel
,
2036 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> *to_remap
) const
2038 bool any_change
= false;
2039 map
<int, map
<int, float>> rule_weight_map
;
2040 for (auto& pg
: to_check
) {
2041 const pg_pool_t
*pi
= get_pg_pool(pg
.pool());
2042 if (!pi
|| pg
.ps() >= pi
->get_pg_num_pending()) {
2043 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is gone or merge source"
2045 to_cancel
->push_back(pg
);
2048 if (pi
->is_pending_merge(pg
, nullptr)) {
2049 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is pending merge"
2051 to_cancel
->push_back(pg
);
2054 vector
<int> raw
, up
;
2055 pg_to_raw_upmap(pg
, &raw
, &up
);
2056 auto crush_rule
= get_pg_pool_crush_rule(pg
);
2057 auto r
= crush
->verify_upmap(cct
,
2059 get_pg_pool_size(pg
),
2062 ldout(cct
, 0) << __func__
<< " verify_upmap of pg " << pg
2063 << " returning " << r
2065 to_cancel
->push_back(pg
);
2068 // below we check against crush-topology changing..
2069 map
<int, float> weight_map
;
2070 auto it
= rule_weight_map
.find(crush_rule
);
2071 if (it
== rule_weight_map
.end()) {
2072 auto r
= crush
->get_rule_weight_osd_map(crush_rule
, &weight_map
);
2074 lderr(cct
) << __func__
<< " unable to get crush weight_map for "
2075 << "crush_rule " << crush_rule
2079 rule_weight_map
[crush_rule
] = weight_map
;
2081 weight_map
= it
->second
;
2083 ldout(cct
, 10) << __func__
<< " pg " << pg
2084 << " weight_map " << weight_map
2086 for (auto osd
: up
) {
2087 auto it
= weight_map
.find(osd
);
2088 if (it
== weight_map
.end()) {
2089 ldout(cct
, 10) << __func__
<< " pg " << pg
<< ": osd " << osd
<< " is gone or has "
2090 << "been moved out of the specific crush-tree"
2092 to_cancel
->push_back(pg
);
2095 auto adjusted_weight
= get_weightf(it
->first
) * it
->second
;
2096 if (adjusted_weight
== 0) {
2097 ldout(cct
, 10) << __func__
<< " pg " << pg
<< ": osd " << osd
2098 << " is out/crush-out"
2100 to_cancel
->push_back(pg
);
2104 if (!to_cancel
->empty() && to_cancel
->back() == pg
)
2106 // okay, upmap is valid
2107 // continue to check if it is still necessary
2108 auto i
= pg_upmap
.find(pg
);
2109 if (i
!= pg_upmap
.end()) {
2110 if (i
->second
== raw
) {
2111 ldout(cct
, 10) << __func__
<< "removing redundant pg_upmap " << i
->first
<< " "
2112 << i
->second
<< dendl
;
2113 to_cancel
->push_back(pg
);
2116 if ((int)i
->second
.size() != get_pg_pool_size(pg
)) {
2117 ldout(cct
, 10) << __func__
<< "removing pg_upmap " << i
->first
<< " "
2118 << i
->second
<< " != pool size " << get_pg_pool_size(pg
)
2120 to_cancel
->push_back(pg
);
2124 auto j
= pg_upmap_items
.find(pg
);
2125 if (j
!= pg_upmap_items
.end()) {
2126 mempool::osdmap::vector
<pair
<int,int>> newmap
;
2127 for (auto& p
: j
->second
) {
2128 auto osd_from
= p
.first
;
2129 auto osd_to
= p
.second
;
2130 if (std::find(raw
.begin(), raw
.end(), osd_from
) == raw
.end()) {
2131 // cancel mapping if source osd does not exist anymore
2132 ldout(cct
, 20) << __func__
<< " pg_upmap_items (source osd does not exist) " << pg_upmap_items
<< dendl
;
2135 if (osd_to
!= CRUSH_ITEM_NONE
&& osd_to
< max_osd
&&
2136 osd_to
>= 0 && osd_weight
[osd_to
] == 0) {
2137 // cancel mapping if target osd is out
2138 ldout(cct
, 20) << __func__
<< " pg_upmap_items (target osd is out) " << pg_upmap_items
<< dendl
;
2141 newmap
.push_back(p
);
2143 if (newmap
.empty()) {
2144 ldout(cct
, 10) << __func__
<< " removing no-op pg_upmap_items "
2145 << j
->first
<< " " << j
->second
2147 to_cancel
->push_back(pg
);
2149 //Josh--check partial no-op here.
2150 ldout(cct
, 10) << __func__
<< " simplifying partially no-op pg_upmap_items "
2151 << j
->first
<< " " << j
->second
2154 to_remap
->insert({pg
, newmap
});
2159 any_change
= any_change
|| !to_cancel
->empty();
2163 void OSDMap::clean_pg_upmaps(
2165 Incremental
*pending_inc
,
2166 const vector
<pg_t
>& to_cancel
,
2167 const map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>>& to_remap
) const
2169 for (auto &pg
: to_cancel
) {
2170 auto i
= pending_inc
->new_pg_upmap
.find(pg
);
2171 if (i
!= pending_inc
->new_pg_upmap
.end()) {
2172 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
2173 << "pg_upmap entry "
2174 << i
->first
<< "->" << i
->second
2176 pending_inc
->new_pg_upmap
.erase(i
);
2178 auto j
= pg_upmap
.find(pg
);
2179 if (j
!= pg_upmap
.end()) {
2180 ldout(cct
, 10) << __func__
<< " cancel invalid pg_upmap entry "
2181 << j
->first
<< "->" << j
->second
2183 pending_inc
->old_pg_upmap
.insert(pg
);
2185 auto p
= pending_inc
->new_pg_upmap_items
.find(pg
);
2186 if (p
!= pending_inc
->new_pg_upmap_items
.end()) {
2187 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
2188 << "pg_upmap_items entry "
2189 << p
->first
<< "->" << p
->second
2191 pending_inc
->new_pg_upmap_items
.erase(p
);
2193 auto q
= pg_upmap_items
.find(pg
);
2194 if (q
!= pg_upmap_items
.end()) {
2195 ldout(cct
, 10) << __func__
<< " cancel invalid "
2196 << "pg_upmap_items entry "
2197 << q
->first
<< "->" << q
->second
2199 pending_inc
->old_pg_upmap_items
.insert(pg
);
2202 for (auto& i
: to_remap
)
2203 pending_inc
->new_pg_upmap_items
[i
.first
] = i
.second
;
2206 bool OSDMap::clean_pg_upmaps(
2208 Incremental
*pending_inc
) const
2210 ldout(cct
, 10) << __func__
<< dendl
;
2211 vector
<pg_t
> to_check
;
2212 vector
<pg_t
> to_cancel
;
2213 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> to_remap
;
2215 get_upmap_pgs(&to_check
);
2216 auto any_change
= check_pg_upmaps(cct
, to_check
, &to_cancel
, &to_remap
);
2217 clean_pg_upmaps(cct
, pending_inc
, to_cancel
, to_remap
);
2218 //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked
2219 // and cleaned in the same way as pg_upmap. This is not critical since invalid
2220 // pg_upmap_primaries are never applied, (the final check is in _apply_upmap).
2224 int OSDMap::apply_incremental(const Incremental
&inc
)
2226 new_blocklist_entries
= false;
2229 else if (inc
.fsid
!= fsid
)
2232 ceph_assert(inc
.epoch
== epoch
+1);
2235 modified
= inc
.modified
;
2238 if (inc
.fullmap
.length()) {
2239 ceph::buffer::list
bl(inc
.fullmap
);
2244 // nope, incremental.
2245 if (inc
.new_flags
>= 0) {
2246 flags
= inc
.new_flags
;
2247 // the below is just to cover a newly-upgraded luminous mon
2248 // cluster that has to set require_jewel_osds or
2249 // require_kraken_osds before the osds can be upgraded to
2251 if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
2252 if (require_osd_release
< ceph_release_t::kraken
) {
2253 require_osd_release
= ceph_release_t::kraken
;
2255 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
2256 if (require_osd_release
< ceph_release_t::jewel
) {
2257 require_osd_release
= ceph_release_t::jewel
;
2262 if (inc
.new_max_osd
>= 0)
2263 set_max_osd(inc
.new_max_osd
);
2265 if (inc
.new_pool_max
!= -1)
2266 pool_max
= inc
.new_pool_max
;
2268 for (const auto &pool
: inc
.new_pools
) {
2269 pools
[pool
.first
] = pool
.second
;
2270 pools
[pool
.first
].last_change
= epoch
;
2273 new_removed_snaps
= inc
.new_removed_snaps
;
2274 new_purged_snaps
= inc
.new_purged_snaps
;
2275 for (auto p
= new_removed_snaps
.begin();
2276 p
!= new_removed_snaps
.end();
2278 removed_snaps_queue
[p
->first
].union_of(p
->second
);
2280 for (auto p
= new_purged_snaps
.begin();
2281 p
!= new_purged_snaps
.end();
2283 auto q
= removed_snaps_queue
.find(p
->first
);
2284 ceph_assert(q
!= removed_snaps_queue
.end());
2285 q
->second
.subtract(p
->second
);
2286 if (q
->second
.empty()) {
2287 removed_snaps_queue
.erase(q
);
2291 if (inc
.new_last_up_change
!= utime_t()) {
2292 last_up_change
= inc
.new_last_up_change
;
2294 if (inc
.new_last_in_change
!= utime_t()) {
2295 last_in_change
= inc
.new_last_in_change
;
2298 for (const auto &pname
: inc
.new_pool_names
) {
2299 auto pool_name_entry
= pool_name
.find(pname
.first
);
2300 if (pool_name_entry
!= pool_name
.end()) {
2301 name_pool
.erase(pool_name_entry
->second
);
2302 pool_name_entry
->second
= pname
.second
;
2304 pool_name
[pname
.first
] = pname
.second
;
2306 name_pool
[pname
.second
] = pname
.first
;
2309 for (const auto &pool
: inc
.old_pools
) {
2311 name_pool
.erase(pool_name
[pool
]);
2312 pool_name
.erase(pool
);
2315 for (const auto &weight
: inc
.new_weight
) {
2316 set_weight(weight
.first
, weight
.second
);
2318 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2319 // xinfo old_weight.
2320 if (weight
.second
) {
2321 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
2322 osd_xinfo
[weight
.first
].old_weight
= 0;
2326 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
2327 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
2330 // erasure_code_profiles
2331 for (const auto &profile
: inc
.old_erasure_code_profiles
)
2332 erasure_code_profiles
.erase(profile
);
2334 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
2335 set_erasure_code_profile(profile
.first
, profile
.second
);
2339 for (const auto &state
: inc
.new_state
) {
2340 const auto osd
= state
.first
;
2341 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
2342 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
2343 (s
& CEPH_OSD_UP
)) {
2344 osd_info
[osd
].down_at
= epoch
;
2345 osd_xinfo
[osd
].down_stamp
= modified
;
2347 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
2348 (s
& CEPH_OSD_EXISTS
)) {
2349 // osd is destroyed; clear out anything interesting.
2350 (*osd_uuid
)[osd
] = uuid_d();
2351 osd_info
[osd
] = osd_info_t();
2352 osd_xinfo
[osd
] = osd_xinfo_t();
2353 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
2354 osd_addrs
->client_addrs
[osd
].reset(new entity_addrvec_t());
2355 osd_addrs
->cluster_addrs
[osd
].reset(new entity_addrvec_t());
2356 osd_addrs
->hb_front_addrs
[osd
].reset(new entity_addrvec_t());
2357 osd_addrs
->hb_back_addrs
[osd
].reset(new entity_addrvec_t());
2360 osd_state
[osd
] ^= s
;
2364 for (const auto &client
: inc
.new_up_client
) {
2365 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
2366 osd_state
[client
.first
] &= ~CEPH_OSD_STOP
; // if any
2367 osd_addrs
->client_addrs
[client
.first
].reset(
2368 new entity_addrvec_t(client
.second
));
2369 osd_addrs
->hb_back_addrs
[client
.first
].reset(
2370 new entity_addrvec_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
2371 osd_addrs
->hb_front_addrs
[client
.first
].reset(
2372 new entity_addrvec_t(inc
.new_hb_front_up
.find(client
.first
)->second
));
2374 osd_info
[client
.first
].up_from
= epoch
;
2377 for (const auto &cluster
: inc
.new_up_cluster
)
2378 osd_addrs
->cluster_addrs
[cluster
.first
].reset(
2379 new entity_addrvec_t(cluster
.second
));
2382 for (const auto &thru
: inc
.new_up_thru
)
2383 osd_info
[thru
.first
].up_thru
= thru
.second
;
2385 for (const auto &interval
: inc
.new_last_clean_interval
) {
2386 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
2387 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
2390 for (const auto &lost
: inc
.new_lost
)
2391 osd_info
[lost
.first
].lost_at
= lost
.second
;
2394 for (const auto &xinfo
: inc
.new_xinfo
)
2395 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
2398 for (const auto &uuid
: inc
.new_uuid
)
2399 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
2402 for (const auto &pg
: inc
.new_pg_temp
) {
2403 if (pg
.second
.empty())
2404 pg_temp
->erase(pg
.first
);
2406 pg_temp
->set(pg
.first
, pg
.second
);
2408 if (!inc
.new_pg_temp
.empty()) {
2409 // make sure pg_temp is efficiently stored
2413 for (const auto &pg
: inc
.new_primary_temp
) {
2414 if (pg
.second
== -1)
2415 primary_temp
->erase(pg
.first
);
2417 (*primary_temp
)[pg
.first
] = pg
.second
;
2420 for (auto& p
: inc
.new_pg_upmap
) {
2421 pg_upmap
[p
.first
] = p
.second
;
2423 for (auto& pg
: inc
.old_pg_upmap
) {
2426 for (auto& p
: inc
.new_pg_upmap_items
) {
2427 pg_upmap_items
[p
.first
] = p
.second
;
2429 for (auto& pg
: inc
.old_pg_upmap_items
) {
2430 pg_upmap_items
.erase(pg
);
2433 for (auto& [pg
, prim
] : inc
.new_pg_upmap_primary
) {
2434 pg_upmap_primaries
[pg
] = prim
;
2436 for (auto& pg
: inc
.old_pg_upmap_primary
) {
2437 pg_upmap_primaries
.erase(pg
);
2441 if (!inc
.new_blocklist
.empty()) {
2442 blocklist
.insert(inc
.new_blocklist
.begin(),inc
.new_blocklist
.end());
2443 new_blocklist_entries
= true;
2445 for (const auto &addr
: inc
.old_blocklist
)
2446 blocklist
.erase(addr
);
2448 for (const auto& addr_p
: inc
.new_range_blocklist
) {
2449 range_blocklist
.insert(addr_p
);
2450 calculated_ranges
.emplace(addr_p
.first
, addr_p
.first
);
2451 new_blocklist_entries
= true;
2453 for (const auto &addr
: inc
.old_range_blocklist
) {
2454 calculated_ranges
.erase(addr
);
2455 range_blocklist
.erase(addr
);
2458 for (auto& i
: inc
.new_crush_node_flags
) {
2460 crush_node_flags
[i
.first
] = i
.second
;
2462 crush_node_flags
.erase(i
.first
);
2466 for (auto& i
: inc
.new_device_class_flags
) {
2468 device_class_flags
[i
.first
] = i
.second
;
2470 device_class_flags
.erase(i
.first
);
2474 // cluster snapshot?
2475 if (inc
.cluster_snapshot
.length()) {
2476 cluster_snapshot
= inc
.cluster_snapshot
;
2477 cluster_snapshot_epoch
= inc
.epoch
;
2479 cluster_snapshot
.clear();
2480 cluster_snapshot_epoch
= 0;
2483 if (inc
.new_nearfull_ratio
>= 0) {
2484 nearfull_ratio
= inc
.new_nearfull_ratio
;
2486 if (inc
.new_backfillfull_ratio
>= 0) {
2487 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
2489 if (inc
.new_full_ratio
>= 0) {
2490 full_ratio
= inc
.new_full_ratio
;
2492 if (inc
.new_require_min_compat_client
> ceph_release_t::unknown
) {
2493 require_min_compat_client
= inc
.new_require_min_compat_client
;
2495 if (inc
.new_require_osd_release
>= ceph_release_t::unknown
) {
2496 require_osd_release
= inc
.new_require_osd_release
;
2497 if (require_osd_release
>= ceph_release_t::luminous
) {
2498 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2499 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
2503 if (inc
.new_require_osd_release
>= ceph_release_t::unknown
) {
2504 require_osd_release
= inc
.new_require_osd_release
;
2505 if (require_osd_release
>= ceph_release_t::nautilus
) {
2506 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
2509 // do new crush map last (after up/down stuff)
2510 if (inc
.crush
.length()) {
2511 ceph::buffer::list
bl(inc
.crush
);
2512 auto blp
= bl
.cbegin();
2513 crush
.reset(new CrushWrapper
);
2515 if (require_osd_release
>= ceph_release_t::luminous
) {
2516 // only increment if this is a luminous-encoded osdmap, lest
2517 // the mon's crush_version diverge from what the osds or others
2518 // are decoding and applying on their end. if we won't encode
2519 // it in the canonical version, don't change it.
2522 for (auto it
= device_class_flags
.begin();
2523 it
!= device_class_flags
.end();) {
2524 const char* class_name
= crush
->get_class_name(it
->first
);
2525 if (!class_name
) // device class is gone
2526 it
= device_class_flags
.erase(it
);
2532 if (inc
.change_stretch_mode
) {
2533 stretch_mode_enabled
= inc
.stretch_mode_enabled
;
2534 stretch_bucket_count
= inc
.new_stretch_bucket_count
;
2535 degraded_stretch_mode
= inc
.new_degraded_stretch_mode
;
2536 recovering_stretch_mode
= inc
.new_recovering_stretch_mode
;
2537 stretch_mode_bucket
= inc
.new_stretch_mode_bucket
;
2540 switch (inc
.mutate_allow_crimson
) {
2541 case Incremental::mutate_allow_crimson_t::NONE
:
2543 case Incremental::mutate_allow_crimson_t::SET
:
2544 allow_crimson
= true;
2546 case Incremental::mutate_allow_crimson_t::CLEAR
:
2547 allow_crimson
= false;
2552 _calc_up_osd_features();
2557 int OSDMap::map_to_pg(
2561 const string
& nspace
,
2564 // calculate ps (placement seed)
2565 const pg_pool_t
*pool
= get_pg_pool(poolid
);
2570 ps
= pool
->hash_key(key
, nspace
);
2572 ps
= pool
->hash_key(name
, nspace
);
2573 *pg
= pg_t(ps
, poolid
);
2577 int OSDMap::object_locator_to_pg(
2578 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
2580 if (loc
.hash
>= 0) {
2581 if (!get_pg_pool(loc
.get_pool())) {
2584 pg
= pg_t(loc
.hash
, loc
.get_pool());
2587 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
2590 ceph_object_layout
OSDMap::make_object_layout(
2591 object_t oid
, int pg_pool
, string nspace
) const
2593 object_locator_t
loc(pg_pool
, nspace
);
2595 ceph_object_layout ol
;
2596 pg_t pgid
= object_locator_to_pg(oid
, loc
);
2597 ol
.ol_pgid
= pgid
.get_old_pg().v
;
2598 ol
.ol_stripe_unit
= 0;
2602 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
2603 vector
<int>& osds
) const
2605 if (pool
.can_shift_osds()) {
2606 unsigned removed
= 0;
2607 for (unsigned i
= 0; i
< osds
.size(); i
++) {
2608 if (!exists(osds
[i
])) {
2613 osds
[i
- removed
] = osds
[i
];
2617 osds
.resize(osds
.size() - removed
);
2619 for (auto& osd
: osds
) {
2621 osd
= CRUSH_ITEM_NONE
;
2626 void OSDMap::_pg_to_raw_osds(
2627 const pg_pool_t
& pool
, pg_t pg
,
2632 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
2633 unsigned size
= pool
.get_size();
2636 int ruleno
= pool
.get_crush_rule();
2638 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
2640 _remove_nonexistent_osds(pool
, *osds
);
2646 int OSDMap::_pick_primary(const vector
<int>& osds
) const
2648 for (auto osd
: osds
) {
2649 if (osd
!= CRUSH_ITEM_NONE
) {
2656 void OSDMap::_apply_upmap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
2658 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
2659 auto p
= pg_upmap
.find(pg
);
2660 if (p
!= pg_upmap
.end()) {
2661 // make sure targets aren't marked out
2662 for (auto osd
: p
->second
) {
2663 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd
>= 0 &&
2664 osd_weight
[osd
] == 0) {
2665 // reject/ignore the explicit mapping
2669 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
2670 // continue to check and apply pg_upmap_items if any
2673 auto q
= pg_upmap_items
.find(pg
);
2674 if (q
!= pg_upmap_items
.end()) {
2675 // NOTE: this approach does not allow a bidirectional swap,
2676 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2677 for (auto& [osd_from
, osd_to
] : q
->second
) {
2678 // A capcaity change upmap (repace osd in the pg with osd not in the pg)
2679 // make sure the replacement value doesn't already appear
2680 bool exists
= false;
2682 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
2683 int osd
= (*raw
)[i
];
2684 if (osd
== osd_to
) {
2688 // ignore mapping if target is marked out (or invalid osd id)
2689 if (osd
== osd_from
&&
2691 !(osd_to
!= CRUSH_ITEM_NONE
&& osd_to
< max_osd
&&
2692 osd_to
>= 0 && osd_weight
[osd_to
] == 0)) {
2696 if (!exists
&& pos
>= 0) {
2697 (*raw
)[pos
] = osd_to
;
2701 auto r
= pg_upmap_primaries
.find(pg
);
2702 if (r
!= pg_upmap_primaries
.end()) {
2703 auto new_prim
= r
->second
;
2704 // Apply mapping only if new primary is not marked out and valid osd id
2705 if (new_prim
!= CRUSH_ITEM_NONE
&& new_prim
< max_osd
&& new_prim
>= 0 &&
2706 osd_weight
[new_prim
] != 0) {
2707 int new_prim_idx
= 0;
2708 for (int i
= 1 ; i
< (int)raw
->size(); i
++) { // start from 1 on purpose
2709 if ((*raw
)[i
] == new_prim
) {
2714 if (new_prim_idx
> 0) {
2716 (*raw
)[new_prim_idx
] = (*raw
)[0];
2717 (*raw
)[0] = new_prim
;
2723 // pg -> (up osd list)
2724 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
2725 vector
<int> *up
) const
2727 if (pool
.can_shift_osds()) {
2730 up
->reserve(raw
.size());
2731 for (unsigned i
=0; i
<raw
.size(); i
++) {
2732 if (!exists(raw
[i
]) || is_down(raw
[i
]))
2734 up
->push_back(raw
[i
]);
2737 // set down/dne devices to NONE
2738 up
->resize(raw
.size());
2739 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
2740 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
2741 (*up
)[i
] = CRUSH_ITEM_NONE
;
2749 void OSDMap::_apply_primary_affinity(ps_t seed
,
2750 const pg_pool_t
& pool
,
2754 // do we have any non-default primary_affinity values for these osds?
2755 if (!osd_primary_affinity
)
2759 for (const auto osd
: *osds
) {
2760 if (osd
!= CRUSH_ITEM_NONE
&&
2761 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
2769 // pick the primary. feed both the seed (for the pg) and the osd
2770 // into the hash/rng so that a proportional fraction of an osd's pgs
2771 // get rejected as primary.
2773 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
2775 if (o
== CRUSH_ITEM_NONE
)
2777 unsigned a
= (*osd_primary_affinity
)[o
];
2778 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
2779 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
2780 seed
, o
) >> 16) >= a
) {
2781 // we chose not to use this primary. note it anyway as a
2782 // fallback in case we don't pick anyone else, but keep looking.
2793 *primary
= (*osds
)[pos
];
2795 if (pool
.can_shift_osds() && pos
> 0) {
2796 // move the new primary to the front.
2797 for (int i
= pos
; i
> 0; --i
) {
2798 (*osds
)[i
] = (*osds
)[i
-1];
2800 (*osds
)[0] = *primary
;
2804 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
2805 vector
<int> *temp_pg
, int *temp_primary
) const
2807 pg
= pool
.raw_pg_to_pg(pg
);
2808 const auto p
= pg_temp
->find(pg
);
2810 if (p
!= pg_temp
->end()) {
2811 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
2812 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
2813 if (pool
.can_shift_osds()) {
2816 temp_pg
->push_back(CRUSH_ITEM_NONE
);
2819 temp_pg
->push_back(p
->second
[i
]);
2823 const auto &pp
= primary_temp
->find(pg
);
2825 if (pp
!= primary_temp
->end()) {
2826 *temp_primary
= pp
->second
;
2827 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
2828 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
2829 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
2830 *temp_primary
= (*temp_pg
)[i
];
2837 void OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
2839 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2845 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2846 *primary
= _pick_primary(*raw
);
2849 void OSDMap::pg_to_raw_upmap(pg_t pg
, vector
<int>*raw
,
2850 vector
<int> *raw_upmap
) const
2852 auto pool
= get_pg_pool(pg
.pool());
2857 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2859 _apply_upmap(*pool
, pg
, raw_upmap
);
2862 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
2864 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2872 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2873 _apply_upmap(*pool
, pg
, &raw
);
2874 _raw_to_up_osds(*pool
, raw
, up
);
2875 *primary
= _pick_primary(raw
);
2876 _apply_primary_affinity(pps
, *pool
, up
, primary
);
2879 void OSDMap::_pg_to_up_acting_osds(
2880 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
2881 vector
<int> *acting
, int *acting_primary
,
2882 bool raw_pg_to_pg
) const
2884 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2886 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
2894 *acting_primary
= -1;
2899 vector
<int> _acting
;
2901 int _acting_primary
;
2903 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
2904 if (_acting
.empty() || up
|| up_primary
) {
2905 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2906 _apply_upmap(*pool
, pg
, &raw
);
2907 _raw_to_up_osds(*pool
, raw
, &_up
);
2908 _up_primary
= _pick_primary(_up
);
2909 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
2910 if (_acting
.empty()) {
2912 if (_acting_primary
== -1) {
2913 _acting_primary
= _up_primary
;
2920 *up_primary
= _up_primary
;
2924 acting
->swap(_acting
);
2926 *acting_primary
= _acting_primary
;
2929 int OSDMap::calc_pg_role_broken(int osd
, const vector
<int>& acting
, int nrep
)
2931 // This implementation is broken for EC PGs since the osd may appear
2932 // multiple times in the acting set. See
2933 // https://tracker.ceph.com/issues/43213
2935 nrep
= acting
.size();
2936 for (int i
=0; i
<nrep
; i
++)
2937 if (acting
[i
] == osd
)
2942 int OSDMap::calc_pg_role(pg_shard_t who
, const vector
<int>& acting
)
2944 int nrep
= acting
.size();
2945 if (who
.shard
== shard_id_t::NO_SHARD
) {
2946 for (int i
=0; i
<nrep
; i
++) {
2947 if (acting
[i
] == who
.osd
) {
2952 if (who
.shard
< nrep
&& acting
[who
.shard
] == who
.osd
) {
2959 bool OSDMap::primary_changed_broken(
2961 const vector
<int> &oldacting
,
2963 const vector
<int> &newacting
)
2965 if (oldacting
.empty() && newacting
.empty())
2966 return false; // both still empty
2967 if (oldacting
.empty() ^ newacting
.empty())
2968 return true; // was empty, now not, or vice versa
2969 if (oldprimary
!= newprimary
)
2970 return true; // primary changed
2971 if (calc_pg_role_broken(oldprimary
, oldacting
) !=
2972 calc_pg_role_broken(newprimary
, newacting
))
2974 return false; // same primary (tho replicas may have changed)
2977 uint64_t OSDMap::get_encoding_features() const
2979 uint64_t f
= SIGNIFICANT_FEATURES
;
2980 if (require_osd_release
< ceph_release_t::octopus
) {
2981 f
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
2983 if (require_osd_release
< ceph_release_t::nautilus
) {
2984 f
&= ~CEPH_FEATURE_SERVER_NAUTILUS
;
2986 if (require_osd_release
< ceph_release_t::mimic
) {
2987 f
&= ~CEPH_FEATURE_SERVER_MIMIC
;
2989 if (require_osd_release
< ceph_release_t::luminous
) {
2990 f
&= ~(CEPH_FEATURE_SERVER_LUMINOUS
|
2991 CEPH_FEATURE_CRUSH_CHOOSE_ARGS
);
2993 if (require_osd_release
< ceph_release_t::kraken
) {
2994 f
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
2995 CEPH_FEATURE_MSG_ADDR2
);
2997 if (require_osd_release
< ceph_release_t::jewel
) {
2998 f
&= ~(CEPH_FEATURE_SERVER_JEWEL
|
2999 CEPH_FEATURE_NEW_OSDOP_ENCODING
|
3000 CEPH_FEATURE_CRUSH_TUNABLES5
);
3005 // serialize, unserialize
3006 void OSDMap::encode_client_old(ceph::buffer::list
& bl
) const
3015 encode(created
, bl
);
3016 encode(modified
, bl
);
3018 // for encode(pools, bl);
3019 __u32 n
= pools
.size();
3022 for (const auto &pool
: pools
) {
3025 encode(pool
.second
, bl
, 0);
3027 // for encode(pool_name, bl);
3028 n
= pool_name
.size();
3030 for (const auto &pname
: pool_name
) {
3033 encode(pname
.second
, bl
);
3035 // for encode(pool_max, bl);
3041 encode(max_osd
, bl
);
3043 uint32_t n
= osd_state
.size();
3045 for (auto s
: osd_state
) {
3046 encode((uint8_t)s
, bl
);
3049 encode(osd_weight
, bl
);
3050 encode(osd_addrs
->client_addrs
, bl
, 0);
3052 // for encode(pg_temp, bl);
3053 n
= pg_temp
->size();
3055 for (const auto& pg
: *pg_temp
) {
3056 old_pg_t opg
= pg
.first
.get_old_pg();
3058 encode(pg
.second
, bl
);
3062 ceph::buffer::list cbl
;
3063 crush
->encode(cbl
, 0 /* legacy (no) features */);
3067 void OSDMap::encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const
3070 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
3071 encode_client_old(bl
);
3081 encode(created
, bl
);
3082 encode(modified
, bl
);
3084 encode(pools
, bl
, features
);
3085 encode(pool_name
, bl
);
3086 encode(pool_max
, bl
);
3090 encode(max_osd
, bl
);
3092 uint32_t n
= osd_state
.size();
3094 for (auto s
: osd_state
) {
3095 encode((uint8_t)s
, bl
);
3098 encode(osd_weight
, bl
);
3099 encode(osd_addrs
->client_addrs
, bl
, features
);
3101 encode(*pg_temp
, bl
);
3104 ceph::buffer::list cbl
;
3105 crush
->encode(cbl
, 0 /* legacy (no) features */);
3111 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
3112 encode(osd_info
, bl
);
3113 encode(blocklist
, bl
, features
);
3114 encode(osd_addrs
->cluster_addrs
, bl
, features
);
3115 encode(cluster_snapshot_epoch
, bl
);
3116 encode(cluster_snapshot
, bl
);
3117 encode(*osd_uuid
, bl
);
3118 encode(osd_xinfo
, bl
, features
);
3119 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
3122 /* for a description of osdmap versions, and when they were introduced, please
3124 * doc/dev/osd_internals/osdmap_versions.txt
3126 void OSDMap::encode(ceph::buffer::list
& bl
, uint64_t features
) const
3129 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
3130 encode_classic(bl
, features
);
3134 // only a select set of callers should *ever* be encoding new
3135 // OSDMaps. others should be passing around the canonical encoded
3136 // buffers from on high. select out those callers by passing in an
3137 // "impossible" feature bit.
3138 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
3139 features
&= ~CEPH_FEATURE_RESERVED
;
3141 size_t start_offset
= bl
.length();
3144 std::optional
<ceph::buffer::list::contiguous_filler
> crc_filler
;
3146 // meta-encoding: how we include client-used and osd-specific data
3147 ENCODE_START(8, 7, bl
);
3150 // NOTE: any new encoding dependencies must be reflected by
3151 // SIGNIFICANT_FEATURES
3153 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
3155 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
3157 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
3159 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
3162 ENCODE_START(v
, 1, bl
); // client-usable data
3166 encode(created
, bl
);
3167 encode(modified
, bl
);
3169 encode(pools
, bl
, features
);
3170 encode(pool_name
, bl
);
3171 encode(pool_max
, bl
);
3174 decltype(flags
) f
= flags
;
3175 if (require_osd_release
>= ceph_release_t::luminous
)
3176 f
|= CEPH_OSDMAP_REQUIRE_LUMINOUS
| CEPH_OSDMAP_RECOVERY_DELETES
;
3177 else if (require_osd_release
== ceph_release_t::kraken
)
3178 f
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
3179 else if (require_osd_release
== ceph_release_t::jewel
)
3180 f
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
3186 encode(max_osd
, bl
);
3188 encode(osd_state
, bl
);
3190 uint32_t n
= osd_state
.size();
3192 for (auto s
: osd_state
) {
3193 encode((uint8_t)s
, bl
);
3196 encode(osd_weight
, bl
);
3198 encode(osd_addrs
->client_addrs
, bl
, features
);
3200 encode_addrvec_pvec_as_addr(osd_addrs
->client_addrs
, bl
, features
);
3203 encode(*pg_temp
, bl
);
3204 encode(*primary_temp
, bl
);
3205 if (osd_primary_affinity
) {
3206 encode(*osd_primary_affinity
, bl
);
3213 ceph::buffer::list cbl
;
3214 crush
->encode(cbl
, features
);
3216 encode(erasure_code_profiles
, bl
);
3219 encode(pg_upmap
, bl
);
3220 encode(pg_upmap_items
, bl
);
3222 ceph_assert(pg_upmap
.empty());
3223 ceph_assert(pg_upmap_items
.empty());
3226 encode(crush_version
, bl
);
3229 encode(new_removed_snaps
, bl
);
3230 encode(new_purged_snaps
, bl
);
3233 encode(last_up_change
, bl
);
3234 encode(last_in_change
, bl
);
3237 encode(pg_upmap_primaries
, bl
);
3239 ceph_assert(pg_upmap_primaries
.empty());
3241 ENCODE_FINISH(bl
); // client-usable data
3245 // NOTE: any new encoding dependencies must be reflected by
3246 // SIGNIFICANT_FEATURES
3247 uint8_t target_v
= 9; // when bumping this, be aware of allow_crimson
3248 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
3250 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
3252 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
3255 if (stretch_mode_enabled
) {
3256 target_v
= std::max((uint8_t)10, target_v
);
3258 if (!range_blocklist
.empty()) {
3259 target_v
= std::max((uint8_t)11, target_v
);
3261 if (allow_crimson
) {
3262 target_v
= std::max((uint8_t)12, target_v
);
3264 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
3266 encode_addrvec_pvec_as_addr(osd_addrs
->hb_back_addrs
, bl
, features
);
3268 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
3270 encode(osd_info
, bl
);
3272 // put this in a sorted, ordered map<> so that we encode in a
3273 // deterministic order.
3274 map
<entity_addr_t
,utime_t
> blocklist_map
;
3275 for (const auto &addr
: blocklist
)
3276 blocklist_map
.insert(make_pair(addr
.first
, addr
.second
));
3277 encode(blocklist_map
, bl
, features
);
3280 encode_addrvec_pvec_as_addr(osd_addrs
->cluster_addrs
, bl
, features
);
3282 encode(osd_addrs
->cluster_addrs
, bl
, features
);
3284 encode(cluster_snapshot_epoch
, bl
);
3285 encode(cluster_snapshot
, bl
);
3286 encode(*osd_uuid
, bl
);
3287 encode(osd_xinfo
, bl
, features
);
3289 encode_addrvec_pvec_as_addr(osd_addrs
->hb_front_addrs
, bl
, features
);
3291 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
3293 if (target_v
>= 2) {
3294 encode(nearfull_ratio
, bl
);
3295 encode(full_ratio
, bl
);
3296 encode(backfillfull_ratio
, bl
);
3298 // 4 was string-based new_require_min_compat_client
3299 if (target_v
>= 5) {
3300 encode(require_min_compat_client
, bl
);
3301 encode(require_osd_release
, bl
);
3303 if (target_v
>= 6) {
3304 encode(removed_snaps_queue
, bl
);
3306 if (target_v
>= 8) {
3307 encode(crush_node_flags
, bl
);
3309 if (target_v
>= 9) {
3310 encode(device_class_flags
, bl
);
3312 if (target_v
>= 10) {
3313 encode(stretch_mode_enabled
, bl
);
3314 encode(stretch_bucket_count
, bl
);
3315 encode(degraded_stretch_mode
, bl
);
3316 encode(recovering_stretch_mode
, bl
);
3317 encode(stretch_mode_bucket
, bl
);
3319 if (target_v
>= 11) {
3320 ::encode(range_blocklist
, bl
, features
);
3322 if (target_v
>= 12) {
3323 ::encode(allow_crimson
, bl
);
3325 ENCODE_FINISH(bl
); // osd-only data
3328 crc_offset
= bl
.length();
3329 crc_filler
= bl
.append_hole(sizeof(uint32_t));
3330 tail_offset
= bl
.length();
3332 ENCODE_FINISH(bl
); // meta-encoding wrapper
3335 ceph::buffer::list front
;
3336 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
3337 crc
= front
.crc32c(-1);
3338 if (tail_offset
< bl
.length()) {
3339 ceph::buffer::list tail
;
3340 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
3341 crc
= tail
.crc32c(crc
);
3345 crc_filler
->copy_in(4, (char*)&crc_le
);
3349 /* for a description of osdmap versions, and when they were introduced, please
3351 * doc/dev/osd_internals/osdmap_versions.txt
3353 void OSDMap::decode(ceph::buffer::list
& bl
)
3355 auto p
= bl
.cbegin();
3359 void OSDMap::decode_classic(ceph::buffer::list::const_iterator
& p
)
3370 decode(modified
, p
);
3374 int32_t max_pools
= 0;
3375 decode(max_pools
, p
);
3376 pool_max
= max_pools
;
3382 decode(pools
[t
], p
);
3387 } else if (v
== 5) {
3392 decode(pool_name
[t
], p
);
3399 decode(pool_name
, p
);
3400 decode(pool_max
, p
);
3402 // kludge around some old bug that zeroed out pool_max (#2307)
3403 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
3404 pool_max
= pools
.rbegin()->first
;
3413 osd_state
.resize(os
.size());
3414 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3415 osd_state
[i
] = os
[i
];
3418 decode(osd_weight
, p
);
3419 decode(osd_addrs
->client_addrs
, p
);
3425 ceph::decode_raw(opg
, p
);
3426 mempool::osdmap::vector
<int32_t> v
;
3428 pg_temp
->set(pg_t(opg
), v
);
3431 decode(*pg_temp
, p
);
3435 ceph::buffer::list cbl
;
3437 auto cblp
= cbl
.cbegin();
3438 crush
->decode(cblp
);
3444 decode(osd_addrs
->hb_back_addrs
, p
);
3445 decode(osd_info
, p
);
3447 decode(pool_name
, p
);
3449 decode(blocklist
, p
);
3451 decode(osd_addrs
->cluster_addrs
, p
);
3453 osd_addrs
->cluster_addrs
.resize(osd_addrs
->client_addrs
.size());
3456 decode(cluster_snapshot_epoch
, p
);
3457 decode(cluster_snapshot
, p
);
3461 decode(*osd_uuid
, p
);
3463 osd_uuid
->resize(max_osd
);
3466 decode(osd_xinfo
, p
);
3468 osd_xinfo
.resize(max_osd
);
3471 decode(osd_addrs
->hb_front_addrs
, p
);
3473 osd_addrs
->hb_front_addrs
.resize(osd_addrs
->hb_back_addrs
.size());
3475 osd_primary_affinity
.reset();
3480 void OSDMap::decode(ceph::buffer::list::const_iterator
& bl
)
3484 * Older encodings of the OSDMap had a single struct_v which
3485 * covered the whole encoding, and was prior to our modern
3486 * stuff which includes a compatv and a size. So if we see
3487 * a struct_v < 7, we must rewind to the beginning and use our
3490 size_t start_offset
= bl
.get_off();
3491 size_t tail_offset
= 0;
3492 ceph::buffer::list crc_front
, crc_tail
;
3494 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
3496 bl
.seek(start_offset
);
3501 * Since we made it past that hurdle, we can use our normal paths.
3504 DECODE_START(9, bl
); // client-usable data
3508 decode(created
, bl
);
3509 decode(modified
, bl
);
3512 decode(pool_name
, bl
);
3513 decode(pool_max
, bl
);
3517 decode(max_osd
, bl
);
3518 if (struct_v
>= 5) {
3519 decode(osd_state
, bl
);
3523 osd_state
.resize(os
.size());
3524 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3525 osd_state
[i
] = os
[i
];
3528 decode(osd_weight
, bl
);
3529 decode(osd_addrs
->client_addrs
, bl
);
3531 decode(*pg_temp
, bl
);
3532 decode(*primary_temp
, bl
);
3533 // dates back to firefly. version increased from 2 to 3 still in firefly.
3534 // do we really still need to keep this around? even for old clients?
3535 if (struct_v
>= 2) {
3536 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
3537 decode(*osd_primary_affinity
, bl
);
3538 if (osd_primary_affinity
->empty())
3539 osd_primary_affinity
.reset();
3541 osd_primary_affinity
.reset();
3545 ceph::buffer::list cbl
;
3547 auto cblp
= cbl
.cbegin();
3548 crush
->decode(cblp
);
3549 // added in firefly; version increased in luminous, so it affects
3550 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3551 // alone until we require clients to be all luminous?
3552 if (struct_v
>= 3) {
3553 decode(erasure_code_profiles
, bl
);
3555 erasure_code_profiles
.clear();
3557 // version increased from 3 to 4 still in luminous, so same as above
3559 if (struct_v
>= 4) {
3560 decode(pg_upmap
, bl
);
3561 decode(pg_upmap_items
, bl
);
3564 pg_upmap_items
.clear();
3566 // again, version increased from 5 to 6 still in luminous, so above
3568 if (struct_v
>= 6) {
3569 decode(crush_version
, bl
);
3571 // version increase from 6 to 7 in mimic
3572 if (struct_v
>= 7) {
3573 decode(new_removed_snaps
, bl
);
3574 decode(new_purged_snaps
, bl
);
3576 // version increase from 7 to 8, 8 to 9, in nautilus.
3577 if (struct_v
>= 9) {
3578 decode(last_up_change
, bl
);
3579 decode(last_in_change
, bl
);
3581 if (struct_v
>= 10) {
3582 decode(pg_upmap_primaries
, bl
);
3584 pg_upmap_primaries
.clear();
3586 DECODE_FINISH(bl
); // client-usable data
3590 DECODE_START(10, bl
); // extended, osd-only data
3591 decode(osd_addrs
->hb_back_addrs
, bl
);
3592 decode(osd_info
, bl
);
3593 decode(blocklist
, bl
);
3594 decode(osd_addrs
->cluster_addrs
, bl
);
3595 decode(cluster_snapshot_epoch
, bl
);
3596 decode(cluster_snapshot
, bl
);
3597 decode(*osd_uuid
, bl
);
3598 decode(osd_xinfo
, bl
);
3599 decode(osd_addrs
->hb_front_addrs
, bl
);
3601 if (struct_v
>= 2) {
3602 decode(nearfull_ratio
, bl
);
3603 decode(full_ratio
, bl
);
3608 if (struct_v
>= 3) {
3609 decode(backfillfull_ratio
, bl
);
3611 backfillfull_ratio
= 0;
3613 if (struct_v
== 4) {
3617 require_min_compat_client
= ceph_release_from_name(r
.c_str());
3619 if (struct_v
>= 5) {
3620 decode(require_min_compat_client
, bl
);
3621 decode(require_osd_release
, bl
);
3622 if (require_osd_release
>= ceph_release_t::nautilus
) {
3623 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
3625 if (require_osd_release
>= ceph_release_t::luminous
) {
3626 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3627 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3630 if (flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
) {
3631 // only for compat with post-kraken pre-luminous test clusters
3632 require_osd_release
= ceph_release_t::luminous
;
3633 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3634 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3635 } else if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
3636 require_osd_release
= ceph_release_t::kraken
;
3637 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
3638 require_osd_release
= ceph_release_t::jewel
;
3640 require_osd_release
= ceph_release_t::unknown
;
3643 if (struct_v
>= 6) {
3644 decode(removed_snaps_queue
, bl
);
3646 if (struct_v
>= 8) {
3647 decode(crush_node_flags
, bl
);
3649 crush_node_flags
.clear();
3651 if (struct_v
>= 9) {
3652 decode(device_class_flags
, bl
);
3654 device_class_flags
.clear();
3656 if (struct_v
>= 10) {
3657 decode(stretch_mode_enabled
, bl
);
3658 decode(stretch_bucket_count
, bl
);
3659 decode(degraded_stretch_mode
, bl
);
3660 decode(recovering_stretch_mode
, bl
);
3661 decode(stretch_mode_bucket
, bl
);
3663 stretch_mode_enabled
= false;
3664 stretch_bucket_count
= 0;
3665 degraded_stretch_mode
= 0;
3666 recovering_stretch_mode
= 0;
3667 stretch_mode_bucket
= 0;
3669 if (struct_v
>= 11) {
3670 decode(range_blocklist
, bl
);
3671 calculated_ranges
.clear();
3672 for (const auto& i
: range_blocklist
) {
3673 calculated_ranges
.emplace(i
.first
, i
.first
);
3676 if (struct_v
>= 12) {
3677 decode(allow_crimson
, bl
);
3679 DECODE_FINISH(bl
); // osd-only data
3682 if (struct_v
>= 8) {
3683 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
3685 tail_offset
= bl
.get_off();
3688 crc_defined
= false;
3692 DECODE_FINISH(bl
); // wrapper
3696 uint32_t actual
= crc_front
.crc32c(-1);
3697 if (tail_offset
< bl
.get_off()) {
3698 ceph::buffer::list tail
;
3699 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
3700 actual
= tail
.crc32c(actual
);
3702 if (crc
!= actual
) {
3704 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
3705 string s
= ss
.str();
3706 throw ceph::buffer::malformed_input(s
.c_str());
3713 void OSDMap::post_decode()
3717 for (const auto &pname
: pool_name
) {
3718 name_pool
[pname
.second
] = pname
.first
;
3722 _calc_up_osd_features();
3725 void OSDMap::dump_erasure_code_profiles(
3726 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
3729 f
->open_object_section("erasure_code_profiles");
3730 for (const auto &profile
: profiles
) {
3731 f
->open_object_section(profile
.first
.c_str());
3732 for (const auto &profm
: profile
.second
) {
3733 f
->dump_string(profm
.first
.c_str(), profm
.second
);
3740 void OSDMap::dump_osds(Formatter
*f
) const
3742 f
->open_array_section("osds");
3743 for (int i
=0; i
<get_max_osd(); i
++) {
3751 void OSDMap::dump_osd(int id
, Formatter
*f
) const
3753 ceph_assert(f
!= nullptr);
3758 f
->open_object_section("osd_info");
3759 f
->dump_int("osd", id
);
3760 f
->dump_stream("uuid") << get_uuid(id
);
3761 f
->dump_int("up", is_up(id
));
3762 f
->dump_int("in", is_in(id
));
3763 f
->dump_float("weight", get_weightf(id
));
3764 f
->dump_float("primary_affinity", get_primary_affinityf(id
));
3765 get_info(id
).dump(f
);
3766 f
->dump_object("public_addrs", get_addrs(id
));
3767 f
->dump_object("cluster_addrs", get_cluster_addrs(id
));
3768 f
->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id
));
3769 f
->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id
));
3771 f
->dump_stream("public_addr") << get_addrs(id
).get_legacy_str();
3772 f
->dump_stream("cluster_addr") << get_cluster_addrs(id
).get_legacy_str();
3773 f
->dump_stream("heartbeat_back_addr")
3774 << get_hb_back_addrs(id
).get_legacy_str();
3775 f
->dump_stream("heartbeat_front_addr")
3776 << get_hb_front_addrs(id
).get_legacy_str();
3780 f
->open_array_section("state");
3781 for (const auto &state
: st
)
3782 f
->dump_string("state", state
);
3788 void OSDMap::dump_pool(CephContext
*cct
,
3790 const pg_pool_t
&pdata
,
3791 ceph::Formatter
*f
) const
3793 std::string
name("<unknown>");
3794 const auto &pni
= pool_name
.find(pid
);
3795 if (pni
!= pool_name
.end())
3797 f
->open_object_section("pool");
3798 f
->dump_int("pool", pid
);
3799 f
->dump_string("pool_name", name
);
3801 dump_read_balance_score(cct
, pid
, pdata
, f
);
3802 f
->close_section(); // pool
3805 void OSDMap::dump_read_balance_score(CephContext
*cct
,
3807 const pg_pool_t
&pdata
,
3808 ceph::Formatter
*f
) const
3810 if (pdata
.is_replicated()) {
3811 // Add rb section with values for score, optimal score, raw score
3812 // // and primary_affinity average
3813 OSDMap::read_balance_info_t rb_info
;
3814 auto rc
= calc_read_balance_score(cct
, pid
, &rb_info
);
3816 f
->open_object_section("read_balance");
3817 f
->dump_float("score_acting", rb_info
.acting_adj_score
);
3818 f
->dump_float("score_stable", rb_info
.adjusted_score
);
3819 f
->dump_float("optimal_score", rb_info
.optimal_score
);
3820 f
->dump_float("raw_score_acting", rb_info
.acting_raw_score
);
3821 f
->dump_float("raw_score_stable", rb_info
.raw_score
);
3822 f
->dump_float("primary_affinity_weighted", rb_info
.pa_weighted
);
3823 f
->dump_float("average_primary_affinity", rb_info
.pa_avg
);
3824 f
->dump_float("average_primary_affinity_weighted", rb_info
.pa_weighted_avg
);
3825 if (rb_info
.err_msg
.length() > 0) {
3826 f
->dump_string("error_message", rb_info
.err_msg
);
3828 f
->close_section(); // read_balance
3831 if (rb_info
.err_msg
.length() > 0) {
3832 f
->open_object_section("read_balance");
3833 f
->dump_string("error_message", rb_info
.err_msg
);
3834 f
->dump_float("score_acting", rb_info
.acting_adj_score
);
3835 f
->dump_float("score_stable", rb_info
.adjusted_score
);
3836 f
->close_section(); // read_balance
3842 void OSDMap::dump(Formatter
*f
, CephContext
*cct
) const
3844 f
->dump_int("epoch", get_epoch());
3845 f
->dump_stream("fsid") << get_fsid();
3846 f
->dump_stream("created") << get_created();
3847 f
->dump_stream("modified") << get_modified();
3848 f
->dump_stream("last_up_change") << last_up_change
;
3849 f
->dump_stream("last_in_change") << last_in_change
;
3850 f
->dump_string("flags", get_flag_string());
3851 f
->dump_unsigned("flags_num", flags
);
3852 f
->open_array_section("flags_set");
3853 set
<string
> flagset
;
3854 get_flag_set(&flagset
);
3855 for (auto p
: flagset
) {
3856 f
->dump_string("flag", p
);
3859 f
->dump_unsigned("crush_version", get_crush_version());
3860 f
->dump_float("full_ratio", full_ratio
);
3861 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
3862 f
->dump_float("nearfull_ratio", nearfull_ratio
);
3863 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
3864 f
->dump_int("pool_max", get_pool_max());
3865 f
->dump_int("max_osd", get_max_osd());
3866 f
->dump_string("require_min_compat_client",
3867 to_string(require_min_compat_client
));
3868 f
->dump_string("min_compat_client",
3869 to_string(get_min_compat_client()));
3870 f
->dump_string("require_osd_release",
3871 to_string(require_osd_release
));
3873 f
->dump_bool("allow_crimson", allow_crimson
);
3874 f
->open_array_section("pools");
3875 for (const auto &[pid
, pdata
] : pools
) {
3876 dump_pool(cct
, pid
, pdata
, f
);
3882 f
->open_array_section("osd_xinfo");
3883 for (int i
=0; i
<get_max_osd(); i
++) {
3885 f
->open_object_section("xinfo");
3886 f
->dump_int("osd", i
);
3887 osd_xinfo
[i
].dump(f
);
3893 f
->open_array_section("pg_upmap");
3894 for (auto& p
: pg_upmap
) {
3895 f
->open_object_section("mapping");
3896 f
->dump_stream("pgid") << p
.first
;
3897 f
->open_array_section("osds");
3898 for (auto q
: p
.second
) {
3899 f
->dump_int("osd", q
);
3906 f
->open_array_section("pg_upmap_items");
3907 for (auto& [pgid
, mappings
] : pg_upmap_items
) {
3908 f
->open_object_section("mapping");
3909 f
->dump_stream("pgid") << pgid
;
3910 f
->open_array_section("mappings");
3911 for (auto& [from
, to
] : mappings
) {
3912 f
->open_object_section("mapping");
3913 f
->dump_int("from", from
);
3914 f
->dump_int("to", to
);
3922 f
->open_array_section("pg_upmap_primaries");
3923 for (const auto& [pg
, osd
] : pg_upmap_primaries
) {
3924 f
->open_object_section("primary_mapping");
3925 f
->dump_stream("pgid") << pg
;
3926 f
->dump_int("primary_osd", osd
);
3929 f
->close_section(); // primary_temp
3931 f
->open_array_section("pg_temp");
3935 f
->open_array_section("primary_temp");
3936 for (const auto &pg
: *primary_temp
) {
3937 f
->dump_stream("pgid") << pg
.first
;
3938 f
->dump_int("osd", pg
.second
);
3940 f
->close_section(); // primary_temp
3942 f
->open_object_section("blocklist");
3943 for (const auto &addr
: blocklist
) {
3946 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3949 f
->open_object_section("range_blocklist");
3950 for (const auto &addr
: range_blocklist
) {
3953 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3957 dump_erasure_code_profiles(erasure_code_profiles
, f
);
3959 f
->open_array_section("removed_snaps_queue");
3960 for (auto& p
: removed_snaps_queue
) {
3961 f
->open_object_section("pool");
3962 f
->dump_int("pool", p
.first
);
3963 f
->open_array_section("snaps");
3964 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3965 f
->open_object_section("interval");
3966 f
->dump_unsigned("begin", q
.get_start());
3967 f
->dump_unsigned("length", q
.get_len());
3974 f
->open_array_section("new_removed_snaps");
3975 for (auto& p
: new_removed_snaps
) {
3976 f
->open_object_section("pool");
3977 f
->dump_int("pool", p
.first
);
3978 f
->open_array_section("snaps");
3979 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3980 f
->open_object_section("interval");
3981 f
->dump_unsigned("begin", q
.get_start());
3982 f
->dump_unsigned("length", q
.get_len());
3989 f
->open_array_section("new_purged_snaps");
3990 for (auto& p
: new_purged_snaps
) {
3991 f
->open_object_section("pool");
3992 f
->dump_int("pool", p
.first
);
3993 f
->open_array_section("snaps");
3994 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3995 f
->open_object_section("interval");
3996 f
->dump_unsigned("begin", q
.get_start());
3997 f
->dump_unsigned("length", q
.get_len());
4004 f
->open_object_section("crush_node_flags");
4005 for (auto& i
: crush_node_flags
) {
4006 string s
= crush
->item_exists(i
.first
) ? crush
->get_item_name(i
.first
)
4007 : stringify(i
.first
);
4008 f
->open_array_section(s
.c_str());
4010 calc_state_set(i
.second
, st
);
4011 for (auto& j
: st
) {
4012 f
->dump_string("flag", j
);
4017 f
->open_object_section("device_class_flags");
4018 for (auto& i
: device_class_flags
) {
4019 const char* class_name
= crush
->get_class_name(i
.first
);
4020 string s
= class_name
? class_name
: stringify(i
.first
);
4021 f
->open_array_section(s
.c_str());
4023 calc_state_set(i
.second
, st
);
4024 for (auto& j
: st
) {
4025 f
->dump_string("flag", j
);
4030 f
->open_object_section("stretch_mode");
4032 f
->dump_bool("stretch_mode_enabled", stretch_mode_enabled
);
4033 f
->dump_unsigned("stretch_bucket_count", stretch_bucket_count
);
4034 f
->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode
);
4035 f
->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode
);
4036 f
->dump_int("stretch_mode_bucket", stretch_mode_bucket
);
4041 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
4043 o
.push_back(new OSDMap
);
4045 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
4046 o
.push_back(new OSDMap
);
4048 o
.back()->build_simple(cct
, 1, fsid
, 16);
4049 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
4050 o
.back()->blocklist
[entity_addr_t()] = utime_t(5, 6);
4054 string
OSDMap::get_flag_string(unsigned f
)
4057 if (f
& CEPH_OSDMAP_PAUSERD
)
4059 if (f
& CEPH_OSDMAP_PAUSEWR
)
4061 if (f
& CEPH_OSDMAP_PAUSEREC
)
4063 if (f
& CEPH_OSDMAP_NOUP
)
4065 if (f
& CEPH_OSDMAP_NODOWN
)
4067 if (f
& CEPH_OSDMAP_NOOUT
)
4069 if (f
& CEPH_OSDMAP_NOIN
)
4071 if (f
& CEPH_OSDMAP_NOBACKFILL
)
4073 if (f
& CEPH_OSDMAP_NOREBALANCE
)
4074 s
+= ",norebalance";
4075 if (f
& CEPH_OSDMAP_NORECOVER
)
4077 if (f
& CEPH_OSDMAP_NOSCRUB
)
4079 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
4080 s
+= ",nodeep-scrub";
4081 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
4082 s
+= ",notieragent";
4083 if (f
& CEPH_OSDMAP_NOSNAPTRIM
)
4085 if (f
& CEPH_OSDMAP_SORTBITWISE
)
4086 s
+= ",sortbitwise";
4087 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
4088 s
+= ",require_jewel_osds";
4089 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
4090 s
+= ",require_kraken_osds";
4091 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
4092 s
+= ",require_luminous_osds";
4093 if (f
& CEPH_OSDMAP_RECOVERY_DELETES
)
4094 s
+= ",recovery_deletes";
4095 if (f
& CEPH_OSDMAP_PURGED_SNAPDIRS
)
4096 s
+= ",purged_snapdirs";
4097 if (f
& CEPH_OSDMAP_PGLOG_HARDLIMIT
)
4098 s
+= ",pglog_hardlimit";
4104 string
OSDMap::get_flag_string() const
4106 return get_flag_string(flags
);
4109 void OSDMap::print_pools(CephContext
*cct
, ostream
& out
) const
4111 for (const auto &[pid
, pdata
] : pools
) {
4112 std::string
name("<unknown>");
4113 const auto &pni
= pool_name
.find(pid
);
4114 if (pni
!= pool_name
.end())
4116 char rb_score_str
[32] = "";
4118 read_balance_info_t rb_info
;
4119 if (pdata
.is_replicated()) {
4120 rc
= calc_read_balance_score(cct
, pid
, &rb_info
);
4122 snprintf (rb_score_str
, sizeof(rb_score_str
),
4123 " read_balance_score %.2f", rb_info
.acting_adj_score
);
4126 out
<< "pool " << pid
4129 << rb_score_str
<< "\n";
4130 if (rb_info
.err_msg
.length() > 0) {
4131 out
<< (rc
< 0 ? " ERROR: " : " Warning: ") << rb_info
.err_msg
<< "\n";
4134 //TODO - print error messages here.
4136 for (const auto &snap
: pdata
.snaps
)
4137 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
4139 if (!pdata
.removed_snaps
.empty())
4140 out
<< "\tremoved_snaps " << pdata
.removed_snaps
<< "\n";
4141 auto p
= removed_snaps_queue
.find(pid
);
4142 if (p
!= removed_snaps_queue
.end()) {
4143 out
<< "\tremoved_snaps_queue " << p
->second
<< "\n";
4149 void OSDMap::print_osds(ostream
& out
) const
4151 for (int i
=0; i
<get_max_osd(); i
++) {
4157 void OSDMap::print_osd(int id
, ostream
& out
) const
4163 out
<< "osd." << id
;
4164 out
<< (is_up(id
) ? " up ":" down");
4165 out
<< (is_in(id
) ? " in ":" out");
4166 out
<< " weight " << get_weightf(id
);
4167 if (get_primary_affinity(id
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
4168 out
<< " primary_affinity " << get_primary_affinityf(id
);
4170 const osd_info_t
& info(get_info(id
));
4172 out
<< " " << get_addrs(id
) << " " << get_cluster_addrs(id
);
4176 if (!get_uuid(id
).is_zero()) {
4177 out
<< " " << get_uuid(id
);
4182 void OSDMap::print(CephContext
*cct
, ostream
& out
) const
4184 out
<< "epoch " << get_epoch() << "\n"
4185 << "fsid " << get_fsid() << "\n"
4186 << "created " << get_created() << "\n"
4187 << "modified " << get_modified() << "\n";
4189 out
<< "flags " << get_flag_string() << "\n";
4190 out
<< "crush_version " << get_crush_version() << "\n";
4191 out
<< "full_ratio " << full_ratio
<< "\n";
4192 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
4193 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
4194 if (require_min_compat_client
!= ceph_release_t::unknown
) {
4195 out
<< "require_min_compat_client "
4196 << require_min_compat_client
<< "\n";
4198 out
<< "min_compat_client " << get_min_compat_client()
4200 if (require_osd_release
> ceph_release_t::unknown
) {
4201 out
<< "require_osd_release " << require_osd_release
4204 out
<< "stretch_mode_enabled " << (stretch_mode_enabled
? "true" : "false") << "\n";
4205 if (stretch_mode_enabled
) {
4206 out
<< "stretch_bucket_count " << stretch_bucket_count
<< "\n";
4207 out
<< "degraded_stretch_mode " << degraded_stretch_mode
<< "\n";
4208 out
<< "recovering_stretch_mode " << recovering_stretch_mode
<< "\n";
4209 out
<< "stretch_mode_bucket " << stretch_mode_bucket
<< "\n";
4211 if (get_cluster_snapshot().length())
4212 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
4213 if (allow_crimson
) {
4214 out
<< "allow_crimson=true\n";
4218 print_pools(cct
, out
);
4220 out
<< "max_osd " << get_max_osd() << "\n";
4224 for (auto& p
: pg_upmap
) {
4225 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
4227 for (auto& p
: pg_upmap_items
) {
4228 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
4231 for (auto& [pg
, osd
] : pg_upmap_primaries
) {
4232 out
<< "pg_upmap_primary " << pg
<< " " << osd
<< "\n";
4235 for (const auto& pg
: *pg_temp
)
4236 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
4238 for (const auto& pg
: *primary_temp
)
4239 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
4241 for (const auto &addr
: blocklist
)
4242 out
<< "blocklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
4243 for (const auto &addr
: range_blocklist
)
4244 out
<< "range blocklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
4247 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
4249 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
4251 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4253 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
4255 bool should_dump_leaf(int i
) const override
{
4257 return true; // normal case
4259 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
4260 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
4261 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
4262 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
4263 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
4269 bool should_dump_empty_bucket() const override
{
4273 void init_table(TextTable
*tbl
) {
4274 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
4275 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
4276 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4277 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
4278 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
4279 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4280 tbl
->define_column("PRI-AFF", TextTable::LEFT
, TextTable::RIGHT
);
4282 void dump(TextTable
*tbl
, string
& bucket
) {
4285 if (!bucket
.empty()) {
4290 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4291 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
)) {
4292 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), tbl
);
4299 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
4300 const char *c
= crush
->get_item_class(qi
.id
);
4305 << weightf_t(qi
.weight
);
4308 for (int k
= 0; k
< qi
.depth
; k
++)
4310 if (qi
.is_bucket()) {
4311 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
4312 << crush
->get_item_name(qi
.id
);
4314 name
<< "osd." << qi
.id
;
4318 if (!qi
.is_bucket()) {
4319 if (!osdmap
->exists(qi
.id
)) {
4324 if (osdmap
->is_up(qi
.id
)) {
4326 } else if (osdmap
->is_destroyed(qi
.id
)) {
4332 << weightf_t(osdmap
->get_weightf(qi
.id
))
4333 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
4336 *tbl
<< TextTable::endrow
;
4340 const OSDMap
*osdmap
;
4341 const unsigned filter
;
4344 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
4346 typedef CrushTreeDumper::FormattingDumper Parent
;
4348 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4350 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
4352 bool should_dump_leaf(int i
) const override
{
4354 return true; // normal case
4356 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
4357 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
4358 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
4359 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
4360 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
4366 bool should_dump_empty_bucket() const override
{
4370 void dump(Formatter
*f
, string
& bucket
) {
4371 if (!bucket
.empty()) {
4373 f
->open_array_section("nodes");
4377 f
->open_array_section("nodes");
4380 f
->open_array_section("stray");
4381 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4382 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
))
4383 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
4390 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
4391 Parent::dump_item_fields(qi
, f
);
4392 if (!qi
.is_bucket())
4395 if (osdmap
->is_up(qi
.id
)) {
4397 } else if (osdmap
->is_destroyed(qi
.id
)) {
4402 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
4403 f
->dump_string("status", s
);
4404 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
4405 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
4410 const OSDMap
*osdmap
;
4411 const unsigned filter
;
4414 void OSDMap::print_tree(Formatter
*f
, ostream
*out
, unsigned filter
, string bucket
) const
4417 OSDTreeFormattingDumper(crush
.get(), this, filter
).dump(f
, bucket
);
4421 OSDTreePlainDumper(crush
.get(), this, filter
).dump(&tbl
, bucket
);
4426 void OSDMap::print_summary(Formatter
*f
, ostream
& out
,
4427 const string
& prefix
, bool extra
) const
4430 f
->dump_int("epoch", get_epoch());
4431 f
->dump_int("num_osds", get_num_osds());
4432 f
->dump_int("num_up_osds", get_num_up_osds());
4433 f
->dump_int("osd_up_since", last_up_change
.to_msec() / 1000);
4434 f
->dump_int("num_in_osds", get_num_in_osds());
4435 f
->dump_int("osd_in_since", last_in_change
.to_msec() / 1000);
4436 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
4438 utime_t now
= ceph_clock_now();
4439 out
<< get_num_osds() << " osds: "
4440 << get_num_up_osds() << " up";
4441 if (last_up_change
!= utime_t()) {
4442 out
<< " (since " << utimespan_str(now
- last_up_change
) << ")";
4444 out
<< ", " << get_num_in_osds() << " in";
4445 if (last_in_change
!= utime_t()) {
4446 out
<< " (since " << utimespan_str(now
- last_in_change
) << ")";
4449 out
<< "; epoch: e" << get_epoch();
4450 if (get_num_pg_temp())
4451 out
<< "; " << get_num_pg_temp() << " remapped pgs";
4453 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
4454 if (important_flags
)
4455 out
<< prefix
<< "flags " << get_flag_string(important_flags
) << "\n";
4459 void OSDMap::print_oneline_summary(ostream
& out
) const
4461 out
<< "e" << get_epoch() << ": "
4462 << get_num_osds() << " total, "
4463 << get_num_up_osds() << " up, "
4464 << get_num_in_osds() << " in";
4467 bool OSDMap::crush_rule_in_use(int rule_id
) const
4469 for (const auto &pool
: pools
) {
4470 if (pool
.second
.crush_rule
== rule_id
)
4476 int OSDMap::validate_crush_rules(CrushWrapper
*newcrush
,
4479 for (auto& i
: pools
) {
4480 auto& pool
= i
.second
;
4481 int ruleno
= pool
.get_crush_rule();
4482 if (!newcrush
->rule_exists(ruleno
)) {
4483 *ss
<< "pool " << i
.first
<< " references crush_rule " << ruleno
4484 << " but it is not present";
4487 if (newcrush
->get_rule_type(ruleno
) != (int)pool
.get_type()) {
4488 *ss
<< "pool " << i
.first
<< " type does not match rule " << ruleno
;
4495 int OSDMap::build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
4496 int nosd
, int pg_bits
, int pgp_bits
,
4499 ldout(cct
, 10) << "build_simple on " << nosd
4500 << " osds" << dendl
;
4503 created
= modified
= ceph_clock_now();
4510 const auto& conf
= cct
->_conf
;
4511 vector
<string
> sections
;
4512 conf
.get_all_sections(sections
);
4514 for (auto §ion
: sections
) {
4515 if (section
.find("osd.") != 0)
4518 const char *begin
= section
.c_str() + 4;
4519 char *end
= (char*)begin
;
4520 int o
= strtol(begin
, &end
, 10);
4524 if (o
> cct
->_conf
->mon_max_osd
) {
4525 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
4533 set_max_osd(maxosd
+ 1);
4540 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
4542 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
4543 ceph_assert(r
== 0);
4545 int poolbase
= get_max_osd() ? get_max_osd() : 1;
4547 const int default_replicated_rule
= crush
->get_osd_pool_default_crush_replicated_rule(cct
);
4548 ceph_assert(default_replicated_rule
>= 0);
4551 // pgp_num <= pg_num
4552 if (pgp_bits
> pg_bits
)
4555 vector
<string
> pool_names
;
4556 pool_names
.push_back("rbd");
4557 for (auto &plname
: pool_names
) {
4558 int64_t pool
= ++pool_max
;
4559 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
4560 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
4561 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
4562 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
4563 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
4564 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
4565 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
4566 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
4567 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
4568 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
4569 if (cct
->_conf
->osd_pool_default_flag_bulk
)
4570 pools
[pool
].set_flag(pg_pool_t::FLAG_BULK
);
4571 pools
[pool
].size
= cct
->_conf
.get_val
<uint64_t>("osd_pool_default_size");
4572 pools
[pool
].min_size
= cct
->_conf
.get_osd_pool_default_min_size(
4574 pools
[pool
].crush_rule
= default_replicated_rule
;
4575 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
4576 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
4577 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
4578 pools
[pool
].set_pg_num_target(poolbase
<< pg_bits
);
4579 pools
[pool
].set_pgp_num_target(poolbase
<< pgp_bits
);
4580 pools
[pool
].last_change
= epoch
;
4581 pools
[pool
].application_metadata
.insert(
4582 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
4583 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
4584 cct
->_conf
.get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
4585 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
4586 pools
[pool
].pg_autoscale_mode
= m
;
4588 pools
[pool
].pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
4590 pool_name
[pool
] = plname
;
4591 name_pool
[plname
] = pool
;
4595 map
<string
,string
> profile_map
;
4596 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
4598 lderr(cct
) << ss
.str() << dendl
;
4601 set_erasure_code_profile("default", profile_map
);
4605 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
4606 map
<string
,string
> &profile_map
,
4609 int r
= get_json_str_map(cct
->_conf
.get_val
<string
>("osd_pool_default_erasure_code_profile"),
4615 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
4617 crush
.set_type_name(0, "osd");
4618 crush
.set_type_name(1, "host");
4619 crush
.set_type_name(2, "chassis");
4620 crush
.set_type_name(3, "rack");
4621 crush
.set_type_name(4, "row");
4622 crush
.set_type_name(5, "pdu");
4623 crush
.set_type_name(6, "pod");
4624 crush
.set_type_name(7, "room");
4625 crush
.set_type_name(8, "datacenter");
4626 crush
.set_type_name(9, "zone");
4627 crush
.set_type_name(10, "region");
4628 crush
.set_type_name(11, "root");
4632 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
4633 int nosd
, ostream
*ss
)
4638 int root_type
= _build_crush_types(crush
);
4640 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
4641 root_type
, 0, NULL
, NULL
, &rootid
);
4642 ceph_assert(r
== 0);
4643 crush
.set_item_name(rootid
, "default");
4645 map
<string
,string
> loc
{
4646 {"host", "localhost"},
4647 {"rack", "localrack"},
4650 for (int o
=0; o
<nosd
; o
++) {
4651 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
4653 snprintf(name
, sizeof(name
), "osd.%d", o
);
4654 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
4657 build_simple_crush_rules(cct
, crush
, "default", ss
);
4664 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
4665 CrushWrapper
& crush
,
4668 const auto& conf
= cct
->_conf
;
4673 int root_type
= _build_crush_types(crush
);
4675 int r
= crush
.add_bucket(0, 0,
4677 root_type
, 0, NULL
, NULL
, &rootid
);
4678 ceph_assert(r
== 0);
4679 crush
.set_item_name(rootid
, "default");
4682 vector
<string
> sections
;
4683 conf
.get_all_sections(sections
);
4685 for (auto §ion
: sections
) {
4686 if (section
.find("osd.") != 0)
4689 const char *begin
= section
.c_str() + 4;
4690 char *end
= (char*)begin
;
4691 int o
= strtol(begin
, &end
, 10);
4695 string host
, rack
, row
, room
, dc
, pool
;
4696 vector
<string
> sectiontmp
;
4697 sectiontmp
.push_back("osd");
4698 sectiontmp
.push_back(section
);
4699 conf
.get_val_from_conf_file(sectiontmp
, "host", host
, false);
4700 conf
.get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
4701 conf
.get_val_from_conf_file(sectiontmp
, "row", row
, false);
4702 conf
.get_val_from_conf_file(sectiontmp
, "room", room
, false);
4703 conf
.get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
4704 conf
.get_val_from_conf_file(sectiontmp
, "root", pool
, false);
4706 if (host
.length() == 0)
4707 host
= "unknownhost";
4708 if (rack
.length() == 0)
4709 rack
= "unknownrack";
4711 map
<string
,string
> loc
;
4719 loc
["datacenter"] = dc
;
4720 loc
["root"] = "default";
4722 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
4723 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
4726 build_simple_crush_rules(cct
, crush
, "default", ss
);
4734 int OSDMap::build_simple_crush_rules(
4736 CrushWrapper
& crush
,
4740 int crush_rule
= crush
.get_osd_pool_default_crush_replicated_rule(cct
);
4741 string failure_domain
=
4742 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
4745 r
= crush
.add_simple_rule_at(
4746 "replicated_rule", root
, failure_domain
, "",
4747 "firstn", pg_pool_t::TYPE_REPLICATED
,
4751 // do not add an erasure rule by default or else we will implicitly
4752 // require the crush_v2 feature of clients
4756 int OSDMap::summarize_mapping_stats(
4758 const set
<int64_t> *pools
,
4766 for (auto &p
: get_pools())
4770 unsigned total_pg
= 0;
4771 unsigned moved_pg
= 0;
4772 vector
<unsigned> base_by_osd(get_max_osd(), 0);
4773 vector
<unsigned> new_by_osd(get_max_osd(), 0);
4774 for (int64_t pool_id
: ls
) {
4775 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
4776 vector
<int> up
, up2
;
4778 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
4779 pg_t
pgid(ps
, pool_id
);
4780 total_pg
+= pi
->get_size();
4781 pg_to_up_acting_osds(pgid
, &up
, &up_primary
, nullptr, nullptr);
4782 for (int osd
: up
) {
4783 if (osd
>= 0 && osd
< get_max_osd())
4787 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
, nullptr, nullptr);
4788 for (int osd
: up2
) {
4789 if (osd
>= 0 && osd
< get_max_osd())
4792 if (pi
->is_erasure()) {
4793 for (unsigned i
=0; i
<up
.size(); ++i
) {
4794 if (up
[i
] != up2
[i
]) {
4798 } else if (pi
->is_replicated()) {
4799 for (int osd
: up
) {
4800 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
4805 ceph_abort_msg("unhandled pool type");
4811 unsigned num_up_in
= 0;
4812 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4813 if (is_up(osd
) && is_in(osd
))
4820 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
4821 float base_stddev
= 0, new_stddev
= 0;
4822 int min
= -1, max
= -1;
4823 unsigned min_base_pg
= 0, max_base_pg
= 0;
4824 unsigned min_new_pg
= 0, max_new_pg
= 0;
4825 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4826 if (is_up(osd
) && is_in(osd
)) {
4827 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
4828 base_stddev
+= base_diff
* base_diff
;
4829 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
4830 new_stddev
+= new_diff
* new_diff
;
4831 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
4833 min_base_pg
= base_by_osd
[osd
];
4834 min_new_pg
= new_by_osd
[osd
];
4836 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
4838 max_base_pg
= base_by_osd
[osd
];
4839 max_new_pg
= new_by_osd
[osd
];
4843 base_stddev
= sqrt(base_stddev
/ num_up_in
);
4844 new_stddev
= sqrt(new_stddev
/ num_up_in
);
4846 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
4850 f
->open_object_section("utilization");
4853 f
->dump_unsigned("moved_pgs", moved_pg
);
4854 f
->dump_unsigned("total_pgs", total_pg
);
4858 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
4859 ss
<< "moved " << moved_pg
<< " / " << total_pg
4860 << " (" << percent
<< "%)\n";
4864 f
->dump_float("avg_pgs", avg_pg
);
4865 f
->dump_float("std_dev", base_stddev
);
4866 f
->dump_float("expected_baseline_std_dev", edev
);
4868 f
->dump_float("new_std_dev", new_stddev
);
4870 ss
<< "avg " << avg_pg
<< "\n";
4871 ss
<< "stddev " << base_stddev
;
4873 ss
<< " -> " << new_stddev
;
4874 ss
<< " (expected baseline " << edev
<< ")\n";
4878 f
->dump_unsigned("min_osd", min
);
4879 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
4881 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
4883 ss
<< "min osd." << min
<< " with " << min_base_pg
;
4885 ss
<< " -> " << min_new_pg
;
4886 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
4888 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
4894 f
->dump_unsigned("max_osd", max
);
4895 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
4897 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
4899 ss
<< "max osd." << max
<< " with " << max_base_pg
;
4901 ss
<< " -> " << max_new_pg
;
4902 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
4904 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
4915 bool OSDMap::try_pg_upmap(
4917 pg_t pg
, ///< pg to potentially remap
4918 const set
<int>& overfull
, ///< osds we'd want to evacuate
4919 const vector
<int>& underfull
, ///< osds to move to, in order of preference
4920 const vector
<int>& more_underfull
, ///< more osds only slightly underfull
4922 vector
<int> *out
) ///< resulting alternative mapping
4924 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
4927 int rule
= pool
->get_crush_rule();
4931 // make sure there is something there to remap
4933 for (auto osd
: *orig
) {
4934 if (overfull
.count(osd
)) {
4943 int r
= crush
->try_remap_rule(
4947 overfull
, underfull
,
4959 int OSDMap::balance_primaries(
4962 OSDMap::Incremental
*pending_inc
,
4963 OSDMap
& tmp_osd_map
) const
4965 // This function only handles replicated pools.
4966 const pg_pool_t
* pool
= get_pg_pool(pid
);
4967 if (! pool
->is_replicated()) {
4968 ldout(cct
, 10) << __func__
<< " skipping erasure pool "
4969 << get_pool_name(pid
) << dendl
;
4973 // Info to be used in verify_upmap
4974 int pool_size
= pool
->get_size();
4975 int crush_rule
= pool
->get_crush_rule();
4977 // Get pgs by osd (map of osd -> pgs)
4978 // Get primaries by osd (map of osd -> primary)
4979 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
4980 map
<uint64_t,set
<pg_t
>> prim_pgs_by_osd
;
4981 map
<uint64_t,set
<pg_t
>> acting_prims_by_osd
;
4982 pgs_by_osd
= tmp_osd_map
.get_pgs_by_osd(cct
, pid
, &prim_pgs_by_osd
, &acting_prims_by_osd
);
4984 // Transfer pgs into a map, `pgs_to_check`. This will tell us the total num_changes after all
4985 // calculations have been finalized.
4986 // Transfer osds into a set, `osds_to_check`.
4987 // This is to avoid poor runtime when we loop through the pgs and to set up
4988 // our call to calc_desired_primary_distribution.
4989 map
<pg_t
,bool> prim_pgs_to_check
;
4990 vector
<uint64_t> osds_to_check
;
4991 for (const auto & [osd
, pgs
] : prim_pgs_by_osd
) {
4992 osds_to_check
.push_back(osd
);
4993 for (const auto & pg
: pgs
) {
4994 prim_pgs_to_check
.insert({pg
, false});
4998 // calculate desired primary distribution for each osd
4999 map
<uint64_t,float> desired_prim_dist
;
5001 rc
= calc_desired_primary_distribution(cct
, pid
, osds_to_check
, desired_prim_dist
);
5003 ldout(cct
, 10) << __func__
<< " Error in calculating desired primary distribution" << dendl
;
5006 map
<uint64_t,float> prim_dist_scores
;
5009 for (auto osd
: osds_to_check
) {
5010 actual
= prim_pgs_by_osd
[osd
].size();
5011 desired
= desired_prim_dist
[osd
];
5012 prim_dist_scores
[osd
] = actual
- desired
;
5013 ldout(cct
, 10) << __func__
<< " desired distribution for osd." << osd
<< " " << desired
<< dendl
;
5016 // get read balance score before balancing
5017 float read_balance_score_before
= 0.0;
5018 read_balance_info_t rb_info
;
5019 rc
= tmp_osd_map
.calc_read_balance_score(cct
, pid
, &rb_info
);
5021 read_balance_score_before
= rb_info
.adjusted_score
;
5023 if (rb_info
.err_msg
.length() > 0) {
5024 ldout(cct
, 10) << __func__
<< (rc
< 0 ? " ERROR: " : " Warning: ") << rb_info
.err_msg
<< dendl
;
5028 // get ready to swap pgs
5030 int curr_num_changes
= 0;
5031 vector
<int> up_osds
;
5032 vector
<int> acting_osds
;
5033 int up_primary
, acting_primary
;
5034 for (const auto & [pg
, mapped
] : prim_pgs_to_check
) {
5035 // fill in the up, up primary, acting, and acting primary for the current PG
5036 tmp_osd_map
.pg_to_up_acting_osds(pg
, &up_osds
, &up_primary
,
5037 &acting_osds
, &acting_primary
);
5039 // find the OSD that would make the best swap based on its score
5040 // We start by first testing the OSD that is currently primary for the PG we are checking.
5041 uint64_t curr_best_osd
= up_primary
;
5042 float prim_score
= prim_dist_scores
[up_primary
];
5043 for (auto potential_osd
: up_osds
) {
5044 float potential_score
= prim_dist_scores
[potential_osd
];
5045 if ((prim_score
> 0) && // taking 1 pg from the prim would not make its score worse
5046 (potential_score
< 0) && // adding 1 pg to the potential would not make its score worse
5047 ((prim_score
- potential_score
) > 1) && // swapping a pg would not just keep the scores the same
5048 (desired_prim_dist
[potential_osd
] > 0)) // the potential is not off limits (the primary affinity is above 0)
5050 curr_best_osd
= potential_osd
;
5054 // Make the swap only if:
5055 // 1. The swap is legal
5056 // 2. The balancer has chosen a new primary
5057 auto legal_swap
= crush
->verify_upmap(cct
,
5060 {(int)curr_best_osd
});
5061 if (legal_swap
>= 0 &&
5062 ((int)curr_best_osd
!= up_primary
)) {
5063 // Update prim_dist_scores
5064 prim_dist_scores
[curr_best_osd
] += 1;
5065 prim_dist_scores
[up_primary
] -= 1;
5067 // Update the mappings
5068 pending_inc
->new_pg_upmap_primary
[pg
] = curr_best_osd
;
5069 tmp_osd_map
.pg_upmap_primaries
[pg
] = curr_best_osd
;
5070 prim_pgs_to_check
[pg
] = true; // mark that this pg changed mappings
5074 ldout(cct
, 20) << __func__
<< " curr_num_changes: " << curr_num_changes
<< dendl
;
5076 // If there are no changes after one pass through the pgs, then no further optimizations can be made.
5077 if (curr_num_changes
== 0) {
5078 ldout(cct
, 20) << __func__
<< " curr_num_changes is 0; no further optimizations can be made." << dendl
;
5083 // get read balance score after balancing
5084 float read_balance_score_after
= 0.0;
5085 rc
= tmp_osd_map
.calc_read_balance_score(cct
, pid
, &rb_info
);
5087 read_balance_score_after
= rb_info
.adjusted_score
;
5089 if (rb_info
.err_msg
.length() > 0) {
5090 ldout(cct
, 10) << __func__
<< (rc
< 0 ? " ERROR: " : " Warning: ") << rb_info
.err_msg
<< dendl
;
5094 // Tally total number of changes
5095 int num_changes
= 0;
5096 if (read_balance_score_after
< read_balance_score_before
) {
5097 for (auto [pg
, mapped
] : prim_pgs_to_check
) {
5104 ldout(cct
, 10) << __func__
<< " num_changes " << num_changes
<< dendl
;
5108 int OSDMap::calc_desired_primary_distribution(
5111 const vector
<uint64_t> &osds
,
5112 std::map
<uint64_t, float>& desired_primary_distribution
) const
5114 // will return a perfect distribution of floats
5115 // without calculating the floor of each value
5117 // This function only handles replicated pools.
5118 const pg_pool_t
* pool
= get_pg_pool(pid
);
5119 if (pool
->is_replicated()) {
5120 ldout(cct
, 20) << __func__
<< " calculating distribution for replicated pool "
5121 << get_pool_name(pid
) << dendl
;
5122 uint64_t replica_count
= pool
->get_size();
5124 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
5125 pgs_by_osd
= get_pgs_by_osd(cct
, pid
);
5127 // First calculate the distribution using primary affinity and tally up the sum
5128 auto distribution_sum
= 0.0;
5129 for (const auto & osd
: osds
) {
5130 float osd_primary_count
= ((float)pgs_by_osd
[osd
].size() / (float)replica_count
) * get_primary_affinityf(osd
);
5131 desired_primary_distribution
.insert({osd
, osd_primary_count
});
5132 distribution_sum
+= osd_primary_count
;
5134 if (distribution_sum
<= 0) {
5135 ldout(cct
, 10) << __func__
<< " Unable to calculate primary distribution, likely because primary affinity is"
5136 << " set to 0 on all OSDs." << dendl
;
5140 // Then, stretch the value (necessary when primary affinity is smaller than 1)
5141 float factor
= (float)pool
->get_pg_num() / (float)distribution_sum
;
5142 float distribution_sum_desired
= 0.0;
5144 ceph_assert(factor
>= 1.0);
5145 for (const auto & [osd
, osd_primary_count
] : desired_primary_distribution
) {
5146 desired_primary_distribution
[osd
] *= factor
;
5147 distribution_sum_desired
+= desired_primary_distribution
[osd
];
5149 ceph_assert(fabs(distribution_sum_desired
- pool
->get_pg_num()) < 0.01);
5151 ldout(cct
, 10) << __func__
<<" skipping erasure pool "
5152 << get_pool_name(pid
) << dendl
;
5159 int OSDMap::calc_pg_upmaps(
5161 uint32_t max_deviation
,
5163 const set
<int64_t>& only_pools
,
5164 OSDMap::Incremental
*pending_inc
,
5165 std::random_device::result_type
*p_seed
)
5167 ldout(cct
, 10) << __func__
<< " pools " << only_pools
<< dendl
;
5169 // Can't be less than 1 pg
5170 if (max_deviation
< 1)
5172 tmp_osd_map
.deepish_copy_from(*this);
5173 int num_changed
= 0;
5174 map
<int,set
<pg_t
>> pgs_by_osd
;
5176 float osd_weight_total
= 0;
5177 map
<int,float> osd_weight
;
5180 lderr(cct
) << __func__
<< " abort due to max <= 0" << dendl
;
5184 osd_weight_total
= build_pool_pgs_info(cct
, only_pools
, tmp_osd_map
,
5185 total_pgs
, pgs_by_osd
, osd_weight
);
5186 if (osd_weight_total
== 0) {
5187 lderr(cct
) << __func__
<< " abort due to osd_weight_total == 0" << dendl
;
5191 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
5192 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
5193 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
5196 map
<int,float> osd_deviation
; // osd, deviation(pgs)
5197 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
5198 float cur_max_deviation
= calc_deviations(cct
, pgs_by_osd
, osd_weight
, pgs_per_weight
,
5199 osd_deviation
, deviation_osd
, stddev
);
5201 ldout(cct
, 20) << " stdev " << stddev
<< " max_deviation " << cur_max_deviation
<< dendl
;
5202 if (cur_max_deviation
<= max_deviation
) {
5203 ldout(cct
, 10) << __func__
<< " distribution is almost perfect"
5208 bool skip_overfull
= false;
5210 cct
->_conf
.get_val
<bool>("osd_calc_pg_upmaps_aggressively");
5211 auto fast_aggressive
= aggressive
&&
5212 cct
->_conf
.get_val
<bool>("osd_calc_pg_upmaps_aggressively_fast");
5213 auto local_fallback_retries
=
5214 cct
->_conf
.get_val
<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
5217 ldout(cct
, 30) << "Top of loop #" << max
+1 << dendl
;
5218 // build overfull and underfull
5220 set
<int> more_overfull
;
5221 bool using_more_overfull
= false;
5222 vector
<int> underfull
;
5223 vector
<int> more_underfull
;
5224 fill_overfull_underfull(cct
, deviation_osd
, max_deviation
,
5225 overfull
, more_overfull
,
5226 underfull
, more_underfull
);
5228 if (underfull
.empty() && overfull
.empty()) {
5229 ldout(cct
, 20) << __func__
<< " failed to build overfull and underfull" << dendl
;
5232 if (overfull
.empty() && !underfull
.empty()) {
5233 ldout(cct
, 20) << __func__
<< " Using more_overfull since we still have underfull" << dendl
;
5234 overfull
= more_overfull
;
5235 using_more_overfull
= true;
5238 ldout(cct
, 10) << " overfull " << overfull
5239 << " underfull " << underfull
5242 uint64_t local_fallback_retried
= 0;
5244 // Used to prevent some of the unsuccessful loop iterations (save runtime)
5245 // If we can't find a change per OSD we skip further iterations for this OSD
5246 uint n_changes
= 0, prev_n_changes
= 0;
5247 set
<int> osd_to_skip
;
5252 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>> to_upmap
;
5253 auto temp_pgs_by_osd
= pgs_by_osd
;
5254 // always start with fullest, break if we find any changes to make
5255 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
5256 if (skip_overfull
&& !underfull
.empty()) {
5257 ldout(cct
, 10) << " skipping overfull " << dendl
;
5258 break; // fall through to check underfull
5260 int osd
= p
->second
;
5261 float deviation
= p
->first
;
5262 if (fast_aggressive
&& osd_to_skip
.count(osd
)) {
5263 ldout(cct
, 20) << " Fast aggressive mode: skipping osd " << osd
5264 << " osd_to_skip size = " << osd_to_skip
.size() << dendl
;
5268 if (deviation
< 0) {
5269 ldout(cct
, 10) << " hitting underfull osds now"
5270 << " when trying to remap overfull osds"
5274 float target
= osd_weight
[osd
] * pgs_per_weight
;
5275 ldout(cct
, 10) << " Overfull search osd." << osd
5276 << " target " << target
5277 << " deviation " << deviation
5279 ceph_assert(target
> 0);
5280 if (!using_more_overfull
&& deviation
<= max_deviation
) {
5281 ldout(cct
, 10) << " osd." << osd
5282 << " target " << target
5283 << " deviation " << deviation
5284 << " < max deviation " << max_deviation
5290 pgs
.reserve(pgs_by_osd
[osd
].size());
5291 for (auto& pg
: pgs_by_osd
[osd
]) {
5292 if (to_skip
.count(pg
))
5297 // shuffle PG list so they all get equal (in)attention
5298 std::shuffle(pgs
.begin(), pgs
.end(), get_random_engine(cct
, p_seed
));
5300 // look for remaps we can un-remap
5301 if (try_drop_remap_overfull(cct
, pgs
, tmp_osd_map
, osd
,
5302 temp_pgs_by_osd
, to_unmap
, to_upmap
))
5306 for (auto pg
: pgs
) {
5307 auto temp_it
= tmp_osd_map
.pg_upmap
.find(pg
);
5308 if (temp_it
!= tmp_osd_map
.pg_upmap
.end()) {
5309 // leave pg_upmap alone
5310 // it must be specified by admin since balancer does not
5311 // support pg_upmap yet
5312 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap "
5313 << temp_it
->second
<< ", skipping"
5317 auto pg_pool_size
= tmp_osd_map
.get_pg_pool_size(pg
);
5318 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5320 auto it
= tmp_osd_map
.pg_upmap_items
.find(pg
);
5321 if (it
!= tmp_osd_map
.pg_upmap_items
.end()) {
5322 auto& um_items
= it
->second
;
5323 if (um_items
.size() >= (size_t)pg_pool_size
) {
5324 ldout(cct
, 10) << " " << pg
<< " already has full-size pg_upmap_items "
5325 << um_items
<< ", skipping"
5329 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap_items "
5332 new_upmap_items
= um_items
;
5333 // build existing too (for dedup)
5334 for (auto [um_from
, um_to
] : um_items
) {
5335 existing
.insert(um_from
);
5336 existing
.insert(um_to
);
5340 // to see if we can append more remapping pairs
5342 ldout(cct
, 10) << " trying " << pg
<< dendl
;
5343 vector
<int> raw
, orig
, out
;
5344 tmp_osd_map
.pg_to_raw_upmap(pg
, &raw
, &orig
); // including existing upmaps too
5345 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, more_underfull
, &orig
, &out
)) {
5348 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
5349 if (orig
.size() != out
.size()) {
5352 ceph_assert(orig
!= out
);
5353 int pos
= find_best_remap(cct
, orig
, out
, existing
, osd_deviation
);
5355 // append new remapping pairs slowly
5356 // This way we can make sure that each tiny change will
5357 // definitely make distribution of PGs converging to
5358 // the perfect status.
5359 add_remap_pair(cct
, orig
[pos
], out
[pos
], pg
, (size_t)pg_pool_size
,
5360 osd
, existing
, temp_pgs_by_osd
,
5361 new_upmap_items
, to_upmap
);
5365 if (fast_aggressive
) {
5366 if (prev_n_changes
== n_changes
) { // no changes for prev OSD
5367 osd_to_skip
.insert(osd
);
5370 prev_n_changes
= n_changes
;
5376 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
5377 ldout(cct
, 10) << " failed to find any changes for overfull osds"
5379 for (auto& [deviation
, osd
] : deviation_osd
) {
5380 if (std::find(underfull
.begin(), underfull
.end(), osd
) ==
5383 float target
= osd_weight
[osd
] * pgs_per_weight
;
5384 ceph_assert(target
> 0);
5385 if (fabsf(deviation
) < max_deviation
) {
5386 // respect max_deviation too
5387 ldout(cct
, 10) << " osd." << osd
5388 << " target " << target
5389 << " deviation " << deviation
5390 << " -> absolute " << fabsf(deviation
)
5391 << " < max " << max_deviation
5395 // look for remaps we can un-remap
5396 candidates_t candidates
= build_candidates(cct
, tmp_osd_map
, to_skip
,
5397 only_pools
, aggressive
, p_seed
);
5398 if (try_drop_remap_underfull(cct
, candidates
, osd
, temp_pgs_by_osd
,
5399 to_unmap
, to_upmap
)) {
5404 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
5405 ldout(cct
, 10) << " failed to find any changes for underfull osds"
5408 ldout(cct
, 10) << " break due to aggressive mode not enabled" << dendl
;
5410 } else if (!skip_overfull
) {
5411 // safe to quit because below here we know
5412 // we've done checking both overfull and underfull osds..
5413 ldout(cct
, 10) << " break due to not being able to find any"
5414 << " further optimizations"
5418 // restart with fullest and do exhaustive searching
5419 skip_overfull
= false;
5424 // test change, apply if change is good
5425 ceph_assert(to_unmap
.size() || to_upmap
.size());
5426 float new_stddev
= 0;
5427 map
<int,float> temp_osd_deviation
;
5428 multimap
<float,int> temp_deviation_osd
;
5429 float cur_max_deviation
= calc_deviations(cct
, temp_pgs_by_osd
, osd_weight
,
5430 pgs_per_weight
, temp_osd_deviation
,
5431 temp_deviation_osd
, new_stddev
);
5432 ldout(cct
, 10) << " stddev " << stddev
<< " -> " << new_stddev
<< dendl
;
5433 if (new_stddev
>= stddev
) {
5435 ldout(cct
, 10) << " break because stddev is not decreasing"
5436 << " and aggressive mode is not enabled"
5440 local_fallback_retried
++;
5441 if (local_fallback_retried
>= local_fallback_retries
) {
5442 // does not make progress
5443 // flip *skip_overfull* so both overfull and underfull
5444 // get equal (in)attention
5445 skip_overfull
= !skip_overfull
;
5446 ldout(cct
, 10) << " hit local_fallback_retries "
5447 << local_fallback_retries
5451 for (auto& i
: to_unmap
)
5453 for (auto& i
: to_upmap
)
5454 to_skip
.insert(i
.first
);
5455 ldout(cct
, 20) << " local_fallback_retried " << local_fallback_retried
5456 << " to_skip " << to_skip
5462 ceph_assert(new_stddev
< stddev
);
5463 stddev
= new_stddev
;
5464 pgs_by_osd
= temp_pgs_by_osd
;
5465 osd_deviation
= temp_osd_deviation
;
5466 deviation_osd
= temp_deviation_osd
;
5470 num_changed
+= pack_upmap_results(cct
, to_unmap
, to_upmap
, tmp_osd_map
, pending_inc
);
5472 ldout(cct
, 20) << " stdev " << stddev
<< " max_deviation " << cur_max_deviation
<< dendl
;
5473 if (cur_max_deviation
<= max_deviation
) {
5474 ldout(cct
, 10) << __func__
<< " Optimization plan is almost perfect"
5479 ldout(cct
, 10) << " num_changed = " << num_changed
<< dendl
;
5483 map
<uint64_t,set
<pg_t
>> OSDMap::get_pgs_by_osd(
5486 map
<uint64_t, set
<pg_t
>> *p_primaries_by_osd
,
5487 map
<uint64_t, set
<pg_t
>> *p_acting_primaries_by_osd
) const
5489 // Set up the OSDMap
5491 tmp_osd_map
.deepish_copy_from(*this);
5493 // Get the pool from the provided pool id
5494 const pg_pool_t
* pool
= get_pg_pool(pid
);
5496 // build array of pgs from the pool
5497 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
5498 for (unsigned ps
= 0; ps
< pool
->get_pg_num(); ++ps
) {
5503 tmp_osd_map
.pg_to_up_acting_osds(pg
, &up
, &primary
, nullptr, &acting_prim
);
5505 ldout(cct
, 20) << __func__
<< " " << pg
5507 << " primary " << primary
5508 << " acting_primary " << acting_prim
5511 if (!up
.empty()) { // up can be empty is test generated files
5512 // in this case, we return empty result
5513 for (auto osd
: up
) {
5514 if (osd
!= CRUSH_ITEM_NONE
)
5515 pgs_by_osd
[osd
].insert(pg
);
5517 if (p_primaries_by_osd
!= nullptr) {
5518 if (primary
!= CRUSH_ITEM_NONE
)
5519 (*p_primaries_by_osd
)[primary
].insert(pg
);
5521 if (p_acting_primaries_by_osd
!= nullptr) {
5522 if (acting_prim
!= CRUSH_ITEM_NONE
)
5523 (*p_acting_primaries_by_osd
)[acting_prim
].insert(pg
);
5530 float OSDMap::get_osds_weight(
5532 const OSDMap
& tmp_osd_map
,
5534 map
<int,float>& osds_weight
) const
5536 map
<int,float> pmap
;
5537 ceph_assert(pools
.count(pid
));
5538 int ruleno
= pools
.at(pid
).get_crush_rule();
5539 tmp_osd_map
.crush
->get_rule_weight_osd_map(ruleno
, &pmap
);
5540 ldout(cct
,20) << __func__
<< " pool " << pid
5541 << " ruleno " << ruleno
5542 << " weight-map " << pmap
5544 float osds_weight_total
= 0;
5545 for (auto [oid
, oweight
] : pmap
) {
5546 auto adjusted_weight
= tmp_osd_map
.get_weightf(oid
) * oweight
;
5547 if (adjusted_weight
!= 0) {
5548 osds_weight
[oid
] += adjusted_weight
;
5549 osds_weight_total
+= adjusted_weight
;
5552 return osds_weight_total
;
5555 float OSDMap::build_pool_pgs_info (
5557 const std::set
<int64_t>& only_pools
, ///< [optional] restrict to pool
5558 const OSDMap
& tmp_osd_map
,
5560 map
<int,set
<pg_t
>>& pgs_by_osd
,
5561 map
<int,float>& osds_weight
)
5564 // This function builds some data structures that are used by calc_pg_upmaps.
5565 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
5566 // and returns the osd_weight_total
5568 float osds_weight_total
= 0.0;
5569 for (auto& [pid
, pdata
] : pools
) {
5570 if (!only_pools
.empty() && !only_pools
.count(pid
))
5572 for (unsigned ps
= 0; ps
< pdata
.get_pg_num(); ++ps
) {
5575 tmp_osd_map
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
5576 ldout(cct
, 20) << __func__
<< " " << pg
<< " up " << up
<< dendl
;
5577 for (auto osd
: up
) {
5578 if (osd
!= CRUSH_ITEM_NONE
)
5579 pgs_by_osd
[osd
].insert(pg
);
5582 total_pgs
+= pdata
.get_size() * pdata
.get_pg_num();
5584 osds_weight_total
= get_osds_weight(cct
, tmp_osd_map
, pid
, osds_weight
);
5586 for (auto& [oid
, oweight
] : osds_weight
) {
5588 auto p
= pgs_by_osd
.find(oid
);
5589 if (p
!= pgs_by_osd
.end())
5590 pgs
= p
->second
.size();
5592 pgs_by_osd
.emplace(oid
, set
<pg_t
>());
5593 ldout(cct
, 20) << " osd." << oid
<< " weight " << oweight
5594 << " pgs " << pgs
<< dendl
;
5596 return osds_weight_total
;
5598 } // return total weight of all OSDs
5600 float OSDMap::calc_deviations (
5602 const map
<int,set
<pg_t
>>& pgs_by_osd
,
5603 const map
<int,float>& osd_weight
,
5604 float pgs_per_weight
,
5605 map
<int,float>& osd_deviation
,
5606 multimap
<float,int>& deviation_osd
,
5607 float& stddev
) // return current max deviation
5610 // This function calculates the 2 maps osd_deviation and deviation_osd which
5611 // hold the deviation between the current number of PGs which map to an OSD
5612 // and the optimal number. Ot also calculates the stddev of the deviations and
5613 // returns the current max deviation.
5614 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5615 // long as it is monotonic with stddev (and it is), it is sufficient for
5616 // the balancer code.
5618 float cur_max_deviation
= 0.0;
5620 for (auto& [oid
, opgs
] : pgs_by_osd
) {
5621 // make sure osd is still there (belongs to this crush-tree)
5622 ceph_assert(osd_weight
.count(oid
));
5623 float target
= osd_weight
.at(oid
) * pgs_per_weight
;
5624 float deviation
= (float)opgs
.size() - target
;
5625 ldout(cct
, 20) << " osd." << oid
5626 << "\tpgs " << opgs
.size()
5627 << "\ttarget " << target
5628 << "\tdeviation " << deviation
5630 osd_deviation
[oid
] = deviation
;
5631 deviation_osd
.insert(make_pair(deviation
, oid
));
5632 stddev
+= deviation
* deviation
;
5633 if (fabsf(deviation
) > cur_max_deviation
)
5634 cur_max_deviation
= fabsf(deviation
);
5636 return cur_max_deviation
;
5639 void OSDMap::fill_overfull_underfull (
5641 const std::multimap
<float,int>& deviation_osd
,
5643 std::set
<int>& overfull
,
5644 std::set
<int>& more_overfull
,
5645 std::vector
<int>& underfull
,
5646 std::vector
<int>& more_underfull
)
5649 // This function just fills the overfull and underfull data structures for the
5650 // use of calc_pg_upmaps
5652 for (auto i
= deviation_osd
.rbegin(); i
!= deviation_osd
.rend(); i
++) {
5653 auto& odev
= i
->first
;
5654 auto& oid
= i
->second
;
5655 ldout(cct
, 30) << " check " << odev
<< " <= " << max_deviation
<< dendl
;
5658 if (odev
> max_deviation
) {
5659 ldout(cct
, 30) << " add overfull osd." << oid
<< dendl
;
5660 overfull
.insert(oid
);
5662 more_overfull
.insert(oid
);
5666 for (auto i
= deviation_osd
.begin(); i
!= deviation_osd
.end(); i
++) {
5667 auto& odev
= i
->first
;
5668 auto& oid
= i
->second
;
5669 ldout(cct
, 30) << " check " << odev
<< " >= " << -(int)max_deviation
<< dendl
;
5672 if (odev
< -(int)max_deviation
) {
5673 ldout(cct
, 30) << " add underfull osd." << oid
<< dendl
;
5674 underfull
.push_back(oid
);
5676 more_underfull
.push_back(oid
);
5681 int OSDMap::pack_upmap_results(
5683 const std::set
<pg_t
>& to_unmap
,
5684 const std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int, int>>>& to_upmap
,
5685 OSDMap
& tmp_osd_map
,
5686 OSDMap::Incremental
*pending_inc
)
5689 // This function takes the input from the local variables to_unmap and to_upmap
5690 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5691 // (so that the results are visible outside calc_pg_upmaps)
5693 int num_changed
= 0;
5694 for (auto& i
: to_unmap
) {
5695 ldout(cct
, 10) << " unmap pg " << i
<< dendl
;
5696 ceph_assert(tmp_osd_map
.pg_upmap_items
.count(i
));
5697 tmp_osd_map
.pg_upmap_items
.erase(i
);
5698 pending_inc
->old_pg_upmap_items
.insert(i
);
5701 for (auto& [pg
, um_items
] : to_upmap
) {
5702 ldout(cct
, 10) << " upmap pg " << pg
5703 << " new pg_upmap_items " << um_items
5705 tmp_osd_map
.pg_upmap_items
[pg
] = um_items
;
5706 pending_inc
->new_pg_upmap_items
[pg
] = um_items
;
5713 std::default_random_engine
OSDMap::get_random_engine(
5715 std::random_device::result_type
*p_seed
)
5718 // This function creates a random_engine to be used for shuffling.
5719 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5720 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5721 // increments seed_set. This is used in order to craete regression test without
5722 // random effect on the results.
5724 static std::random_device::result_type seed_set
= 0;
5725 std::random_device::result_type seed
;
5726 if (p_seed
== nullptr) {
5727 std::random_device rd
;
5731 seed
= *p_seed
+ seed_set
;
5732 ldout(cct
, 30) << " Starting random engine with seed "
5736 return std::default_random_engine
{seed
};
5739 bool OSDMap::try_drop_remap_overfull(
5741 const std::vector
<pg_t
>& pgs
,
5742 const OSDMap
& tmp_osd_map
,
5744 map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
5745 set
<pg_t
>& to_unmap
,
5746 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>>& to_upmap
)
5749 // This function tries to drop existimg upmap items which map data to overfull
5750 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5751 // if it found an item that can be dropped, false if not.
5753 for (auto pg
: pgs
) {
5754 auto p
= tmp_osd_map
.pg_upmap_items
.find(pg
);
5755 if (p
== tmp_osd_map
.pg_upmap_items
.end())
5757 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5758 auto& pg_upmap_items
= p
->second
;
5759 for (auto um_pair
: pg_upmap_items
) {
5760 auto& um_from
= um_pair
.first
;
5761 auto& um_to
= um_pair
.second
;
5763 ldout(cct
, 10) << " will try dropping existing"
5764 << " remapping pair "
5765 << um_from
<< " -> " << um_to
5766 << " which remapped " << pg
5767 << " into overfull osd." << osd
5769 temp_pgs_by_osd
[um_to
].erase(pg
);
5770 temp_pgs_by_osd
[um_from
].insert(pg
);
5772 new_upmap_items
.push_back(um_pair
);
5775 if (new_upmap_items
.empty()) {
5777 ldout(cct
, 10) << " existing pg_upmap_items " << pg_upmap_items
5778 << " remapped " << pg
<< " into overfull osd." << osd
5779 << ", will try cancelling it entirely"
5781 to_unmap
.insert(pg
);
5783 } else if (new_upmap_items
.size() != pg_upmap_items
.size()) {
5784 // drop single remapping pair, updating
5785 ceph_assert(new_upmap_items
.size() < pg_upmap_items
.size());
5786 ldout(cct
, 10) << " existing pg_upmap_items " << pg_upmap_items
5787 << " remapped " << pg
<< " into overfull osd." << osd
5788 << ", new_pg_upmap_items now " << new_upmap_items
5790 to_upmap
[pg
] = new_upmap_items
;
5797 bool OSDMap::try_drop_remap_underfull(
5799 const candidates_t
& candidates
,
5801 map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
5802 set
<pg_t
>& to_unmap
,
5803 map
<pg_t
, mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>>& to_upmap
)
5806 // This function tries to drop existimg upmap items which map data from underfull
5807 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5808 // if it found an item that can be dropped, false if not.
5810 for (auto& [pg
, um_pairs
] : candidates
) {
5811 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5812 for (auto& ump
: um_pairs
) {
5813 auto& um_from
= ump
.first
;
5814 auto& um_to
= ump
.second
;
5815 if (um_from
== osd
) {
5816 ldout(cct
, 10) << " will try dropping existing"
5817 << " remapping pair "
5818 << um_from
<< " -> " << um_to
5819 << " which remapped " << pg
5820 << " out from underfull osd." << osd
5822 temp_pgs_by_osd
[um_to
].erase(pg
);
5823 temp_pgs_by_osd
[um_from
].insert(pg
);
5825 new_upmap_items
.push_back(ump
);
5828 if (new_upmap_items
.empty()) {
5830 ldout(cct
, 10) << " existing pg_upmap_items " << um_pairs
5831 << " remapped " << pg
5832 << " out from underfull osd." << osd
5833 << ", will try cancelling it entirely"
5835 to_unmap
.insert(pg
);
5837 } else if (new_upmap_items
.size() != um_pairs
.size()) {
5838 // drop single remapping pair, updating
5839 ceph_assert(new_upmap_items
.size() < um_pairs
.size());
5840 ldout(cct
, 10) << " existing pg_upmap_items " << um_pairs
5841 << " remapped " << pg
5842 << " out from underfull osd." << osd
5843 << ", new_pg_upmap_items now " << new_upmap_items
5845 to_upmap
[pg
] = new_upmap_items
;
5852 void OSDMap::add_remap_pair(
5857 size_t pg_pool_size
,
5860 map
<int,set
<pg_t
>>& temp_pgs_by_osd
,
5861 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
,
5862 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>>& to_upmap
)
5865 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5866 // the relevant data structures
5868 ldout(cct
, 10) << " will try adding new remapping pair "
5869 << orig
<< " -> " << out
<< " for " << pg
5870 << (orig
!= osd
? " NOT selected osd" : "")
5872 existing
.insert(orig
);
5873 existing
.insert(out
);
5874 temp_pgs_by_osd
[orig
].erase(pg
);
5875 temp_pgs_by_osd
[out
].insert(pg
);
5876 ceph_assert(new_upmap_items
.size() < pg_pool_size
);
5877 new_upmap_items
.push_back(make_pair(orig
, out
));
5878 // append new remapping pairs slowly
5879 // This way we can make sure that each tiny change will
5880 // definitely make distribution of PGs converging to
5881 // the perfect status.
5882 to_upmap
[pg
] = new_upmap_items
;
5886 int OSDMap::find_best_remap (
5888 const vector
<int>& orig
,
5889 const vector
<int>& out
,
5890 const set
<int>& existing
,
5891 const map
<int,float> osd_deviation
)
5894 // Find the best remap from the suggestions in orig and out - the best remap
5895 // is the one which maps from the OSD with the largest deviatoion (from the
5896 // OSDs which are part of orig)
5900 for (unsigned i
= 0; i
< out
.size(); ++i
) {
5901 if (orig
[i
] == out
[i
])
5902 continue; // skip invalid remappings
5903 if (existing
.count(orig
[i
]) || existing
.count(out
[i
]))
5904 continue; // we want new remappings only!
5905 if (osd_deviation
.at(orig
[i
]) > max_dev
) {
5906 max_dev
= osd_deviation
.at(orig
[i
]);
5908 ldout(cct
, 30) << "Max osd." << orig
[i
] << " pos " << i
<< " dev " << osd_deviation
.at(orig
[i
]) << dendl
;
5914 OSDMap::candidates_t
OSDMap::build_candidates(
5916 const OSDMap
& tmp_osd_map
,
5917 const set
<pg_t
> to_skip
,
5918 const set
<int64_t>& only_pools
,
5920 std::random_device::result_type
*p_seed
)
5923 // build the candidates data structure
5925 candidates_t candidates
;
5926 candidates
.reserve(tmp_osd_map
.pg_upmap_items
.size());
5927 for (auto& [pg
, um_pair
] : tmp_osd_map
.pg_upmap_items
) {
5928 if (to_skip
.count(pg
))
5930 if (!only_pools
.empty() && !only_pools
.count(pg
.pool()))
5932 candidates
.push_back(make_pair(pg
, um_pair
));
5935 // shuffle candidates so they all get equal (in)attention
5936 std::shuffle(candidates
.begin(), candidates
.end(), get_random_engine(cct
, p_seed
));
5941 // return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs
5942 int64_t OSDMap::has_zero_pa_pgs(CephContext
*cct
, int64_t pool_id
) const
5944 const pg_pool_t
* pool
= get_pg_pool(pool_id
);
5945 for (unsigned ps
= 0; ps
< pool
->get_pg_num(); ++ps
) {
5946 pg_t
pg(ps
, pool_id
);
5948 pg_to_up_acting_osds(pg
, nullptr, nullptr, &acting
, nullptr);
5949 if (cct
!= nullptr) {
5950 ldout(cct
, 30) << __func__
<< " " << pg
<< " acting " << acting
<< dendl
;
5952 bool pg_zero_pa
= true;
5953 for (auto osd
: acting
) {
5954 if (get_primary_affinityf(osd
) != 0) {
5960 if (cct
!= nullptr) {
5961 ldout(cct
, 20) << __func__
<< " " << pg
<< " - maps only to OSDs with primiary affinity 0" << dendl
;
5969 void OSDMap::zero_rbi(read_balance_info_t
&rbi
) const {
5971 rbi
.pa_weighted
= 0.;
5972 rbi
.pa_weighted_avg
= 0.;
5974 rbi
.optimal_score
= 0.;
5975 rbi
.adjusted_score
= 0.;
5976 rbi
.acting_raw_score
= 0.;
5977 rbi
.acting_adj_score
= 0.;
5981 int OSDMap::set_rbi(
5983 read_balance_info_t
&rbi
,
5989 float total_osd_weight
,
5990 uint max_prims_per_osd
,
5991 uint max_acting_prims_per_osd
,
5992 float avg_prims_per_osd
,
5993 bool prim_on_zero_pa
,
5994 bool acting_on_zero_pa
,
5995 float max_osd_score
) const
5997 // put all the ugly code here, so rest of code is nicer.
5998 const pg_pool_t
* pool
= get_pg_pool(pool_id
);
6001 if (total_w_pa
/ total_osd_weight
< 1. / float(pool
->get_size())) {
6002 ldout(cct
, 20) << __func__
<< " pool " << pool_id
<< " average primary affinity is lower than"
6003 << 1. / float(pool
->get_size()) << dendl
;
6004 rbi
.err_msg
= fmt::format(
6005 "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable",
6006 pool_id
, 1. / float(pool
->get_size()));
6009 rbi
.pa_weighted
= total_w_pa
;
6011 // weighted_prim_affinity_avg
6012 rbi
.pa_weighted_avg
= rbi_round(rbi
.pa_weighted
/ total_osd_weight
); // in [0..1]
6013 // p_rbi->pa_weighted / osd_pa_count; // in [0..1]
6015 rbi
.raw_score
= rbi_round((float)max_prims_per_osd
/ avg_prims_per_osd
); // >=1
6016 if (acting_on_zero_pa
) {
6017 rbi
.acting_raw_score
= rbi_round(max_osd_score
);
6018 rbi
.err_msg
= fmt::format(
6019 "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate",
6022 rbi
.acting_raw_score
= rbi_round((float)max_acting_prims_per_osd
/ avg_prims_per_osd
);
6025 if (osd_pa_count
!= 0) {
6026 // this implies that pa_sum > 0
6027 rbi
.pa_avg
= rbi_round(pa_sum
/ osd_pa_count
); // in [0..1]
6032 if (rbi
.pa_avg
!= 0.) {
6034 if ((zpg
= has_zero_pa_pgs(cct
, pool_id
)) >= 0) {
6035 pg_t
pg(zpg
, pool_id
);
6036 std::stringstream ss
;
6038 ldout(cct
, 10) << __func__
<< " pool " << pool_id
<< " has some PGs where all OSDs are with primary_affinity 0 (" << pg
<< ",...)" << dendl
;
6039 rbi
.err_msg
= fmt::format(
6040 "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable",
6044 rbi
.optimal_score
= rbi_round(float(num_osds
) / float(osd_pa_count
)); // >= 1
6045 // adjust the score to the primary affinity setting (if prim affinity is set
6046 // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1)
6047 // When total system primary affinity is too low (average < 1 / pool replica count)
6048 // the score is negative in order to grab the user's attention.
6049 rbi
.adjusted_score
= rbi_round(rbi
.raw_score
/ rbi
.optimal_score
); // >= 1 if PA is not low
6050 rbi
.acting_adj_score
= rbi_round(rbi
.acting_raw_score
/ rbi
.optimal_score
); // >= 1 if PA is not low
6053 // We should never get here - this condition is checked before calling this function - this is just sanity check code.
6054 rbi
.err_msg
= fmt::format(
6055 "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score",
6063 int OSDMap::calc_read_balance_score(CephContext
*cct
, int64_t pool_id
,
6064 read_balance_info_t
*p_rbi
) const
6066 //BUG: wrong score with one PG replica 3 and 4 OSDs
6068 ldout(cct
,20) << __func__
<< " pool " << get_pool_name(pool_id
) << dendl
;
6071 tmp_osd_map
.deepish_copy_from(*this);
6072 if (p_rbi
== nullptr) {
6073 // The only case where error message is not set - this is not tested in the unit test.
6075 ldout(cct
,30) << __func__
<< " p_rbi is nullptr." << dendl
;
6079 if (tmp_osd_map
.pools
.count(pool_id
) == 0) {
6081 ldout(cct
,30) << __func__
<< " pool " << pool_id
<< " not found." << dendl
;
6083 p_rbi
->err_msg
= fmt::format("pool {} not found", pool_id
);
6087 const pg_pool_t
* pool
= tmp_osd_map
.get_pg_pool(pool_id
);
6088 auto num_pgs
= pool
->get_pg_num();
6090 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
6091 map
<uint64_t,set
<pg_t
>> prim_pgs_by_osd
;
6092 map
<uint64_t,set
<pg_t
>> acting_prims_by_osd
;
6094 pgs_by_osd
= tmp_osd_map
.get_pgs_by_osd(cct
, pool_id
, &prim_pgs_by_osd
, &acting_prims_by_osd
);
6097 ldout(cct
,30) << __func__
<< " Primaries for pool: "
6098 << prim_pgs_by_osd
<< dendl
;
6100 if (pgs_by_osd
.empty()) {
6101 //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id);
6104 if (cct
!= nullptr) {
6105 for (auto& [osd
,pgs
] : prim_pgs_by_osd
) {
6106 ldout(cct
,20) << __func__
<< " Pool " << pool_id
<< " OSD." << osd
6107 << " has " << pgs
.size() << " primary PGs, "
6108 << acting_prims_by_osd
[osd
].size() << " acting primaries."
6113 auto num_osds
= pgs_by_osd
.size();
6115 float avg_prims_per_osd
= (float)num_pgs
/ (float)num_osds
;
6116 uint64_t max_prims_per_osd
= 0;
6117 uint64_t max_acting_prims_per_osd
= 0;
6118 float max_osd_score
= 0.;
6119 bool prim_on_zero_pa
= false;
6120 bool acting_on_zero_pa
= false;
6122 float prim_affinity_sum
= 0.;
6123 float total_osd_weight
= 0.;
6124 float total_weighted_pa
= 0.;
6126 map
<int,float> osds_crush_weight
;
6127 // Set up the OSDMap
6128 int ruleno
= tmp_osd_map
.pools
.at(pool_id
).get_crush_rule();
6129 tmp_osd_map
.crush
->get_rule_weight_osd_map(ruleno
, &osds_crush_weight
);
6131 if (cct
!= nullptr) {
6132 ldout(cct
,20) << __func__
<< " pool " << pool_id
6133 << " ruleno " << ruleno
6134 << " weight-map " << osds_crush_weight
6137 uint osd_pa_count
= 0;
6139 for (auto [osd
, oweight
] : osds_crush_weight
) { // loop over all OSDs
6140 total_osd_weight
+= oweight
;
6141 float osd_pa
= tmp_osd_map
.get_primary_affinityf(osd
);
6142 total_weighted_pa
+= oweight
* osd_pa
;
6146 if (prim_pgs_by_osd
.count(osd
)) {
6147 auto n_prims
= prim_pgs_by_osd
.at(osd
).size();
6148 max_prims_per_osd
= std::max(max_prims_per_osd
, n_prims
);
6150 prim_on_zero_pa
= true;
6153 if (acting_prims_by_osd
.count(osd
)) {
6154 auto n_aprims
= acting_prims_by_osd
.at(osd
).size();
6155 max_acting_prims_per_osd
= std::max(max_acting_prims_per_osd
, n_aprims
);
6157 max_osd_score
= std::max(max_osd_score
, float(n_aprims
) / osd_pa
);
6160 acting_on_zero_pa
= true;
6164 prim_affinity_sum
+= osd_pa
;
6165 if (cct
!= nullptr) {
6166 auto np
= prim_pgs_by_osd
.count(osd
) ? prim_pgs_by_osd
.at(osd
).size() : 0;
6167 auto nap
= acting_prims_by_osd
.count(osd
) ? acting_prims_by_osd
.at(osd
).size() : 0;
6168 auto wt
= osds_crush_weight
.count(osd
) ? osds_crush_weight
.at(osd
) : 0.;
6169 ldout(cct
,30) << __func__
<< " OSD." << osd
<< " info: "
6170 << " num_primaries " << np
6171 << " num_acting_prims " << nap
6172 << " prim_affinity " << tmp_osd_map
.get_primary_affinityf(osd
)
6177 if (cct
!= nullptr) {
6178 ldout(cct
,30) << __func__
<< " pool " << pool_id
6179 << " total_osd_weight " << total_osd_weight
6180 << " total_weighted_pa " << total_weighted_pa
6184 if (prim_affinity_sum
== 0.0) {
6185 if (cct
!= nullptr) {
6186 ldout(cct
, 10) << __func__
<< " pool " << pool_id
6187 << " has primary_affinity set to zero on all OSDs" << dendl
;
6190 p_rbi
->err_msg
= fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id
);
6192 return -ERANGE
; // score has a different meaning now.
6195 max_osd_score
*= prim_affinity_sum
/ num_osds
;
6198 rc
= tmp_osd_map
.set_rbi(cct
, *p_rbi
, pool_id
, total_weighted_pa
,
6199 prim_affinity_sum
, num_osds
, osd_pa_count
,
6200 total_osd_weight
, max_prims_per_osd
,
6201 max_acting_prims_per_osd
, avg_prims_per_osd
,
6202 prim_on_zero_pa
, acting_on_zero_pa
, max_osd_score
);
6204 if (cct
!= nullptr) {
6205 ldout(cct
,30) << __func__
<< " pool " << get_pool_name(pool_id
)
6206 << " pa_avg " << p_rbi
->pa_avg
6207 << " pa_weighted " << p_rbi
->pa_weighted
6208 << " pa_weighted_avg " << p_rbi
->pa_weighted_avg
6209 << " optimal_score " << p_rbi
->optimal_score
6210 << " adjusted_score " << p_rbi
->adjusted_score
6211 << " acting_adj_score " << p_rbi
->acting_adj_score
6213 ldout(cct
,20) << __func__
<< " pool " << get_pool_name(pool_id
)
6214 << " raw_score: " << p_rbi
->raw_score
6215 << " acting_raw_score: " << p_rbi
->acting_raw_score
6217 ldout(cct
,10) << __func__
<< " pool " << get_pool_name(pool_id
)
6218 << " wl_score: " << p_rbi
->acting_adj_score
<< dendl
;
6224 int OSDMap::get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const
6226 return crush
->get_leaves(name
, osds
);
6229 // get pools whose crush rules might reference the given osd
6230 void OSDMap::get_pool_ids_by_osd(CephContext
*cct
,
6232 set
<int64_t> *pool_ids
) const
6234 ceph_assert(pool_ids
);
6236 int r
= crush
->get_rules_by_osd(osd
, &raw_rules
);
6238 lderr(cct
) << __func__
<< " get_rules_by_osd failed: " << cpp_strerror(r
)
6240 ceph_assert(r
>= 0);
6243 for (auto &i
: raw_rules
) {
6244 // exclude any dead rule
6245 if (crush_rule_in_use(i
)) {
6249 for (auto &r
: rules
) {
6250 get_pool_ids_by_rule(r
, pool_ids
);
6254 template <typename F
>
6255 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
6257 typedef CrushTreeDumper::Dumper
<F
> Parent
;
6259 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
6260 const PGMap
& pgmap_
, bool tree_
,
6261 const string
& filter
) :
6262 Parent(crush
, osdmap_
->get_pool_names()),
6270 if (osdmap
->crush
->name_exists(filter
)) {
6271 // filter by crush node
6272 auto item_id
= osdmap
->crush
->get_item_id(filter
);
6273 allowed
.insert(item_id
);
6274 osdmap
->crush
->get_all_children(item_id
, &allowed
);
6275 } else if (osdmap
->crush
->class_exists(filter
)) {
6276 // filter by device class
6277 class_id
= osdmap
->crush
->get_class_id(filter
);
6278 } else if (auto pool_id
= osdmap
->lookup_pg_pool_name(filter
);
6281 auto crush_rule
= osdmap
->get_pool_crush_rule(pool_id
);
6283 osdmap
->crush
->find_takes_by_rule(crush_rule
, &roots
);
6285 for (auto r
: roots
)
6286 osdmap
->crush
->get_all_children(r
, &allowed
);
6288 average_util
= average_utilization();
6293 bool should_dump(int id
) const {
6294 if (!allowed
.empty() && !allowed
.count(id
)) // filter by name
6296 if (id
>= 0 && class_id
>= 0) {
6297 auto item_class_id
= osdmap
->crush
->get_item_class_id(id
);
6298 if (item_class_id
< 0 || // not bound to a class yet
6299 item_class_id
!= class_id
) // or already bound to a different class
6305 set
<int> get_dumped_osds() {
6306 if (allowed
.empty() && class_id
< 0) {
6313 void dump_stray(F
*f
) {
6314 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
6315 if (osdmap
->exists(i
) && !this->is_touched(i
))
6316 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
6320 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
6321 if (!tree
&& (qi
.is_bucket() || dumped_osds
.count(qi
.id
)))
6323 if (!should_dump(qi
.id
))
6326 if (!qi
.is_bucket())
6327 dumped_osds
.insert(qi
.id
);
6328 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
6329 int64_t kb
= 0, kb_used
= 0, kb_used_data
= 0, kb_used_omap
= 0,
6330 kb_used_meta
= 0, kb_avail
= 0;
6332 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_used_data
,
6333 &kb_used_omap
, &kb_used_meta
, &kb_avail
))
6335 util
= 100.0 * (double)kb_used
/ (double)kb
;
6339 var
= util
/ average_util
;
6341 size_t num_pgs
= qi
.is_bucket() ? 0 : pgmap
.get_num_pg_by_osd(qi
.id
);
6343 dump_item(qi
, reweight
, kb
, kb_used
,
6344 kb_used_data
, kb_used_omap
, kb_used_meta
,
6345 kb_avail
, util
, var
, num_pgs
, f
);
6347 if (!qi
.is_bucket() && reweight
> 0) {
6348 if (min_var
< 0 || var
< min_var
)
6350 if (max_var
< 0 || var
> max_var
)
6353 double dev
= util
- average_util
;
6355 stddev
+= reweight
* dev
;
6360 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
6364 int64_t kb_used_data
,
6365 int64_t kb_used_omap
,
6366 int64_t kb_used_meta
,
6370 const size_t num_pgs
,
6374 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
6377 double average_utilization() {
6378 int64_t kb
= 0, kb_used
= 0;
6379 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
6380 if (!osdmap
->exists(i
) ||
6381 osdmap
->get_weight(i
) == 0 ||
6384 int64_t kb_i
, kb_used_i
, kb_used_data_i
, kb_used_omap_i
, kb_used_meta_i
,
6386 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_used_data_i
,
6387 &kb_used_omap_i
, &kb_used_meta_i
, &kb_avail_i
)) {
6389 kb_used
+= kb_used_i
;
6392 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
6395 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
6396 int64_t* kb_used_data
,
6397 int64_t* kb_used_omap
,
6398 int64_t* kb_used_meta
,
6399 int64_t* kb_avail
) const {
6400 const osd_stat_t
*p
= pgmap
.get_osd_stat(id
);
6401 if (!p
) return false;
6402 *kb
= p
->statfs
.kb();
6403 *kb_used
= p
->statfs
.kb_used_raw();
6404 *kb_used_data
= p
->statfs
.kb_used_data();
6405 *kb_used_omap
= p
->statfs
.kb_used_omap();
6406 *kb_used_meta
= p
->statfs
.kb_used_internal_metadata();
6407 *kb_avail
= p
->statfs
.kb_avail();
6412 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
6413 int64_t* kb_used_data
,
6414 int64_t* kb_used_omap
,
6415 int64_t* kb_used_meta
,
6416 int64_t* kb_avail
) const {
6418 if (osdmap
->is_out(id
) || !should_dump(id
)) {
6427 return get_osd_utilization(id
, kb
, kb_used
, kb_used_data
,
6428 kb_used_omap
, kb_used_meta
, kb_avail
);
6438 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
6439 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
6440 int64_t kb_i
= 0, kb_used_i
= 0, kb_used_data_i
= 0,
6441 kb_used_omap_i
= 0, kb_used_meta_i
= 0, kb_avail_i
= 0;
6442 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
,
6443 &kb_used_data_i
, &kb_used_omap_i
,
6444 &kb_used_meta_i
, &kb_avail_i
))
6447 *kb_used
+= kb_used_i
;
6448 *kb_used_data
+= kb_used_data_i
;
6449 *kb_used_omap
+= kb_used_omap_i
;
6450 *kb_used_meta
+= kb_used_meta_i
;
6451 *kb_avail
+= kb_avail_i
;
6457 const OSDMap
*osdmap
;
6460 double average_util
;
6467 set
<int> dumped_osds
;
6471 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
6473 typedef OSDUtilizationDumper
<TextTable
> Parent
;
6475 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
6476 const PGMap
& pgmap
, bool tree
,
6477 const string
& filter
) :
6478 Parent(crush
, osdmap
, pgmap
, tree
, filter
) {}
6480 void dump(TextTable
*tbl
) {
6481 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
6482 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
6483 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
6484 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
6485 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
6486 tbl
->define_column("RAW USE", TextTable::LEFT
, TextTable::RIGHT
);
6487 tbl
->define_column("DATA", TextTable::LEFT
, TextTable::RIGHT
);
6488 tbl
->define_column("OMAP", TextTable::LEFT
, TextTable::RIGHT
);
6489 tbl
->define_column("META", TextTable::LEFT
, TextTable::RIGHT
);
6490 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
6491 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
6492 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
6493 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
6494 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
6496 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
6502 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
6506 << byte_u_t(sum
.statfs
.total
)
6507 << byte_u_t(sum
.statfs
.get_used_raw())
6508 << byte_u_t(sum
.statfs
.allocated
)
6509 << byte_u_t(sum
.statfs
.omap_allocated
)
6510 << byte_u_t(sum
.statfs
.internal_metadata
)
6511 << byte_u_t(sum
.statfs
.available
)
6512 << lowprecision_t(average_util
)
6514 << TextTable::endrow
;
6518 struct lowprecision_t
{
6520 explicit lowprecision_t(float _v
) : v(_v
) {}
6522 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
6524 using OSDUtilizationDumper
<TextTable
>::dump_item
;
6525 void dump_item(const CrushTreeDumper::Item
&qi
,
6529 int64_t kb_used_data
,
6530 int64_t kb_used_omap
,
6531 int64_t kb_used_meta
,
6535 const size_t num_pgs
,
6536 TextTable
*tbl
) override
{
6537 const char *c
= crush
->get_item_class(qi
.id
);
6542 << weightf_t(qi
.weight
)
6543 << weightf_t(reweight
)
6544 << byte_u_t(kb
<< 10)
6545 << byte_u_t(kb_used
<< 10)
6546 << byte_u_t(kb_used_data
<< 10)
6547 << byte_u_t(kb_used_omap
<< 10)
6548 << byte_u_t(kb_used_meta
<< 10)
6549 << byte_u_t(kb_avail
<< 10)
6550 << lowprecision_t(util
)
6551 << lowprecision_t(var
);
6553 if (qi
.is_bucket()) {
6558 if (osdmap
->is_up(qi
.id
)) {
6560 } else if (osdmap
->is_destroyed(qi
.id
)) {
6561 *tbl
<< "destroyed";
6569 for (int k
= 0; k
< qi
.depth
; k
++)
6571 if (qi
.is_bucket()) {
6572 int type
= crush
->get_bucket_type(qi
.id
);
6573 name
<< crush
->get_type_name(type
) << " "
6574 << crush
->get_item_name(qi
.id
);
6576 name
<< "osd." << qi
.id
;
6581 *tbl
<< TextTable::endrow
;
6587 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
6588 << "/" << lowprecision_t(max_var
) << " "
6589 << "STDDEV: " << lowprecision_t(dev());
6594 ostream
& operator<<(ostream
& out
,
6595 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
6599 } else if (v
.v
< 0.001) {
6602 std::streamsize p
= out
.precision();
6603 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
6607 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
6609 typedef OSDUtilizationDumper
<Formatter
> Parent
;
6611 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
6612 const PGMap
& pgmap
, bool tree
,
6613 const string
& filter
) :
6614 Parent(crush
, osdmap
, pgmap
, tree
, filter
) {}
6616 void dump(Formatter
*f
) {
6617 f
->open_array_section("nodes");
6621 f
->open_array_section("stray");
6627 using OSDUtilizationDumper
<Formatter
>::dump_item
;
6628 void dump_item(const CrushTreeDumper::Item
&qi
,
6632 int64_t kb_used_data
,
6633 int64_t kb_used_omap
,
6634 int64_t kb_used_meta
,
6638 const size_t num_pgs
,
6639 Formatter
*f
) override
{
6640 f
->open_object_section("item");
6641 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
6642 f
->dump_float("reweight", reweight
);
6643 f
->dump_int("kb", kb
);
6644 f
->dump_int("kb_used", kb_used
);
6645 f
->dump_int("kb_used_data", kb_used_data
);
6646 f
->dump_int("kb_used_omap", kb_used_omap
);
6647 f
->dump_int("kb_used_meta", kb_used_meta
);
6648 f
->dump_int("kb_avail", kb_avail
);
6649 f
->dump_float("utilization", util
);
6650 f
->dump_float("var", var
);
6651 f
->dump_unsigned("pgs", num_pgs
);
6652 if (!qi
.is_bucket()) {
6653 if (osdmap
->is_up(qi
.id
)) {
6654 f
->dump_string("status", "up");
6655 } else if (osdmap
->is_destroyed(qi
.id
)) {
6656 f
->dump_string("status", "destroyed");
6658 f
->dump_string("status", "down");
6661 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
6666 void summary(Formatter
*f
) {
6667 f
->open_object_section("summary");
6668 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
6669 auto& s
= sum
.statfs
;
6671 f
->dump_int("total_kb", s
.kb());
6672 f
->dump_int("total_kb_used", s
.kb_used_raw());
6673 f
->dump_int("total_kb_used_data", s
.kb_used_data());
6674 f
->dump_int("total_kb_used_omap", s
.kb_used_omap());
6675 f
->dump_int("total_kb_used_meta", s
.kb_used_internal_metadata());
6676 f
->dump_int("total_kb_avail", s
.kb_avail());
6677 f
->dump_float("average_utilization", average_util
);
6678 f
->dump_float("min_var", min_var
);
6679 f
->dump_float("max_var", max_var
);
6680 f
->dump_float("dev", dev());
6685 void print_osd_utilization(const OSDMap
& osdmap
,
6690 const string
& filter
)
6692 const CrushWrapper
*crush
= osdmap
.crush
.get();
6694 f
->open_object_section("df");
6695 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgmap
, tree
, filter
);
6701 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgmap
, tree
, filter
);
6704 out
<< tbl
<< d
.summary() << "\n";
6708 void OSDMap::check_health(CephContext
*cct
,
6709 health_check_map_t
*checks
) const
6711 int num_osds
= get_num_osds();
6714 // OSD_$subtree_DOWN
6716 if (num_osds
>= 0) {
6717 int num_in_osds
= 0;
6718 int num_down_in_osds
= 0;
6720 set
<int> down_in_osds
;
6721 set
<int> up_in_osds
;
6722 set
<int> subtree_up
;
6723 unordered_map
<int, set
<int> > subtree_type_down
;
6724 unordered_map
<int, int> num_osds_subtree
;
6725 int max_type
= crush
->get_max_type_id();
6727 for (int i
= 0; i
< get_max_osd(); i
++) {
6729 if (crush
->item_exists(i
)) {
6734 if (is_out(i
) || (osd_state
[i
] & CEPH_OSD_NEW
))
6737 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
6740 down_in_osds
.insert(i
);
6743 for (int type
= 0; type
<= max_type
; type
++) {
6744 if (!crush
->get_type_name(type
))
6746 int r
= crush
->get_immediate_parent_id(current
, &parent_id
);
6749 // break early if this parent is already marked as up
6750 if (subtree_up
.count(parent_id
))
6752 type
= crush
->get_bucket_type(parent_id
);
6753 if (!subtree_type_is_down(
6754 cct
, parent_id
, type
,
6755 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
6757 current
= parent_id
;
6762 // calculate the number of down osds in each down subtree and
6763 // store it in num_osds_subtree
6764 for (int type
= 1; type
<= max_type
; type
++) {
6765 if (!crush
->get_type_name(type
))
6767 for (auto j
= subtree_type_down
[type
].begin();
6768 j
!= subtree_type_down
[type
].end();
6772 int num_children
= crush
->get_children(*j
, &children
);
6773 if (num_children
== 0)
6775 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
6778 } else if (num_osds_subtree
[*l
] > 0) {
6779 num
= num
+ num_osds_subtree
[*l
];
6782 num_osds_subtree
[*j
] = num
;
6785 num_down_in_osds
= down_in_osds
.size();
6786 ceph_assert(num_down_in_osds
<= num_in_osds
);
6787 if (num_down_in_osds
> 0) {
6788 // summary of down subtree types and osds
6789 for (int type
= max_type
; type
> 0; type
--) {
6790 if (!crush
->get_type_name(type
))
6792 if (subtree_type_down
[type
].size() > 0) {
6794 ss
<< subtree_type_down
[type
].size() << " "
6795 << crush
->get_type_name(type
);
6796 if (subtree_type_down
[type
].size() > 1) {
6799 int sum_down_osds
= 0;
6800 for (auto j
= subtree_type_down
[type
].begin();
6801 j
!= subtree_type_down
[type
].end();
6803 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
6805 ss
<< " (" << sum_down_osds
<< " osds) down";
6806 string err
= string("OSD_") +
6807 string(crush
->get_type_name(type
)) + "_DOWN";
6808 boost::to_upper(err
);
6809 auto& d
= checks
->add(err
, HEALTH_WARN
, ss
.str(),
6810 subtree_type_down
[type
].size());
6811 for (auto j
= subtree_type_down
[type
].rbegin();
6812 j
!= subtree_type_down
[type
].rend();
6815 ss
<< crush
->get_type_name(type
);
6817 ss
<< crush
->get_item_name(*j
);
6818 // at the top level, do not print location
6819 if (type
!= max_type
) {
6821 ss
<< crush
->get_full_location_ordered_string(*j
);
6824 int num
= num_osds_subtree
[*j
];
6825 ss
<< " (" << num
<< " osds)";
6827 d
.detail
.push_back(ss
.str());
6832 ss
<< down_in_osds
.size() << " osds down";
6833 auto& d
= checks
->add("OSD_DOWN", HEALTH_WARN
, ss
.str(),
6834 down_in_osds
.size());
6835 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
6837 ss
<< "osd." << *it
<< " (";
6838 ss
<< crush
->get_full_location_ordered_string(*it
);
6840 d
.detail
.push_back(ss
.str());
6844 if (!osds
.empty()) {
6846 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
6847 auto& d
= checks
->add("OSD_ORPHAN", HEALTH_WARN
, ss
.str(),
6849 for (auto osd
: osds
) {
6851 ss
<< "osd." << osd
<< " exists in crush map but not in osdmap";
6852 d
.detail
.push_back(ss
.str());
6857 std::list
<std::string
> scrub_messages
;
6858 bool noscrub
= false, nodeepscrub
= false;
6859 for (const auto &p
: pools
) {
6860 if (p
.second
.flags
& pg_pool_t::FLAG_NOSCRUB
) {
6862 ss
<< "Pool " << get_pool_name(p
.first
) << " has noscrub flag";
6863 scrub_messages
.push_back(ss
.str());
6866 if (p
.second
.flags
& pg_pool_t::FLAG_NODEEP_SCRUB
) {
6868 ss
<< "Pool " << get_pool_name(p
.first
) << " has nodeep-scrub flag";
6869 scrub_messages
.push_back(ss
.str());
6873 if (noscrub
|| nodeepscrub
) {
6875 out
+= noscrub
? string("noscrub") + (nodeepscrub
? ", " : "") : "";
6876 out
+= nodeepscrub
? "nodeep-scrub" : "";
6877 auto& d
= checks
->add("POOL_SCRUB_FLAGS", HEALTH_OK
,
6878 "Some pool(s) have the " + out
+ " flag(s) set", 0);
6879 d
.detail
.splice(d
.detail
.end(), scrub_messages
);
6882 // OSD_OUT_OF_ORDER_FULL
6884 // An osd could configure failsafe ratio, to something different
6885 // but for now assume it is the same here.
6886 float fsr
= cct
->_conf
->osd_failsafe_full_ratio
;
6887 if (fsr
> 1.0) fsr
/= 100;
6888 float fr
= get_full_ratio();
6889 float br
= get_backfillfull_ratio();
6890 float nr
= get_nearfull_ratio();
6892 list
<string
> detail
;
6893 // These checks correspond to how OSDService::check_full_status() in an OSD
6894 // handles the improper setting of these values.
6897 ss
<< "backfillfull_ratio (" << br
6898 << ") < nearfull_ratio (" << nr
<< "), increased";
6899 detail
.push_back(ss
.str());
6904 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
6906 detail
.push_back(ss
.str());
6911 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
6913 detail
.push_back(ss
.str());
6915 if (!detail
.empty()) {
6916 auto& d
= checks
->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR
,
6917 "full ratio(s) out of order", 0);
6918 d
.detail
.swap(detail
);
6925 // OSD_FAILSAFE_FULL
6927 set
<int> full
, backfillfull
, nearfull
;
6928 get_full_osd_counts(&full
, &backfillfull
, &nearfull
);
6931 ss
<< full
.size() << " full osd(s)";
6932 auto& d
= checks
->add("OSD_FULL", HEALTH_ERR
, ss
.str(), full
.size());
6933 for (auto& i
: full
) {
6935 ss
<< "osd." << i
<< " is full";
6936 d
.detail
.push_back(ss
.str());
6939 if (backfillfull
.size()) {
6941 ss
<< backfillfull
.size() << " backfillfull osd(s)";
6942 auto& d
= checks
->add("OSD_BACKFILLFULL", HEALTH_WARN
, ss
.str(),
6943 backfillfull
.size());
6944 for (auto& i
: backfillfull
) {
6946 ss
<< "osd." << i
<< " is backfill full";
6947 d
.detail
.push_back(ss
.str());
6950 if (nearfull
.size()) {
6952 ss
<< nearfull
.size() << " nearfull osd(s)";
6953 auto& d
= checks
->add("OSD_NEARFULL", HEALTH_WARN
, ss
.str(), nearfull
.size());
6954 for (auto& i
: nearfull
) {
6956 ss
<< "osd." << i
<< " is near full";
6957 d
.detail
.push_back(ss
.str());
6965 uint64_t warn_flags
=
6966 CEPH_OSDMAP_PAUSERD
|
6967 CEPH_OSDMAP_PAUSEWR
|
6968 CEPH_OSDMAP_PAUSEREC
|
6970 CEPH_OSDMAP_NODOWN
|
6973 CEPH_OSDMAP_NOBACKFILL
|
6974 CEPH_OSDMAP_NORECOVER
|
6975 CEPH_OSDMAP_NOSCRUB
|
6976 CEPH_OSDMAP_NODEEP_SCRUB
|
6977 CEPH_OSDMAP_NOTIERAGENT
|
6978 CEPH_OSDMAP_NOSNAPTRIM
|
6979 CEPH_OSDMAP_NOREBALANCE
;
6980 if (test_flag(warn_flags
)) {
6982 string s
= get_flag_string(get_flags() & warn_flags
);
6983 ss
<< s
<< " flag(s) set";
6984 checks
->add("OSDMAP_FLAGS", HEALTH_WARN
, ss
.str(),
6985 s
.size() /* kludgey but sufficient */);
6991 list
<string
> detail
;
6992 const unsigned flags
=
6997 for (int i
= 0; i
< max_osd
; ++i
) {
6998 if (osd_state
[i
] & flags
) {
7001 OSDMap::calc_state_set(osd_state
[i
] & flags
, states
);
7002 ss
<< "osd." << i
<< " has flags " << states
;
7003 detail
.push_back(ss
.str());
7006 for (auto& i
: crush_node_flags
) {
7007 if (i
.second
&& crush
->item_exists(i
.first
)) {
7010 OSDMap::calc_state_set(i
.second
, states
);
7011 int t
= i
.first
>= 0 ? 0 : crush
->get_bucket_type(i
.first
);
7012 const char *tn
= crush
->get_type_name(t
);
7013 ss
<< (tn
? tn
: "node") << " "
7014 << crush
->get_item_name(i
.first
) << " has flags " << states
;
7015 detail
.push_back(ss
.str());
7018 for (auto& i
: device_class_flags
) {
7019 const char* class_name
= crush
->get_class_name(i
.first
);
7020 if (i
.second
&& class_name
) {
7023 OSDMap::calc_state_set(i
.second
, states
);
7024 ss
<< "device class '" << class_name
<< "' has flags " << states
;
7025 detail
.push_back(ss
.str());
7028 if (!detail
.empty()) {
7030 ss
<< detail
.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
7031 auto& d
= checks
->add("OSD_FLAGS", HEALTH_WARN
, ss
.str(), detail
.size());
7032 d
.detail
.swap(detail
);
7036 // OLD_CRUSH_TUNABLES
7037 if (cct
->_conf
->mon_warn_on_legacy_crush_tunables
) {
7038 string min
= crush
->get_min_required_version();
7039 if (min
< cct
->_conf
->mon_crush_min_required_version
) {
7041 ss
<< "crush map has legacy tunables (require " << min
7042 << ", min is " << cct
->_conf
->mon_crush_min_required_version
<< ")";
7043 auto& d
= checks
->add("OLD_CRUSH_TUNABLES", HEALTH_WARN
, ss
.str(), 0);
7044 d
.detail
.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7048 // OLD_CRUSH_STRAW_CALC_VERSION
7049 if (cct
->_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
7050 if (crush
->get_straw_calc_version() == 0) {
7052 ss
<< "crush map has straw_calc_version=0";
7053 auto& d
= checks
->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN
, ss
.str(), 0);
7055 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7059 // CACHE_POOL_NO_HIT_SET
7060 if (cct
->_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
7061 list
<string
> detail
;
7062 for (auto p
= pools
.cbegin(); p
!= pools
.cend(); ++p
) {
7063 const pg_pool_t
& info
= p
->second
;
7064 if (info
.cache_mode_requires_hit_set() &&
7065 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
7067 ss
<< "pool '" << get_pool_name(p
->first
)
7068 << "' with cache_mode " << info
.get_cache_mode_name()
7069 << " needs hit_set_type to be set but it is not";
7070 detail
.push_back(ss
.str());
7073 if (!detail
.empty()) {
7075 ss
<< detail
.size() << " cache pools are missing hit_sets";
7076 auto& d
= checks
->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN
, ss
.str(),
7078 d
.detail
.swap(detail
);
7082 // OSD_NO_SORTBITWISE
7083 if (!test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
7085 ss
<< "'sortbitwise' flag is not set";
7086 checks
->add("OSD_NO_SORTBITWISE", HEALTH_WARN
, ss
.str(), 0);
7089 // OSD_UPGRADE_FINISHED
7090 if (auto require_release
= pending_require_osd_release()) {
7092 ss
<< "all OSDs are running " << *require_release
<< " or later but"
7093 << " require_osd_release < " << *require_release
;
7094 auto& d
= checks
->add("OSD_UPGRADE_FINISHED", HEALTH_WARN
, ss
.str(), 0);
7095 d
.detail
.push_back(ss
.str());
7098 // POOL_NEARFULL/BACKFILLFULL/FULL
7100 list
<string
> full_detail
, backfillfull_detail
, nearfull_detail
;
7101 for (auto it
: get_pools()) {
7102 const pg_pool_t
&pool
= it
.second
;
7103 const string
& pool_name
= get_pool_name(it
.first
);
7104 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
7106 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7107 // may run out of space too,
7108 // but we want EQUOTA taking precedence
7109 ss
<< "pool '" << pool_name
<< "' is full (running out of quota)";
7111 ss
<< "pool '" << pool_name
<< "' is full (no space)";
7113 full_detail
.push_back(ss
.str());
7114 } else if (pool
.has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
7116 ss
<< "pool '" << pool_name
<< "' is backfillfull";
7117 backfillfull_detail
.push_back(ss
.str());
7118 } else if (pool
.has_flag(pg_pool_t::FLAG_NEARFULL
)) {
7120 ss
<< "pool '" << pool_name
<< "' is nearfull";
7121 nearfull_detail
.push_back(ss
.str());
7124 if (!full_detail
.empty()) {
7126 ss
<< full_detail
.size() << " pool(s) full";
7127 auto& d
= checks
->add("POOL_FULL", HEALTH_WARN
, ss
.str(), full_detail
.size());
7128 d
.detail
.swap(full_detail
);
7130 if (!backfillfull_detail
.empty()) {
7132 ss
<< backfillfull_detail
.size() << " pool(s) backfillfull";
7133 auto& d
= checks
->add("POOL_BACKFILLFULL", HEALTH_WARN
, ss
.str(),
7134 backfillfull_detail
.size());
7135 d
.detail
.swap(backfillfull_detail
);
7137 if (!nearfull_detail
.empty()) {
7139 ss
<< nearfull_detail
.size() << " pool(s) nearfull";
7140 auto& d
= checks
->add("POOL_NEARFULL", HEALTH_WARN
, ss
.str(),
7141 nearfull_detail
.size());
7142 d
.detail
.swap(nearfull_detail
);
7146 // POOL_PG_NUM_NOT_POWER_OF_TWO
7147 if (cct
->_conf
.get_val
<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
7148 list
<string
> detail
;
7149 for (auto it
: get_pools()) {
7150 if (!std::has_single_bit(it
.second
.get_pg_num_target())) {
7152 ss
<< "pool '" << get_pool_name(it
.first
)
7153 << "' pg_num " << it
.second
.get_pg_num_target()
7154 << " is not a power of two";
7155 detail
.push_back(ss
.str());
7158 if (!detail
.empty()) {
7160 ss
<< detail
.size() << " pool(s) have non-power-of-two pg_num";
7161 auto& d
= checks
->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN
,
7162 ss
.str(), detail
.size());
7163 d
.detail
.swap(detail
);
7167 // POOL_NO_REDUNDANCY
7168 if (cct
->_conf
.get_val
<bool>("mon_warn_on_pool_no_redundancy"))
7170 list
<string
> detail
;
7171 for (auto it
: get_pools()) {
7172 if (it
.second
.get_size() == 1) {
7174 ss
<< "pool '" << get_pool_name(it
.first
)
7175 << "' has no replicas configured";
7176 detail
.push_back(ss
.str());
7179 if (!detail
.empty()) {
7181 ss
<< detail
.size() << " pool(s) have no replicas configured";
7182 auto& d
= checks
->add("POOL_NO_REDUNDANCY", HEALTH_WARN
,
7183 ss
.str(), detail
.size());
7184 d
.detail
.swap(detail
);
7188 // DEGRADED STRETCH MODE
7189 if (cct
->_conf
.get_val
<bool>("mon_warn_on_degraded_stretch_mode")) {
7190 if (recovering_stretch_mode
) {
7192 ss
<< "We are recovering stretch mode buckets, only requiring "
7193 << degraded_stretch_mode
<< " of " << stretch_bucket_count
<< " buckets to peer" ;
7194 checks
->add("RECOVERING_STRETCH_MODE", HEALTH_WARN
,
7196 } else if (degraded_stretch_mode
) {
7198 ss
<< "We are missing stretch mode buckets, only requiring "
7199 << degraded_stretch_mode
<< " of " << stretch_bucket_count
<< " buckets to peer" ;
7200 checks
->add("DEGRADED_STRETCH_MODE", HEALTH_WARN
,
7206 int OSDMap::parse_osd_id_list(const vector
<string
>& ls
, set
<int> *out
,
7210 for (auto i
= ls
.begin(); i
!= ls
.end(); ++i
) {
7211 if (i
== ls
.begin() &&
7212 (*i
== "any" || *i
== "all" || *i
== "*")) {
7216 long osd
= ceph::common::parse_osd_id(i
->c_str(), ss
);
7218 *ss
<< "invalid osd id '" << *i
<< "'";
7226 void OSDMap::get_random_up_osds_by_subtree(int n
, // whoami
7228 int limit
, // how many
7230 set
<int> *want
) const {
7233 int subtree_type
= crush
->get_type_id(subtree
);
7234 if (subtree_type
< 1)
7236 vector
<int> subtrees
;
7237 crush
->get_subtree_of_type(subtree_type
, &subtrees
);
7238 std::random_device rd
;
7239 std::default_random_engine rng
{rd()};
7240 std::shuffle(subtrees
.begin(), subtrees
.end(), rng
);
7241 for (auto s
: subtrees
) {
7244 if (crush
->subtree_contains(s
, n
))
7247 crush
->get_children_of_type(s
, 0, &osds
);
7250 vector
<int> up_osds
;
7251 for (auto o
: osds
) {
7252 if (is_up(o
) && !skip
.count(o
))
7253 up_osds
.push_back(o
);
7255 if (up_osds
.empty())
7257 auto it
= up_osds
.begin();
7258 std::advance(it
, (n
% up_osds
.size()));
7264 float OSDMap::pool_raw_used_rate(int64_t poolid
) const
7266 const pg_pool_t
*pool
= get_pg_pool(poolid
);
7267 assert(pool
!= nullptr);
7269 switch (pool
->get_type()) {
7270 case pg_pool_t::TYPE_REPLICATED
:
7271 return pool
->get_size();
7272 case pg_pool_t::TYPE_ERASURE
:
7275 get_erasure_code_profile(pool
->erasure_code_profile
);
7276 auto pm
= ecp
.find("m");
7277 auto pk
= ecp
.find("k");
7278 if (pm
!= ecp
.end() && pk
!= ecp
.end()) {
7279 int k
= atoi(pk
->second
.c_str());
7280 int m
= atoi(pm
->second
.c_str());
7282 ceph_assert(mk
!= 0);
7283 ceph_assert(k
!= 0);
7284 return (float)mk
/ k
;
7291 ceph_abort_msg("unrecognized pool type");
7295 unsigned OSDMap::get_osd_crush_node_flags(int osd
) const
7298 if (!crush_node_flags
.empty()) {
7299 // the map will contain type -> name
7300 std::map
<std::string
,std::string
> ploc
= crush
->get_full_location(osd
);
7301 for (auto& i
: ploc
) {
7302 int id
= crush
->get_item_id(i
.second
);
7303 auto p
= crush_node_flags
.find(id
);
7304 if (p
!= crush_node_flags
.end()) {
7312 unsigned OSDMap::get_crush_node_flags(int id
) const
7315 auto it
= crush_node_flags
.find(id
);
7316 if (it
!= crush_node_flags
.end())
7321 unsigned OSDMap::get_device_class_flags(int id
) const
7324 auto it
= device_class_flags
.find(id
);
7325 if (it
!= device_class_flags
.end())
7330 std::optional
<std::string
> OSDMap::pending_require_osd_release() const
7332 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY
) &&
7333 require_osd_release
< ceph_release_t::quincy
) {
7336 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC
) &&
7337 require_osd_release
< ceph_release_t::pacific
) {
7340 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS
) &&
7341 require_osd_release
< ceph_release_t::octopus
) {
7344 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS
) &&
7345 require_osd_release
< ceph_release_t::nautilus
) {
7349 return std::nullopt
;