1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
22 #include <fmt/format.h>
24 #include <boost/algorithm/string.hpp>
27 #include "common/config.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/TextTable.h"
31 #include "include/ceph_features.h"
32 #include "include/common_fwd.h"
33 #include "include/str_map.h"
35 #include "common/code_environment.h"
36 #include "mon/health_check.h"
38 #include "crush/CrushTreeDumper.h"
39 #include "common/Clock.h"
40 #include "mon/PGMap.h"
47 using std::ostringstream
;
51 using std::stringstream
;
52 using std::unordered_map
;
57 using ceph::Formatter
;
59 #define dout_subsys ceph_subsys_osd
61 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
62 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
65 // ----------------------------------
68 void osd_info_t::dump(Formatter
*f
) const
70 f
->dump_int("last_clean_begin", last_clean_begin
);
71 f
->dump_int("last_clean_end", last_clean_end
);
72 f
->dump_int("up_from", up_from
);
73 f
->dump_int("up_thru", up_thru
);
74 f
->dump_int("down_at", down_at
);
75 f
->dump_int("lost_at", lost_at
);
78 void osd_info_t::encode(ceph::buffer::list
& bl
) const
83 encode(last_clean_begin
, bl
);
84 encode(last_clean_end
, bl
);
91 void osd_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
96 decode(last_clean_begin
, bl
);
97 decode(last_clean_end
, bl
);
104 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
106 o
.push_back(new osd_info_t
);
107 o
.push_back(new osd_info_t
);
108 o
.back()->last_clean_begin
= 1;
109 o
.back()->last_clean_end
= 2;
110 o
.back()->up_from
= 30;
111 o
.back()->up_thru
= 40;
112 o
.back()->down_at
= 5;
113 o
.back()->lost_at
= 6;
116 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
118 out
<< "up_from " << info
.up_from
119 << " up_thru " << info
.up_thru
120 << " down_at " << info
.down_at
121 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
123 out
<< " lost_at " << info
.lost_at
;
127 // ----------------------------------
130 void osd_xinfo_t::dump(Formatter
*f
) const
132 f
->dump_stream("down_stamp") << down_stamp
;
133 f
->dump_float("laggy_probability", laggy_probability
);
134 f
->dump_int("laggy_interval", laggy_interval
);
135 f
->dump_int("features", features
);
136 f
->dump_unsigned("old_weight", old_weight
);
137 f
->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub
;
138 f
->dump_int("dead_epoch", dead_epoch
);
141 void osd_xinfo_t::encode(ceph::buffer::list
& bl
, uint64_t enc_features
) const
144 if (!HAVE_FEATURE(enc_features
, SERVER_OCTOPUS
)) {
147 ENCODE_START(v
, 1, bl
);
148 encode(down_stamp
, bl
);
149 __u32 lp
= laggy_probability
* float(0xfffffffful
);
151 encode(laggy_interval
, bl
);
152 encode(features
, bl
);
153 encode(old_weight
, bl
);
155 encode(last_purged_snaps_scrub
, bl
);
156 encode(dead_epoch
, bl
);
161 void osd_xinfo_t::decode(ceph::buffer::list::const_iterator
& bl
)
164 decode(down_stamp
, bl
);
167 laggy_probability
= (float)lp
/ (float)0xffffffff;
168 decode(laggy_interval
, bl
);
170 decode(features
, bl
);
174 decode(old_weight
, bl
);
178 decode(last_purged_snaps_scrub
, bl
);
179 decode(dead_epoch
, bl
);
186 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
188 o
.push_back(new osd_xinfo_t
);
189 o
.push_back(new osd_xinfo_t
);
190 o
.back()->down_stamp
= utime_t(2, 3);
191 o
.back()->laggy_probability
= .123;
192 o
.back()->laggy_interval
= 123456;
193 o
.back()->old_weight
= 0x7fff;
196 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
198 return out
<< "down_stamp " << xi
.down_stamp
199 << " laggy_probability " << xi
.laggy_probability
200 << " laggy_interval " << xi
.laggy_interval
201 << " old_weight " << xi
.old_weight
202 << " last_purged_snaps_scrub " << xi
.last_purged_snaps_scrub
203 << " dead_epoch " << xi
.dead_epoch
;
206 // ----------------------------------
207 // OSDMap::Incremental
209 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
212 for (auto &weight
: new_weight
) {
213 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
215 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
221 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
224 for (auto &state
: new_state
) { //
225 if (state
.second
& CEPH_OSD_UP
) {
226 if (previous
->is_up(state
.first
))
235 int OSDMap::Incremental::identify_osd(uuid_d u
) const
237 for (auto &uuid
: new_uuid
)
238 if (uuid
.second
== u
)
243 int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext
*cct
,
244 const OSDMap
& osdmap
)
246 ceph_assert(epoch
== osdmap
.get_epoch() + 1);
248 for (auto &new_pool
: new_pools
) {
249 if (!new_pool
.second
.tiers
.empty()) {
250 pg_pool_t
& base
= new_pool
.second
;
252 auto new_rem_it
= new_removed_snaps
.find(new_pool
.first
);
254 for (const auto &tier_pool
: base
.tiers
) {
255 const auto &r
= new_pools
.find(tier_pool
);
257 if (r
== new_pools
.end()) {
258 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
260 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
263 tier
= get_new_pool(tier_pool
, orig
);
267 if (tier
->tier_of
!= new_pool
.first
) {
268 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
272 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
273 << tier_pool
<< dendl
;
274 tier
->snap_seq
= base
.snap_seq
;
275 tier
->snap_epoch
= base
.snap_epoch
;
276 tier
->snaps
= base
.snaps
;
277 tier
->removed_snaps
= base
.removed_snaps
;
278 tier
->flags
|= base
.flags
& (pg_pool_t::FLAG_SELFMANAGED_SNAPS
|
279 pg_pool_t::FLAG_POOL_SNAPS
);
281 if (new_rem_it
!= new_removed_snaps
.end()) {
282 new_removed_snaps
[tier_pool
] = new_rem_it
->second
;
285 tier
->application_metadata
= base
.application_metadata
;
292 // ----------------------------------
295 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
301 down_cache
->count(id
)) {
306 crush
->get_children(id
, &children
);
307 for (const auto &child
: children
) {
308 if (!subtree_is_down(child
, down_cache
)) {
313 down_cache
->insert(id
);
318 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
320 // use a stack-local down_cache if we didn't get one from the
321 // caller. then at least this particular call will avoid duplicated
323 set
<int> local_down_cache
;
325 down_cache
= &local_down_cache
;
334 type
= crush
->get_bucket_type(current
);
336 ceph_assert(type
>= 0);
338 if (!subtree_is_down(current
, down_cache
)) {
339 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
343 // is this a big enough subtree to be marked as down?
344 if (type
>= subtree_type
) {
345 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
349 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
356 bool OSDMap::subtree_type_is_down(
360 set
<int> *down_in_osds
,
361 set
<int> *up_in_osds
,
362 set
<int> *subtree_up
,
363 unordered_map
<int, set
<int> > *subtree_type_down
) const
366 bool is_down_ret
= is_down(id
);
369 down_in_osds
->insert(id
);
371 up_in_osds
->insert(id
);
377 if (subtree_type_down
&&
378 (*subtree_type_down
)[subtree_type
].count(id
)) {
383 crush
->get_children(id
, &children
);
384 for (const auto &child
: children
) {
385 if (!subtree_type_is_down(
386 cct
, child
, crush
->get_bucket_type(child
),
387 down_in_osds
, up_in_osds
, subtree_up
, subtree_type_down
)) {
388 subtree_up
->insert(id
);
392 if (subtree_type_down
) {
393 (*subtree_type_down
)[subtree_type
].insert(id
);
398 void OSDMap::Incremental::encode_client_old(ceph::buffer::list
& bl
) const
405 encode(modified
, bl
);
406 int32_t new_t
= new_pool_max
;
408 encode(new_flags
, bl
);
412 encode(new_max_osd
, bl
);
413 // for encode(new_pools, bl);
414 __u32 n
= new_pools
.size();
416 for (const auto &new_pool
: new_pools
) {
419 encode(new_pool
.second
, bl
, 0);
421 // for encode(new_pool_names, bl);
422 n
= new_pool_names
.size();
425 for (const auto &new_pool_name
: new_pool_names
) {
426 n
= new_pool_name
.first
;
428 encode(new_pool_name
.second
, bl
);
430 // for encode(old_pools, bl);
431 n
= old_pools
.size();
433 for (auto &old_pool
: old_pools
) {
437 encode(new_up_client
, bl
, 0);
439 // legacy is map<int32_t,uint8_t>
440 map
<int32_t, uint8_t> os
;
441 for (auto p
: new_state
) {
442 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
443 // that an old client could not understand.
445 uint8_t s
= p
.second
;
446 if (p
.second
!= 0 && s
== 0)
450 uint32_t n
= os
.size();
454 encode(p
.second
, bl
);
457 encode(new_weight
, bl
);
458 // for encode(new_pg_temp, bl);
459 n
= new_pg_temp
.size();
462 for (const auto &pg_temp
: new_pg_temp
) {
463 old_pg_t opg
= pg_temp
.first
.get_old_pg();
465 encode(pg_temp
.second
, bl
);
469 void OSDMap::Incremental::encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const
472 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
473 encode_client_old(bl
);
482 encode(modified
, bl
);
483 encode(new_pool_max
, bl
);
484 encode(new_flags
, bl
);
488 encode(new_max_osd
, bl
);
489 encode(new_pools
, bl
, features
);
490 encode(new_pool_names
, bl
);
491 encode(old_pools
, bl
);
492 encode(new_up_client
, bl
, features
);
494 map
<int32_t, uint8_t> os
;
495 for (auto p
: new_state
) {
496 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
497 // that an old client could not understand.
499 uint8_t s
= p
.second
;
500 if (p
.second
!= 0 && s
== 0)
504 uint32_t n
= os
.size();
508 encode(p
.second
, bl
);
511 encode(new_weight
, bl
);
512 encode(new_pg_temp
, bl
);
517 encode(new_hb_back_up
, bl
, features
);
518 encode(new_up_thru
, bl
);
519 encode(new_last_clean_interval
, bl
);
520 encode(new_lost
, bl
);
521 encode(new_blocklist
, bl
, features
);
522 encode(old_blocklist
, bl
, features
);
523 encode(new_up_cluster
, bl
, features
);
524 encode(cluster_snapshot
, bl
);
525 encode(new_uuid
, bl
);
526 encode(new_xinfo
, bl
, features
);
527 encode(new_hb_front_up
, bl
, features
);
531 static void encode_addrvec_map_as_addr(const T
& m
, ceph::buffer::list
& bl
, uint64_t f
)
533 uint32_t n
= m
.size();
537 encode(i
.second
.legacy_addr(), bl
, f
);
542 static void encode_addrvec_pvec_as_addr(const T
& m
, ceph::buffer::list
& bl
, uint64_t f
)
544 uint32_t n
= m
.size();
548 encode(i
->legacy_addr(), bl
, f
);
550 encode(entity_addr_t(), bl
, f
);
555 /* for a description of osdmap incremental versions, and when they were
556 * introduced, please refer to
557 * doc/dev/osd_internals/osdmap_versions.txt
559 void OSDMap::Incremental::encode(ceph::buffer::list
& bl
, uint64_t features
) const
562 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
563 encode_classic(bl
, features
);
567 // only a select set of callers should *ever* be encoding new
568 // OSDMaps. others should be passing around the canonical encoded
569 // buffers from on high. select out those callers by passing in an
570 // "impossible" feature bit.
571 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
572 features
&= ~CEPH_FEATURE_RESERVED
;
574 size_t start_offset
= bl
.length();
577 std::optional
<ceph::buffer::list::contiguous_filler
> crc_filler
;
579 // meta-encoding: how we include client-used and osd-specific data
580 ENCODE_START(8, 7, bl
);
584 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
586 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
588 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
590 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
593 ENCODE_START(v
, 1, bl
); // client-usable data
596 encode(modified
, bl
);
597 encode(new_pool_max
, bl
);
598 encode(new_flags
, bl
);
602 encode(new_max_osd
, bl
);
603 encode(new_pools
, bl
, features
);
604 encode(new_pool_names
, bl
);
605 encode(old_pools
, bl
);
607 encode(new_up_client
, bl
, features
);
609 encode_addrvec_map_as_addr(new_up_client
, bl
, features
);
612 encode(new_state
, bl
);
614 map
<int32_t, uint8_t> os
;
615 for (auto p
: new_state
) {
616 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
617 // that an old client could not understand.
619 uint8_t s
= p
.second
;
620 if (p
.second
!= 0 && s
== 0)
624 uint32_t n
= os
.size();
628 encode(p
.second
, bl
);
631 encode(new_weight
, bl
);
632 encode(new_pg_temp
, bl
);
633 encode(new_primary_temp
, bl
);
634 encode(new_primary_affinity
, bl
);
635 encode(new_erasure_code_profiles
, bl
);
636 encode(old_erasure_code_profiles
, bl
);
638 encode(new_pg_upmap
, bl
);
639 encode(old_pg_upmap
, bl
);
640 encode(new_pg_upmap_items
, bl
);
641 encode(old_pg_upmap_items
, bl
);
644 encode(new_removed_snaps
, bl
);
645 encode(new_purged_snaps
, bl
);
648 encode(new_last_up_change
, bl
);
649 encode(new_last_in_change
, bl
);
652 encode(new_pg_upmap_primary
, bl
);
653 encode(old_pg_upmap_primary
, bl
);
655 ENCODE_FINISH(bl
); // client-usable data
659 uint8_t target_v
= 9; // if bumping this, be aware of allow_crimson 12
660 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
662 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
665 if (change_stretch_mode
) {
666 target_v
= std::max((uint8_t)10, target_v
);
668 if (!new_range_blocklist
.empty() ||
669 !old_range_blocklist
.empty()) {
670 target_v
= std::max((uint8_t)11, target_v
);
672 if (mutate_allow_crimson
!= mutate_allow_crimson_t::NONE
) {
673 target_v
= std::max((uint8_t)12, target_v
);
675 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
677 encode_addrvec_map_as_addr(new_hb_back_up
, bl
, features
);
679 encode(new_hb_back_up
, bl
, features
);
681 encode(new_up_thru
, bl
);
682 encode(new_last_clean_interval
, bl
);
683 encode(new_lost
, bl
);
684 encode(new_blocklist
, bl
, features
);
685 encode(old_blocklist
, bl
, features
);
687 encode_addrvec_map_as_addr(new_up_cluster
, bl
, features
);
689 encode(new_up_cluster
, bl
, features
);
691 encode(cluster_snapshot
, bl
);
692 encode(new_uuid
, bl
);
693 encode(new_xinfo
, bl
, features
);
695 encode_addrvec_map_as_addr(new_hb_front_up
, bl
, features
);
697 encode(new_hb_front_up
, bl
, features
);
699 encode(features
, bl
); // NOTE: features arg, not the member
701 encode(new_nearfull_ratio
, bl
);
702 encode(new_full_ratio
, bl
);
703 encode(new_backfillfull_ratio
, bl
);
705 // 5 was string-based new_require_min_compat_client
707 encode(new_require_min_compat_client
, bl
);
708 encode(new_require_osd_release
, bl
);
711 encode(new_crush_node_flags
, bl
);
714 encode(new_device_class_flags
, bl
);
716 if (target_v
>= 10) {
717 encode(change_stretch_mode
, bl
);
718 encode(new_stretch_bucket_count
, bl
);
719 encode(new_degraded_stretch_mode
, bl
);
720 encode(new_recovering_stretch_mode
, bl
);
721 encode(new_stretch_mode_bucket
, bl
);
722 encode(stretch_mode_enabled
, bl
);
724 if (target_v
>= 11) {
725 encode(new_range_blocklist
, bl
, features
);
726 encode(old_range_blocklist
, bl
, features
);
728 if (target_v
>= 12) {
729 encode(mutate_allow_crimson
, bl
);
731 ENCODE_FINISH(bl
); // osd-only data
734 crc_offset
= bl
.length();
735 crc_filler
= bl
.append_hole(sizeof(uint32_t));
736 tail_offset
= bl
.length();
738 encode(full_crc
, bl
);
740 ENCODE_FINISH(bl
); // meta-encoding wrapper
743 ceph::buffer::list front
;
744 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
745 inc_crc
= front
.crc32c(-1);
746 ceph::buffer::list tail
;
747 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
748 inc_crc
= tail
.crc32c(inc_crc
);
751 crc_filler
->copy_in(4u, (char*)&crc_le
);
755 void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator
&p
)
765 if (v
== 4 || v
== 5) {
769 decode(new_pool_max
, p
);
770 decode(new_flags
, p
);
774 decode(new_max_osd
, p
);
780 decode(new_pools
[t
], p
);
783 decode(new_pools
, p
);
786 new_pool_names
.clear();
790 decode(new_pool_names
[t
], p
);
793 decode(new_pool_names
, p
);
803 decode(old_pools
, p
);
805 decode(new_up_client
, p
);
807 map
<int32_t,uint8_t> ns
;
810 new_state
[q
.first
] = q
.second
;
813 decode(new_weight
, p
);
820 ceph::decode_raw(opg
, p
);
821 decode(new_pg_temp
[pg_t(opg
)], p
);
824 decode(new_pg_temp
, p
);
827 // decode short map, too.
828 if (v
== 5 && p
.end())
835 decode(new_hb_back_up
, p
);
837 decode(new_pool_names
, p
);
838 decode(new_up_thru
, p
);
839 decode(new_last_clean_interval
, p
);
841 decode(new_blocklist
, p
);
842 decode(old_blocklist
, p
);
844 decode(new_up_cluster
, p
);
846 decode(cluster_snapshot
, p
);
850 decode(new_xinfo
, p
);
852 decode(new_hb_front_up
, p
);
855 /* for a description of osdmap incremental versions, and when they were
856 * introduced, please refer to
857 * doc/dev/osd_internals/osdmap_versions.txt
859 void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator
& bl
)
863 * Older encodings of the Incremental had a single struct_v which
864 * covered the whole encoding, and was prior to our modern
865 * stuff which includes a compatv and a size. So if we see
866 * a struct_v < 7, we must rewind to the beginning and use our
869 size_t start_offset
= bl
.get_off();
870 size_t tail_offset
= 0;
871 ceph::buffer::list crc_front
, crc_tail
;
873 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
875 bl
.seek(start_offset
);
879 encode_features
= CEPH_FEATURE_PGID64
;
885 DECODE_START(8, bl
); // client-usable data
888 decode(modified
, bl
);
889 decode(new_pool_max
, bl
);
890 decode(new_flags
, bl
);
894 decode(new_max_osd
, bl
);
895 decode(new_pools
, bl
);
896 decode(new_pool_names
, bl
);
897 decode(old_pools
, bl
);
898 decode(new_up_client
, bl
);
900 decode(new_state
, bl
);
902 map
<int32_t,uint8_t> ns
;
905 new_state
[q
.first
] = q
.second
;
908 decode(new_weight
, bl
);
909 decode(new_pg_temp
, bl
);
910 decode(new_primary_temp
, bl
);
912 decode(new_primary_affinity
, bl
);
914 new_primary_affinity
.clear();
916 decode(new_erasure_code_profiles
, bl
);
917 decode(old_erasure_code_profiles
, bl
);
919 new_erasure_code_profiles
.clear();
920 old_erasure_code_profiles
.clear();
923 decode(new_pg_upmap
, bl
);
924 decode(old_pg_upmap
, bl
);
925 decode(new_pg_upmap_items
, bl
);
926 decode(old_pg_upmap_items
, bl
);
929 decode(new_removed_snaps
, bl
);
930 decode(new_purged_snaps
, bl
);
933 decode(new_last_up_change
, bl
);
934 decode(new_last_in_change
, bl
);
936 DECODE_FINISH(bl
); // client-usable data
940 DECODE_START(10, bl
); // extended, osd-only data
941 decode(new_hb_back_up
, bl
);
942 decode(new_up_thru
, bl
);
943 decode(new_last_clean_interval
, bl
);
944 decode(new_lost
, bl
);
945 decode(new_blocklist
, bl
);
946 decode(old_blocklist
, bl
);
947 decode(new_up_cluster
, bl
);
948 decode(cluster_snapshot
, bl
);
949 decode(new_uuid
, bl
);
950 decode(new_xinfo
, bl
);
951 decode(new_hb_front_up
, bl
);
953 decode(encode_features
, bl
);
955 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
957 decode(new_nearfull_ratio
, bl
);
958 decode(new_full_ratio
, bl
);
960 new_nearfull_ratio
= -1;
964 decode(new_backfillfull_ratio
, bl
);
966 new_backfillfull_ratio
= -1;
972 new_require_min_compat_client
= ceph_release_from_name(r
);
976 decode(new_require_min_compat_client
, bl
);
977 decode(new_require_osd_release
, bl
);
979 if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
980 // only for compat with post-kraken pre-luminous test clusters
981 new_require_osd_release
= ceph_release_t::luminous
;
982 new_flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
983 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
984 new_require_osd_release
= ceph_release_t::kraken
;
985 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_JEWEL
)) {
986 new_require_osd_release
= ceph_release_t::jewel
;
988 new_require_osd_release
= ceph_release_t::unknown
;
992 decode(new_crush_node_flags
, bl
);
995 decode(new_device_class_flags
, bl
);
997 if (struct_v
>= 10) {
998 decode(change_stretch_mode
, bl
);
999 decode(new_stretch_bucket_count
, bl
);
1000 decode(new_degraded_stretch_mode
, bl
);
1001 decode(new_recovering_stretch_mode
, bl
);
1002 decode(new_stretch_mode_bucket
, bl
);
1003 decode(stretch_mode_enabled
, bl
);
1005 if (struct_v
>= 11) {
1006 decode(new_range_blocklist
, bl
);
1007 decode(old_range_blocklist
, bl
);
1009 if (struct_v
>= 12) {
1010 decode(mutate_allow_crimson
, bl
);
1012 DECODE_FINISH(bl
); // osd-only data
1015 if (struct_v
>= 8) {
1017 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
1018 decode(inc_crc
, bl
);
1019 tail_offset
= bl
.get_off();
1020 decode(full_crc
, bl
);
1027 DECODE_FINISH(bl
); // wrapper
1031 uint32_t actual
= crc_front
.crc32c(-1);
1032 if (tail_offset
< bl
.get_off()) {
1033 ceph::buffer::list tail
;
1034 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
1035 actual
= tail
.crc32c(actual
);
1037 if (inc_crc
!= actual
) {
1039 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
1040 string s
= ss
.str();
1041 throw ceph::buffer::malformed_input(s
.c_str());
1046 void OSDMap::Incremental::dump(Formatter
*f
) const
1048 f
->dump_int("epoch", epoch
);
1049 f
->dump_stream("fsid") << fsid
;
1050 f
->dump_stream("modified") << modified
;
1051 f
->dump_stream("new_last_up_change") << new_last_up_change
;
1052 f
->dump_stream("new_last_in_change") << new_last_in_change
;
1053 f
->dump_int("new_pool_max", new_pool_max
);
1054 f
->dump_int("new_flags", new_flags
);
1055 f
->dump_float("new_full_ratio", new_full_ratio
);
1056 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
1057 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
1058 f
->dump_int("new_require_min_compat_client", to_integer
<int>(new_require_min_compat_client
));
1059 f
->dump_int("new_require_osd_release", to_integer
<int>(new_require_osd_release
));
1060 f
->dump_unsigned("mutate_allow_crimson", static_cast<unsigned>(mutate_allow_crimson
));
1062 if (fullmap
.length()) {
1063 f
->open_object_section("full_map");
1065 ceph::buffer::list fbl
= fullmap
; // kludge around constness.
1066 auto p
= fbl
.cbegin();
1071 if (crush
.length()) {
1072 f
->open_object_section("crush");
1074 ceph::buffer::list tbl
= crush
; // kludge around constness.
1075 auto p
= tbl
.cbegin();
1081 f
->dump_int("new_max_osd", new_max_osd
);
1083 f
->open_array_section("new_pools");
1085 for (const auto &new_pool
: new_pools
) {
1086 f
->open_object_section("pool");
1087 f
->dump_int("pool", new_pool
.first
);
1088 new_pool
.second
.dump(f
);
1092 f
->open_array_section("new_pool_names");
1094 for (const auto &new_pool_name
: new_pool_names
) {
1095 f
->open_object_section("pool_name");
1096 f
->dump_int("pool", new_pool_name
.first
);
1097 f
->dump_string("name", new_pool_name
.second
);
1101 f
->open_array_section("old_pools");
1103 for (const auto &old_pool
: old_pools
)
1104 f
->dump_int("pool", old_pool
);
1107 f
->open_array_section("new_up_osds");
1109 for (const auto &upclient
: new_up_client
) {
1110 f
->open_object_section("osd");
1111 f
->dump_int("osd", upclient
.first
);
1112 f
->dump_stream("public_addr") << upclient
.second
.legacy_addr();
1113 f
->dump_object("public_addrs", upclient
.second
);
1114 if (auto p
= new_up_cluster
.find(upclient
.first
);
1115 p
!= new_up_cluster
.end()) {
1116 f
->dump_stream("cluster_addr") << p
->second
.legacy_addr();
1117 f
->dump_object("cluster_addrs", p
->second
);
1119 if (auto p
= new_hb_back_up
.find(upclient
.first
);
1120 p
!= new_hb_back_up
.end()) {
1121 f
->dump_object("heartbeat_back_addrs", p
->second
);
1123 if (auto p
= new_hb_front_up
.find(upclient
.first
);
1124 p
!= new_hb_front_up
.end()) {
1125 f
->dump_object("heartbeat_front_addrs", p
->second
);
1131 f
->open_array_section("new_weight");
1133 for (const auto &weight
: new_weight
) {
1134 f
->open_object_section("osd");
1135 f
->dump_int("osd", weight
.first
);
1136 f
->dump_int("weight", weight
.second
);
1141 f
->open_array_section("osd_state_xor");
1142 for (const auto &ns
: new_state
) {
1143 f
->open_object_section("osd");
1144 f
->dump_int("osd", ns
.first
);
1146 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
1147 f
->open_array_section("state_xor");
1148 for (auto &state
: st
)
1149 f
->dump_string("state", state
);
1155 f
->open_array_section("new_pg_temp");
1157 for (const auto &pg_temp
: new_pg_temp
) {
1158 f
->open_object_section("pg");
1159 f
->dump_stream("pgid") << pg_temp
.first
;
1160 f
->open_array_section("osds");
1162 for (const auto &osd
: pg_temp
.second
)
1163 f
->dump_int("osd", osd
);
1169 f
->open_array_section("primary_temp");
1171 for (const auto &primary_temp
: new_primary_temp
) {
1172 f
->dump_stream("pgid") << primary_temp
.first
;
1173 f
->dump_int("osd", primary_temp
.second
);
1175 f
->close_section(); // primary_temp
1177 f
->open_array_section("new_pg_upmap");
1178 for (auto& i
: new_pg_upmap
) {
1179 f
->open_object_section("mapping");
1180 f
->dump_stream("pgid") << i
.first
;
1181 f
->open_array_section("osds");
1182 for (auto osd
: i
.second
) {
1183 f
->dump_int("osd", osd
);
1189 f
->open_array_section("old_pg_upmap");
1190 for (auto& i
: old_pg_upmap
) {
1191 f
->dump_stream("pgid") << i
;
1195 f
->open_array_section("new_pg_upmap_items");
1196 for (auto& i
: new_pg_upmap_items
) {
1197 f
->open_object_section("mapping");
1198 f
->dump_stream("pgid") << i
.first
;
1199 f
->open_array_section("mappings");
1200 for (auto& p
: i
.second
) {
1201 f
->open_object_section("mapping");
1202 f
->dump_int("from", p
.first
);
1203 f
->dump_int("to", p
.second
);
1210 f
->open_array_section("old_pg_upmap_items");
1211 for (auto& i
: old_pg_upmap_items
) {
1212 f
->dump_stream("pgid") << i
;
1216 // dump upmap_primaries
1217 f
->open_array_section("new_pg_upmap_primaries");
1218 for (auto& [pg
, osd
] : new_pg_upmap_primary
) {
1219 f
->open_object_section("primary_mapping");
1220 f
->dump_stream("pgid") << pg
;
1221 f
->dump_int("primary_osd", osd
);
1224 f
->close_section(); // new_pg_upmap_primaries
1226 // dump old_pg_upmap_primaries (removed primary mappings)
1227 f
->open_array_section("old_pg_upmap_primaries");
1228 for (auto& pg
: old_pg_upmap_primary
) {
1229 f
->dump_stream("pgid") << pg
;
1231 f
->close_section(); // old_pg_upmap_primaries
1233 f
->open_array_section("new_up_thru");
1235 for (const auto &up_thru
: new_up_thru
) {
1236 f
->open_object_section("osd");
1237 f
->dump_int("osd", up_thru
.first
);
1238 f
->dump_int("up_thru", up_thru
.second
);
1243 f
->open_array_section("new_lost");
1245 for (const auto &lost
: new_lost
) {
1246 f
->open_object_section("osd");
1247 f
->dump_int("osd", lost
.first
);
1248 f
->dump_int("epoch_lost", lost
.second
);
1253 f
->open_array_section("new_last_clean_interval");
1255 for (const auto &last_clean_interval
: new_last_clean_interval
) {
1256 f
->open_object_section("osd");
1257 f
->dump_int("osd", last_clean_interval
.first
);
1258 f
->dump_int("first", last_clean_interval
.second
.first
);
1259 f
->dump_int("last", last_clean_interval
.second
.second
);
1264 f
->open_array_section("new_blocklist");
1265 for (const auto &blist
: new_blocklist
) {
1268 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1271 f
->open_array_section("old_blocklist");
1272 for (const auto &blist
: old_blocklist
)
1273 f
->dump_stream("addr") << blist
;
1275 f
->open_array_section("new_range_blocklist");
1276 for (const auto &blist
: new_range_blocklist
) {
1279 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1282 f
->open_array_section("old_range_blocklist");
1283 for (const auto &blist
: old_range_blocklist
)
1284 f
->dump_stream("addr") << blist
;
1287 f
->open_array_section("new_xinfo");
1288 for (const auto &xinfo
: new_xinfo
) {
1289 f
->open_object_section("xinfo");
1290 f
->dump_int("osd", xinfo
.first
);
1291 xinfo
.second
.dump(f
);
1296 if (cluster_snapshot
.size())
1297 f
->dump_string("cluster_snapshot", cluster_snapshot
);
1299 f
->open_array_section("new_uuid");
1300 for (const auto &uuid
: new_uuid
) {
1301 f
->open_object_section("osd");
1302 f
->dump_int("osd", uuid
.first
);
1303 f
->dump_stream("uuid") << uuid
.second
;
1308 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
1309 f
->open_array_section("old_erasure_code_profiles");
1310 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
1311 f
->dump_string("old", erasure_code_profile
);
1315 f
->open_array_section("new_removed_snaps");
1316 for (auto& p
: new_removed_snaps
) {
1317 f
->open_object_section("pool");
1318 f
->dump_int("pool", p
.first
);
1319 f
->open_array_section("snaps");
1320 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1321 f
->open_object_section("interval");
1322 f
->dump_unsigned("begin", q
.get_start());
1323 f
->dump_unsigned("length", q
.get_len());
1330 f
->open_array_section("new_purged_snaps");
1331 for (auto& p
: new_purged_snaps
) {
1332 f
->open_object_section("pool");
1333 f
->dump_int("pool", p
.first
);
1334 f
->open_array_section("snaps");
1335 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1336 f
->open_object_section("interval");
1337 f
->dump_unsigned("begin", q
.get_start());
1338 f
->dump_unsigned("length", q
.get_len());
1344 f
->open_array_section("new_crush_node_flags");
1345 for (auto& i
: new_crush_node_flags
) {
1346 f
->open_object_section("node");
1347 f
->dump_int("id", i
.first
);
1349 calc_state_set(i
.second
, st
);
1350 for (auto& j
: st
) {
1351 f
->dump_string("flag", j
);
1356 f
->open_array_section("new_device_class_flags");
1357 for (auto& i
: new_device_class_flags
) {
1358 f
->open_object_section("device_class");
1359 f
->dump_int("id", i
.first
);
1361 calc_state_set(i
.second
, st
);
1362 for (auto& j
: st
) {
1363 f
->dump_string("flag", j
);
1368 f
->open_object_section("stretch_mode");
1370 f
->dump_bool("change_stretch_mode", change_stretch_mode
);
1371 f
->dump_bool("stretch_mode_enabled", stretch_mode_enabled
);
1372 f
->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count
);
1373 f
->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode
);
1374 f
->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode
);
1375 f
->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket
);
1381 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
1383 o
.push_back(new Incremental
);
1386 // ----------------------------------
1389 void OSDMap::set_epoch(epoch_t e
)
1392 for (auto &pool
: pools
)
1393 pool
.second
.last_change
= e
;
1396 OSDMap::range_bits::range_bits() : ipv6(false) {
1397 memset(&bits
, 0, sizeof(bits
));
1400 OSDMap::range_bits::range_bits(const entity_addr_t
& addr
) : ipv6(false) {
1401 memset(&bits
, 0, sizeof(bits
));
1405 void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr
,
1406 uint64_t *upper
, uint64_t *lower
)
1408 *upper
= ((uint64_t)(ntohl(*(uint32_t*)(addr
)))) << 32 |
1409 ((uint64_t)(ntohl(*(uint32_t*)(&addr
[4]))));
1410 *lower
= ((uint64_t)(ntohl(*(uint32_t*)(&addr
[8])))) << 32 |
1411 ((uint64_t)(ntohl(*(uint32_t*)(&addr
[12]))));
1414 void OSDMap::range_bits::parse(const entity_addr_t
& addr
) {
1415 // parse it into meaningful data
1416 if (addr
.is_ipv6()) {
1417 get_ipv6_bytes(addr
.in6_addr().sin6_addr
.s6_addr
,
1418 &bits
.ipv6
.upper_64_bits
, &bits
.ipv6
.lower_64_bits
);
1419 int32_t lower_shift
= std::min(128-
1420 static_cast<int32_t>(addr
.get_nonce()), 64);
1421 int32_t upper_shift
= std::max(64- //(128-b.first.get_nonce())-64
1422 static_cast<int32_t>(addr
.get_nonce()), 0);
1424 auto get_mask
= [](int32_t shift
) -> uint64_t {
1425 if (shift
>= 0 && shift
< 64) {
1426 return UINT64_MAX
<< shift
;
1431 bits
.ipv6
.lower_mask
= get_mask(lower_shift
);
1432 bits
.ipv6
.upper_mask
= get_mask(upper_shift
);
1434 } else if (addr
.is_ipv4()) {
1435 bits
.ipv4
.ip_32_bits
= ntohl(addr
.in4_addr().sin_addr
.s_addr
);
1436 if (addr
.get_nonce() > 0) {
1437 bits
.ipv4
.mask
= UINT32_MAX
<< (32-addr
.get_nonce());
1446 bool OSDMap::range_bits::matches(const entity_addr_t
& addr
) const {
1447 if (addr
.is_ipv4() && !ipv6
) {
1448 return ((ntohl(addr
.in4_addr().sin_addr
.s_addr
) & bits
.ipv4
.mask
) ==
1449 (bits
.ipv4
.ip_32_bits
& bits
.ipv4
.mask
));
1450 } else if (addr
.is_ipv6() && ipv6
) {
1451 uint64_t upper_64
, lower_64
;
1452 get_ipv6_bytes(addr
.in6_addr().sin6_addr
.s6_addr
, &upper_64
, &lower_64
);
1453 return (((upper_64
& bits
.ipv6
.upper_mask
) ==
1454 (bits
.ipv6
.upper_64_bits
& bits
.ipv6
.upper_mask
)) &&
1455 ((lower_64
& bits
.ipv6
.lower_mask
) ==
1456 (bits
.ipv6
.lower_64_bits
& bits
.ipv6
.lower_mask
)));
1461 bool OSDMap::is_blocklisted(const entity_addr_t
& orig
, CephContext
*cct
) const
1463 if (cct
) ldout(cct
, 25) << "is_blocklisted: " << orig
<< dendl
;
1464 if (blocklist
.empty() && range_blocklist
.empty()) {
1465 if (cct
) ldout(cct
, 30) << "not blocklisted: " << orig
<< dendl
;
1469 // all blocklist entries are type ANY for nautilus+
1470 // FIXME: avoid this copy!
1471 entity_addr_t a
= orig
;
1472 if (require_osd_release
< ceph_release_t::nautilus
) {
1473 a
.set_type(entity_addr_t::TYPE_LEGACY
);
1475 a
.set_type(entity_addr_t::TYPE_ANY
);
1478 // this specific instance?
1479 if (blocklist
.count(a
)) {
1480 if (cct
) ldout(cct
, 20) << "blocklist contains " << a
<< dendl
;
1484 // is entire ip blocklisted?
1488 if (blocklist
.count(a
)) {
1489 if (cct
) ldout(cct
, 20) << "blocklist contains " << a
<< dendl
;
1494 // is it in a blocklisted range?
1495 for (const auto& i
: calculated_ranges
) {
1496 bool blocked
= i
.second
.matches(a
);
1498 if (cct
) ldout(cct
, 20) << "range_blocklist contains " << a
<< dendl
;
1503 if (cct
) ldout(cct
, 25) << "not blocklisted: " << orig
<< dendl
;
1507 bool OSDMap::is_blocklisted(const entity_addrvec_t
& av
, CephContext
*cct
) const
1509 if (blocklist
.empty() && range_blocklist
.empty())
1512 for (auto& a
: av
.v
) {
1513 if (is_blocklisted(a
, cct
)) {
1521 void OSDMap::get_blocklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
,
1522 std::list
<std::pair
<entity_addr_t
,utime_t
> > *rl
) const
1524 std::copy(blocklist
.begin(), blocklist
.end(), std::back_inserter(*bl
));
1525 std::copy(range_blocklist
.begin(), range_blocklist
.end(),
1526 std::back_inserter(*rl
));
1529 void OSDMap::get_blocklist(std::set
<entity_addr_t
> *bl
,
1530 std::set
<entity_addr_t
> *rl
) const
1532 for (const auto &i
: blocklist
) {
1533 bl
->insert(i
.first
);
1535 for (const auto &i
: range_blocklist
) {
1536 rl
->insert(i
.first
);
1540 void OSDMap::set_max_osd(int m
)
1543 osd_state
.resize(max_osd
, 0);
1544 osd_weight
.resize(max_osd
, CEPH_OSD_OUT
);
1545 osd_info
.resize(max_osd
);
1546 osd_xinfo
.resize(max_osd
);
1547 osd_addrs
->client_addrs
.resize(max_osd
);
1548 osd_addrs
->cluster_addrs
.resize(max_osd
);
1549 osd_addrs
->hb_back_addrs
.resize(max_osd
);
1550 osd_addrs
->hb_front_addrs
.resize(max_osd
);
1551 osd_uuid
->resize(max_osd
);
1552 if (osd_primary_affinity
)
1553 osd_primary_affinity
->resize(max_osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1558 int OSDMap::calc_num_osds()
1563 for (int i
=0; i
<max_osd
; i
++) {
1564 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1566 if (osd_state
[i
] & CEPH_OSD_UP
) {
1569 if (get_weight(i
) != CEPH_OSD_OUT
) {
1577 void OSDMap::get_full_pools(CephContext
*cct
,
1579 set
<int64_t> *backfillfull
,
1580 set
<int64_t> *nearfull
) const
1583 ceph_assert(backfillfull
);
1584 ceph_assert(nearfull
);
1586 backfillfull
->clear();
1589 vector
<int> full_osds
;
1590 vector
<int> backfillfull_osds
;
1591 vector
<int> nearfull_osds
;
1592 for (int i
= 0; i
< max_osd
; ++i
) {
1593 if (exists(i
) && is_up(i
) && is_in(i
)) {
1594 if (osd_state
[i
] & CEPH_OSD_FULL
)
1595 full_osds
.push_back(i
);
1596 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1597 backfillfull_osds
.push_back(i
);
1598 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1599 nearfull_osds
.push_back(i
);
1603 for (auto i
: full_osds
) {
1604 get_pool_ids_by_osd(cct
, i
, full
);
1606 for (auto i
: backfillfull_osds
) {
1607 get_pool_ids_by_osd(cct
, i
, backfillfull
);
1609 for (auto i
: nearfull_osds
) {
1610 get_pool_ids_by_osd(cct
, i
, nearfull
);
1614 void OSDMap::get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
1615 set
<int> *nearfull
) const
1620 for (int i
= 0; i
< max_osd
; ++i
) {
1621 if (exists(i
) && is_up(i
) && is_in(i
)) {
1622 if (osd_state
[i
] & CEPH_OSD_FULL
)
1624 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1625 backfill
->emplace(i
);
1626 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1627 nearfull
->emplace(i
);
1632 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1634 for (int i
=0; i
<max_osd
; i
++)
1639 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1641 for (int i
= 0; i
< max_osd
; i
++) {
1647 void OSDMap::get_out_existing_osds(set
<int32_t>& ls
) const
1649 for (int i
= 0; i
< max_osd
; i
++) {
1650 if (exists(i
) && get_weight(i
) == CEPH_OSD_OUT
)
1655 void OSDMap::get_flag_set(set
<string
> *flagset
) const
1657 for (unsigned i
= 0; i
< sizeof(flags
) * 8; ++i
) {
1658 if (flags
& (1<<i
)) {
1659 flagset
->insert(get_flag_string(flags
& (1<<i
)));
1664 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1667 for (unsigned s
= 1; t
; s
<<= 1) {
1670 st
.insert(ceph_osd_state_name(s
));
1675 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1678 for (const auto &weight
: weights
) {
1679 if (weight
.second
> max
)
1680 max
= weight
.second
;
1683 for (const auto &weight
: weights
) {
1684 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1688 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1690 for (int i
=0; i
<max_osd
; i
++)
1691 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1692 get_cluster_addrs(i
).contains(addr
)))
1697 int OSDMap::identify_osd(const uuid_d
& u
) const
1699 for (int i
=0; i
<max_osd
; i
++)
1700 if (exists(i
) && get_uuid(i
) == u
)
1705 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1707 for (int i
=0; i
<max_osd
; i
++)
1708 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1709 get_cluster_addrs(i
).contains(addr
) ||
1710 get_hb_back_addrs(i
).contains(addr
) ||
1711 get_hb_front_addrs(i
).contains(addr
)))
1716 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1718 for (int i
=0; i
<max_osd
; i
++)
1719 if (exists(i
) && (get_addrs(i
).is_same_host(ip
) ||
1720 get_cluster_addrs(i
).is_same_host(ip
)))
1726 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1728 uint64_t features
= 0; // things we actually have
1729 uint64_t mask
= 0; // things we could have
1731 if (crush
->has_nondefault_tunables())
1732 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1733 if (crush
->has_nondefault_tunables2())
1734 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1735 if (crush
->has_nondefault_tunables3())
1736 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1737 if (crush
->has_v4_buckets())
1738 features
|= CEPH_FEATURE_CRUSH_V4
;
1739 if (crush
->has_nondefault_tunables5())
1740 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1741 if (crush
->has_incompat_choose_args()) {
1742 features
|= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
;
1744 mask
|= CEPH_FEATURES_CRUSH
;
1746 if (!pg_upmap
.empty() || !pg_upmap_items
.empty() || !pg_upmap_primaries
.empty())
1747 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1748 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1750 for (auto &pool
: pools
) {
1751 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1752 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1754 if (!pool
.second
.tiers
.empty() ||
1755 pool
.second
.is_tier()) {
1756 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1758 int ruleid
= pool
.second
.get_crush_rule();
1760 if (crush
->is_v2_rule(ruleid
))
1761 features
|= CEPH_FEATURE_CRUSH_V2
;
1762 if (crush
->is_v3_rule(ruleid
))
1763 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1764 if (crush
->is_v5_rule(ruleid
))
1765 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1768 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1770 if (osd_primary_affinity
) {
1771 for (int i
= 0; i
< max_osd
; ++i
) {
1772 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1773 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1778 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1780 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1781 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1782 if (require_osd_release
>= ceph_release_t::jewel
) {
1783 features
|= jewel_features
;
1785 mask
|= jewel_features
;
1787 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1788 | CEPH_FEATURE_MSG_ADDR2
;
1789 if (require_osd_release
>= ceph_release_t::kraken
) {
1790 features
|= kraken_features
;
1792 mask
|= kraken_features
;
1794 if (stretch_mode_enabled
) {
1795 features
|= CEPH_FEATUREMASK_STRETCH_MODE
;
1796 mask
|= CEPH_FEATUREMASK_STRETCH_MODE
;
1800 if (require_min_compat_client
>= ceph_release_t::nautilus
) {
1801 // if min_compat_client is >= nautilus, require v2 cephx signatures
1803 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1804 } else if (require_osd_release
>= ceph_release_t::nautilus
&&
1805 entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1806 // if osds are >= nautilus, at least require the signatures from them
1807 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1809 mask
|= CEPH_FEATUREMASK_CEPHX_V2
;
1816 ceph_release_t
OSDMap::get_min_compat_client() const
1818 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1820 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1821 HAVE_FEATURE(f
, CRUSH_CHOOSE_ARGS
)) { // v12.0.1-2172-gef1ef28
1822 return ceph_release_t::luminous
; // v12.2.0
1824 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1825 return ceph_release_t::jewel
; // v10.2.0
1827 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1828 return ceph_release_t::hammer
; // v0.94.0
1830 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1831 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1832 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1833 return ceph_release_t::firefly
; // v0.80.0
1835 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1836 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1837 return ceph_release_t::dumpling
; // v0.67.0
1839 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1840 return ceph_release_t::argonaut
; // v0.48argonaut-206-g6f381af
1842 return ceph_release_t::argonaut
; // v0.48argonaut-206-g6f381af
1845 ceph_release_t
OSDMap::get_require_min_compat_client() const
1847 return require_min_compat_client
;
1850 void OSDMap::_calc_up_osd_features()
1853 cached_up_osd_features
= 0;
1854 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1857 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1858 if (xi
.features
== 0)
1859 continue; // bogus xinfo, maybe #20751 or similar, skipping
1861 cached_up_osd_features
= xi
.features
;
1864 cached_up_osd_features
&= xi
.features
;
1869 uint64_t OSDMap::get_up_osd_features() const
1871 return cached_up_osd_features
;
1874 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1877 if (o
->epoch
== n
->epoch
)
1883 if (o
->max_osd
!= n
->max_osd
)
1885 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1886 if ( n
->osd_addrs
->client_addrs
[i
] && o
->osd_addrs
->client_addrs
[i
] &&
1887 *n
->osd_addrs
->client_addrs
[i
] == *o
->osd_addrs
->client_addrs
[i
])
1888 n
->osd_addrs
->client_addrs
[i
] = o
->osd_addrs
->client_addrs
[i
];
1891 if ( n
->osd_addrs
->cluster_addrs
[i
] && o
->osd_addrs
->cluster_addrs
[i
] &&
1892 *n
->osd_addrs
->cluster_addrs
[i
] == *o
->osd_addrs
->cluster_addrs
[i
])
1893 n
->osd_addrs
->cluster_addrs
[i
] = o
->osd_addrs
->cluster_addrs
[i
];
1896 if ( n
->osd_addrs
->hb_back_addrs
[i
] && o
->osd_addrs
->hb_back_addrs
[i
] &&
1897 *n
->osd_addrs
->hb_back_addrs
[i
] == *o
->osd_addrs
->hb_back_addrs
[i
])
1898 n
->osd_addrs
->hb_back_addrs
[i
] = o
->osd_addrs
->hb_back_addrs
[i
];
1901 if ( n
->osd_addrs
->hb_front_addrs
[i
] && o
->osd_addrs
->hb_front_addrs
[i
] &&
1902 *n
->osd_addrs
->hb_front_addrs
[i
] == *o
->osd_addrs
->hb_front_addrs
[i
])
1903 n
->osd_addrs
->hb_front_addrs
[i
] = o
->osd_addrs
->hb_front_addrs
[i
];
1908 // zoinks, no differences at all!
1909 n
->osd_addrs
= o
->osd_addrs
;
1912 // does crush match?
1913 ceph::buffer::list oc
, nc
;
1914 encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1915 encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1916 if (oc
.contents_equal(nc
)) {
1917 n
->crush
= o
->crush
;
1920 // does pg_temp match?
1921 if (*o
->pg_temp
== *n
->pg_temp
)
1922 n
->pg_temp
= o
->pg_temp
;
1924 // does primary_temp match?
1925 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1926 if (*o
->primary_temp
== *n
->primary_temp
)
1927 n
->primary_temp
= o
->primary_temp
;
1931 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1932 *o
->osd_uuid
== *n
->osd_uuid
)
1933 n
->osd_uuid
= o
->osd_uuid
;
1936 void OSDMap::clean_temps(CephContext
*cct
,
1937 const OSDMap
& oldmap
,
1938 const OSDMap
& nextmap
,
1939 Incremental
*pending_inc
)
1941 ldout(cct
, 10) << __func__
<< dendl
;
1943 for (auto pg
: *nextmap
.pg_temp
) {
1944 // if pool does not exist, remove any existing pg_temps associated with
1945 // it. we don't care about pg_temps on the pending_inc either; if there
1946 // are new_pg_temp entries on the pending, clear them out just as well.
1947 if (!nextmap
.have_pg_pool(pg
.first
.pool())) {
1948 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1949 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1950 pending_inc
->new_pg_temp
[pg
.first
].clear();
1953 if (!nextmap
.pg_exists(pg
.first
)) {
1954 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1955 << " for nonexistent pg " << dendl
;
1956 pending_inc
->new_pg_temp
[pg
.first
].clear();
1960 unsigned num_up
= 0;
1961 for (auto o
: pg
.second
) {
1962 if (!nextmap
.is_down(o
)) {
1968 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1969 << " with all down osds" << pg
.second
<< dendl
;
1970 pending_inc
->new_pg_temp
[pg
.first
].clear();
1973 // redundant pg_temp?
1976 nextmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1977 bool remove
= false;
1978 if (raw_up
== pg
.second
) {
1979 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1980 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1983 // oversized pg_temp?
1984 if (pg
.second
.size() > nextmap
.get_pg_pool(pg
.first
.pool())->get_size()) {
1985 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1986 << pg
.second
<< " exceeds pool size" << dendl
;
1990 if (oldmap
.pg_temp
->count(pg
.first
))
1991 pending_inc
->new_pg_temp
[pg
.first
].clear();
1993 pending_inc
->new_pg_temp
.erase(pg
.first
);
1997 for (auto &pg
: *nextmap
.primary_temp
) {
1999 if (nextmap
.is_down(pg
.second
)) {
2000 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
2001 << " to down " << pg
.second
<< dendl
;
2002 pending_inc
->new_primary_temp
[pg
.first
] = -1;
2005 // redundant primary_temp?
2006 vector
<int> real_up
, templess_up
;
2007 int real_primary
, templess_primary
;
2008 pg_t pgid
= pg
.first
;
2009 nextmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
2010 nextmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
2011 if (real_primary
== templess_primary
){
2012 ldout(cct
, 10) << __func__
<< " removing primary_temp "
2013 << pgid
<< " -> " << real_primary
2014 << " (unnecessary/redundant)" << dendl
;
2015 if (oldmap
.primary_temp
->count(pgid
))
2016 pending_inc
->new_primary_temp
[pgid
] = -1;
2018 pending_inc
->new_primary_temp
.erase(pgid
);
2023 void OSDMap::get_upmap_pgs(vector
<pg_t
> *upmap_pgs
) const
2025 upmap_pgs
->reserve(pg_upmap
.size() + pg_upmap_items
.size());
2026 for (auto& p
: pg_upmap
)
2027 upmap_pgs
->push_back(p
.first
);
2028 for (auto& p
: pg_upmap_items
)
2029 upmap_pgs
->push_back(p
.first
);
2032 bool OSDMap::check_pg_upmaps(
2034 const vector
<pg_t
>& to_check
,
2035 vector
<pg_t
> *to_cancel
,
2036 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> *to_remap
) const
2038 bool any_change
= false;
2039 map
<int, map
<int, float>> rule_weight_map
;
2040 for (auto& pg
: to_check
) {
2041 const pg_pool_t
*pi
= get_pg_pool(pg
.pool());
2042 if (!pi
|| pg
.ps() >= pi
->get_pg_num_pending()) {
2043 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is gone or merge source"
2045 to_cancel
->push_back(pg
);
2048 if (pi
->is_pending_merge(pg
, nullptr)) {
2049 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is pending merge"
2051 to_cancel
->push_back(pg
);
2054 vector
<int> raw
, up
;
2055 pg_to_raw_upmap(pg
, &raw
, &up
);
2056 auto crush_rule
= get_pg_pool_crush_rule(pg
);
2057 auto r
= crush
->verify_upmap(cct
,
2059 get_pg_pool_size(pg
),
2062 ldout(cct
, 0) << __func__
<< " verify_upmap of pg " << pg
2063 << " returning " << r
2065 to_cancel
->push_back(pg
);
2068 // below we check against crush-topology changing..
2069 map
<int, float> weight_map
;
2070 auto it
= rule_weight_map
.find(crush_rule
);
2071 if (it
== rule_weight_map
.end()) {
2072 auto r
= crush
->get_rule_weight_osd_map(crush_rule
, &weight_map
);
2074 lderr(cct
) << __func__
<< " unable to get crush weight_map for "
2075 << "crush_rule " << crush_rule
2079 rule_weight_map
[crush_rule
] = weight_map
;
2081 weight_map
= it
->second
;
2083 ldout(cct
, 10) << __func__
<< " pg " << pg
2084 << " weight_map " << weight_map
2086 for (auto osd
: up
) {
2087 auto it
= weight_map
.find(osd
);
2088 if (it
== weight_map
.end()) {
2089 ldout(cct
, 10) << __func__
<< " pg " << pg
<< ": osd " << osd
<< " is gone or has "
2090 << "been moved out of the specific crush-tree"
2092 to_cancel
->push_back(pg
);
2095 auto adjusted_weight
= get_weightf(it
->first
) * it
->second
;
2096 if (adjusted_weight
== 0) {
2097 ldout(cct
, 10) << __func__
<< " pg " << pg
<< ": osd " << osd
2098 << " is out/crush-out"
2100 to_cancel
->push_back(pg
);
2104 if (!to_cancel
->empty() && to_cancel
->back() == pg
)
2106 // okay, upmap is valid
2107 // continue to check if it is still necessary
2108 auto i
= pg_upmap
.find(pg
);
2109 if (i
!= pg_upmap
.end()) {
2110 if (i
->second
== raw
) {
2111 ldout(cct
, 10) << __func__
<< "removing redundant pg_upmap " << i
->first
<< " "
2112 << i
->second
<< dendl
;
2113 to_cancel
->push_back(pg
);
2116 if ((int)i
->second
.size() != get_pg_pool_size(pg
)) {
2117 ldout(cct
, 10) << __func__
<< "removing pg_upmap " << i
->first
<< " "
2118 << i
->second
<< " != pool size " << get_pg_pool_size(pg
)
2120 to_cancel
->push_back(pg
);
2124 auto j
= pg_upmap_items
.find(pg
);
2125 if (j
!= pg_upmap_items
.end()) {
2126 mempool::osdmap::vector
<pair
<int,int>> newmap
;
2127 for (auto& p
: j
->second
) {
2128 auto osd_from
= p
.first
;
2129 auto osd_to
= p
.second
;
2130 if (std::find(raw
.begin(), raw
.end(), osd_from
) == raw
.end()) {
2131 // cancel mapping if source osd does not exist anymore
2132 ldout(cct
, 20) << __func__
<< " pg_upmap_items (source osd does not exist) " << pg_upmap_items
<< dendl
;
2135 if (osd_to
!= CRUSH_ITEM_NONE
&& osd_to
< max_osd
&&
2136 osd_to
>= 0 && osd_weight
[osd_to
] == 0) {
2137 // cancel mapping if target osd is out
2138 ldout(cct
, 20) << __func__
<< " pg_upmap_items (target osd is out) " << pg_upmap_items
<< dendl
;
2141 newmap
.push_back(p
);
2143 if (newmap
.empty()) {
2144 ldout(cct
, 10) << __func__
<< " removing no-op pg_upmap_items "
2145 << j
->first
<< " " << j
->second
2147 to_cancel
->push_back(pg
);
2148 } else if (newmap
!= j
->second
) {
2149 // check partial no-op here.
2150 ldout(cct
, 10) << __func__
<< " simplifying partially no-op pg_upmap_items "
2151 << j
->first
<< " " << j
->second
2154 to_remap
->insert({pg
, newmap
});
2159 any_change
= any_change
|| !to_cancel
->empty();
2163 void OSDMap::clean_pg_upmaps(
2165 Incremental
*pending_inc
,
2166 const vector
<pg_t
>& to_cancel
,
2167 const map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>>& to_remap
) const
2169 for (auto &pg
: to_cancel
) {
2170 auto i
= pending_inc
->new_pg_upmap
.find(pg
);
2171 if (i
!= pending_inc
->new_pg_upmap
.end()) {
2172 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
2173 << "pg_upmap entry "
2174 << i
->first
<< "->" << i
->second
2176 pending_inc
->new_pg_upmap
.erase(i
);
2178 auto j
= pg_upmap
.find(pg
);
2179 if (j
!= pg_upmap
.end()) {
2180 ldout(cct
, 10) << __func__
<< " cancel invalid pg_upmap entry "
2181 << j
->first
<< "->" << j
->second
2183 pending_inc
->old_pg_upmap
.insert(pg
);
2185 auto p
= pending_inc
->new_pg_upmap_items
.find(pg
);
2186 if (p
!= pending_inc
->new_pg_upmap_items
.end()) {
2187 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
2188 << "pg_upmap_items entry "
2189 << p
->first
<< "->" << p
->second
2191 pending_inc
->new_pg_upmap_items
.erase(p
);
2193 auto q
= pg_upmap_items
.find(pg
);
2194 if (q
!= pg_upmap_items
.end()) {
2195 ldout(cct
, 10) << __func__
<< " cancel invalid "
2196 << "pg_upmap_items entry "
2197 << q
->first
<< "->" << q
->second
2199 pending_inc
->old_pg_upmap_items
.insert(pg
);
2202 for (auto& i
: to_remap
)
2203 pending_inc
->new_pg_upmap_items
[i
.first
] = i
.second
;
2206 bool OSDMap::clean_pg_upmaps(
2208 Incremental
*pending_inc
) const
2210 ldout(cct
, 10) << __func__
<< dendl
;
2211 vector
<pg_t
> to_check
;
2212 vector
<pg_t
> to_cancel
;
2213 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> to_remap
;
2215 get_upmap_pgs(&to_check
);
2216 auto any_change
= check_pg_upmaps(cct
, to_check
, &to_cancel
, &to_remap
);
2217 clean_pg_upmaps(cct
, pending_inc
, to_cancel
, to_remap
);
2218 //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked
2219 // and cleaned in the same way as pg_upmap. This is not critical since invalid
2220 // pg_upmap_primaries are never applied, (the final check is in _apply_upmap).
2224 int OSDMap::apply_incremental(const Incremental
&inc
)
2226 new_blocklist_entries
= false;
2229 else if (inc
.fsid
!= fsid
)
2232 ceph_assert(inc
.epoch
== epoch
+1);
2235 modified
= inc
.modified
;
2238 if (inc
.fullmap
.length()) {
2239 ceph::buffer::list
bl(inc
.fullmap
);
2244 // nope, incremental.
2245 if (inc
.new_flags
>= 0) {
2246 flags
= inc
.new_flags
;
2247 // the below is just to cover a newly-upgraded luminous mon
2248 // cluster that has to set require_jewel_osds or
2249 // require_kraken_osds before the osds can be upgraded to
2251 if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
2252 if (require_osd_release
< ceph_release_t::kraken
) {
2253 require_osd_release
= ceph_release_t::kraken
;
2255 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
2256 if (require_osd_release
< ceph_release_t::jewel
) {
2257 require_osd_release
= ceph_release_t::jewel
;
2262 if (inc
.new_max_osd
>= 0)
2263 set_max_osd(inc
.new_max_osd
);
2265 if (inc
.new_pool_max
!= -1)
2266 pool_max
= inc
.new_pool_max
;
2268 for (const auto &pool
: inc
.new_pools
) {
2269 pools
[pool
.first
] = pool
.second
;
2270 pools
[pool
.first
].last_change
= epoch
;
2273 new_removed_snaps
= inc
.new_removed_snaps
;
2274 new_purged_snaps
= inc
.new_purged_snaps
;
2275 for (auto p
= new_removed_snaps
.begin();
2276 p
!= new_removed_snaps
.end();
2278 removed_snaps_queue
[p
->first
].union_of(p
->second
);
2280 for (auto p
= new_purged_snaps
.begin();
2281 p
!= new_purged_snaps
.end();
2283 auto q
= removed_snaps_queue
.find(p
->first
);
2284 ceph_assert(q
!= removed_snaps_queue
.end());
2285 q
->second
.subtract(p
->second
);
2286 if (q
->second
.empty()) {
2287 removed_snaps_queue
.erase(q
);
2291 if (inc
.new_last_up_change
!= utime_t()) {
2292 last_up_change
= inc
.new_last_up_change
;
2294 if (inc
.new_last_in_change
!= utime_t()) {
2295 last_in_change
= inc
.new_last_in_change
;
2298 for (const auto &pname
: inc
.new_pool_names
) {
2299 auto pool_name_entry
= pool_name
.find(pname
.first
);
2300 if (pool_name_entry
!= pool_name
.end()) {
2301 name_pool
.erase(pool_name_entry
->second
);
2302 pool_name_entry
->second
= pname
.second
;
2304 pool_name
[pname
.first
] = pname
.second
;
2306 name_pool
[pname
.second
] = pname
.first
;
2309 for (const auto &pool
: inc
.old_pools
) {
2311 name_pool
.erase(pool_name
[pool
]);
2312 pool_name
.erase(pool
);
2315 for (const auto &weight
: inc
.new_weight
) {
2316 set_weight(weight
.first
, weight
.second
);
2318 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2319 // xinfo old_weight.
2320 if (weight
.second
) {
2321 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
2322 osd_xinfo
[weight
.first
].old_weight
= 0;
2326 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
2327 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
2330 // erasure_code_profiles
2331 for (const auto &profile
: inc
.old_erasure_code_profiles
)
2332 erasure_code_profiles
.erase(profile
);
2334 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
2335 set_erasure_code_profile(profile
.first
, profile
.second
);
2339 for (const auto &state
: inc
.new_state
) {
2340 const auto osd
= state
.first
;
2341 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
2342 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
2343 (s
& CEPH_OSD_UP
)) {
2344 osd_info
[osd
].down_at
= epoch
;
2345 osd_xinfo
[osd
].down_stamp
= modified
;
2347 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
2348 (s
& CEPH_OSD_EXISTS
)) {
2349 // osd is destroyed; clear out anything interesting.
2350 (*osd_uuid
)[osd
] = uuid_d();
2351 osd_info
[osd
] = osd_info_t();
2352 osd_xinfo
[osd
] = osd_xinfo_t();
2353 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
2354 osd_addrs
->client_addrs
[osd
].reset(new entity_addrvec_t());
2355 osd_addrs
->cluster_addrs
[osd
].reset(new entity_addrvec_t());
2356 osd_addrs
->hb_front_addrs
[osd
].reset(new entity_addrvec_t());
2357 osd_addrs
->hb_back_addrs
[osd
].reset(new entity_addrvec_t());
2360 osd_state
[osd
] ^= s
;
2364 for (const auto &client
: inc
.new_up_client
) {
2365 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
2366 osd_state
[client
.first
] &= ~CEPH_OSD_STOP
; // if any
2367 osd_addrs
->client_addrs
[client
.first
].reset(
2368 new entity_addrvec_t(client
.second
));
2369 osd_addrs
->hb_back_addrs
[client
.first
].reset(
2370 new entity_addrvec_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
2371 osd_addrs
->hb_front_addrs
[client
.first
].reset(
2372 new entity_addrvec_t(inc
.new_hb_front_up
.find(client
.first
)->second
));
2374 osd_info
[client
.first
].up_from
= epoch
;
2377 for (const auto &cluster
: inc
.new_up_cluster
)
2378 osd_addrs
->cluster_addrs
[cluster
.first
].reset(
2379 new entity_addrvec_t(cluster
.second
));
2382 for (const auto &thru
: inc
.new_up_thru
)
2383 osd_info
[thru
.first
].up_thru
= thru
.second
;
2385 for (const auto &interval
: inc
.new_last_clean_interval
) {
2386 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
2387 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
2390 for (const auto &lost
: inc
.new_lost
)
2391 osd_info
[lost
.first
].lost_at
= lost
.second
;
2394 for (const auto &xinfo
: inc
.new_xinfo
)
2395 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
2398 for (const auto &uuid
: inc
.new_uuid
)
2399 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
2402 for (const auto &pg
: inc
.new_pg_temp
) {
2403 if (pg
.second
.empty())
2404 pg_temp
->erase(pg
.first
);
2406 pg_temp
->set(pg
.first
, pg
.second
);
2408 if (!inc
.new_pg_temp
.empty()) {
2409 // make sure pg_temp is efficiently stored
2413 for (const auto &pg
: inc
.new_primary_temp
) {
2414 if (pg
.second
== -1)
2415 primary_temp
->erase(pg
.first
);
2417 (*primary_temp
)[pg
.first
] = pg
.second
;
2420 for (auto& p
: inc
.new_pg_upmap
) {
2421 pg_upmap
[p
.first
] = p
.second
;
2423 for (auto& pg
: inc
.old_pg_upmap
) {
2426 for (auto& p
: inc
.new_pg_upmap_items
) {
2427 pg_upmap_items
[p
.first
] = p
.second
;
2429 for (auto& pg
: inc
.old_pg_upmap_items
) {
2430 pg_upmap_items
.erase(pg
);
2433 for (auto& [pg
, prim
] : inc
.new_pg_upmap_primary
) {
2434 pg_upmap_primaries
[pg
] = prim
;
2436 for (auto& pg
: inc
.old_pg_upmap_primary
) {
2437 pg_upmap_primaries
.erase(pg
);
2441 if (!inc
.new_blocklist
.empty()) {
2442 blocklist
.insert(inc
.new_blocklist
.begin(),inc
.new_blocklist
.end());
2443 new_blocklist_entries
= true;
2445 for (const auto &addr
: inc
.old_blocklist
)
2446 blocklist
.erase(addr
);
2448 for (const auto& addr_p
: inc
.new_range_blocklist
) {
2449 range_blocklist
.insert(addr_p
);
2450 calculated_ranges
.emplace(addr_p
.first
, addr_p
.first
);
2451 new_blocklist_entries
= true;
2453 for (const auto &addr
: inc
.old_range_blocklist
) {
2454 calculated_ranges
.erase(addr
);
2455 range_blocklist
.erase(addr
);
2458 for (auto& i
: inc
.new_crush_node_flags
) {
2460 crush_node_flags
[i
.first
] = i
.second
;
2462 crush_node_flags
.erase(i
.first
);
2466 for (auto& i
: inc
.new_device_class_flags
) {
2468 device_class_flags
[i
.first
] = i
.second
;
2470 device_class_flags
.erase(i
.first
);
2474 // cluster snapshot?
2475 if (inc
.cluster_snapshot
.length()) {
2476 cluster_snapshot
= inc
.cluster_snapshot
;
2477 cluster_snapshot_epoch
= inc
.epoch
;
2479 cluster_snapshot
.clear();
2480 cluster_snapshot_epoch
= 0;
2483 if (inc
.new_nearfull_ratio
>= 0) {
2484 nearfull_ratio
= inc
.new_nearfull_ratio
;
2486 if (inc
.new_backfillfull_ratio
>= 0) {
2487 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
2489 if (inc
.new_full_ratio
>= 0) {
2490 full_ratio
= inc
.new_full_ratio
;
2492 if (inc
.new_require_min_compat_client
> ceph_release_t::unknown
) {
2493 require_min_compat_client
= inc
.new_require_min_compat_client
;
2495 if (inc
.new_require_osd_release
>= ceph_release_t::unknown
) {
2496 require_osd_release
= inc
.new_require_osd_release
;
2497 if (require_osd_release
>= ceph_release_t::luminous
) {
2498 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2499 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
2503 if (inc
.new_require_osd_release
>= ceph_release_t::unknown
) {
2504 require_osd_release
= inc
.new_require_osd_release
;
2505 if (require_osd_release
>= ceph_release_t::nautilus
) {
2506 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
2509 // do new crush map last (after up/down stuff)
2510 if (inc
.crush
.length()) {
2511 ceph::buffer::list
bl(inc
.crush
);
2512 auto blp
= bl
.cbegin();
2513 crush
.reset(new CrushWrapper
);
2515 if (require_osd_release
>= ceph_release_t::luminous
) {
2516 // only increment if this is a luminous-encoded osdmap, lest
2517 // the mon's crush_version diverge from what the osds or others
2518 // are decoding and applying on their end. if we won't encode
2519 // it in the canonical version, don't change it.
2522 for (auto it
= device_class_flags
.begin();
2523 it
!= device_class_flags
.end();) {
2524 const char* class_name
= crush
->get_class_name(it
->first
);
2525 if (!class_name
) // device class is gone
2526 it
= device_class_flags
.erase(it
);
2532 if (inc
.change_stretch_mode
) {
2533 stretch_mode_enabled
= inc
.stretch_mode_enabled
;
2534 stretch_bucket_count
= inc
.new_stretch_bucket_count
;
2535 degraded_stretch_mode
= inc
.new_degraded_stretch_mode
;
2536 recovering_stretch_mode
= inc
.new_recovering_stretch_mode
;
2537 stretch_mode_bucket
= inc
.new_stretch_mode_bucket
;
2540 switch (inc
.mutate_allow_crimson
) {
2541 case Incremental::mutate_allow_crimson_t::NONE
:
2543 case Incremental::mutate_allow_crimson_t::SET
:
2544 allow_crimson
= true;
2546 case Incremental::mutate_allow_crimson_t::CLEAR
:
2547 allow_crimson
= false;
2552 _calc_up_osd_features();
2557 int OSDMap::map_to_pg(
2561 const string
& nspace
,
2564 // calculate ps (placement seed)
2565 const pg_pool_t
*pool
= get_pg_pool(poolid
);
2570 ps
= pool
->hash_key(key
, nspace
);
2572 ps
= pool
->hash_key(name
, nspace
);
2573 *pg
= pg_t(ps
, poolid
);
2577 int OSDMap::object_locator_to_pg(
2578 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
2580 if (loc
.hash
>= 0) {
2581 if (!get_pg_pool(loc
.get_pool())) {
2584 pg
= pg_t(loc
.hash
, loc
.get_pool());
2587 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
2590 ceph_object_layout
OSDMap::make_object_layout(
2591 object_t oid
, int pg_pool
, string nspace
) const
2593 object_locator_t
loc(pg_pool
, nspace
);
2595 ceph_object_layout ol
;
2596 pg_t pgid
= object_locator_to_pg(oid
, loc
);
2597 ol
.ol_pgid
= pgid
.get_old_pg().v
;
2598 ol
.ol_stripe_unit
= 0;
2602 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
2603 vector
<int>& osds
) const
2605 if (pool
.can_shift_osds()) {
2606 unsigned removed
= 0;
2607 for (unsigned i
= 0; i
< osds
.size(); i
++) {
2608 if (!exists(osds
[i
])) {
2613 osds
[i
- removed
] = osds
[i
];
2617 osds
.resize(osds
.size() - removed
);
2619 for (auto& osd
: osds
) {
2621 osd
= CRUSH_ITEM_NONE
;
2626 void OSDMap::_pg_to_raw_osds(
2627 const pg_pool_t
& pool
, pg_t pg
,
2632 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
2633 unsigned size
= pool
.get_size();
2636 int ruleno
= pool
.get_crush_rule();
2638 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
2640 _remove_nonexistent_osds(pool
, *osds
);
2646 int OSDMap::_pick_primary(const vector
<int>& osds
) const
2648 for (auto osd
: osds
) {
2649 if (osd
!= CRUSH_ITEM_NONE
) {
2656 void OSDMap::_apply_upmap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
2658 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
2659 auto p
= pg_upmap
.find(pg
);
2660 if (p
!= pg_upmap
.end()) {
2661 // make sure targets aren't marked out
2662 for (auto osd
: p
->second
) {
2663 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd
>= 0 &&
2664 osd_weight
[osd
] == 0) {
2665 // reject/ignore the explicit mapping
2669 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
2670 // continue to check and apply pg_upmap_items if any
2673 auto q
= pg_upmap_items
.find(pg
);
2674 if (q
!= pg_upmap_items
.end()) {
2675 // NOTE: this approach does not allow a bidirectional swap,
2676 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2677 for (auto& [osd_from
, osd_to
] : q
->second
) {
2678 // A capcaity change upmap (repace osd in the pg with osd not in the pg)
2679 // make sure the replacement value doesn't already appear
2680 bool exists
= false;
2682 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
2683 int osd
= (*raw
)[i
];
2684 if (osd
== osd_to
) {
2688 // ignore mapping if target is marked out (or invalid osd id)
2689 if (osd
== osd_from
&&
2691 !(osd_to
!= CRUSH_ITEM_NONE
&& osd_to
< max_osd
&&
2692 osd_to
>= 0 && osd_weight
[osd_to
] == 0)) {
2696 if (!exists
&& pos
>= 0) {
2697 (*raw
)[pos
] = osd_to
;
2701 auto r
= pg_upmap_primaries
.find(pg
);
2702 if (r
!= pg_upmap_primaries
.end()) {
2703 auto new_prim
= r
->second
;
2704 // Apply mapping only if new primary is not marked out and valid osd id
2705 if (new_prim
!= CRUSH_ITEM_NONE
&& new_prim
< max_osd
&& new_prim
>= 0 &&
2706 osd_weight
[new_prim
] != 0) {
2707 int new_prim_idx
= 0;
2708 for (int i
= 1 ; i
< (int)raw
->size(); i
++) { // start from 1 on purpose
2709 if ((*raw
)[i
] == new_prim
) {
2714 if (new_prim_idx
> 0) {
2716 (*raw
)[new_prim_idx
] = (*raw
)[0];
2717 (*raw
)[0] = new_prim
;
2723 // pg -> (up osd list)
2724 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
2725 vector
<int> *up
) const
2727 if (pool
.can_shift_osds()) {
2730 up
->reserve(raw
.size());
2731 for (unsigned i
=0; i
<raw
.size(); i
++) {
2732 if (!exists(raw
[i
]) || is_down(raw
[i
]))
2734 up
->push_back(raw
[i
]);
2737 // set down/dne devices to NONE
2738 up
->resize(raw
.size());
2739 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
2740 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
2741 (*up
)[i
] = CRUSH_ITEM_NONE
;
2749 void OSDMap::_apply_primary_affinity(ps_t seed
,
2750 const pg_pool_t
& pool
,
2754 // do we have any non-default primary_affinity values for these osds?
2755 if (!osd_primary_affinity
)
2759 for (const auto osd
: *osds
) {
2760 if (osd
!= CRUSH_ITEM_NONE
&&
2761 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
2769 // pick the primary. feed both the seed (for the pg) and the osd
2770 // into the hash/rng so that a proportional fraction of an osd's pgs
2771 // get rejected as primary.
2773 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
2775 if (o
== CRUSH_ITEM_NONE
)
2777 unsigned a
= (*osd_primary_affinity
)[o
];
2778 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
2779 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
2780 seed
, o
) >> 16) >= a
) {
2781 // we chose not to use this primary. note it anyway as a
2782 // fallback in case we don't pick anyone else, but keep looking.
2793 *primary
= (*osds
)[pos
];
2795 if (pool
.can_shift_osds() && pos
> 0) {
2796 // move the new primary to the front.
2797 for (int i
= pos
; i
> 0; --i
) {
2798 (*osds
)[i
] = (*osds
)[i
-1];
2800 (*osds
)[0] = *primary
;
2804 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
2805 vector
<int> *temp_pg
, int *temp_primary
) const
2807 pg
= pool
.raw_pg_to_pg(pg
);
2808 const auto p
= pg_temp
->find(pg
);
2810 if (p
!= pg_temp
->end()) {
2811 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
2812 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
2813 if (pool
.can_shift_osds()) {
2816 temp_pg
->push_back(CRUSH_ITEM_NONE
);
2819 temp_pg
->push_back(p
->second
[i
]);
2823 const auto &pp
= primary_temp
->find(pg
);
2825 if (pp
!= primary_temp
->end()) {
2826 *temp_primary
= pp
->second
;
2827 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
2828 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
2829 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
2830 *temp_primary
= (*temp_pg
)[i
];
2837 void OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
2839 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2845 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2846 *primary
= _pick_primary(*raw
);
2849 void OSDMap::pg_to_raw_upmap(pg_t pg
, vector
<int>*raw
,
2850 vector
<int> *raw_upmap
) const
2852 auto pool
= get_pg_pool(pg
.pool());
2857 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2859 _apply_upmap(*pool
, pg
, raw_upmap
);
2862 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
2864 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2872 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2873 _apply_upmap(*pool
, pg
, &raw
);
2874 _raw_to_up_osds(*pool
, raw
, up
);
2875 *primary
= _pick_primary(raw
);
2876 _apply_primary_affinity(pps
, *pool
, up
, primary
);
2879 void OSDMap::_pg_to_up_acting_osds(
2880 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
2881 vector
<int> *acting
, int *acting_primary
,
2882 bool raw_pg_to_pg
) const
2884 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2886 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
2894 *acting_primary
= -1;
2899 vector
<int> _acting
;
2901 int _acting_primary
;
2903 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
2904 if (_acting
.empty() || up
|| up_primary
) {
2905 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2906 _apply_upmap(*pool
, pg
, &raw
);
2907 _raw_to_up_osds(*pool
, raw
, &_up
);
2908 _up_primary
= _pick_primary(_up
);
2909 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
2910 if (_acting
.empty()) {
2912 if (_acting_primary
== -1) {
2913 _acting_primary
= _up_primary
;
2920 *up_primary
= _up_primary
;
2924 acting
->swap(_acting
);
2926 *acting_primary
= _acting_primary
;
2929 int OSDMap::calc_pg_role_broken(int osd
, const vector
<int>& acting
, int nrep
)
2931 // This implementation is broken for EC PGs since the osd may appear
2932 // multiple times in the acting set. See
2933 // https://tracker.ceph.com/issues/43213
2935 nrep
= acting
.size();
2936 for (int i
=0; i
<nrep
; i
++)
2937 if (acting
[i
] == osd
)
2942 int OSDMap::calc_pg_role(pg_shard_t who
, const vector
<int>& acting
)
2944 int nrep
= acting
.size();
2945 if (who
.shard
== shard_id_t::NO_SHARD
) {
2946 for (int i
=0; i
<nrep
; i
++) {
2947 if (acting
[i
] == who
.osd
) {
2952 if (who
.shard
< nrep
&& acting
[who
.shard
] == who
.osd
) {
2959 bool OSDMap::primary_changed_broken(
2961 const vector
<int> &oldacting
,
2963 const vector
<int> &newacting
)
2965 if (oldacting
.empty() && newacting
.empty())
2966 return false; // both still empty
2967 if (oldacting
.empty() ^ newacting
.empty())
2968 return true; // was empty, now not, or vice versa
2969 if (oldprimary
!= newprimary
)
2970 return true; // primary changed
2971 if (calc_pg_role_broken(oldprimary
, oldacting
) !=
2972 calc_pg_role_broken(newprimary
, newacting
))
2974 return false; // same primary (tho replicas may have changed)
2977 uint64_t OSDMap::get_encoding_features() const
2979 uint64_t f
= SIGNIFICANT_FEATURES
;
2980 if (require_osd_release
< ceph_release_t::octopus
) {
2981 f
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
2983 if (require_osd_release
< ceph_release_t::nautilus
) {
2984 f
&= ~CEPH_FEATURE_SERVER_NAUTILUS
;
2986 if (require_osd_release
< ceph_release_t::mimic
) {
2987 f
&= ~CEPH_FEATURE_SERVER_MIMIC
;
2989 if (require_osd_release
< ceph_release_t::luminous
) {
2990 f
&= ~(CEPH_FEATURE_SERVER_LUMINOUS
|
2991 CEPH_FEATURE_CRUSH_CHOOSE_ARGS
);
2993 if (require_osd_release
< ceph_release_t::kraken
) {
2994 f
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
2995 CEPH_FEATURE_MSG_ADDR2
);
2997 if (require_osd_release
< ceph_release_t::jewel
) {
2998 f
&= ~(CEPH_FEATURE_SERVER_JEWEL
|
2999 CEPH_FEATURE_NEW_OSDOP_ENCODING
|
3000 CEPH_FEATURE_CRUSH_TUNABLES5
);
3005 // serialize, unserialize
3006 void OSDMap::encode_client_old(ceph::buffer::list
& bl
) const
3015 encode(created
, bl
);
3016 encode(modified
, bl
);
3018 // for encode(pools, bl);
3019 __u32 n
= pools
.size();
3022 for (const auto &pool
: pools
) {
3025 encode(pool
.second
, bl
, 0);
3027 // for encode(pool_name, bl);
3028 n
= pool_name
.size();
3030 for (const auto &pname
: pool_name
) {
3033 encode(pname
.second
, bl
);
3035 // for encode(pool_max, bl);
3041 encode(max_osd
, bl
);
3043 uint32_t n
= osd_state
.size();
3045 for (auto s
: osd_state
) {
3046 encode((uint8_t)s
, bl
);
3049 encode(osd_weight
, bl
);
3050 encode(osd_addrs
->client_addrs
, bl
, 0);
3052 // for encode(pg_temp, bl);
3053 n
= pg_temp
->size();
3055 for (const auto& pg
: *pg_temp
) {
3056 old_pg_t opg
= pg
.first
.get_old_pg();
3058 encode(pg
.second
, bl
);
3062 ceph::buffer::list cbl
;
3063 crush
->encode(cbl
, 0 /* legacy (no) features */);
3067 void OSDMap::encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const
3070 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
3071 encode_client_old(bl
);
3081 encode(created
, bl
);
3082 encode(modified
, bl
);
3084 encode(pools
, bl
, features
);
3085 encode(pool_name
, bl
);
3086 encode(pool_max
, bl
);
3090 encode(max_osd
, bl
);
3092 uint32_t n
= osd_state
.size();
3094 for (auto s
: osd_state
) {
3095 encode((uint8_t)s
, bl
);
3098 encode(osd_weight
, bl
);
3099 encode(osd_addrs
->client_addrs
, bl
, features
);
3101 encode(*pg_temp
, bl
);
3104 ceph::buffer::list cbl
;
3105 crush
->encode(cbl
, 0 /* legacy (no) features */);
3111 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
3112 encode(osd_info
, bl
);
3113 encode(blocklist
, bl
, features
);
3114 encode(osd_addrs
->cluster_addrs
, bl
, features
);
3115 encode(cluster_snapshot_epoch
, bl
);
3116 encode(cluster_snapshot
, bl
);
3117 encode(*osd_uuid
, bl
);
3118 encode(osd_xinfo
, bl
, features
);
3119 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
3122 /* for a description of osdmap versions, and when they were introduced, please
3124 * doc/dev/osd_internals/osdmap_versions.txt
3126 void OSDMap::encode(ceph::buffer::list
& bl
, uint64_t features
) const
3129 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
3130 encode_classic(bl
, features
);
3134 // only a select set of callers should *ever* be encoding new
3135 // OSDMaps. others should be passing around the canonical encoded
3136 // buffers from on high. select out those callers by passing in an
3137 // "impossible" feature bit.
3138 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
3139 features
&= ~CEPH_FEATURE_RESERVED
;
3141 size_t start_offset
= bl
.length();
3144 std::optional
<ceph::buffer::list::contiguous_filler
> crc_filler
;
3146 // meta-encoding: how we include client-used and osd-specific data
3147 ENCODE_START(8, 7, bl
);
3150 // NOTE: any new encoding dependencies must be reflected by
3151 // SIGNIFICANT_FEATURES
3153 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
3155 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
3157 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
3159 } /* else if (!HAVE_FEATURE(features, SERVER_REEF)) {
3162 ENCODE_START(v
, 1, bl
); // client-usable data
3166 encode(created
, bl
);
3167 encode(modified
, bl
);
3169 encode(pools
, bl
, features
);
3170 encode(pool_name
, bl
);
3171 encode(pool_max
, bl
);
3174 decltype(flags
) f
= flags
;
3175 if (require_osd_release
>= ceph_release_t::luminous
)
3176 f
|= CEPH_OSDMAP_REQUIRE_LUMINOUS
| CEPH_OSDMAP_RECOVERY_DELETES
;
3177 else if (require_osd_release
== ceph_release_t::kraken
)
3178 f
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
3179 else if (require_osd_release
== ceph_release_t::jewel
)
3180 f
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
3186 encode(max_osd
, bl
);
3188 encode(osd_state
, bl
);
3190 uint32_t n
= osd_state
.size();
3192 for (auto s
: osd_state
) {
3193 encode((uint8_t)s
, bl
);
3196 encode(osd_weight
, bl
);
3198 encode(osd_addrs
->client_addrs
, bl
, features
);
3200 encode_addrvec_pvec_as_addr(osd_addrs
->client_addrs
, bl
, features
);
3203 encode(*pg_temp
, bl
);
3204 encode(*primary_temp
, bl
);
3205 if (osd_primary_affinity
) {
3206 encode(*osd_primary_affinity
, bl
);
3213 ceph::buffer::list cbl
;
3214 crush
->encode(cbl
, features
);
3216 encode(erasure_code_profiles
, bl
);
3219 encode(pg_upmap
, bl
);
3220 encode(pg_upmap_items
, bl
);
3222 ceph_assert(pg_upmap
.empty());
3223 ceph_assert(pg_upmap_items
.empty());
3226 encode(crush_version
, bl
);
3229 encode(new_removed_snaps
, bl
);
3230 encode(new_purged_snaps
, bl
);
3233 encode(last_up_change
, bl
);
3234 encode(last_in_change
, bl
);
3237 encode(pg_upmap_primaries
, bl
);
3239 ceph_assert(pg_upmap_primaries
.empty());
3241 ENCODE_FINISH(bl
); // client-usable data
3245 // NOTE: any new encoding dependencies must be reflected by
3246 // SIGNIFICANT_FEATURES
3247 uint8_t target_v
= 9; // when bumping this, be aware of allow_crimson
3248 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
3250 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
3252 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
3255 if (stretch_mode_enabled
) {
3256 target_v
= std::max((uint8_t)10, target_v
);
3258 if (!range_blocklist
.empty()) {
3259 target_v
= std::max((uint8_t)11, target_v
);
3261 if (allow_crimson
) {
3262 target_v
= std::max((uint8_t)12, target_v
);
3264 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
3266 encode_addrvec_pvec_as_addr(osd_addrs
->hb_back_addrs
, bl
, features
);
3268 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
3270 encode(osd_info
, bl
);
3272 // put this in a sorted, ordered map<> so that we encode in a
3273 // deterministic order.
3274 map
<entity_addr_t
,utime_t
> blocklist_map
;
3275 for (const auto &addr
: blocklist
)
3276 blocklist_map
.insert(make_pair(addr
.first
, addr
.second
));
3277 encode(blocklist_map
, bl
, features
);
3280 encode_addrvec_pvec_as_addr(osd_addrs
->cluster_addrs
, bl
, features
);
3282 encode(osd_addrs
->cluster_addrs
, bl
, features
);
3284 encode(cluster_snapshot_epoch
, bl
);
3285 encode(cluster_snapshot
, bl
);
3286 encode(*osd_uuid
, bl
);
3287 encode(osd_xinfo
, bl
, features
);
3289 encode_addrvec_pvec_as_addr(osd_addrs
->hb_front_addrs
, bl
, features
);
3291 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
3293 if (target_v
>= 2) {
3294 encode(nearfull_ratio
, bl
);
3295 encode(full_ratio
, bl
);
3296 encode(backfillfull_ratio
, bl
);
3298 // 4 was string-based new_require_min_compat_client
3299 if (target_v
>= 5) {
3300 encode(require_min_compat_client
, bl
);
3301 encode(require_osd_release
, bl
);
3303 if (target_v
>= 6) {
3304 encode(removed_snaps_queue
, bl
);
3306 if (target_v
>= 8) {
3307 encode(crush_node_flags
, bl
);
3309 if (target_v
>= 9) {
3310 encode(device_class_flags
, bl
);
3312 if (target_v
>= 10) {
3313 encode(stretch_mode_enabled
, bl
);
3314 encode(stretch_bucket_count
, bl
);
3315 encode(degraded_stretch_mode
, bl
);
3316 encode(recovering_stretch_mode
, bl
);
3317 encode(stretch_mode_bucket
, bl
);
3319 if (target_v
>= 11) {
3320 ::encode(range_blocklist
, bl
, features
);
3322 if (target_v
>= 12) {
3323 ::encode(allow_crimson
, bl
);
3325 ENCODE_FINISH(bl
); // osd-only data
3328 crc_offset
= bl
.length();
3329 crc_filler
= bl
.append_hole(sizeof(uint32_t));
3330 tail_offset
= bl
.length();
3332 ENCODE_FINISH(bl
); // meta-encoding wrapper
3335 ceph::buffer::list front
;
3336 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
3337 crc
= front
.crc32c(-1);
3338 if (tail_offset
< bl
.length()) {
3339 ceph::buffer::list tail
;
3340 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
3341 crc
= tail
.crc32c(crc
);
3345 crc_filler
->copy_in(4, (char*)&crc_le
);
3349 /* for a description of osdmap versions, and when they were introduced, please
3351 * doc/dev/osd_internals/osdmap_versions.txt
3353 void OSDMap::decode(ceph::buffer::list
& bl
)
3355 auto p
= bl
.cbegin();
3359 void OSDMap::decode_classic(ceph::buffer::list::const_iterator
& p
)
3370 decode(modified
, p
);
3374 int32_t max_pools
= 0;
3375 decode(max_pools
, p
);
3376 pool_max
= max_pools
;
3382 decode(pools
[t
], p
);
3387 } else if (v
== 5) {
3392 decode(pool_name
[t
], p
);
3399 decode(pool_name
, p
);
3400 decode(pool_max
, p
);
3402 // kludge around some old bug that zeroed out pool_max (#2307)
3403 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
3404 pool_max
= pools
.rbegin()->first
;
3413 osd_state
.resize(os
.size());
3414 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3415 osd_state
[i
] = os
[i
];
3418 decode(osd_weight
, p
);
3419 decode(osd_addrs
->client_addrs
, p
);
3425 ceph::decode_raw(opg
, p
);
3426 mempool::osdmap::vector
<int32_t> v
;
3428 pg_temp
->set(pg_t(opg
), v
);
3431 decode(*pg_temp
, p
);
3435 ceph::buffer::list cbl
;
3437 auto cblp
= cbl
.cbegin();
3438 crush
->decode(cblp
);
3444 decode(osd_addrs
->hb_back_addrs
, p
);
3445 decode(osd_info
, p
);
3447 decode(pool_name
, p
);
3449 decode(blocklist
, p
);
3451 decode(osd_addrs
->cluster_addrs
, p
);
3453 osd_addrs
->cluster_addrs
.resize(osd_addrs
->client_addrs
.size());
3456 decode(cluster_snapshot_epoch
, p
);
3457 decode(cluster_snapshot
, p
);
3461 decode(*osd_uuid
, p
);
3463 osd_uuid
->resize(max_osd
);
3466 decode(osd_xinfo
, p
);
3468 osd_xinfo
.resize(max_osd
);
3471 decode(osd_addrs
->hb_front_addrs
, p
);
3473 osd_addrs
->hb_front_addrs
.resize(osd_addrs
->hb_back_addrs
.size());
3475 osd_primary_affinity
.reset();
3480 void OSDMap::decode(ceph::buffer::list::const_iterator
& bl
)
3484 * Older encodings of the OSDMap had a single struct_v which
3485 * covered the whole encoding, and was prior to our modern
3486 * stuff which includes a compatv and a size. So if we see
3487 * a struct_v < 7, we must rewind to the beginning and use our
3490 size_t start_offset
= bl
.get_off();
3491 size_t tail_offset
= 0;
3492 ceph::buffer::list crc_front
, crc_tail
;
3494 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
3496 bl
.seek(start_offset
);
3501 * Since we made it past that hurdle, we can use our normal paths.
3504 DECODE_START(9, bl
); // client-usable data
3508 decode(created
, bl
);
3509 decode(modified
, bl
);
3512 decode(pool_name
, bl
);
3513 decode(pool_max
, bl
);
3517 decode(max_osd
, bl
);
3518 if (struct_v
>= 5) {
3519 decode(osd_state
, bl
);
3523 osd_state
.resize(os
.size());
3524 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3525 osd_state
[i
] = os
[i
];
3528 decode(osd_weight
, bl
);
3529 decode(osd_addrs
->client_addrs
, bl
);
3531 decode(*pg_temp
, bl
);
3532 decode(*primary_temp
, bl
);
3533 // dates back to firefly. version increased from 2 to 3 still in firefly.
3534 // do we really still need to keep this around? even for old clients?
3535 if (struct_v
>= 2) {
3536 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
3537 decode(*osd_primary_affinity
, bl
);
3538 if (osd_primary_affinity
->empty())
3539 osd_primary_affinity
.reset();
3541 osd_primary_affinity
.reset();
3545 ceph::buffer::list cbl
;
3547 auto cblp
= cbl
.cbegin();
3548 crush
->decode(cblp
);
3549 // added in firefly; version increased in luminous, so it affects
3550 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3551 // alone until we require clients to be all luminous?
3552 if (struct_v
>= 3) {
3553 decode(erasure_code_profiles
, bl
);
3555 erasure_code_profiles
.clear();
3557 // version increased from 3 to 4 still in luminous, so same as above
3559 if (struct_v
>= 4) {
3560 decode(pg_upmap
, bl
);
3561 decode(pg_upmap_items
, bl
);
3564 pg_upmap_items
.clear();
3566 // again, version increased from 5 to 6 still in luminous, so above
3568 if (struct_v
>= 6) {
3569 decode(crush_version
, bl
);
3571 // version increase from 6 to 7 in mimic
3572 if (struct_v
>= 7) {
3573 decode(new_removed_snaps
, bl
);
3574 decode(new_purged_snaps
, bl
);
3576 // version increase from 7 to 8, 8 to 9, in nautilus.
3577 if (struct_v
>= 9) {
3578 decode(last_up_change
, bl
);
3579 decode(last_in_change
, bl
);
3581 if (struct_v
>= 10) {
3582 decode(pg_upmap_primaries
, bl
);
3584 pg_upmap_primaries
.clear();
3586 DECODE_FINISH(bl
); // client-usable data
3590 DECODE_START(10, bl
); // extended, osd-only data
3591 decode(osd_addrs
->hb_back_addrs
, bl
);
3592 decode(osd_info
, bl
);
3593 decode(blocklist
, bl
);
3594 decode(osd_addrs
->cluster_addrs
, bl
);
3595 decode(cluster_snapshot_epoch
, bl
);
3596 decode(cluster_snapshot
, bl
);
3597 decode(*osd_uuid
, bl
);
3598 decode(osd_xinfo
, bl
);
3599 decode(osd_addrs
->hb_front_addrs
, bl
);
3601 if (struct_v
>= 2) {
3602 decode(nearfull_ratio
, bl
);
3603 decode(full_ratio
, bl
);
3608 if (struct_v
>= 3) {
3609 decode(backfillfull_ratio
, bl
);
3611 backfillfull_ratio
= 0;
3613 if (struct_v
== 4) {
3617 require_min_compat_client
= ceph_release_from_name(r
.c_str());
3619 if (struct_v
>= 5) {
3620 decode(require_min_compat_client
, bl
);
3621 decode(require_osd_release
, bl
);
3622 if (require_osd_release
>= ceph_release_t::nautilus
) {
3623 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
3625 if (require_osd_release
>= ceph_release_t::luminous
) {
3626 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3627 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3630 if (flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
) {
3631 // only for compat with post-kraken pre-luminous test clusters
3632 require_osd_release
= ceph_release_t::luminous
;
3633 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3634 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3635 } else if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
3636 require_osd_release
= ceph_release_t::kraken
;
3637 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
3638 require_osd_release
= ceph_release_t::jewel
;
3640 require_osd_release
= ceph_release_t::unknown
;
3643 if (struct_v
>= 6) {
3644 decode(removed_snaps_queue
, bl
);
3646 if (struct_v
>= 8) {
3647 decode(crush_node_flags
, bl
);
3649 crush_node_flags
.clear();
3651 if (struct_v
>= 9) {
3652 decode(device_class_flags
, bl
);
3654 device_class_flags
.clear();
3656 if (struct_v
>= 10) {
3657 decode(stretch_mode_enabled
, bl
);
3658 decode(stretch_bucket_count
, bl
);
3659 decode(degraded_stretch_mode
, bl
);
3660 decode(recovering_stretch_mode
, bl
);
3661 decode(stretch_mode_bucket
, bl
);
3663 stretch_mode_enabled
= false;
3664 stretch_bucket_count
= 0;
3665 degraded_stretch_mode
= 0;
3666 recovering_stretch_mode
= 0;
3667 stretch_mode_bucket
= 0;
3669 if (struct_v
>= 11) {
3670 decode(range_blocklist
, bl
);
3671 calculated_ranges
.clear();
3672 for (const auto& i
: range_blocklist
) {
3673 calculated_ranges
.emplace(i
.first
, i
.first
);
3676 if (struct_v
>= 12) {
3677 decode(allow_crimson
, bl
);
3679 DECODE_FINISH(bl
); // osd-only data
3682 if (struct_v
>= 8) {
3683 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
3685 tail_offset
= bl
.get_off();
3688 crc_defined
= false;
3692 DECODE_FINISH(bl
); // wrapper
3696 uint32_t actual
= crc_front
.crc32c(-1);
3697 if (tail_offset
< bl
.get_off()) {
3698 ceph::buffer::list tail
;
3699 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
3700 actual
= tail
.crc32c(actual
);
3702 if (crc
!= actual
) {
3704 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
3705 string s
= ss
.str();
3706 throw ceph::buffer::malformed_input(s
.c_str());
3713 void OSDMap::post_decode()
3717 for (const auto &pname
: pool_name
) {
3718 name_pool
[pname
.second
] = pname
.first
;
3722 _calc_up_osd_features();
3725 void OSDMap::dump_erasure_code_profiles(
3726 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
3729 f
->open_object_section("erasure_code_profiles");
3730 for (const auto &profile
: profiles
) {
3731 f
->open_object_section(profile
.first
.c_str());
3732 for (const auto &profm
: profile
.second
) {
3733 f
->dump_string(profm
.first
.c_str(), profm
.second
);
3740 void OSDMap::dump_osds(Formatter
*f
) const
3742 f
->open_array_section("osds");
3743 for (int i
=0; i
<get_max_osd(); i
++) {
3751 void OSDMap::dump_osd(int id
, Formatter
*f
) const
3753 ceph_assert(f
!= nullptr);
3758 f
->open_object_section("osd_info");
3759 f
->dump_int("osd", id
);
3760 f
->dump_stream("uuid") << get_uuid(id
);
3761 f
->dump_int("up", is_up(id
));
3762 f
->dump_int("in", is_in(id
));
3763 f
->dump_float("weight", get_weightf(id
));
3764 f
->dump_float("primary_affinity", get_primary_affinityf(id
));
3765 get_info(id
).dump(f
);
3766 f
->dump_object("public_addrs", get_addrs(id
));
3767 f
->dump_object("cluster_addrs", get_cluster_addrs(id
));
3768 f
->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id
));
3769 f
->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id
));
3771 f
->dump_stream("public_addr") << get_addrs(id
).get_legacy_str();
3772 f
->dump_stream("cluster_addr") << get_cluster_addrs(id
).get_legacy_str();
3773 f
->dump_stream("heartbeat_back_addr")
3774 << get_hb_back_addrs(id
).get_legacy_str();
3775 f
->dump_stream("heartbeat_front_addr")
3776 << get_hb_front_addrs(id
).get_legacy_str();
3780 f
->open_array_section("state");
3781 for (const auto &state
: st
)
3782 f
->dump_string("state", state
);
3788 void OSDMap::dump_pool(CephContext
*cct
,
3790 const pg_pool_t
&pdata
,
3791 ceph::Formatter
*f
) const
3793 std::string
name("<unknown>");
3794 const auto &pni
= pool_name
.find(pid
);
3795 if (pni
!= pool_name
.end())
3797 f
->open_object_section("pool");
3798 f
->dump_int("pool", pid
);
3799 f
->dump_string("pool_name", name
);
3801 dump_read_balance_score(cct
, pid
, pdata
, f
);
3802 f
->close_section(); // pool
3805 void OSDMap::dump_read_balance_score(CephContext
*cct
,
3807 const pg_pool_t
&pdata
,
3808 ceph::Formatter
*f
) const
3810 if (pdata
.is_replicated()) {
3811 // Add rb section with values for score, optimal score, raw score
3812 // // and primary_affinity average
3813 OSDMap::read_balance_info_t rb_info
;
3814 auto rc
= calc_read_balance_score(cct
, pid
, &rb_info
);
3816 f
->open_object_section("read_balance");
3817 f
->dump_float("score_acting", rb_info
.acting_adj_score
);
3818 f
->dump_float("score_stable", rb_info
.adjusted_score
);
3819 f
->dump_float("optimal_score", rb_info
.optimal_score
);
3820 f
->dump_float("raw_score_acting", rb_info
.acting_raw_score
);
3821 f
->dump_float("raw_score_stable", rb_info
.raw_score
);
3822 f
->dump_float("primary_affinity_weighted", rb_info
.pa_weighted
);
3823 f
->dump_float("average_primary_affinity", rb_info
.pa_avg
);
3824 f
->dump_float("average_primary_affinity_weighted", rb_info
.pa_weighted_avg
);
3825 if (rb_info
.err_msg
.length() > 0) {
3826 f
->dump_string("error_message", rb_info
.err_msg
);
3828 f
->close_section(); // read_balance
3831 if (rb_info
.err_msg
.length() > 0) {
3832 f
->open_object_section("read_balance");
3833 f
->dump_string("error_message", rb_info
.err_msg
);
3834 f
->dump_float("score_acting", rb_info
.acting_adj_score
);
3835 f
->dump_float("score_stable", rb_info
.adjusted_score
);
3836 f
->close_section(); // read_balance
3842 void OSDMap::dump(Formatter
*f
, CephContext
*cct
) const
3844 f
->dump_int("epoch", get_epoch());
3845 f
->dump_stream("fsid") << get_fsid();
3846 f
->dump_stream("created") << get_created();
3847 f
->dump_stream("modified") << get_modified();
3848 f
->dump_stream("last_up_change") << last_up_change
;
3849 f
->dump_stream("last_in_change") << last_in_change
;
3850 f
->dump_string("flags", get_flag_string());
3851 f
->dump_unsigned("flags_num", flags
);
3852 f
->open_array_section("flags_set");
3853 set
<string
> flagset
;
3854 get_flag_set(&flagset
);
3855 for (auto p
: flagset
) {
3856 f
->dump_string("flag", p
);
3859 f
->dump_unsigned("crush_version", get_crush_version());
3860 f
->dump_float("full_ratio", full_ratio
);
3861 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
3862 f
->dump_float("nearfull_ratio", nearfull_ratio
);
3863 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
3864 f
->dump_int("pool_max", get_pool_max());
3865 f
->dump_int("max_osd", get_max_osd());
3866 f
->dump_string("require_min_compat_client",
3867 to_string(require_min_compat_client
));
3868 f
->dump_string("min_compat_client",
3869 to_string(get_min_compat_client()));
3870 f
->dump_string("require_osd_release",
3871 to_string(require_osd_release
));
3873 f
->dump_bool("allow_crimson", allow_crimson
);
3874 f
->open_array_section("pools");
3875 for (const auto &[pid
, pdata
] : pools
) {
3876 dump_pool(cct
, pid
, pdata
, f
);
3882 f
->open_array_section("osd_xinfo");
3883 for (int i
=0; i
<get_max_osd(); i
++) {
3885 f
->open_object_section("xinfo");
3886 f
->dump_int("osd", i
);
3887 osd_xinfo
[i
].dump(f
);
3893 f
->open_array_section("pg_upmap");
3894 for (auto& p
: pg_upmap
) {
3895 f
->open_object_section("mapping");
3896 f
->dump_stream("pgid") << p
.first
;
3897 f
->open_array_section("osds");
3898 for (auto q
: p
.second
) {
3899 f
->dump_int("osd", q
);
3906 f
->open_array_section("pg_upmap_items");
3907 for (auto& [pgid
, mappings
] : pg_upmap_items
) {
3908 f
->open_object_section("mapping");
3909 f
->dump_stream("pgid") << pgid
;
3910 f
->open_array_section("mappings");
3911 for (auto& [from
, to
] : mappings
) {
3912 f
->open_object_section("mapping");
3913 f
->dump_int("from", from
);
3914 f
->dump_int("to", to
);
3922 f
->open_array_section("pg_upmap_primaries");
3923 for (const auto& [pg
, osd
] : pg_upmap_primaries
) {
3924 f
->open_object_section("primary_mapping");
3925 f
->dump_stream("pgid") << pg
;
3926 f
->dump_int("primary_osd", osd
);
3929 f
->close_section(); // primary_temp
3931 f
->open_array_section("pg_temp");
3935 f
->open_array_section("primary_temp");
3936 for (const auto &pg
: *primary_temp
) {
3937 f
->dump_stream("pgid") << pg
.first
;
3938 f
->dump_int("osd", pg
.second
);
3940 f
->close_section(); // primary_temp
3942 f
->open_object_section("blocklist");
3943 for (const auto &addr
: blocklist
) {
3946 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3949 f
->open_object_section("range_blocklist");
3950 for (const auto &addr
: range_blocklist
) {
3953 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3957 dump_erasure_code_profiles(erasure_code_profiles
, f
);
3959 f
->open_array_section("removed_snaps_queue");
3960 for (auto& p
: removed_snaps_queue
) {
3961 f
->open_object_section("pool");
3962 f
->dump_int("pool", p
.first
);
3963 f
->open_array_section("snaps");
3964 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3965 f
->open_object_section("interval");
3966 f
->dump_unsigned("begin", q
.get_start());
3967 f
->dump_unsigned("length", q
.get_len());
3974 f
->open_array_section("new_removed_snaps");
3975 for (auto& p
: new_removed_snaps
) {
3976 f
->open_object_section("pool");
3977 f
->dump_int("pool", p
.first
);
3978 f
->open_array_section("snaps");
3979 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3980 f
->open_object_section("interval");
3981 f
->dump_unsigned("begin", q
.get_start());
3982 f
->dump_unsigned("length", q
.get_len());
3989 f
->open_array_section("new_purged_snaps");
3990 for (auto& p
: new_purged_snaps
) {
3991 f
->open_object_section("pool");
3992 f
->dump_int("pool", p
.first
);
3993 f
->open_array_section("snaps");
3994 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3995 f
->open_object_section("interval");
3996 f
->dump_unsigned("begin", q
.get_start());
3997 f
->dump_unsigned("length", q
.get_len());
4004 f
->open_object_section("crush_node_flags");
4005 for (auto& i
: crush_node_flags
) {
4006 string s
= crush
->item_exists(i
.first
) ? crush
->get_item_name(i
.first
)
4007 : stringify(i
.first
);
4008 f
->open_array_section(s
.c_str());
4010 calc_state_set(i
.second
, st
);
4011 for (auto& j
: st
) {
4012 f
->dump_string("flag", j
);
4017 f
->open_object_section("device_class_flags");
4018 for (auto& i
: device_class_flags
) {
4019 const char* class_name
= crush
->get_class_name(i
.first
);
4020 string s
= class_name
? class_name
: stringify(i
.first
);
4021 f
->open_array_section(s
.c_str());
4023 calc_state_set(i
.second
, st
);
4024 for (auto& j
: st
) {
4025 f
->dump_string("flag", j
);
4030 f
->open_object_section("stretch_mode");
4032 f
->dump_bool("stretch_mode_enabled", stretch_mode_enabled
);
4033 f
->dump_unsigned("stretch_bucket_count", stretch_bucket_count
);
4034 f
->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode
);
4035 f
->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode
);
4036 f
->dump_int("stretch_mode_bucket", stretch_mode_bucket
);
4041 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
4043 o
.push_back(new OSDMap
);
4045 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
4046 o
.push_back(new OSDMap
);
4048 o
.back()->build_simple(cct
, 1, fsid
, 16);
4049 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
4050 o
.back()->blocklist
[entity_addr_t()] = utime_t(5, 6);
4054 string
OSDMap::get_flag_string(unsigned f
)
4057 if (f
& CEPH_OSDMAP_PAUSERD
)
4059 if (f
& CEPH_OSDMAP_PAUSEWR
)
4061 if (f
& CEPH_OSDMAP_PAUSEREC
)
4063 if (f
& CEPH_OSDMAP_NOUP
)
4065 if (f
& CEPH_OSDMAP_NODOWN
)
4067 if (f
& CEPH_OSDMAP_NOOUT
)
4069 if (f
& CEPH_OSDMAP_NOIN
)
4071 if (f
& CEPH_OSDMAP_NOBACKFILL
)
4073 if (f
& CEPH_OSDMAP_NOREBALANCE
)
4074 s
+= ",norebalance";
4075 if (f
& CEPH_OSDMAP_NORECOVER
)
4077 if (f
& CEPH_OSDMAP_NOSCRUB
)
4079 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
4080 s
+= ",nodeep-scrub";
4081 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
4082 s
+= ",notieragent";
4083 if (f
& CEPH_OSDMAP_NOSNAPTRIM
)
4085 if (f
& CEPH_OSDMAP_SORTBITWISE
)
4086 s
+= ",sortbitwise";
4087 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
4088 s
+= ",require_jewel_osds";
4089 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
4090 s
+= ",require_kraken_osds";
4091 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
4092 s
+= ",require_luminous_osds";
4093 if (f
& CEPH_OSDMAP_RECOVERY_DELETES
)
4094 s
+= ",recovery_deletes";
4095 if (f
& CEPH_OSDMAP_PURGED_SNAPDIRS
)
4096 s
+= ",purged_snapdirs";
4097 if (f
& CEPH_OSDMAP_PGLOG_HARDLIMIT
)
4098 s
+= ",pglog_hardlimit";
4099 if (f
& CEPH_OSDMAP_NOAUTOSCALE
)
4100 s
+= ",noautoscale";
4106 string
OSDMap::get_flag_string() const
4108 return get_flag_string(flags
);
4111 void OSDMap::print_pools(CephContext
*cct
, ostream
& out
) const
4113 for (const auto &[pid
, pdata
] : pools
) {
4114 std::string
name("<unknown>");
4115 const auto &pni
= pool_name
.find(pid
);
4116 if (pni
!= pool_name
.end())
4118 char rb_score_str
[32] = "";
4120 read_balance_info_t rb_info
;
4121 if (pdata
.is_replicated()) {
4122 rc
= calc_read_balance_score(cct
, pid
, &rb_info
);
4124 snprintf (rb_score_str
, sizeof(rb_score_str
),
4125 " read_balance_score %.2f", rb_info
.acting_adj_score
);
4128 out
<< "pool " << pid
4131 << rb_score_str
<< "\n";
4132 if (rb_info
.err_msg
.length() > 0) {
4133 out
<< (rc
< 0 ? " ERROR: " : " Warning: ") << rb_info
.err_msg
<< "\n";
4136 //TODO - print error messages here.
4138 for (const auto &snap
: pdata
.snaps
)
4139 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
4141 if (!pdata
.removed_snaps
.empty())
4142 out
<< "\tremoved_snaps " << pdata
.removed_snaps
<< "\n";
4143 auto p
= removed_snaps_queue
.find(pid
);
4144 if (p
!= removed_snaps_queue
.end()) {
4145 out
<< "\tremoved_snaps_queue " << p
->second
<< "\n";
4151 void OSDMap::print_osds(ostream
& out
) const
4153 for (int i
=0; i
<get_max_osd(); i
++) {
4159 void OSDMap::print_osd(int id
, ostream
& out
) const
4165 out
<< "osd." << id
;
4166 out
<< (is_up(id
) ? " up ":" down");
4167 out
<< (is_in(id
) ? " in ":" out");
4168 out
<< " weight " << get_weightf(id
);
4169 if (get_primary_affinity(id
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
4170 out
<< " primary_affinity " << get_primary_affinityf(id
);
4172 const osd_info_t
& info(get_info(id
));
4174 out
<< " " << get_addrs(id
) << " " << get_cluster_addrs(id
);
4178 if (!get_uuid(id
).is_zero()) {
4179 out
<< " " << get_uuid(id
);
4184 void OSDMap::print(CephContext
*cct
, ostream
& out
) const
4186 out
<< "epoch " << get_epoch() << "\n"
4187 << "fsid " << get_fsid() << "\n"
4188 << "created " << get_created() << "\n"
4189 << "modified " << get_modified() << "\n";
4191 out
<< "flags " << get_flag_string() << "\n";
4192 out
<< "crush_version " << get_crush_version() << "\n";
4193 out
<< "full_ratio " << full_ratio
<< "\n";
4194 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
4195 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
4196 if (require_min_compat_client
!= ceph_release_t::unknown
) {
4197 out
<< "require_min_compat_client "
4198 << require_min_compat_client
<< "\n";
4200 out
<< "min_compat_client " << get_min_compat_client()
4202 if (require_osd_release
> ceph_release_t::unknown
) {
4203 out
<< "require_osd_release " << require_osd_release
4206 out
<< "stretch_mode_enabled " << (stretch_mode_enabled
? "true" : "false") << "\n";
4207 if (stretch_mode_enabled
) {
4208 out
<< "stretch_bucket_count " << stretch_bucket_count
<< "\n";
4209 out
<< "degraded_stretch_mode " << degraded_stretch_mode
<< "\n";
4210 out
<< "recovering_stretch_mode " << recovering_stretch_mode
<< "\n";
4211 out
<< "stretch_mode_bucket " << stretch_mode_bucket
<< "\n";
4213 if (get_cluster_snapshot().length())
4214 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
4215 if (allow_crimson
) {
4216 out
<< "allow_crimson=true\n";
4220 print_pools(cct
, out
);
4222 out
<< "max_osd " << get_max_osd() << "\n";
4226 for (auto& p
: pg_upmap
) {
4227 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
4229 for (auto& p
: pg_upmap_items
) {
4230 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
4233 for (auto& [pg
, osd
] : pg_upmap_primaries
) {
4234 out
<< "pg_upmap_primary " << pg
<< " " << osd
<< "\n";
4237 for (const auto& pg
: *pg_temp
)
4238 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
4240 for (const auto& pg
: *primary_temp
)
4241 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
4243 for (const auto &addr
: blocklist
)
4244 out
<< "blocklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
4245 for (const auto &addr
: range_blocklist
)
4246 out
<< "range blocklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
4249 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
4251 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
4253 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4255 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
4257 bool should_dump_leaf(int i
) const override
{
4259 return true; // normal case
4261 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
4262 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
4263 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
4264 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
4265 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
4271 bool should_dump_empty_bucket() const override
{
4275 void init_table(TextTable
*tbl
) {
4276 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
4277 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
4278 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4279 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
4280 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
4281 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4282 tbl
->define_column("PRI-AFF", TextTable::LEFT
, TextTable::RIGHT
);
4284 void dump(TextTable
*tbl
, string
& bucket
) {
4287 if (!bucket
.empty()) {
4292 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4293 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
)) {
4294 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), tbl
);
4301 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
4302 const char *c
= crush
->get_item_class(qi
.id
);
4307 << weightf_t(qi
.weight
);
4310 for (int k
= 0; k
< qi
.depth
; k
++)
4312 if (qi
.is_bucket()) {
4313 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
4314 << crush
->get_item_name(qi
.id
);
4316 name
<< "osd." << qi
.id
;
4320 if (!qi
.is_bucket()) {
4321 if (!osdmap
->exists(qi
.id
)) {
4326 if (osdmap
->is_up(qi
.id
)) {
4328 } else if (osdmap
->is_destroyed(qi
.id
)) {
4334 << weightf_t(osdmap
->get_weightf(qi
.id
))
4335 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
4338 *tbl
<< TextTable::endrow
;
4342 const OSDMap
*osdmap
;
4343 const unsigned filter
;
4346 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
4348 typedef CrushTreeDumper::FormattingDumper Parent
;
4350 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4352 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
4354 bool should_dump_leaf(int i
) const override
{
4356 return true; // normal case
4358 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
4359 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
4360 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
4361 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
4362 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
4368 bool should_dump_empty_bucket() const override
{
4372 void dump(Formatter
*f
, string
& bucket
) {
4373 if (!bucket
.empty()) {
4375 f
->open_array_section("nodes");
4379 f
->open_array_section("nodes");
4382 f
->open_array_section("stray");
4383 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4384 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
))
4385 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
4392 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
4393 Parent::dump_item_fields(qi
, f
);
4394 if (!qi
.is_bucket())
4397 if (osdmap
->is_up(qi
.id
)) {
4399 } else if (osdmap
->is_destroyed(qi
.id
)) {
4404 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
4405 f
->dump_string("status", s
);
4406 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
4407 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
4412 const OSDMap
*osdmap
;
4413 const unsigned filter
;
4416 void OSDMap::print_tree(Formatter
*f
, ostream
*out
, unsigned filter
, string bucket
) const
4419 OSDTreeFormattingDumper(crush
.get(), this, filter
).dump(f
, bucket
);
4423 OSDTreePlainDumper(crush
.get(), this, filter
).dump(&tbl
, bucket
);
4428 void OSDMap::print_summary(Formatter
*f
, ostream
& out
,
4429 const string
& prefix
, bool extra
) const
4432 f
->dump_int("epoch", get_epoch());
4433 f
->dump_int("num_osds", get_num_osds());
4434 f
->dump_int("num_up_osds", get_num_up_osds());
4435 f
->dump_int("osd_up_since", last_up_change
.to_msec() / 1000);
4436 f
->dump_int("num_in_osds", get_num_in_osds());
4437 f
->dump_int("osd_in_since", last_in_change
.to_msec() / 1000);
4438 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
4440 utime_t now
= ceph_clock_now();
4441 out
<< get_num_osds() << " osds: "
4442 << get_num_up_osds() << " up";
4443 if (last_up_change
!= utime_t()) {
4444 out
<< " (since " << utimespan_str(now
- last_up_change
) << ")";
4446 out
<< ", " << get_num_in_osds() << " in";
4447 if (last_in_change
!= utime_t()) {
4448 out
<< " (since " << utimespan_str(now
- last_in_change
) << ")";
4451 out
<< "; epoch: e" << get_epoch();
4452 if (get_num_pg_temp())
4453 out
<< "; " << get_num_pg_temp() << " remapped pgs";
4455 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
4456 if (important_flags
)
4457 out
<< prefix
<< "flags " << get_flag_string(important_flags
) << "\n";
4461 void OSDMap::print_oneline_summary(ostream
& out
) const
4463 out
<< "e" << get_epoch() << ": "
4464 << get_num_osds() << " total, "
4465 << get_num_up_osds() << " up, "
4466 << get_num_in_osds() << " in";
4469 bool OSDMap::crush_rule_in_use(int rule_id
) const
4471 for (const auto &pool
: pools
) {
4472 if (pool
.second
.crush_rule
== rule_id
)
4478 int OSDMap::validate_crush_rules(CrushWrapper
*newcrush
,
4481 for (auto& i
: pools
) {
4482 auto& pool
= i
.second
;
4483 int ruleno
= pool
.get_crush_rule();
4484 if (!newcrush
->rule_exists(ruleno
)) {
4485 *ss
<< "pool " << i
.first
<< " references crush_rule " << ruleno
4486 << " but it is not present";
4489 if (newcrush
->get_rule_type(ruleno
) != (int)pool
.get_type()) {
4490 *ss
<< "pool " << i
.first
<< " type does not match rule " << ruleno
;
4497 int OSDMap::build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
4498 int nosd
, int pg_bits
, int pgp_bits
,
4501 ldout(cct
, 10) << "build_simple on " << nosd
4502 << " osds" << dendl
;
4505 created
= modified
= ceph_clock_now();
4512 const auto& conf
= cct
->_conf
;
4513 vector
<string
> sections
;
4514 conf
.get_all_sections(sections
);
4516 for (auto §ion
: sections
) {
4517 if (section
.find("osd.") != 0)
4520 const char *begin
= section
.c_str() + 4;
4521 char *end
= (char*)begin
;
4522 int o
= strtol(begin
, &end
, 10);
4526 if (o
> cct
->_conf
->mon_max_osd
) {
4527 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
4535 set_max_osd(maxosd
+ 1);
4542 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
4544 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
4545 ceph_assert(r
== 0);
4547 int poolbase
= get_max_osd() ? get_max_osd() : 1;
4549 const int default_replicated_rule
= crush
->get_osd_pool_default_crush_replicated_rule(cct
);
4550 ceph_assert(default_replicated_rule
>= 0);
4553 // pgp_num <= pg_num
4554 if (pgp_bits
> pg_bits
)
4557 vector
<string
> pool_names
;
4558 pool_names
.push_back("rbd");
4559 for (auto &plname
: pool_names
) {
4560 int64_t pool
= ++pool_max
;
4561 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
4562 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
4563 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
4564 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
4565 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
4566 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
4567 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
4568 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
4569 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
4570 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
4571 if (cct
->_conf
->osd_pool_default_flag_bulk
)
4572 pools
[pool
].set_flag(pg_pool_t::FLAG_BULK
);
4573 pools
[pool
].size
= cct
->_conf
.get_val
<uint64_t>("osd_pool_default_size");
4574 pools
[pool
].min_size
= cct
->_conf
.get_osd_pool_default_min_size(
4576 pools
[pool
].crush_rule
= default_replicated_rule
;
4577 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
4578 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
4579 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
4580 pools
[pool
].set_pg_num_target(poolbase
<< pg_bits
);
4581 pools
[pool
].set_pgp_num_target(poolbase
<< pgp_bits
);
4582 pools
[pool
].last_change
= epoch
;
4583 pools
[pool
].application_metadata
.insert(
4584 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
4585 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
4586 cct
->_conf
.get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
4587 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
4588 pools
[pool
].pg_autoscale_mode
= m
;
4590 pools
[pool
].pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
4592 pool_name
[pool
] = plname
;
4593 name_pool
[plname
] = pool
;
4597 map
<string
,string
> profile_map
;
4598 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
4600 lderr(cct
) << ss
.str() << dendl
;
4603 set_erasure_code_profile("default", profile_map
);
4607 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
4608 map
<string
,string
> &profile_map
,
4611 int r
= get_json_str_map(cct
->_conf
.get_val
<string
>("osd_pool_default_erasure_code_profile"),
4617 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
4619 crush
.set_type_name(0, "osd");
4620 crush
.set_type_name(1, "host");
4621 crush
.set_type_name(2, "chassis");
4622 crush
.set_type_name(3, "rack");
4623 crush
.set_type_name(4, "row");
4624 crush
.set_type_name(5, "pdu");
4625 crush
.set_type_name(6, "pod");
4626 crush
.set_type_name(7, "room");
4627 crush
.set_type_name(8, "datacenter");
4628 crush
.set_type_name(9, "zone");
4629 crush
.set_type_name(10, "region");
4630 crush
.set_type_name(11, "root");
4634 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
4635 int nosd
, ostream
*ss
)
4640 int root_type
= _build_crush_types(crush
);
4642 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
4643 root_type
, 0, NULL
, NULL
, &rootid
);
4644 ceph_assert(r
== 0);
4645 crush
.set_item_name(rootid
, "default");
4647 map
<string
,string
> loc
{
4648 {"host", "localhost"},
4649 {"rack", "localrack"},
4652 for (int o
=0; o
<nosd
; o
++) {
4653 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
4655 snprintf(name
, sizeof(name
), "osd.%d", o
);
4656 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
4659 build_simple_crush_rules(cct
, crush
, "default", ss
);
4666 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
4667 CrushWrapper
& crush
,
4670 const auto& conf
= cct
->_conf
;
4675 int root_type
= _build_crush_types(crush
);
4677 int r
= crush
.add_bucket(0, 0,
4679 root_type
, 0, NULL
, NULL
, &rootid
);
4680 ceph_assert(r
== 0);
4681 crush
.set_item_name(rootid
, "default");
4684 vector
<string
> sections
;
4685 conf
.get_all_sections(sections
);
4687 for (auto §ion
: sections
) {
4688 if (section
.find("osd.") != 0)
4691 const char *begin
= section
.c_str() + 4;
4692 char *end
= (char*)begin
;
4693 int o
= strtol(begin
, &end
, 10);
4697 string host
, rack
, row
, room
, dc
, pool
;
4698 vector
<string
> sectiontmp
;
4699 sectiontmp
.push_back("osd");
4700 sectiontmp
.push_back(section
);
4701 conf
.get_val_from_conf_file(sectiontmp
, "host", host
, false);
4702 conf
.get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
4703 conf
.get_val_from_conf_file(sectiontmp
, "row", row
, false);
4704 conf
.get_val_from_conf_file(sectiontmp
, "room", room
, false);
4705 conf
.get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
4706 conf
.get_val_from_conf_file(sectiontmp
, "root", pool
, false);
4708 if (host
.length() == 0)
4709 host
= "unknownhost";
4710 if (rack
.length() == 0)
4711 rack
= "unknownrack";
4713 map
<string
,string
> loc
;
4721 loc
["datacenter"] = dc
;
4722 loc
["root"] = "default";
4724 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
4725 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
4728 build_simple_crush_rules(cct
, crush
, "default", ss
);
4736 int OSDMap::build_simple_crush_rules(
4738 CrushWrapper
& crush
,
4742 int crush_rule
= crush
.get_osd_pool_default_crush_replicated_rule(cct
);
4743 string failure_domain
=
4744 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
4747 r
= crush
.add_simple_rule_at(
4748 "replicated_rule", root
, failure_domain
, "",
4749 "firstn", pg_pool_t::TYPE_REPLICATED
,
4753 // do not add an erasure rule by default or else we will implicitly
4754 // require the crush_v2 feature of clients
4758 int OSDMap::summarize_mapping_stats(
4760 const set
<int64_t> *pools
,
4768 for (auto &p
: get_pools())
4772 unsigned total_pg
= 0;
4773 unsigned moved_pg
= 0;
4774 vector
<unsigned> base_by_osd(get_max_osd(), 0);
4775 vector
<unsigned> new_by_osd(get_max_osd(), 0);
4776 for (int64_t pool_id
: ls
) {
4777 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
4778 vector
<int> up
, up2
;
4780 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
4781 pg_t
pgid(ps
, pool_id
);
4782 total_pg
+= pi
->get_size();
4783 pg_to_up_acting_osds(pgid
, &up
, &up_primary
, nullptr, nullptr);
4784 for (int osd
: up
) {
4785 if (osd
>= 0 && osd
< get_max_osd())
4789 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
, nullptr, nullptr);
4790 for (int osd
: up2
) {
4791 if (osd
>= 0 && osd
< get_max_osd())
4794 if (pi
->is_erasure()) {
4795 for (unsigned i
=0; i
<up
.size(); ++i
) {
4796 if (up
[i
] != up2
[i
]) {
4800 } else if (pi
->is_replicated()) {
4801 for (int osd
: up
) {
4802 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
4807 ceph_abort_msg("unhandled pool type");
4813 unsigned num_up_in
= 0;
4814 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4815 if (is_up(osd
) && is_in(osd
))
4822 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
4823 float base_stddev
= 0, new_stddev
= 0;
4824 int min
= -1, max
= -1;
4825 unsigned min_base_pg
= 0, max_base_pg
= 0;
4826 unsigned min_new_pg
= 0, max_new_pg
= 0;
4827 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4828 if (is_up(osd
) && is_in(osd
)) {
4829 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
4830 base_stddev
+= base_diff
* base_diff
;
4831 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
4832 new_stddev
+= new_diff
* new_diff
;
4833 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
4835 min_base_pg
= base_by_osd
[osd
];
4836 min_new_pg
= new_by_osd
[osd
];
4838 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
4840 max_base_pg
= base_by_osd
[osd
];
4841 max_new_pg
= new_by_osd
[osd
];
4845 base_stddev
= sqrt(base_stddev
/ num_up_in
);
4846 new_stddev
= sqrt(new_stddev
/ num_up_in
);
4848 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
4852 f
->open_object_section("utilization");
4855 f
->dump_unsigned("moved_pgs", moved_pg
);
4856 f
->dump_unsigned("total_pgs", total_pg
);
4860 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
4861 ss
<< "moved " << moved_pg
<< " / " << total_pg
4862 << " (" << percent
<< "%)\n";
4866 f
->dump_float("avg_pgs", avg_pg
);
4867 f
->dump_float("std_dev", base_stddev
);
4868 f
->dump_float("expected_baseline_std_dev", edev
);
4870 f
->dump_float("new_std_dev", new_stddev
);
4872 ss
<< "avg " << avg_pg
<< "\n";
4873 ss
<< "stddev " << base_stddev
;
4875 ss
<< " -> " << new_stddev
;
4876 ss
<< " (expected baseline " << edev
<< ")\n";
4880 f
->dump_unsigned("min_osd", min
);
4881 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
4883 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
4885 ss
<< "min osd." << min
<< " with " << min_base_pg
;
4887 ss
<< " -> " << min_new_pg
;
4888 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
4890 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
4896 f
->dump_unsigned("max_osd", max
);
4897 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
4899 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
4901 ss
<< "max osd." << max
<< " with " << max_base_pg
;
4903 ss
<< " -> " << max_new_pg
;
4904 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
4906 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
4917 bool OSDMap::try_pg_upmap(
4919 pg_t pg
, ///< pg to potentially remap
4920 const set
<int>& overfull
, ///< osds we'd want to evacuate
4921 const vector
<int>& underfull
, ///< osds to move to, in order of preference
4922 const vector
<int>& more_underfull
, ///< more osds only slightly underfull
4924 vector
<int> *out
) ///< resulting alternative mapping
4926 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
4929 int rule
= pool
->get_crush_rule();
4933 // make sure there is something there to remap
4935 for (auto osd
: *orig
) {
4936 if (overfull
.count(osd
)) {
4945 int r
= crush
->try_remap_rule(
4949 overfull
, underfull
,
4961 int OSDMap::balance_primaries(
4964 OSDMap::Incremental
*pending_inc
,
4965 OSDMap
& tmp_osd_map
) const
4967 // This function only handles replicated pools.
4968 const pg_pool_t
* pool
= get_pg_pool(pid
);
4969 if (! pool
->is_replicated()) {
4970 ldout(cct
, 10) << __func__
<< " skipping erasure pool "
4971 << get_pool_name(pid
) << dendl
;
4975 // Info to be used in verify_upmap
4976 int pool_size
= pool
->get_size();
4977 int crush_rule
= pool
->get_crush_rule();
4979 // Get pgs by osd (map of osd -> pgs)
4980 // Get primaries by osd (map of osd -> primary)
4981 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
4982 map
<uint64_t,set
<pg_t
>> prim_pgs_by_osd
;
4983 map
<uint64_t,set
<pg_t
>> acting_prims_by_osd
;
4984 pgs_by_osd
= tmp_osd_map
.get_pgs_by_osd(cct
, pid
, &prim_pgs_by_osd
, &acting_prims_by_osd
);
4986 // Construct information about the pgs and osds we will consider in new primary mappings,
4987 // as well as a map of all pgs and their original primary osds.
4988 map
<pg_t
,bool> prim_pgs_to_check
;
4989 vector
<uint64_t> osds_to_check
;
4990 map
<pg_t
, uint64_t> orig_prims
;
4991 for (const auto & [osd
, pgs
] : prim_pgs_by_osd
) {
4992 osds_to_check
.push_back(osd
);
4993 for (const auto & pg
: pgs
) {
4994 prim_pgs_to_check
.insert({pg
, false});
4995 orig_prims
.insert({pg
, osd
});
4999 // calculate desired primary distribution for each osd
5000 map
<uint64_t,float> desired_prim_dist
;
5002 rc
= calc_desired_primary_distribution(cct
, pid
, osds_to_check
, desired_prim_dist
);
5004 ldout(cct
, 10) << __func__
<< " Error in calculating desired primary distribution" << dendl
;
5007 map
<uint64_t,float> prim_dist_scores
;
5010 for (auto osd
: osds_to_check
) {
5011 actual
= prim_pgs_by_osd
[osd
].size();
5012 desired
= desired_prim_dist
[osd
];
5013 prim_dist_scores
[osd
] = actual
- desired
;
5014 ldout(cct
, 10) << __func__
<< " desired distribution for osd." << osd
<< " " << desired
<< dendl
;
5017 // get read balance score before balancing
5018 float read_balance_score_before
= 0.0;
5019 read_balance_info_t rb_info
;
5020 rc
= tmp_osd_map
.calc_read_balance_score(cct
, pid
, &rb_info
);
5022 read_balance_score_before
= rb_info
.adjusted_score
;
5024 if (rb_info
.err_msg
.length() > 0) {
5025 ldout(cct
, 10) << __func__
<< (rc
< 0 ? " ERROR: " : " Warning: ") << rb_info
.err_msg
<< dendl
;
5029 // get ready to swap pgs
5031 int curr_num_changes
= 0;
5032 vector
<int> up_osds
;
5033 vector
<int> acting_osds
;
5034 int up_primary
, acting_primary
;
5035 for (const auto & [pg
, mapped
] : prim_pgs_to_check
) {
5036 // fill in the up, up primary, acting, and acting primary for the current PG
5037 tmp_osd_map
.pg_to_up_acting_osds(pg
, &up_osds
, &up_primary
,
5038 &acting_osds
, &acting_primary
);
5040 // find the OSD that would make the best swap based on its score
5041 // We start by first testing the OSD that is currently primary for the PG we are checking.
5042 uint64_t curr_best_osd
= up_primary
;
5043 float prim_score
= prim_dist_scores
[up_primary
];
5044 for (auto potential_osd
: up_osds
) {
5045 float potential_score
= prim_dist_scores
[potential_osd
];
5046 if ((prim_score
> 0) && // taking 1 pg from the prim would not make its score worse
5047 (potential_score
< 0) && // adding 1 pg to the potential would not make its score worse
5048 ((prim_score
- potential_score
) > 1) && // swapping a pg would not just keep the scores the same
5049 (desired_prim_dist
[potential_osd
] > 0)) // the potential is not off limits (the primary affinity is above 0)
5051 curr_best_osd
= potential_osd
;
5055 // Make the swap only if:
5056 // 1. The swap is legal
5057 // 2. The balancer has chosen a new primary
5058 auto legal_swap
= crush
->verify_upmap(cct
,
5061 {(int)curr_best_osd
});
5062 if (legal_swap
>= 0 &&
5063 ((int)curr_best_osd
!= up_primary
)) {
5064 // Update prim_dist_scores
5065 prim_dist_scores
[curr_best_osd
] += 1;
5066 prim_dist_scores
[up_primary
] -= 1;
5068 // Update the mappings
5069 tmp_osd_map
.pg_upmap_primaries
[pg
] = curr_best_osd
;
5070 if (curr_best_osd
== orig_prims
[pg
]) {
5071 pending_inc
->new_pg_upmap_primary
.erase(pg
);
5072 prim_pgs_to_check
[pg
] = false;
5074 pending_inc
->new_pg_upmap_primary
[pg
] = curr_best_osd
;
5075 prim_pgs_to_check
[pg
] = true; // mark that this pg changed mappings
5080 ldout(cct
, 20) << __func__
<< " curr_num_changes: " << curr_num_changes
<< dendl
;
5082 // If there are no changes after one pass through the pgs, then no further optimizations can be made.
5083 if (curr_num_changes
== 0) {
5084 ldout(cct
, 20) << __func__
<< " curr_num_changes is 0; no further optimizations can be made." << dendl
;
5089 // get read balance score after balancing
5090 float read_balance_score_after
= 0.0;
5091 rc
= tmp_osd_map
.calc_read_balance_score(cct
, pid
, &rb_info
);
5093 read_balance_score_after
= rb_info
.adjusted_score
;
5095 if (rb_info
.err_msg
.length() > 0) {
5096 ldout(cct
, 10) << __func__
<< (rc
< 0 ? " ERROR: " : " Warning: ") << rb_info
.err_msg
<< dendl
;
5100 // Tally total number of changes
5101 int num_changes
= 0;
5102 if (read_balance_score_after
< read_balance_score_before
) {
5103 for (auto [pg
, mapped
] : prim_pgs_to_check
) {
5110 ldout(cct
, 10) << __func__
<< " num_changes " << num_changes
<< dendl
;
5114 int OSDMap::calc_desired_primary_distribution(
5117 const vector
<uint64_t> &osds
,
5118 std::map
<uint64_t, float>& desired_primary_distribution
) const
5120 // will return a perfect distribution of floats
5121 // without calculating the floor of each value
5123 // This function only handles replicated pools.
5124 const pg_pool_t
* pool
= get_pg_pool(pid
);
5125 if (pool
->is_replicated()) {
5126 ldout(cct
, 20) << __func__
<< " calculating distribution for replicated pool "
5127 << get_pool_name(pid
) << dendl
;
5128 uint64_t replica_count
= pool
->get_size();
5130 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
5131 pgs_by_osd
= get_pgs_by_osd(cct
, pid
);
5133 // First calculate the distribution using primary affinity and tally up the sum
5134 auto distribution_sum
= 0.0;
5135 for (const auto & osd
: osds
) {
5136 float osd_primary_count
= ((float)pgs_by_osd
[osd
].size() / (float)replica_count
) * get_primary_affinityf(osd
);
5137 desired_primary_distribution
.insert({osd
, osd_primary_count
});
5138 distribution_sum
+= osd_primary_count
;
5140 if (distribution_sum
<= 0) {
5141 ldout(cct
, 10) << __func__
<< " Unable to calculate primary distribution, likely because primary affinity is"
5142 << " set to 0 on all OSDs." << dendl
;
5146 // Then, stretch the value (necessary when primary affinity is smaller than 1)
5147 float factor
= (float)pool
->get_pg_num() / (float)distribution_sum
;
5148 float distribution_sum_desired
= 0.0;
5150 ceph_assert(factor
>= 1.0);
5151 for (const auto & [osd
, osd_primary_count
] : desired_primary_distribution
) {
5152 desired_primary_distribution
[osd
] *= factor
;
5153 distribution_sum_desired
+= desired_primary_distribution
[osd
];
5155 ceph_assert(fabs(distribution_sum_desired
- pool
->get_pg_num()) < 0.01);
5157 ldout(cct
, 10) << __func__
<<" skipping erasure pool "
5158 << get_pool_name(pid
) << dendl
;
5165 int OSDMap::calc_pg_upmaps(
5167 uint32_t max_deviation
,
5169 const set
<int64_t>& only_pools
,
5170 OSDMap::Incremental
*pending_inc
,
5171 std::random_device::result_type
*p_seed
)
5173 ldout(cct
, 10) << __func__
<< " pools " << only_pools
<< dendl
;
5175 // Can't be less than 1 pg
5176 if (max_deviation
< 1)
5178 tmp_osd_map
.deepish_copy_from(*this);
5179 int num_changed
= 0;
5180 map
<int,set
<pg_t
>> pgs_by_osd
;
5182 float osd_weight_total
= 0;
5183 map
<int,float> osd_weight
;
5186 lderr(cct
) << __func__
<< " abort due to max <= 0" << dendl
;
5190 osd_weight_total
= build_pool_pgs_info(cct
, only_pools
, tmp_osd_map
,
5191 total_pgs
, pgs_by_osd
, osd_weight
);
5192 if (osd_weight_total
== 0) {
5193 lderr(cct
) << __func__
<< " abort due to osd_weight_total == 0" << dendl
;
5197 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
5198 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
5199 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
5202 map
<int,float> osd_deviation
; // osd, deviation(pgs)
5203 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
5204 float cur_max_deviation
= calc_deviations(cct
, pgs_by_osd
, osd_weight
, pgs_per_weight
,
5205 osd_deviation
, deviation_osd
, stddev
);
5207 ldout(cct
, 20) << " stdev " << stddev
<< " max_deviation " << cur_max_deviation
<< dendl
;
5208 if (cur_max_deviation
<= max_deviation
) {
5209 ldout(cct
, 10) << __func__
<< " distribution is almost perfect"
5214 bool skip_overfull
= false;
5216 cct
->_conf
.get_val
<bool>("osd_calc_pg_upmaps_aggressively");
5217 auto fast_aggressive
= aggressive
&&
5218 cct
->_conf
.get_val
<bool>("osd_calc_pg_upmaps_aggressively_fast");
5219 auto local_fallback_retries
=
5220 cct
->_conf
.get_val
<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
5223 ldout(cct
, 30) << "Top of loop #" << max
+1 << dendl
;
5224 // build overfull and underfull
5226 set
<int> more_overfull
;
5227 bool using_more_overfull
= false;
5228 vector
<int> underfull
;
5229 vector
<int> more_underfull
;
5230 fill_overfull_underfull(cct
, deviation_osd
, max_deviation
,
5231 overfull
, more_overfull
,
5232 underfull
, more_underfull
);
5234 if (underfull
.empty() && overfull
.empty()) {
5235 ldout(cct
, 20) << __func__
<< " failed to build overfull and underfull" << dendl
;
5238 if (overfull
.empty() && !underfull
.empty()) {
5239 ldout(cct
, 20) << __func__
<< " Using more_overfull since we still have underfull" << dendl
;
5240 overfull
= more_overfull
;
5241 using_more_overfull
= true;
5244 ldout(cct
, 10) << " overfull " << overfull
5245 << " underfull " << underfull
5248 uint64_t local_fallback_retried
= 0;
5250 // Used to prevent some of the unsuccessful loop iterations (save runtime)
5251 // If we can't find a change per OSD we skip further iterations for this OSD
5252 uint n_changes
= 0, prev_n_changes
= 0;
5253 set
<int> osd_to_skip
;
5258 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>> to_upmap
;
5259 auto temp_pgs_by_osd
= pgs_by_osd
;
5260 // always start with fullest, break if we find any changes to make
5261 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
5262 if (skip_overfull
&& !underfull
.empty()) {
5263 ldout(cct
, 10) << " skipping overfull " << dendl
;
5264 break; // fall through to check underfull
5266 int osd
= p
->second
;
5267 float deviation
= p
->first
;
5268 if (fast_aggressive
&& osd_to_skip
.count(osd
)) {
5269 ldout(cct
, 20) << " Fast aggressive mode: skipping osd " << osd
5270 << " osd_to_skip size = " << osd_to_skip
.size() << dendl
;
5274 if (deviation
< 0) {
5275 ldout(cct
, 10) << " hitting underfull osds now"
5276 << " when trying to remap overfull osds"
5280 float target
= osd_weight
[osd
] * pgs_per_weight
;
5281 ldout(cct
, 10) << " Overfull search osd." << osd
5282 << " target " << target
5283 << " deviation " << deviation
5285 ceph_assert(target
> 0);
5286 if (!using_more_overfull
&& deviation
<= max_deviation
) {
5287 ldout(cct
, 10) << " osd." << osd
5288 << " target " << target
5289 << " deviation " << deviation
5290 << " < max deviation " << max_deviation
5296 pgs
.reserve(pgs_by_osd
[osd
].size());
5297 for (auto& pg
: pgs_by_osd
[osd
]) {
5298 if (to_skip
.count(pg
))
5303 // shuffle PG list so they all get equal (in)attention
5304 std::shuffle(pgs
.begin(), pgs
.end(), get_random_engine(cct
, p_seed
));
5306 // look for remaps we can un-remap
5307 if (try_drop_remap_overfull(cct
, pgs
, tmp_osd_map
, osd
,
5308 temp_pgs_by_osd
, to_unmap
, to_upmap
))
5312 for (auto pg
: pgs
) {
5313 auto temp_it
= tmp_osd_map
.pg_upmap
.find(pg
);
5314 if (temp_it
!= tmp_osd_map
.pg_upmap
.end()) {
5315 // leave pg_upmap alone
5316 // it must be specified by admin since balancer does not
5317 // support pg_upmap yet
5318 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap "
5319 << temp_it
->second
<< ", skipping"
5323 auto pg_pool_size
= tmp_osd_map
.get_pg_pool_size(pg
);
5324 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5326 auto it
= tmp_osd_map
.pg_upmap_items
.find(pg
);
5327 if (it
!= tmp_osd_map
.pg_upmap_items
.end()) {
5328 auto& um_items
= it
->second
;
5329 if (um_items
.size() >= (size_t)pg_pool_size
) {
5330 ldout(cct
, 10) << " " << pg
<< " already has full-size pg_upmap_items "
5331 << um_items
<< ", skipping"
5335 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap_items "
5338 new_upmap_items
= um_items
;
5339 // build existing too (for dedup)
5340 for (auto [um_from
, um_to
] : um_items
) {
5341 existing
.insert(um_from
);
5342 existing
.insert(um_to
);
5346 // to see if we can append more remapping pairs
5348 ldout(cct
, 10) << " trying " << pg
<< dendl
;
5349 vector
<int> raw
, orig
, out
;
5350 tmp_osd_map
.pg_to_raw_upmap(pg
, &raw
, &orig
); // including existing upmaps too
5351 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, more_underfull
, &orig
, &out
)) {
5354 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
5355 if (orig
.size() != out
.size()) {
5358 ceph_assert(orig
!= out
);
5359 int pos
= find_best_remap(cct
, orig
, out
, existing
, osd_deviation
);
5361 // append new remapping pairs slowly
5362 // This way we can make sure that each tiny change will
5363 // definitely make distribution of PGs converging to
5364 // the perfect status.
5365 add_remap_pair(cct
, orig
[pos
], out
[pos
], pg
, (size_t)pg_pool_size
,
5366 osd
, existing
, temp_pgs_by_osd
,
5367 new_upmap_items
, to_upmap
);
5371 if (fast_aggressive
) {
5372 if (prev_n_changes
== n_changes
) { // no changes for prev OSD
5373 osd_to_skip
.insert(osd
);
5376 prev_n_changes
= n_changes
;
5382 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
5383 ldout(cct
, 10) << " failed to find any changes for overfull osds"
5385 for (auto& [deviation
, osd
] : deviation_osd
) {
5386 if (std::find(underfull
.begin(), underfull
.end(), osd
) ==
5389 float target
= osd_weight
[osd
] * pgs_per_weight
;
5390 ceph_assert(target
> 0);
5391 if (fabsf(deviation
) < max_deviation
) {
5392 // respect max_deviation too
5393 ldout(cct
, 10) << " osd." << osd
5394 << " target " << target
5395 << " deviation " << deviation
5396 << " -> absolute " << fabsf(deviation
)
5397 << " < max " << max_deviation
5401 // look for remaps we can un-remap
5402 candidates_t candidates
= build_candidates(cct
, tmp_osd_map
, to_skip
,
5403 only_pools
, aggressive
, p_seed
);
5404 if (try_drop_remap_underfull(cct
, candidates
, osd
, temp_pgs_by_osd
,
5405 to_unmap
, to_upmap
)) {
5410 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
5411 ldout(cct
, 10) << " failed to find any changes for underfull osds"
5414 ldout(cct
, 10) << " break due to aggressive mode not enabled" << dendl
;
5416 } else if (!skip_overfull
) {
5417 // safe to quit because below here we know
5418 // we've done checking both overfull and underfull osds..
5419 ldout(cct
, 10) << " break due to not being able to find any"
5420 << " further optimizations"
5424 // restart with fullest and do exhaustive searching
5425 skip_overfull
= false;
5430 // test change, apply if change is good
5431 ceph_assert(to_unmap
.size() || to_upmap
.size());
5432 float new_stddev
= 0;
5433 map
<int,float> temp_osd_deviation
;
5434 multimap
<float,int> temp_deviation_osd
;
5435 float cur_max_deviation
= calc_deviations(cct
, temp_pgs_by_osd
, osd_weight
,
5436 pgs_per_weight
, temp_osd_deviation
,
5437 temp_deviation_osd
, new_stddev
);
5438 ldout(cct
, 10) << " stddev " << stddev
<< " -> " << new_stddev
<< dendl
;
5439 if (new_stddev
>= stddev
) {
5441 ldout(cct
, 10) << " break because stddev is not decreasing"
5442 << " and aggressive mode is not enabled"
5446 local_fallback_retried
++;
5447 if (local_fallback_retried
>= local_fallback_retries
) {
5448 // does not make progress
5449 // flip *skip_overfull* so both overfull and underfull
5450 // get equal (in)attention
5451 skip_overfull
= !skip_overfull
;
5452 ldout(cct
, 10) << " hit local_fallback_retries "
5453 << local_fallback_retries
5457 for (auto& i
: to_unmap
)
5459 for (auto& i
: to_upmap
)
5460 to_skip
.insert(i
.first
);
5461 ldout(cct
, 20) << " local_fallback_retried " << local_fallback_retried
5462 << " to_skip " << to_skip
5468 ceph_assert(new_stddev
< stddev
);
5469 stddev
= new_stddev
;
5470 pgs_by_osd
= temp_pgs_by_osd
;
5471 osd_deviation
= temp_osd_deviation
;
5472 deviation_osd
= temp_deviation_osd
;
5476 num_changed
+= pack_upmap_results(cct
, to_unmap
, to_upmap
, tmp_osd_map
, pending_inc
);
5478 ldout(cct
, 20) << " stdev " << stddev
<< " max_deviation " << cur_max_deviation
<< dendl
;
5479 if (cur_max_deviation
<= max_deviation
) {
5480 ldout(cct
, 10) << __func__
<< " Optimization plan is almost perfect"
5485 ldout(cct
, 10) << " num_changed = " << num_changed
<< dendl
;
5489 map
<uint64_t,set
<pg_t
>> OSDMap::get_pgs_by_osd(
5492 map
<uint64_t, set
<pg_t
>> *p_primaries_by_osd
,
5493 map
<uint64_t, set
<pg_t
>> *p_acting_primaries_by_osd
) const
5495 // Set up the OSDMap
5497 tmp_osd_map
.deepish_copy_from(*this);
5499 // Get the pool from the provided pool id
5500 const pg_pool_t
* pool
= get_pg_pool(pid
);
5502 // build array of pgs from the pool
5503 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
5504 for (unsigned ps
= 0; ps
< pool
->get_pg_num(); ++ps
) {
5509 tmp_osd_map
.pg_to_up_acting_osds(pg
, &up
, &primary
, nullptr, &acting_prim
);
5511 ldout(cct
, 20) << __func__
<< " " << pg
5513 << " primary " << primary
5514 << " acting_primary " << acting_prim
5517 if (!up
.empty()) { // up can be empty is test generated files
5518 // in this case, we return empty result
5519 for (auto osd
: up
) {
5520 if (osd
!= CRUSH_ITEM_NONE
)
5521 pgs_by_osd
[osd
].insert(pg
);
5523 if (p_primaries_by_osd
!= nullptr) {
5524 if (primary
!= CRUSH_ITEM_NONE
)
5525 (*p_primaries_by_osd
)[primary
].insert(pg
);
5527 if (p_acting_primaries_by_osd
!= nullptr) {
5528 if (acting_prim
!= CRUSH_ITEM_NONE
)
5529 (*p_acting_primaries_by_osd
)[acting_prim
].insert(pg
);
5536 float OSDMap::get_osds_weight(
5538 const OSDMap
& tmp_osd_map
,
5540 map
<int,float>& osds_weight
) const
5542 map
<int,float> pmap
;
5543 ceph_assert(pools
.count(pid
));
5544 int ruleno
= pools
.at(pid
).get_crush_rule();
5545 tmp_osd_map
.crush
->get_rule_weight_osd_map(ruleno
, &pmap
);
5546 ldout(cct
,20) << __func__
<< " pool " << pid
5547 << " ruleno " << ruleno
5548 << " weight-map " << pmap
5550 float osds_weight_total
= 0;
5551 for (auto [oid
, oweight
] : pmap
) {
5552 auto adjusted_weight
= tmp_osd_map
.get_weightf(oid
) * oweight
;
5553 if (adjusted_weight
!= 0) {
5554 osds_weight
[oid
] += adjusted_weight
;
5555 osds_weight_total
+= adjusted_weight
;
5558 return osds_weight_total
;
5561 float OSDMap::build_pool_pgs_info (
5563 const std::set
<int64_t>& only_pools
, ///< [optional] restrict to pool
5564 const OSDMap
& tmp_osd_map
,
5566 map
<int,set
<pg_t
>>& pgs_by_osd
,
5567 map
<int,float>& osds_weight
)
5570 // This function builds some data structures that are used by calc_pg_upmaps.
5571 // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs
5572 // and returns the osd_weight_total
5574 float osds_weight_total
= 0.0;
5575 for (auto& [pid
, pdata
] : pools
) {
5576 if (!only_pools
.empty() && !only_pools
.count(pid
))
5578 for (unsigned ps
= 0; ps
< pdata
.get_pg_num(); ++ps
) {
5581 tmp_osd_map
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
5582 ldout(cct
, 20) << __func__
<< " " << pg
<< " up " << up
<< dendl
;
5583 for (auto osd
: up
) {
5584 if (osd
!= CRUSH_ITEM_NONE
)
5585 pgs_by_osd
[osd
].insert(pg
);
5588 total_pgs
+= pdata
.get_size() * pdata
.get_pg_num();
5590 osds_weight_total
= get_osds_weight(cct
, tmp_osd_map
, pid
, osds_weight
);
5592 for (auto& [oid
, oweight
] : osds_weight
) {
5594 auto p
= pgs_by_osd
.find(oid
);
5595 if (p
!= pgs_by_osd
.end())
5596 pgs
= p
->second
.size();
5598 pgs_by_osd
.emplace(oid
, set
<pg_t
>());
5599 ldout(cct
, 20) << " osd." << oid
<< " weight " << oweight
5600 << " pgs " << pgs
<< dendl
;
5602 return osds_weight_total
;
5604 } // return total weight of all OSDs
5606 float OSDMap::calc_deviations (
5608 const map
<int,set
<pg_t
>>& pgs_by_osd
,
5609 const map
<int,float>& osd_weight
,
5610 float pgs_per_weight
,
5611 map
<int,float>& osd_deviation
,
5612 multimap
<float,int>& deviation_osd
,
5613 float& stddev
) // return current max deviation
5616 // This function calculates the 2 maps osd_deviation and deviation_osd which
5617 // hold the deviation between the current number of PGs which map to an OSD
5618 // and the optimal number. Ot also calculates the stddev of the deviations and
5619 // returns the current max deviation.
5620 // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
5621 // long as it is monotonic with stddev (and it is), it is sufficient for
5622 // the balancer code.
5624 float cur_max_deviation
= 0.0;
5626 for (auto& [oid
, opgs
] : pgs_by_osd
) {
5627 // make sure osd is still there (belongs to this crush-tree)
5628 ceph_assert(osd_weight
.count(oid
));
5629 float target
= osd_weight
.at(oid
) * pgs_per_weight
;
5630 float deviation
= (float)opgs
.size() - target
;
5631 ldout(cct
, 20) << " osd." << oid
5632 << "\tpgs " << opgs
.size()
5633 << "\ttarget " << target
5634 << "\tdeviation " << deviation
5636 osd_deviation
[oid
] = deviation
;
5637 deviation_osd
.insert(make_pair(deviation
, oid
));
5638 stddev
+= deviation
* deviation
;
5639 if (fabsf(deviation
) > cur_max_deviation
)
5640 cur_max_deviation
= fabsf(deviation
);
5642 return cur_max_deviation
;
5645 void OSDMap::fill_overfull_underfull (
5647 const std::multimap
<float,int>& deviation_osd
,
5649 std::set
<int>& overfull
,
5650 std::set
<int>& more_overfull
,
5651 std::vector
<int>& underfull
,
5652 std::vector
<int>& more_underfull
)
5655 // This function just fills the overfull and underfull data structures for the
5656 // use of calc_pg_upmaps
5658 for (auto i
= deviation_osd
.rbegin(); i
!= deviation_osd
.rend(); i
++) {
5659 auto& odev
= i
->first
;
5660 auto& oid
= i
->second
;
5661 ldout(cct
, 30) << " check " << odev
<< " <= " << max_deviation
<< dendl
;
5664 if (odev
> max_deviation
) {
5665 ldout(cct
, 30) << " add overfull osd." << oid
<< dendl
;
5666 overfull
.insert(oid
);
5668 more_overfull
.insert(oid
);
5672 for (auto i
= deviation_osd
.begin(); i
!= deviation_osd
.end(); i
++) {
5673 auto& odev
= i
->first
;
5674 auto& oid
= i
->second
;
5675 ldout(cct
, 30) << " check " << odev
<< " >= " << -(int)max_deviation
<< dendl
;
5678 if (odev
< -(int)max_deviation
) {
5679 ldout(cct
, 30) << " add underfull osd." << oid
<< dendl
;
5680 underfull
.push_back(oid
);
5682 more_underfull
.push_back(oid
);
5687 int OSDMap::pack_upmap_results(
5689 const std::set
<pg_t
>& to_unmap
,
5690 const std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int, int>>>& to_upmap
,
5691 OSDMap
& tmp_osd_map
,
5692 OSDMap::Incremental
*pending_inc
)
5695 // This function takes the input from the local variables to_unmap and to_upmap
5696 // and updates tmp_osd_map (so that another iteration can run) and pending_inc
5697 // (so that the results are visible outside calc_pg_upmaps)
5699 int num_changed
= 0;
5700 for (auto& i
: to_unmap
) {
5701 ldout(cct
, 10) << " unmap pg " << i
<< dendl
;
5702 ceph_assert(tmp_osd_map
.pg_upmap_items
.count(i
));
5703 tmp_osd_map
.pg_upmap_items
.erase(i
);
5704 pending_inc
->old_pg_upmap_items
.insert(i
);
5707 for (auto& [pg
, um_items
] : to_upmap
) {
5708 ldout(cct
, 10) << " upmap pg " << pg
5709 << " new pg_upmap_items " << um_items
5711 tmp_osd_map
.pg_upmap_items
[pg
] = um_items
;
5712 pending_inc
->new_pg_upmap_items
[pg
] = um_items
;
5719 std::default_random_engine
OSDMap::get_random_engine(
5721 std::random_device::result_type
*p_seed
)
5724 // This function creates a random_engine to be used for shuffling.
5725 // When p_seed == nullptr it generates random engine with a seed from /dev/random
5726 // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and
5727 // increments seed_set. This is used in order to craete regression test without
5728 // random effect on the results.
5730 static std::random_device::result_type seed_set
= 0;
5731 std::random_device::result_type seed
;
5732 if (p_seed
== nullptr) {
5733 std::random_device rd
;
5737 seed
= *p_seed
+ seed_set
;
5738 ldout(cct
, 30) << " Starting random engine with seed "
5742 return std::default_random_engine
{seed
};
5745 bool OSDMap::try_drop_remap_overfull(
5747 const std::vector
<pg_t
>& pgs
,
5748 const OSDMap
& tmp_osd_map
,
5750 map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
5751 set
<pg_t
>& to_unmap
,
5752 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>>& to_upmap
)
5755 // This function tries to drop existimg upmap items which map data to overfull
5756 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5757 // if it found an item that can be dropped, false if not.
5759 for (auto pg
: pgs
) {
5760 auto p
= tmp_osd_map
.pg_upmap_items
.find(pg
);
5761 if (p
== tmp_osd_map
.pg_upmap_items
.end())
5763 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5764 auto& pg_upmap_items
= p
->second
;
5765 for (auto um_pair
: pg_upmap_items
) {
5766 auto& um_from
= um_pair
.first
;
5767 auto& um_to
= um_pair
.second
;
5769 ldout(cct
, 10) << " will try dropping existing"
5770 << " remapping pair "
5771 << um_from
<< " -> " << um_to
5772 << " which remapped " << pg
5773 << " into overfull osd." << osd
5775 temp_pgs_by_osd
[um_to
].erase(pg
);
5776 temp_pgs_by_osd
[um_from
].insert(pg
);
5778 new_upmap_items
.push_back(um_pair
);
5781 if (new_upmap_items
.empty()) {
5783 ldout(cct
, 10) << " existing pg_upmap_items " << pg_upmap_items
5784 << " remapped " << pg
<< " into overfull osd." << osd
5785 << ", will try cancelling it entirely"
5787 to_unmap
.insert(pg
);
5789 } else if (new_upmap_items
.size() != pg_upmap_items
.size()) {
5790 // drop single remapping pair, updating
5791 ceph_assert(new_upmap_items
.size() < pg_upmap_items
.size());
5792 ldout(cct
, 10) << " existing pg_upmap_items " << pg_upmap_items
5793 << " remapped " << pg
<< " into overfull osd." << osd
5794 << ", new_pg_upmap_items now " << new_upmap_items
5796 to_upmap
[pg
] = new_upmap_items
;
5803 bool OSDMap::try_drop_remap_underfull(
5805 const candidates_t
& candidates
,
5807 map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
5808 set
<pg_t
>& to_unmap
,
5809 map
<pg_t
, mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>>& to_upmap
)
5812 // This function tries to drop existimg upmap items which map data from underfull
5813 // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true
5814 // if it found an item that can be dropped, false if not.
5816 for (auto& [pg
, um_pairs
] : candidates
) {
5817 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5818 for (auto& ump
: um_pairs
) {
5819 auto& um_from
= ump
.first
;
5820 auto& um_to
= ump
.second
;
5821 if (um_from
== osd
) {
5822 ldout(cct
, 10) << " will try dropping existing"
5823 << " remapping pair "
5824 << um_from
<< " -> " << um_to
5825 << " which remapped " << pg
5826 << " out from underfull osd." << osd
5828 temp_pgs_by_osd
[um_to
].erase(pg
);
5829 temp_pgs_by_osd
[um_from
].insert(pg
);
5831 new_upmap_items
.push_back(ump
);
5834 if (new_upmap_items
.empty()) {
5836 ldout(cct
, 10) << " existing pg_upmap_items " << um_pairs
5837 << " remapped " << pg
5838 << " out from underfull osd." << osd
5839 << ", will try cancelling it entirely"
5841 to_unmap
.insert(pg
);
5843 } else if (new_upmap_items
.size() != um_pairs
.size()) {
5844 // drop single remapping pair, updating
5845 ceph_assert(new_upmap_items
.size() < um_pairs
.size());
5846 ldout(cct
, 10) << " existing pg_upmap_items " << um_pairs
5847 << " remapped " << pg
5848 << " out from underfull osd." << osd
5849 << ", new_pg_upmap_items now " << new_upmap_items
5851 to_upmap
[pg
] = new_upmap_items
;
5858 void OSDMap::add_remap_pair(
5863 size_t pg_pool_size
,
5866 map
<int,set
<pg_t
>>& temp_pgs_by_osd
,
5867 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
,
5868 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>>& to_upmap
)
5871 // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all
5872 // the relevant data structures
5874 ldout(cct
, 10) << " will try adding new remapping pair "
5875 << orig
<< " -> " << out
<< " for " << pg
5876 << (orig
!= osd
? " NOT selected osd" : "")
5878 existing
.insert(orig
);
5879 existing
.insert(out
);
5880 temp_pgs_by_osd
[orig
].erase(pg
);
5881 temp_pgs_by_osd
[out
].insert(pg
);
5882 ceph_assert(new_upmap_items
.size() < pg_pool_size
);
5883 new_upmap_items
.push_back(make_pair(orig
, out
));
5884 // append new remapping pairs slowly
5885 // This way we can make sure that each tiny change will
5886 // definitely make distribution of PGs converging to
5887 // the perfect status.
5888 to_upmap
[pg
] = new_upmap_items
;
5892 int OSDMap::find_best_remap (
5894 const vector
<int>& orig
,
5895 const vector
<int>& out
,
5896 const set
<int>& existing
,
5897 const map
<int,float> osd_deviation
)
5900 // Find the best remap from the suggestions in orig and out - the best remap
5901 // is the one which maps from the OSD with the largest deviatoion (from the
5902 // OSDs which are part of orig)
5906 for (unsigned i
= 0; i
< out
.size(); ++i
) {
5907 if (orig
[i
] == out
[i
])
5908 continue; // skip invalid remappings
5909 if (existing
.count(orig
[i
]) || existing
.count(out
[i
]))
5910 continue; // we want new remappings only!
5911 if (osd_deviation
.at(orig
[i
]) > max_dev
) {
5912 max_dev
= osd_deviation
.at(orig
[i
]);
5914 ldout(cct
, 30) << "Max osd." << orig
[i
] << " pos " << i
<< " dev " << osd_deviation
.at(orig
[i
]) << dendl
;
5920 OSDMap::candidates_t
OSDMap::build_candidates(
5922 const OSDMap
& tmp_osd_map
,
5923 const set
<pg_t
> to_skip
,
5924 const set
<int64_t>& only_pools
,
5926 std::random_device::result_type
*p_seed
)
5929 // build the candidates data structure
5931 candidates_t candidates
;
5932 candidates
.reserve(tmp_osd_map
.pg_upmap_items
.size());
5933 for (auto& [pg
, um_pair
] : tmp_osd_map
.pg_upmap_items
) {
5934 if (to_skip
.count(pg
))
5936 if (!only_pools
.empty() && !only_pools
.count(pg
.pool()))
5938 candidates
.push_back(make_pair(pg
, um_pair
));
5941 // shuffle candidates so they all get equal (in)attention
5942 std::shuffle(candidates
.begin(), candidates
.end(), get_random_engine(cct
, p_seed
));
5947 // return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs
5948 int64_t OSDMap::has_zero_pa_pgs(CephContext
*cct
, int64_t pool_id
) const
5950 const pg_pool_t
* pool
= get_pg_pool(pool_id
);
5951 for (unsigned ps
= 0; ps
< pool
->get_pg_num(); ++ps
) {
5952 pg_t
pg(ps
, pool_id
);
5954 pg_to_up_acting_osds(pg
, nullptr, nullptr, &acting
, nullptr);
5955 if (cct
!= nullptr) {
5956 ldout(cct
, 30) << __func__
<< " " << pg
<< " acting " << acting
<< dendl
;
5958 bool pg_zero_pa
= true;
5959 for (auto osd
: acting
) {
5960 if (get_primary_affinityf(osd
) != 0) {
5966 if (cct
!= nullptr) {
5967 ldout(cct
, 20) << __func__
<< " " << pg
<< " - maps only to OSDs with primiary affinity 0" << dendl
;
5975 void OSDMap::zero_rbi(read_balance_info_t
&rbi
) const {
5977 rbi
.pa_weighted
= 0.;
5978 rbi
.pa_weighted_avg
= 0.;
5980 rbi
.optimal_score
= 0.;
5981 rbi
.adjusted_score
= 0.;
5982 rbi
.acting_raw_score
= 0.;
5983 rbi
.acting_adj_score
= 0.;
5987 int OSDMap::set_rbi(
5989 read_balance_info_t
&rbi
,
5995 float total_osd_weight
,
5996 uint max_prims_per_osd
,
5997 uint max_acting_prims_per_osd
,
5998 float avg_prims_per_osd
,
5999 bool prim_on_zero_pa
,
6000 bool acting_on_zero_pa
,
6001 float max_osd_score
) const
6003 // put all the ugly code here, so rest of code is nicer.
6004 const pg_pool_t
* pool
= get_pg_pool(pool_id
);
6007 if (total_w_pa
/ total_osd_weight
< 1. / float(pool
->get_size())) {
6008 ldout(cct
, 20) << __func__
<< " pool " << pool_id
<< " average primary affinity is lower than"
6009 << 1. / float(pool
->get_size()) << dendl
;
6010 rbi
.err_msg
= fmt::format(
6011 "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable",
6012 pool_id
, 1. / float(pool
->get_size()));
6015 rbi
.pa_weighted
= total_w_pa
;
6017 // weighted_prim_affinity_avg
6018 rbi
.pa_weighted_avg
= rbi_round(rbi
.pa_weighted
/ total_osd_weight
); // in [0..1]
6019 // p_rbi->pa_weighted / osd_pa_count; // in [0..1]
6021 rbi
.raw_score
= rbi_round((float)max_prims_per_osd
/ avg_prims_per_osd
); // >=1
6022 if (acting_on_zero_pa
) {
6023 rbi
.acting_raw_score
= rbi_round(max_osd_score
);
6024 rbi
.err_msg
= fmt::format(
6025 "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate",
6028 rbi
.acting_raw_score
= rbi_round((float)max_acting_prims_per_osd
/ avg_prims_per_osd
);
6031 if (osd_pa_count
!= 0) {
6032 // this implies that pa_sum > 0
6033 rbi
.pa_avg
= rbi_round(pa_sum
/ osd_pa_count
); // in [0..1]
6038 if (rbi
.pa_avg
!= 0.) {
6040 if ((zpg
= has_zero_pa_pgs(cct
, pool_id
)) >= 0) {
6041 pg_t
pg(zpg
, pool_id
);
6042 std::stringstream ss
;
6044 ldout(cct
, 10) << __func__
<< " pool " << pool_id
<< " has some PGs where all OSDs are with primary_affinity 0 (" << pg
<< ",...)" << dendl
;
6045 rbi
.err_msg
= fmt::format(
6046 "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable",
6050 rbi
.optimal_score
= rbi_round(float(num_osds
) / float(osd_pa_count
)); // >= 1
6051 // adjust the score to the primary affinity setting (if prim affinity is set
6052 // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1)
6053 // When total system primary affinity is too low (average < 1 / pool replica count)
6054 // the score is negative in order to grab the user's attention.
6055 rbi
.adjusted_score
= rbi_round(rbi
.raw_score
/ rbi
.optimal_score
); // >= 1 if PA is not low
6056 rbi
.acting_adj_score
= rbi_round(rbi
.acting_raw_score
/ rbi
.optimal_score
); // >= 1 if PA is not low
6059 // We should never get here - this condition is checked before calling this function - this is just sanity check code.
6060 rbi
.err_msg
= fmt::format(
6061 "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score",
6069 int OSDMap::calc_read_balance_score(CephContext
*cct
, int64_t pool_id
,
6070 read_balance_info_t
*p_rbi
) const
6072 //BUG: wrong score with one PG replica 3 and 4 OSDs
6074 ldout(cct
,20) << __func__
<< " pool " << get_pool_name(pool_id
) << dendl
;
6077 tmp_osd_map
.deepish_copy_from(*this);
6078 if (p_rbi
== nullptr) {
6079 // The only case where error message is not set - this is not tested in the unit test.
6081 ldout(cct
,30) << __func__
<< " p_rbi is nullptr." << dendl
;
6085 if (tmp_osd_map
.pools
.count(pool_id
) == 0) {
6087 ldout(cct
,30) << __func__
<< " pool " << pool_id
<< " not found." << dendl
;
6089 p_rbi
->err_msg
= fmt::format("pool {} not found", pool_id
);
6093 const pg_pool_t
* pool
= tmp_osd_map
.get_pg_pool(pool_id
);
6094 auto num_pgs
= pool
->get_pg_num();
6096 map
<uint64_t,set
<pg_t
>> pgs_by_osd
;
6097 map
<uint64_t,set
<pg_t
>> prim_pgs_by_osd
;
6098 map
<uint64_t,set
<pg_t
>> acting_prims_by_osd
;
6100 pgs_by_osd
= tmp_osd_map
.get_pgs_by_osd(cct
, pool_id
, &prim_pgs_by_osd
, &acting_prims_by_osd
);
6103 ldout(cct
,30) << __func__
<< " Primaries for pool: "
6104 << prim_pgs_by_osd
<< dendl
;
6106 if (pgs_by_osd
.empty()) {
6107 //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id);
6110 if (cct
!= nullptr) {
6111 for (auto& [osd
,pgs
] : prim_pgs_by_osd
) {
6112 ldout(cct
,20) << __func__
<< " Pool " << pool_id
<< " OSD." << osd
6113 << " has " << pgs
.size() << " primary PGs, "
6114 << acting_prims_by_osd
[osd
].size() << " acting primaries."
6119 auto num_osds
= pgs_by_osd
.size();
6121 float avg_prims_per_osd
= (float)num_pgs
/ (float)num_osds
;
6122 uint64_t max_prims_per_osd
= 0;
6123 uint64_t max_acting_prims_per_osd
= 0;
6124 float max_osd_score
= 0.;
6125 bool prim_on_zero_pa
= false;
6126 bool acting_on_zero_pa
= false;
6128 float prim_affinity_sum
= 0.;
6129 float total_osd_weight
= 0.;
6130 float total_weighted_pa
= 0.;
6132 map
<int,float> osds_crush_weight
;
6133 // Set up the OSDMap
6134 int ruleno
= tmp_osd_map
.pools
.at(pool_id
).get_crush_rule();
6135 tmp_osd_map
.crush
->get_rule_weight_osd_map(ruleno
, &osds_crush_weight
);
6137 if (cct
!= nullptr) {
6138 ldout(cct
,20) << __func__
<< " pool " << pool_id
6139 << " ruleno " << ruleno
6140 << " weight-map " << osds_crush_weight
6143 uint osd_pa_count
= 0;
6145 for (auto [osd
, oweight
] : osds_crush_weight
) { // loop over all OSDs
6146 total_osd_weight
+= oweight
;
6147 float osd_pa
= tmp_osd_map
.get_primary_affinityf(osd
);
6148 total_weighted_pa
+= oweight
* osd_pa
;
6152 if (prim_pgs_by_osd
.count(osd
)) {
6153 auto n_prims
= prim_pgs_by_osd
.at(osd
).size();
6154 max_prims_per_osd
= std::max(max_prims_per_osd
, n_prims
);
6156 prim_on_zero_pa
= true;
6159 if (acting_prims_by_osd
.count(osd
)) {
6160 auto n_aprims
= acting_prims_by_osd
.at(osd
).size();
6161 max_acting_prims_per_osd
= std::max(max_acting_prims_per_osd
, n_aprims
);
6163 max_osd_score
= std::max(max_osd_score
, float(n_aprims
) / osd_pa
);
6166 acting_on_zero_pa
= true;
6170 prim_affinity_sum
+= osd_pa
;
6171 if (cct
!= nullptr) {
6172 auto np
= prim_pgs_by_osd
.count(osd
) ? prim_pgs_by_osd
.at(osd
).size() : 0;
6173 auto nap
= acting_prims_by_osd
.count(osd
) ? acting_prims_by_osd
.at(osd
).size() : 0;
6174 auto wt
= osds_crush_weight
.count(osd
) ? osds_crush_weight
.at(osd
) : 0.;
6175 ldout(cct
,30) << __func__
<< " OSD." << osd
<< " info: "
6176 << " num_primaries " << np
6177 << " num_acting_prims " << nap
6178 << " prim_affinity " << tmp_osd_map
.get_primary_affinityf(osd
)
6183 if (cct
!= nullptr) {
6184 ldout(cct
,30) << __func__
<< " pool " << pool_id
6185 << " total_osd_weight " << total_osd_weight
6186 << " total_weighted_pa " << total_weighted_pa
6190 if (prim_affinity_sum
== 0.0) {
6191 if (cct
!= nullptr) {
6192 ldout(cct
, 10) << __func__
<< " pool " << pool_id
6193 << " has primary_affinity set to zero on all OSDs" << dendl
;
6196 p_rbi
->err_msg
= fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id
);
6198 return -ERANGE
; // score has a different meaning now.
6201 max_osd_score
*= prim_affinity_sum
/ num_osds
;
6204 rc
= tmp_osd_map
.set_rbi(cct
, *p_rbi
, pool_id
, total_weighted_pa
,
6205 prim_affinity_sum
, num_osds
, osd_pa_count
,
6206 total_osd_weight
, max_prims_per_osd
,
6207 max_acting_prims_per_osd
, avg_prims_per_osd
,
6208 prim_on_zero_pa
, acting_on_zero_pa
, max_osd_score
);
6210 if (cct
!= nullptr) {
6211 ldout(cct
,30) << __func__
<< " pool " << get_pool_name(pool_id
)
6212 << " pa_avg " << p_rbi
->pa_avg
6213 << " pa_weighted " << p_rbi
->pa_weighted
6214 << " pa_weighted_avg " << p_rbi
->pa_weighted_avg
6215 << " optimal_score " << p_rbi
->optimal_score
6216 << " adjusted_score " << p_rbi
->adjusted_score
6217 << " acting_adj_score " << p_rbi
->acting_adj_score
6219 ldout(cct
,20) << __func__
<< " pool " << get_pool_name(pool_id
)
6220 << " raw_score: " << p_rbi
->raw_score
6221 << " acting_raw_score: " << p_rbi
->acting_raw_score
6223 ldout(cct
,10) << __func__
<< " pool " << get_pool_name(pool_id
)
6224 << " wl_score: " << p_rbi
->acting_adj_score
<< dendl
;
6230 int OSDMap::get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const
6232 return crush
->get_leaves(name
, osds
);
6235 // get pools whose crush rules might reference the given osd
6236 void OSDMap::get_pool_ids_by_osd(CephContext
*cct
,
6238 set
<int64_t> *pool_ids
) const
6240 ceph_assert(pool_ids
);
6242 int r
= crush
->get_rules_by_osd(osd
, &raw_rules
);
6244 lderr(cct
) << __func__
<< " get_rules_by_osd failed: " << cpp_strerror(r
)
6246 ceph_assert(r
>= 0);
6249 for (auto &i
: raw_rules
) {
6250 // exclude any dead rule
6251 if (crush_rule_in_use(i
)) {
6255 for (auto &r
: rules
) {
6256 get_pool_ids_by_rule(r
, pool_ids
);
6260 template <typename F
>
6261 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
6263 typedef CrushTreeDumper::Dumper
<F
> Parent
;
6265 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
6266 const PGMap
& pgmap_
, bool tree_
,
6267 const string
& filter
) :
6268 Parent(crush
, osdmap_
->get_pool_names()),
6276 if (osdmap
->crush
->name_exists(filter
)) {
6277 // filter by crush node
6278 auto item_id
= osdmap
->crush
->get_item_id(filter
);
6279 allowed
.insert(item_id
);
6280 osdmap
->crush
->get_all_children(item_id
, &allowed
);
6281 } else if (osdmap
->crush
->class_exists(filter
)) {
6282 // filter by device class
6283 class_id
= osdmap
->crush
->get_class_id(filter
);
6284 } else if (auto pool_id
= osdmap
->lookup_pg_pool_name(filter
);
6287 auto crush_rule
= osdmap
->get_pool_crush_rule(pool_id
);
6289 osdmap
->crush
->find_takes_by_rule(crush_rule
, &roots
);
6291 for (auto r
: roots
)
6292 osdmap
->crush
->get_all_children(r
, &allowed
);
6294 average_util
= average_utilization();
6299 bool should_dump(int id
) const {
6300 if (!allowed
.empty() && !allowed
.count(id
)) // filter by name
6302 if (id
>= 0 && class_id
>= 0) {
6303 auto item_class_id
= osdmap
->crush
->get_item_class_id(id
);
6304 if (item_class_id
< 0 || // not bound to a class yet
6305 item_class_id
!= class_id
) // or already bound to a different class
6311 set
<int> get_dumped_osds() {
6312 if (allowed
.empty() && class_id
< 0) {
6319 void dump_stray(F
*f
) {
6320 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
6321 if (osdmap
->exists(i
) && !this->is_touched(i
))
6322 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
6326 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
6327 if (!tree
&& (qi
.is_bucket() || dumped_osds
.count(qi
.id
)))
6329 if (!should_dump(qi
.id
))
6332 if (!qi
.is_bucket())
6333 dumped_osds
.insert(qi
.id
);
6334 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
6335 int64_t kb
= 0, kb_used
= 0, kb_used_data
= 0, kb_used_omap
= 0,
6336 kb_used_meta
= 0, kb_avail
= 0;
6338 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_used_data
,
6339 &kb_used_omap
, &kb_used_meta
, &kb_avail
))
6341 util
= 100.0 * (double)kb_used
/ (double)kb
;
6345 var
= util
/ average_util
;
6347 size_t num_pgs
= qi
.is_bucket() ? 0 : pgmap
.get_num_pg_by_osd(qi
.id
);
6349 dump_item(qi
, reweight
, kb
, kb_used
,
6350 kb_used_data
, kb_used_omap
, kb_used_meta
,
6351 kb_avail
, util
, var
, num_pgs
, f
);
6353 if (!qi
.is_bucket() && reweight
> 0) {
6354 if (min_var
< 0 || var
< min_var
)
6356 if (max_var
< 0 || var
> max_var
)
6359 double dev
= util
- average_util
;
6361 stddev
+= reweight
* dev
;
6366 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
6370 int64_t kb_used_data
,
6371 int64_t kb_used_omap
,
6372 int64_t kb_used_meta
,
6376 const size_t num_pgs
,
6380 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
6383 double average_utilization() {
6384 int64_t kb
= 0, kb_used
= 0;
6385 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
6386 if (!osdmap
->exists(i
) ||
6387 osdmap
->get_weight(i
) == 0 ||
6390 int64_t kb_i
, kb_used_i
, kb_used_data_i
, kb_used_omap_i
, kb_used_meta_i
,
6392 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_used_data_i
,
6393 &kb_used_omap_i
, &kb_used_meta_i
, &kb_avail_i
)) {
6395 kb_used
+= kb_used_i
;
6398 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
6401 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
6402 int64_t* kb_used_data
,
6403 int64_t* kb_used_omap
,
6404 int64_t* kb_used_meta
,
6405 int64_t* kb_avail
) const {
6406 const osd_stat_t
*p
= pgmap
.get_osd_stat(id
);
6407 if (!p
) return false;
6408 *kb
= p
->statfs
.kb();
6409 *kb_used
= p
->statfs
.kb_used_raw();
6410 *kb_used_data
= p
->statfs
.kb_used_data();
6411 *kb_used_omap
= p
->statfs
.kb_used_omap();
6412 *kb_used_meta
= p
->statfs
.kb_used_internal_metadata();
6413 *kb_avail
= p
->statfs
.kb_avail();
6418 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
6419 int64_t* kb_used_data
,
6420 int64_t* kb_used_omap
,
6421 int64_t* kb_used_meta
,
6422 int64_t* kb_avail
) const {
6424 if (osdmap
->is_out(id
) || !should_dump(id
)) {
6433 return get_osd_utilization(id
, kb
, kb_used
, kb_used_data
,
6434 kb_used_omap
, kb_used_meta
, kb_avail
);
6444 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
6445 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
6446 int64_t kb_i
= 0, kb_used_i
= 0, kb_used_data_i
= 0,
6447 kb_used_omap_i
= 0, kb_used_meta_i
= 0, kb_avail_i
= 0;
6448 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
,
6449 &kb_used_data_i
, &kb_used_omap_i
,
6450 &kb_used_meta_i
, &kb_avail_i
))
6453 *kb_used
+= kb_used_i
;
6454 *kb_used_data
+= kb_used_data_i
;
6455 *kb_used_omap
+= kb_used_omap_i
;
6456 *kb_used_meta
+= kb_used_meta_i
;
6457 *kb_avail
+= kb_avail_i
;
6463 const OSDMap
*osdmap
;
6466 double average_util
;
6473 set
<int> dumped_osds
;
6477 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
6479 typedef OSDUtilizationDumper
<TextTable
> Parent
;
6481 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
6482 const PGMap
& pgmap
, bool tree
,
6483 const string
& filter
) :
6484 Parent(crush
, osdmap
, pgmap
, tree
, filter
) {}
6486 void dump(TextTable
*tbl
) {
6487 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
6488 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
6489 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
6490 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
6491 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
6492 tbl
->define_column("RAW USE", TextTable::LEFT
, TextTable::RIGHT
);
6493 tbl
->define_column("DATA", TextTable::LEFT
, TextTable::RIGHT
);
6494 tbl
->define_column("OMAP", TextTable::LEFT
, TextTable::RIGHT
);
6495 tbl
->define_column("META", TextTable::LEFT
, TextTable::RIGHT
);
6496 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
6497 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
6498 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
6499 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
6500 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
6502 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
6508 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
6512 << byte_u_t(sum
.statfs
.total
)
6513 << byte_u_t(sum
.statfs
.get_used_raw())
6514 << byte_u_t(sum
.statfs
.allocated
)
6515 << byte_u_t(sum
.statfs
.omap_allocated
)
6516 << byte_u_t(sum
.statfs
.internal_metadata
)
6517 << byte_u_t(sum
.statfs
.available
)
6518 << lowprecision_t(average_util
)
6520 << TextTable::endrow
;
6524 struct lowprecision_t
{
6526 explicit lowprecision_t(float _v
) : v(_v
) {}
6528 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
6530 using OSDUtilizationDumper
<TextTable
>::dump_item
;
6531 void dump_item(const CrushTreeDumper::Item
&qi
,
6535 int64_t kb_used_data
,
6536 int64_t kb_used_omap
,
6537 int64_t kb_used_meta
,
6541 const size_t num_pgs
,
6542 TextTable
*tbl
) override
{
6543 const char *c
= crush
->get_item_class(qi
.id
);
6548 << weightf_t(qi
.weight
)
6549 << weightf_t(reweight
)
6550 << byte_u_t(kb
<< 10)
6551 << byte_u_t(kb_used
<< 10)
6552 << byte_u_t(kb_used_data
<< 10)
6553 << byte_u_t(kb_used_omap
<< 10)
6554 << byte_u_t(kb_used_meta
<< 10)
6555 << byte_u_t(kb_avail
<< 10)
6556 << lowprecision_t(util
)
6557 << lowprecision_t(var
);
6559 if (qi
.is_bucket()) {
6564 if (osdmap
->is_up(qi
.id
)) {
6566 } else if (osdmap
->is_destroyed(qi
.id
)) {
6567 *tbl
<< "destroyed";
6575 for (int k
= 0; k
< qi
.depth
; k
++)
6577 if (qi
.is_bucket()) {
6578 int type
= crush
->get_bucket_type(qi
.id
);
6579 name
<< crush
->get_type_name(type
) << " "
6580 << crush
->get_item_name(qi
.id
);
6582 name
<< "osd." << qi
.id
;
6587 *tbl
<< TextTable::endrow
;
6593 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
6594 << "/" << lowprecision_t(max_var
) << " "
6595 << "STDDEV: " << lowprecision_t(dev());
6600 ostream
& operator<<(ostream
& out
,
6601 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
6605 } else if (v
.v
< 0.001) {
6608 std::streamsize p
= out
.precision();
6609 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
6613 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
6615 typedef OSDUtilizationDumper
<Formatter
> Parent
;
6617 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
6618 const PGMap
& pgmap
, bool tree
,
6619 const string
& filter
) :
6620 Parent(crush
, osdmap
, pgmap
, tree
, filter
) {}
6622 void dump(Formatter
*f
) {
6623 f
->open_array_section("nodes");
6627 f
->open_array_section("stray");
6633 using OSDUtilizationDumper
<Formatter
>::dump_item
;
6634 void dump_item(const CrushTreeDumper::Item
&qi
,
6638 int64_t kb_used_data
,
6639 int64_t kb_used_omap
,
6640 int64_t kb_used_meta
,
6644 const size_t num_pgs
,
6645 Formatter
*f
) override
{
6646 f
->open_object_section("item");
6647 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
6648 f
->dump_float("reweight", reweight
);
6649 f
->dump_int("kb", kb
);
6650 f
->dump_int("kb_used", kb_used
);
6651 f
->dump_int("kb_used_data", kb_used_data
);
6652 f
->dump_int("kb_used_omap", kb_used_omap
);
6653 f
->dump_int("kb_used_meta", kb_used_meta
);
6654 f
->dump_int("kb_avail", kb_avail
);
6655 f
->dump_float("utilization", util
);
6656 f
->dump_float("var", var
);
6657 f
->dump_unsigned("pgs", num_pgs
);
6658 if (!qi
.is_bucket()) {
6659 if (osdmap
->is_up(qi
.id
)) {
6660 f
->dump_string("status", "up");
6661 } else if (osdmap
->is_destroyed(qi
.id
)) {
6662 f
->dump_string("status", "destroyed");
6664 f
->dump_string("status", "down");
6667 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
6672 void summary(Formatter
*f
) {
6673 f
->open_object_section("summary");
6674 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
6675 auto& s
= sum
.statfs
;
6677 f
->dump_int("total_kb", s
.kb());
6678 f
->dump_int("total_kb_used", s
.kb_used_raw());
6679 f
->dump_int("total_kb_used_data", s
.kb_used_data());
6680 f
->dump_int("total_kb_used_omap", s
.kb_used_omap());
6681 f
->dump_int("total_kb_used_meta", s
.kb_used_internal_metadata());
6682 f
->dump_int("total_kb_avail", s
.kb_avail());
6683 f
->dump_float("average_utilization", average_util
);
6684 f
->dump_float("min_var", min_var
);
6685 f
->dump_float("max_var", max_var
);
6686 f
->dump_float("dev", dev());
6691 void print_osd_utilization(const OSDMap
& osdmap
,
6696 const string
& filter
)
6698 const CrushWrapper
*crush
= osdmap
.crush
.get();
6700 f
->open_object_section("df");
6701 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgmap
, tree
, filter
);
6707 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgmap
, tree
, filter
);
6710 out
<< tbl
<< d
.summary() << "\n";
6714 void OSDMap::check_health(CephContext
*cct
,
6715 health_check_map_t
*checks
) const
6717 int num_osds
= get_num_osds();
6720 // OSD_$subtree_DOWN
6722 if (num_osds
>= 0) {
6723 int num_in_osds
= 0;
6724 int num_down_in_osds
= 0;
6726 set
<int> down_in_osds
;
6727 set
<int> up_in_osds
;
6728 set
<int> subtree_up
;
6729 unordered_map
<int, set
<int> > subtree_type_down
;
6730 unordered_map
<int, int> num_osds_subtree
;
6731 int max_type
= crush
->get_max_type_id();
6733 for (int i
= 0; i
< get_max_osd(); i
++) {
6735 if (crush
->item_exists(i
)) {
6740 if (is_out(i
) || (osd_state
[i
] & CEPH_OSD_NEW
))
6743 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
6746 down_in_osds
.insert(i
);
6749 for (int type
= 0; type
<= max_type
; type
++) {
6750 if (!crush
->get_type_name(type
))
6752 int r
= crush
->get_immediate_parent_id(current
, &parent_id
);
6755 // break early if this parent is already marked as up
6756 if (subtree_up
.count(parent_id
))
6758 type
= crush
->get_bucket_type(parent_id
);
6759 if (!subtree_type_is_down(
6760 cct
, parent_id
, type
,
6761 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
6763 current
= parent_id
;
6768 // calculate the number of down osds in each down subtree and
6769 // store it in num_osds_subtree
6770 for (int type
= 1; type
<= max_type
; type
++) {
6771 if (!crush
->get_type_name(type
))
6773 for (auto j
= subtree_type_down
[type
].begin();
6774 j
!= subtree_type_down
[type
].end();
6778 int num_children
= crush
->get_children(*j
, &children
);
6779 if (num_children
== 0)
6781 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
6784 } else if (num_osds_subtree
[*l
] > 0) {
6785 num
= num
+ num_osds_subtree
[*l
];
6788 num_osds_subtree
[*j
] = num
;
6791 num_down_in_osds
= down_in_osds
.size();
6792 ceph_assert(num_down_in_osds
<= num_in_osds
);
6793 if (num_down_in_osds
> 0) {
6794 // summary of down subtree types and osds
6795 for (int type
= max_type
; type
> 0; type
--) {
6796 if (!crush
->get_type_name(type
))
6798 if (subtree_type_down
[type
].size() > 0) {
6800 ss
<< subtree_type_down
[type
].size() << " "
6801 << crush
->get_type_name(type
);
6802 if (subtree_type_down
[type
].size() > 1) {
6805 int sum_down_osds
= 0;
6806 for (auto j
= subtree_type_down
[type
].begin();
6807 j
!= subtree_type_down
[type
].end();
6809 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
6811 ss
<< " (" << sum_down_osds
<< " osds) down";
6812 string err
= string("OSD_") +
6813 string(crush
->get_type_name(type
)) + "_DOWN";
6814 boost::to_upper(err
);
6815 auto& d
= checks
->add(err
, HEALTH_WARN
, ss
.str(),
6816 subtree_type_down
[type
].size());
6817 for (auto j
= subtree_type_down
[type
].rbegin();
6818 j
!= subtree_type_down
[type
].rend();
6821 ss
<< crush
->get_type_name(type
);
6823 ss
<< crush
->get_item_name(*j
);
6824 // at the top level, do not print location
6825 if (type
!= max_type
) {
6827 ss
<< crush
->get_full_location_ordered_string(*j
);
6830 int num
= num_osds_subtree
[*j
];
6831 ss
<< " (" << num
<< " osds)";
6833 d
.detail
.push_back(ss
.str());
6838 ss
<< down_in_osds
.size() << " osds down";
6839 auto& d
= checks
->add("OSD_DOWN", HEALTH_WARN
, ss
.str(),
6840 down_in_osds
.size());
6841 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
6843 ss
<< "osd." << *it
<< " (";
6844 ss
<< crush
->get_full_location_ordered_string(*it
);
6846 d
.detail
.push_back(ss
.str());
6850 if (!osds
.empty()) {
6852 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
6853 auto& d
= checks
->add("OSD_ORPHAN", HEALTH_WARN
, ss
.str(),
6855 for (auto osd
: osds
) {
6857 ss
<< "osd." << osd
<< " exists in crush map but not in osdmap";
6858 d
.detail
.push_back(ss
.str());
6863 std::list
<std::string
> scrub_messages
;
6864 bool noscrub
= false, nodeepscrub
= false;
6865 for (const auto &p
: pools
) {
6866 if (p
.second
.flags
& pg_pool_t::FLAG_NOSCRUB
) {
6868 ss
<< "Pool " << get_pool_name(p
.first
) << " has noscrub flag";
6869 scrub_messages
.push_back(ss
.str());
6872 if (p
.second
.flags
& pg_pool_t::FLAG_NODEEP_SCRUB
) {
6874 ss
<< "Pool " << get_pool_name(p
.first
) << " has nodeep-scrub flag";
6875 scrub_messages
.push_back(ss
.str());
6879 if (noscrub
|| nodeepscrub
) {
6881 out
+= noscrub
? string("noscrub") + (nodeepscrub
? ", " : "") : "";
6882 out
+= nodeepscrub
? "nodeep-scrub" : "";
6883 auto& d
= checks
->add("POOL_SCRUB_FLAGS", HEALTH_OK
,
6884 "Some pool(s) have the " + out
+ " flag(s) set", 0);
6885 d
.detail
.splice(d
.detail
.end(), scrub_messages
);
6888 // OSD_OUT_OF_ORDER_FULL
6890 // An osd could configure failsafe ratio, to something different
6891 // but for now assume it is the same here.
6892 float fsr
= cct
->_conf
->osd_failsafe_full_ratio
;
6893 if (fsr
> 1.0) fsr
/= 100;
6894 float fr
= get_full_ratio();
6895 float br
= get_backfillfull_ratio();
6896 float nr
= get_nearfull_ratio();
6898 list
<string
> detail
;
6899 // These checks correspond to how OSDService::check_full_status() in an OSD
6900 // handles the improper setting of these values.
6903 ss
<< "backfillfull_ratio (" << br
6904 << ") < nearfull_ratio (" << nr
<< "), increased";
6905 detail
.push_back(ss
.str());
6910 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
6912 detail
.push_back(ss
.str());
6917 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
6919 detail
.push_back(ss
.str());
6921 if (!detail
.empty()) {
6922 auto& d
= checks
->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR
,
6923 "full ratio(s) out of order", 0);
6924 d
.detail
.swap(detail
);
6931 // OSD_FAILSAFE_FULL
6933 set
<int> full
, backfillfull
, nearfull
;
6934 get_full_osd_counts(&full
, &backfillfull
, &nearfull
);
6937 ss
<< full
.size() << " full osd(s)";
6938 auto& d
= checks
->add("OSD_FULL", HEALTH_ERR
, ss
.str(), full
.size());
6939 for (auto& i
: full
) {
6941 ss
<< "osd." << i
<< " is full";
6942 d
.detail
.push_back(ss
.str());
6945 if (backfillfull
.size()) {
6947 ss
<< backfillfull
.size() << " backfillfull osd(s)";
6948 auto& d
= checks
->add("OSD_BACKFILLFULL", HEALTH_WARN
, ss
.str(),
6949 backfillfull
.size());
6950 for (auto& i
: backfillfull
) {
6952 ss
<< "osd." << i
<< " is backfill full";
6953 d
.detail
.push_back(ss
.str());
6956 if (nearfull
.size()) {
6958 ss
<< nearfull
.size() << " nearfull osd(s)";
6959 auto& d
= checks
->add("OSD_NEARFULL", HEALTH_WARN
, ss
.str(), nearfull
.size());
6960 for (auto& i
: nearfull
) {
6962 ss
<< "osd." << i
<< " is near full";
6963 d
.detail
.push_back(ss
.str());
6971 uint64_t warn_flags
=
6972 CEPH_OSDMAP_PAUSERD
|
6973 CEPH_OSDMAP_PAUSEWR
|
6974 CEPH_OSDMAP_PAUSEREC
|
6976 CEPH_OSDMAP_NODOWN
|
6979 CEPH_OSDMAP_NOBACKFILL
|
6980 CEPH_OSDMAP_NORECOVER
|
6981 CEPH_OSDMAP_NOSCRUB
|
6982 CEPH_OSDMAP_NODEEP_SCRUB
|
6983 CEPH_OSDMAP_NOTIERAGENT
|
6984 CEPH_OSDMAP_NOSNAPTRIM
|
6985 CEPH_OSDMAP_NOREBALANCE
;
6986 if (test_flag(warn_flags
)) {
6988 string s
= get_flag_string(get_flags() & warn_flags
);
6989 ss
<< s
<< " flag(s) set";
6990 checks
->add("OSDMAP_FLAGS", HEALTH_WARN
, ss
.str(),
6991 s
.size() /* kludgey but sufficient */);
6997 list
<string
> detail
;
6998 const unsigned flags
=
7003 for (int i
= 0; i
< max_osd
; ++i
) {
7004 if (osd_state
[i
] & flags
) {
7007 OSDMap::calc_state_set(osd_state
[i
] & flags
, states
);
7008 ss
<< "osd." << i
<< " has flags " << states
;
7009 detail
.push_back(ss
.str());
7012 for (auto& i
: crush_node_flags
) {
7013 if (i
.second
&& crush
->item_exists(i
.first
)) {
7016 OSDMap::calc_state_set(i
.second
, states
);
7017 int t
= i
.first
>= 0 ? 0 : crush
->get_bucket_type(i
.first
);
7018 const char *tn
= crush
->get_type_name(t
);
7019 ss
<< (tn
? tn
: "node") << " "
7020 << crush
->get_item_name(i
.first
) << " has flags " << states
;
7021 detail
.push_back(ss
.str());
7024 for (auto& i
: device_class_flags
) {
7025 const char* class_name
= crush
->get_class_name(i
.first
);
7026 if (i
.second
&& class_name
) {
7029 OSDMap::calc_state_set(i
.second
, states
);
7030 ss
<< "device class '" << class_name
<< "' has flags " << states
;
7031 detail
.push_back(ss
.str());
7034 if (!detail
.empty()) {
7036 ss
<< detail
.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
7037 auto& d
= checks
->add("OSD_FLAGS", HEALTH_WARN
, ss
.str(), detail
.size());
7038 d
.detail
.swap(detail
);
7042 // OLD_CRUSH_TUNABLES
7043 if (cct
->_conf
->mon_warn_on_legacy_crush_tunables
) {
7044 string min
= crush
->get_min_required_version();
7045 if (min
< cct
->_conf
->mon_crush_min_required_version
) {
7047 ss
<< "crush map has legacy tunables (require " << min
7048 << ", min is " << cct
->_conf
->mon_crush_min_required_version
<< ")";
7049 auto& d
= checks
->add("OLD_CRUSH_TUNABLES", HEALTH_WARN
, ss
.str(), 0);
7050 d
.detail
.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7054 // OLD_CRUSH_STRAW_CALC_VERSION
7055 if (cct
->_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
7056 if (crush
->get_straw_calc_version() == 0) {
7058 ss
<< "crush map has straw_calc_version=0";
7059 auto& d
= checks
->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN
, ss
.str(), 0);
7061 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
7065 // CACHE_POOL_NO_HIT_SET
7066 if (cct
->_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
7067 list
<string
> detail
;
7068 for (auto p
= pools
.cbegin(); p
!= pools
.cend(); ++p
) {
7069 const pg_pool_t
& info
= p
->second
;
7070 if (info
.cache_mode_requires_hit_set() &&
7071 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
7073 ss
<< "pool '" << get_pool_name(p
->first
)
7074 << "' with cache_mode " << info
.get_cache_mode_name()
7075 << " needs hit_set_type to be set but it is not";
7076 detail
.push_back(ss
.str());
7079 if (!detail
.empty()) {
7081 ss
<< detail
.size() << " cache pools are missing hit_sets";
7082 auto& d
= checks
->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN
, ss
.str(),
7084 d
.detail
.swap(detail
);
7088 // OSD_NO_SORTBITWISE
7089 if (!test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
7091 ss
<< "'sortbitwise' flag is not set";
7092 checks
->add("OSD_NO_SORTBITWISE", HEALTH_WARN
, ss
.str(), 0);
7095 // OSD_UPGRADE_FINISHED
7096 if (auto require_release
= pending_require_osd_release()) {
7098 ss
<< "all OSDs are running " << *require_release
<< " or later but"
7099 << " require_osd_release < " << *require_release
;
7100 auto& d
= checks
->add("OSD_UPGRADE_FINISHED", HEALTH_WARN
, ss
.str(), 0);
7101 d
.detail
.push_back(ss
.str());
7104 // POOL_NEARFULL/BACKFILLFULL/FULL
7106 list
<string
> full_detail
, backfillfull_detail
, nearfull_detail
;
7107 for (auto it
: get_pools()) {
7108 const pg_pool_t
&pool
= it
.second
;
7109 const string
& pool_name
= get_pool_name(it
.first
);
7110 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
7112 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
7113 // may run out of space too,
7114 // but we want EQUOTA taking precedence
7115 ss
<< "pool '" << pool_name
<< "' is full (running out of quota)";
7117 ss
<< "pool '" << pool_name
<< "' is full (no space)";
7119 full_detail
.push_back(ss
.str());
7120 } else if (pool
.has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
7122 ss
<< "pool '" << pool_name
<< "' is backfillfull";
7123 backfillfull_detail
.push_back(ss
.str());
7124 } else if (pool
.has_flag(pg_pool_t::FLAG_NEARFULL
)) {
7126 ss
<< "pool '" << pool_name
<< "' is nearfull";
7127 nearfull_detail
.push_back(ss
.str());
7130 if (!full_detail
.empty()) {
7132 ss
<< full_detail
.size() << " pool(s) full";
7133 auto& d
= checks
->add("POOL_FULL", HEALTH_WARN
, ss
.str(), full_detail
.size());
7134 d
.detail
.swap(full_detail
);
7136 if (!backfillfull_detail
.empty()) {
7138 ss
<< backfillfull_detail
.size() << " pool(s) backfillfull";
7139 auto& d
= checks
->add("POOL_BACKFILLFULL", HEALTH_WARN
, ss
.str(),
7140 backfillfull_detail
.size());
7141 d
.detail
.swap(backfillfull_detail
);
7143 if (!nearfull_detail
.empty()) {
7145 ss
<< nearfull_detail
.size() << " pool(s) nearfull";
7146 auto& d
= checks
->add("POOL_NEARFULL", HEALTH_WARN
, ss
.str(),
7147 nearfull_detail
.size());
7148 d
.detail
.swap(nearfull_detail
);
7152 // POOL_PG_NUM_NOT_POWER_OF_TWO
7153 if (cct
->_conf
.get_val
<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
7154 list
<string
> detail
;
7155 for (auto it
: get_pools()) {
7156 if (!std::has_single_bit(it
.second
.get_pg_num_target())) {
7158 ss
<< "pool '" << get_pool_name(it
.first
)
7159 << "' pg_num " << it
.second
.get_pg_num_target()
7160 << " is not a power of two";
7161 detail
.push_back(ss
.str());
7164 if (!detail
.empty()) {
7166 ss
<< detail
.size() << " pool(s) have non-power-of-two pg_num";
7167 auto& d
= checks
->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN
,
7168 ss
.str(), detail
.size());
7169 d
.detail
.swap(detail
);
7173 // POOL_NO_REDUNDANCY
7174 if (cct
->_conf
.get_val
<bool>("mon_warn_on_pool_no_redundancy"))
7176 list
<string
> detail
;
7177 for (auto it
: get_pools()) {
7178 if (it
.second
.get_size() == 1) {
7180 ss
<< "pool '" << get_pool_name(it
.first
)
7181 << "' has no replicas configured";
7182 detail
.push_back(ss
.str());
7185 if (!detail
.empty()) {
7187 ss
<< detail
.size() << " pool(s) have no replicas configured";
7188 auto& d
= checks
->add("POOL_NO_REDUNDANCY", HEALTH_WARN
,
7189 ss
.str(), detail
.size());
7190 d
.detail
.swap(detail
);
7194 // DEGRADED STRETCH MODE
7195 if (cct
->_conf
.get_val
<bool>("mon_warn_on_degraded_stretch_mode")) {
7196 if (recovering_stretch_mode
) {
7198 ss
<< "We are recovering stretch mode buckets, only requiring "
7199 << degraded_stretch_mode
<< " of " << stretch_bucket_count
<< " buckets to peer" ;
7200 checks
->add("RECOVERING_STRETCH_MODE", HEALTH_WARN
,
7202 } else if (degraded_stretch_mode
) {
7204 ss
<< "We are missing stretch mode buckets, only requiring "
7205 << degraded_stretch_mode
<< " of " << stretch_bucket_count
<< " buckets to peer" ;
7206 checks
->add("DEGRADED_STRETCH_MODE", HEALTH_WARN
,
7211 if (stretch_mode_enabled
) {
7212 vector
<int> subtrees
;
7213 crush
->get_subtree_of_type(stretch_mode_bucket
, &subtrees
);
7214 if (subtrees
.size() != 2) {
7216 ss
<< "Stretch mode buckets != 2";
7217 checks
->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN
, ss
.str(), 0);
7220 int weight1
= crush
->get_item_weight(subtrees
[0]);
7221 int weight2
= crush
->get_item_weight(subtrees
[1]);
7223 if (weight1
!= weight2
) {
7224 ss
<< "Stretch mode buckets have different weights!";
7225 checks
->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN
, ss
.str(), 0);
7230 int OSDMap::parse_osd_id_list(const vector
<string
>& ls
, set
<int> *out
,
7234 for (auto i
= ls
.begin(); i
!= ls
.end(); ++i
) {
7235 if (i
== ls
.begin() &&
7236 (*i
== "any" || *i
== "all" || *i
== "*")) {
7240 long osd
= ceph::common::parse_osd_id(i
->c_str(), ss
);
7242 *ss
<< "invalid osd id '" << *i
<< "'";
7250 void OSDMap::get_random_up_osds_by_subtree(int n
, // whoami
7252 int limit
, // how many
7254 set
<int> *want
) const {
7257 int subtree_type
= crush
->get_type_id(subtree
);
7258 if (subtree_type
< 1)
7260 vector
<int> subtrees
;
7261 crush
->get_subtree_of_type(subtree_type
, &subtrees
);
7262 std::random_device rd
;
7263 std::default_random_engine rng
{rd()};
7264 std::shuffle(subtrees
.begin(), subtrees
.end(), rng
);
7265 for (auto s
: subtrees
) {
7268 if (crush
->subtree_contains(s
, n
))
7271 crush
->get_children_of_type(s
, 0, &osds
);
7274 vector
<int> up_osds
;
7275 for (auto o
: osds
) {
7276 if (is_up(o
) && !skip
.count(o
))
7277 up_osds
.push_back(o
);
7279 if (up_osds
.empty())
7281 auto it
= up_osds
.begin();
7282 std::advance(it
, (n
% up_osds
.size()));
7288 float OSDMap::pool_raw_used_rate(int64_t poolid
) const
7290 const pg_pool_t
*pool
= get_pg_pool(poolid
);
7291 assert(pool
!= nullptr);
7293 switch (pool
->get_type()) {
7294 case pg_pool_t::TYPE_REPLICATED
:
7295 return pool
->get_size();
7296 case pg_pool_t::TYPE_ERASURE
:
7299 get_erasure_code_profile(pool
->erasure_code_profile
);
7300 auto pm
= ecp
.find("m");
7301 auto pk
= ecp
.find("k");
7302 if (pm
!= ecp
.end() && pk
!= ecp
.end()) {
7303 int k
= atoi(pk
->second
.c_str());
7304 int m
= atoi(pm
->second
.c_str());
7306 ceph_assert(mk
!= 0);
7307 ceph_assert(k
!= 0);
7308 return (float)mk
/ k
;
7315 ceph_abort_msg("unrecognized pool type");
7319 unsigned OSDMap::get_osd_crush_node_flags(int osd
) const
7322 if (!crush_node_flags
.empty()) {
7323 // the map will contain type -> name
7324 std::map
<std::string
,std::string
> ploc
= crush
->get_full_location(osd
);
7325 for (auto& i
: ploc
) {
7326 int id
= crush
->get_item_id(i
.second
);
7327 auto p
= crush_node_flags
.find(id
);
7328 if (p
!= crush_node_flags
.end()) {
7336 unsigned OSDMap::get_crush_node_flags(int id
) const
7339 auto it
= crush_node_flags
.find(id
);
7340 if (it
!= crush_node_flags
.end())
7345 unsigned OSDMap::get_device_class_flags(int id
) const
7348 auto it
= device_class_flags
.find(id
);
7349 if (it
!= device_class_flags
.end())
7354 std::optional
<std::string
> OSDMap::pending_require_osd_release() const
7356 if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY
) &&
7357 require_osd_release
< ceph_release_t::quincy
) {
7360 if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC
) &&
7361 require_osd_release
< ceph_release_t::pacific
) {
7364 if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS
) &&
7365 require_osd_release
< ceph_release_t::octopus
) {
7368 if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS
) &&
7369 require_osd_release
< ceph_release_t::nautilus
) {
7373 return std::nullopt
;