1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
22 #include <boost/algorithm/string.hpp>
25 #include "common/config.h"
26 #include "common/errno.h"
27 #include "common/Formatter.h"
28 #include "common/TextTable.h"
29 #include "include/ceph_features.h"
30 #include "include/common_fwd.h"
31 #include "include/str_map.h"
33 #include "common/code_environment.h"
34 #include "mon/health_check.h"
36 #include "crush/CrushTreeDumper.h"
37 #include "common/Clock.h"
38 #include "mon/PGMap.h"
45 using std::ostringstream
;
49 using std::stringstream
;
50 using std::unordered_map
;
55 using ceph::Formatter
;
57 #define dout_subsys ceph_subsys_osd
59 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
60 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
63 // ----------------------------------
66 void osd_info_t::dump(Formatter
*f
) const
68 f
->dump_int("last_clean_begin", last_clean_begin
);
69 f
->dump_int("last_clean_end", last_clean_end
);
70 f
->dump_int("up_from", up_from
);
71 f
->dump_int("up_thru", up_thru
);
72 f
->dump_int("down_at", down_at
);
73 f
->dump_int("lost_at", lost_at
);
76 void osd_info_t::encode(ceph::buffer::list
& bl
) const
81 encode(last_clean_begin
, bl
);
82 encode(last_clean_end
, bl
);
89 void osd_info_t::decode(ceph::buffer::list::const_iterator
& bl
)
94 decode(last_clean_begin
, bl
);
95 decode(last_clean_end
, bl
);
102 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
104 o
.push_back(new osd_info_t
);
105 o
.push_back(new osd_info_t
);
106 o
.back()->last_clean_begin
= 1;
107 o
.back()->last_clean_end
= 2;
108 o
.back()->up_from
= 30;
109 o
.back()->up_thru
= 40;
110 o
.back()->down_at
= 5;
111 o
.back()->lost_at
= 6;
114 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
116 out
<< "up_from " << info
.up_from
117 << " up_thru " << info
.up_thru
118 << " down_at " << info
.down_at
119 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
121 out
<< " lost_at " << info
.lost_at
;
125 // ----------------------------------
128 void osd_xinfo_t::dump(Formatter
*f
) const
130 f
->dump_stream("down_stamp") << down_stamp
;
131 f
->dump_float("laggy_probability", laggy_probability
);
132 f
->dump_int("laggy_interval", laggy_interval
);
133 f
->dump_int("features", features
);
134 f
->dump_unsigned("old_weight", old_weight
);
135 f
->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub
;
136 f
->dump_int("dead_epoch", dead_epoch
);
139 void osd_xinfo_t::encode(ceph::buffer::list
& bl
, uint64_t enc_features
) const
142 if (!HAVE_FEATURE(enc_features
, SERVER_OCTOPUS
)) {
145 ENCODE_START(v
, 1, bl
);
146 encode(down_stamp
, bl
);
147 __u32 lp
= laggy_probability
* float(0xfffffffful
);
149 encode(laggy_interval
, bl
);
150 encode(features
, bl
);
151 encode(old_weight
, bl
);
153 encode(last_purged_snaps_scrub
, bl
);
154 encode(dead_epoch
, bl
);
159 void osd_xinfo_t::decode(ceph::buffer::list::const_iterator
& bl
)
162 decode(down_stamp
, bl
);
165 laggy_probability
= (float)lp
/ (float)0xffffffff;
166 decode(laggy_interval
, bl
);
168 decode(features
, bl
);
172 decode(old_weight
, bl
);
176 decode(last_purged_snaps_scrub
, bl
);
177 decode(dead_epoch
, bl
);
184 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
186 o
.push_back(new osd_xinfo_t
);
187 o
.push_back(new osd_xinfo_t
);
188 o
.back()->down_stamp
= utime_t(2, 3);
189 o
.back()->laggy_probability
= .123;
190 o
.back()->laggy_interval
= 123456;
191 o
.back()->old_weight
= 0x7fff;
194 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
196 return out
<< "down_stamp " << xi
.down_stamp
197 << " laggy_probability " << xi
.laggy_probability
198 << " laggy_interval " << xi
.laggy_interval
199 << " old_weight " << xi
.old_weight
200 << " last_purged_snaps_scrub " << xi
.last_purged_snaps_scrub
201 << " dead_epoch " << xi
.dead_epoch
;
204 // ----------------------------------
205 // OSDMap::Incremental
207 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
210 for (auto &weight
: new_weight
) {
211 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
213 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
219 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
222 for (auto &state
: new_state
) { //
223 if (state
.second
& CEPH_OSD_UP
) {
224 if (previous
->is_up(state
.first
))
233 int OSDMap::Incremental::identify_osd(uuid_d u
) const
235 for (auto &uuid
: new_uuid
)
236 if (uuid
.second
== u
)
241 int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext
*cct
,
242 const OSDMap
& osdmap
)
244 ceph_assert(epoch
== osdmap
.get_epoch() + 1);
246 for (auto &new_pool
: new_pools
) {
247 if (!new_pool
.second
.tiers
.empty()) {
248 pg_pool_t
& base
= new_pool
.second
;
250 auto new_rem_it
= new_removed_snaps
.find(new_pool
.first
);
252 for (const auto &tier_pool
: base
.tiers
) {
253 const auto &r
= new_pools
.find(tier_pool
);
255 if (r
== new_pools
.end()) {
256 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
258 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
261 tier
= get_new_pool(tier_pool
, orig
);
265 if (tier
->tier_of
!= new_pool
.first
) {
266 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
270 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
271 << tier_pool
<< dendl
;
272 tier
->snap_seq
= base
.snap_seq
;
273 tier
->snap_epoch
= base
.snap_epoch
;
274 tier
->snaps
= base
.snaps
;
275 tier
->removed_snaps
= base
.removed_snaps
;
276 tier
->flags
|= base
.flags
& (pg_pool_t::FLAG_SELFMANAGED_SNAPS
|
277 pg_pool_t::FLAG_POOL_SNAPS
);
279 if (new_rem_it
!= new_removed_snaps
.end()) {
280 new_removed_snaps
[tier_pool
] = new_rem_it
->second
;
283 tier
->application_metadata
= base
.application_metadata
;
290 // ----------------------------------
293 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
299 down_cache
->count(id
)) {
304 crush
->get_children(id
, &children
);
305 for (const auto &child
: children
) {
306 if (!subtree_is_down(child
, down_cache
)) {
311 down_cache
->insert(id
);
316 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
318 // use a stack-local down_cache if we didn't get one from the
319 // caller. then at least this particular call will avoid duplicated
321 set
<int> local_down_cache
;
323 down_cache
= &local_down_cache
;
332 type
= crush
->get_bucket_type(current
);
334 ceph_assert(type
>= 0);
336 if (!subtree_is_down(current
, down_cache
)) {
337 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
341 // is this a big enough subtree to be marked as down?
342 if (type
>= subtree_type
) {
343 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
347 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
354 bool OSDMap::subtree_type_is_down(
358 set
<int> *down_in_osds
,
359 set
<int> *up_in_osds
,
360 set
<int> *subtree_up
,
361 unordered_map
<int, set
<int> > *subtree_type_down
) const
364 bool is_down_ret
= is_down(id
);
367 down_in_osds
->insert(id
);
369 up_in_osds
->insert(id
);
375 if (subtree_type_down
&&
376 (*subtree_type_down
)[subtree_type
].count(id
)) {
381 crush
->get_children(id
, &children
);
382 for (const auto &child
: children
) {
383 if (!subtree_type_is_down(
384 cct
, child
, crush
->get_bucket_type(child
),
385 down_in_osds
, up_in_osds
, subtree_up
, subtree_type_down
)) {
386 subtree_up
->insert(id
);
390 if (subtree_type_down
) {
391 (*subtree_type_down
)[subtree_type
].insert(id
);
396 void OSDMap::Incremental::encode_client_old(ceph::buffer::list
& bl
) const
403 encode(modified
, bl
);
404 int32_t new_t
= new_pool_max
;
406 encode(new_flags
, bl
);
410 encode(new_max_osd
, bl
);
411 // for encode(new_pools, bl);
412 __u32 n
= new_pools
.size();
414 for (const auto &new_pool
: new_pools
) {
417 encode(new_pool
.second
, bl
, 0);
419 // for encode(new_pool_names, bl);
420 n
= new_pool_names
.size();
423 for (const auto &new_pool_name
: new_pool_names
) {
424 n
= new_pool_name
.first
;
426 encode(new_pool_name
.second
, bl
);
428 // for encode(old_pools, bl);
429 n
= old_pools
.size();
431 for (auto &old_pool
: old_pools
) {
435 encode(new_up_client
, bl
, 0);
437 // legacy is map<int32_t,uint8_t>
438 map
<int32_t, uint8_t> os
;
439 for (auto p
: new_state
) {
440 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
441 // that an old client could not understand.
443 uint8_t s
= p
.second
;
444 if (p
.second
!= 0 && s
== 0)
448 uint32_t n
= os
.size();
452 encode(p
.second
, bl
);
455 encode(new_weight
, bl
);
456 // for encode(new_pg_temp, bl);
457 n
= new_pg_temp
.size();
460 for (const auto &pg_temp
: new_pg_temp
) {
461 old_pg_t opg
= pg_temp
.first
.get_old_pg();
463 encode(pg_temp
.second
, bl
);
467 void OSDMap::Incremental::encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const
470 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
471 encode_client_old(bl
);
480 encode(modified
, bl
);
481 encode(new_pool_max
, bl
);
482 encode(new_flags
, bl
);
486 encode(new_max_osd
, bl
);
487 encode(new_pools
, bl
, features
);
488 encode(new_pool_names
, bl
);
489 encode(old_pools
, bl
);
490 encode(new_up_client
, bl
, features
);
492 map
<int32_t, uint8_t> os
;
493 for (auto p
: new_state
) {
494 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
495 // that an old client could not understand.
497 uint8_t s
= p
.second
;
498 if (p
.second
!= 0 && s
== 0)
502 uint32_t n
= os
.size();
506 encode(p
.second
, bl
);
509 encode(new_weight
, bl
);
510 encode(new_pg_temp
, bl
);
515 encode(new_hb_back_up
, bl
, features
);
516 encode(new_up_thru
, bl
);
517 encode(new_last_clean_interval
, bl
);
518 encode(new_lost
, bl
);
519 encode(new_blocklist
, bl
, features
);
520 encode(old_blocklist
, bl
, features
);
521 encode(new_up_cluster
, bl
, features
);
522 encode(cluster_snapshot
, bl
);
523 encode(new_uuid
, bl
);
524 encode(new_xinfo
, bl
, features
);
525 encode(new_hb_front_up
, bl
, features
);
529 static void encode_addrvec_map_as_addr(const T
& m
, ceph::buffer::list
& bl
, uint64_t f
)
531 uint32_t n
= m
.size();
535 encode(i
.second
.legacy_addr(), bl
, f
);
540 static void encode_addrvec_pvec_as_addr(const T
& m
, ceph::buffer::list
& bl
, uint64_t f
)
542 uint32_t n
= m
.size();
546 encode(i
->legacy_addr(), bl
, f
);
548 encode(entity_addr_t(), bl
, f
);
553 /* for a description of osdmap incremental versions, and when they were
554 * introduced, please refer to
555 * doc/dev/osd_internals/osdmap_versions.txt
557 void OSDMap::Incremental::encode(ceph::buffer::list
& bl
, uint64_t features
) const
560 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
561 encode_classic(bl
, features
);
565 // only a select set of callers should *ever* be encoding new
566 // OSDMaps. others should be passing around the canonical encoded
567 // buffers from on high. select out those callers by passing in an
568 // "impossible" feature bit.
569 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
570 features
&= ~CEPH_FEATURE_RESERVED
;
572 size_t start_offset
= bl
.length();
575 std::optional
<ceph::buffer::list::contiguous_filler
> crc_filler
;
577 // meta-encoding: how we include client-used and osd-specific data
578 ENCODE_START(8, 7, bl
);
582 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
584 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
586 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
589 ENCODE_START(v
, 1, bl
); // client-usable data
592 encode(modified
, bl
);
593 encode(new_pool_max
, bl
);
594 encode(new_flags
, bl
);
598 encode(new_max_osd
, bl
);
599 encode(new_pools
, bl
, features
);
600 encode(new_pool_names
, bl
);
601 encode(old_pools
, bl
);
603 encode(new_up_client
, bl
, features
);
605 encode_addrvec_map_as_addr(new_up_client
, bl
, features
);
608 encode(new_state
, bl
);
610 map
<int32_t, uint8_t> os
;
611 for (auto p
: new_state
) {
612 // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
613 // that an old client could not understand.
615 uint8_t s
= p
.second
;
616 if (p
.second
!= 0 && s
== 0)
620 uint32_t n
= os
.size();
624 encode(p
.second
, bl
);
627 encode(new_weight
, bl
);
628 encode(new_pg_temp
, bl
);
629 encode(new_primary_temp
, bl
);
630 encode(new_primary_affinity
, bl
);
631 encode(new_erasure_code_profiles
, bl
);
632 encode(old_erasure_code_profiles
, bl
);
634 encode(new_pg_upmap
, bl
);
635 encode(old_pg_upmap
, bl
);
636 encode(new_pg_upmap_items
, bl
);
637 encode(old_pg_upmap_items
, bl
);
640 encode(new_removed_snaps
, bl
);
641 encode(new_purged_snaps
, bl
);
644 encode(new_last_up_change
, bl
);
645 encode(new_last_in_change
, bl
);
647 ENCODE_FINISH(bl
); // client-usable data
651 uint8_t target_v
= 9; // if bumping this, be aware of stretch_mode target_v 10!
652 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
654 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
657 if (change_stretch_mode
) {
658 target_v
= std::max((uint8_t)10, target_v
);
660 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
662 encode_addrvec_map_as_addr(new_hb_back_up
, bl
, features
);
664 encode(new_hb_back_up
, bl
, features
);
666 encode(new_up_thru
, bl
);
667 encode(new_last_clean_interval
, bl
);
668 encode(new_lost
, bl
);
669 encode(new_blocklist
, bl
, features
);
670 encode(old_blocklist
, bl
, features
);
672 encode_addrvec_map_as_addr(new_up_cluster
, bl
, features
);
674 encode(new_up_cluster
, bl
, features
);
676 encode(cluster_snapshot
, bl
);
677 encode(new_uuid
, bl
);
678 encode(new_xinfo
, bl
, features
);
680 encode_addrvec_map_as_addr(new_hb_front_up
, bl
, features
);
682 encode(new_hb_front_up
, bl
, features
);
684 encode(features
, bl
); // NOTE: features arg, not the member
686 encode(new_nearfull_ratio
, bl
);
687 encode(new_full_ratio
, bl
);
688 encode(new_backfillfull_ratio
, bl
);
690 // 5 was string-based new_require_min_compat_client
692 encode(new_require_min_compat_client
, bl
);
693 encode(new_require_osd_release
, bl
);
696 encode(new_crush_node_flags
, bl
);
699 encode(new_device_class_flags
, bl
);
701 if (target_v
>= 10) {
702 encode(change_stretch_mode
, bl
);
703 encode(new_stretch_bucket_count
, bl
);
704 encode(new_degraded_stretch_mode
, bl
);
705 encode(new_recovering_stretch_mode
, bl
);
706 encode(new_stretch_mode_bucket
, bl
);
707 encode(stretch_mode_enabled
, bl
);
709 ENCODE_FINISH(bl
); // osd-only data
712 crc_offset
= bl
.length();
713 crc_filler
= bl
.append_hole(sizeof(uint32_t));
714 tail_offset
= bl
.length();
716 encode(full_crc
, bl
);
718 ENCODE_FINISH(bl
); // meta-encoding wrapper
721 ceph::buffer::list front
;
722 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
723 inc_crc
= front
.crc32c(-1);
724 ceph::buffer::list tail
;
725 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
726 inc_crc
= tail
.crc32c(inc_crc
);
729 crc_filler
->copy_in(4u, (char*)&crc_le
);
733 void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator
&p
)
743 if (v
== 4 || v
== 5) {
747 decode(new_pool_max
, p
);
748 decode(new_flags
, p
);
752 decode(new_max_osd
, p
);
758 decode(new_pools
[t
], p
);
761 decode(new_pools
, p
);
764 new_pool_names
.clear();
768 decode(new_pool_names
[t
], p
);
771 decode(new_pool_names
, p
);
781 decode(old_pools
, p
);
783 decode(new_up_client
, p
);
785 map
<int32_t,uint8_t> ns
;
788 new_state
[q
.first
] = q
.second
;
791 decode(new_weight
, p
);
798 ceph::decode_raw(opg
, p
);
799 decode(new_pg_temp
[pg_t(opg
)], p
);
802 decode(new_pg_temp
, p
);
805 // decode short map, too.
806 if (v
== 5 && p
.end())
813 decode(new_hb_back_up
, p
);
815 decode(new_pool_names
, p
);
816 decode(new_up_thru
, p
);
817 decode(new_last_clean_interval
, p
);
819 decode(new_blocklist
, p
);
820 decode(old_blocklist
, p
);
822 decode(new_up_cluster
, p
);
824 decode(cluster_snapshot
, p
);
828 decode(new_xinfo
, p
);
830 decode(new_hb_front_up
, p
);
833 /* for a description of osdmap incremental versions, and when they were
834 * introduced, please refer to
835 * doc/dev/osd_internals/osdmap_versions.txt
837 void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator
& bl
)
841 * Older encodings of the Incremental had a single struct_v which
842 * covered the whole encoding, and was prior to our modern
843 * stuff which includes a compatv and a size. So if we see
844 * a struct_v < 7, we must rewind to the beginning and use our
847 size_t start_offset
= bl
.get_off();
848 size_t tail_offset
= 0;
849 ceph::buffer::list crc_front
, crc_tail
;
851 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
853 bl
.seek(start_offset
);
857 encode_features
= CEPH_FEATURE_PGID64
;
863 DECODE_START(8, bl
); // client-usable data
866 decode(modified
, bl
);
867 decode(new_pool_max
, bl
);
868 decode(new_flags
, bl
);
872 decode(new_max_osd
, bl
);
873 decode(new_pools
, bl
);
874 decode(new_pool_names
, bl
);
875 decode(old_pools
, bl
);
876 decode(new_up_client
, bl
);
878 decode(new_state
, bl
);
880 map
<int32_t,uint8_t> ns
;
883 new_state
[q
.first
] = q
.second
;
886 decode(new_weight
, bl
);
887 decode(new_pg_temp
, bl
);
888 decode(new_primary_temp
, bl
);
890 decode(new_primary_affinity
, bl
);
892 new_primary_affinity
.clear();
894 decode(new_erasure_code_profiles
, bl
);
895 decode(old_erasure_code_profiles
, bl
);
897 new_erasure_code_profiles
.clear();
898 old_erasure_code_profiles
.clear();
901 decode(new_pg_upmap
, bl
);
902 decode(old_pg_upmap
, bl
);
903 decode(new_pg_upmap_items
, bl
);
904 decode(old_pg_upmap_items
, bl
);
907 decode(new_removed_snaps
, bl
);
908 decode(new_purged_snaps
, bl
);
911 decode(new_last_up_change
, bl
);
912 decode(new_last_in_change
, bl
);
914 DECODE_FINISH(bl
); // client-usable data
918 DECODE_START(10, bl
); // extended, osd-only data
919 decode(new_hb_back_up
, bl
);
920 decode(new_up_thru
, bl
);
921 decode(new_last_clean_interval
, bl
);
922 decode(new_lost
, bl
);
923 decode(new_blocklist
, bl
);
924 decode(old_blocklist
, bl
);
925 decode(new_up_cluster
, bl
);
926 decode(cluster_snapshot
, bl
);
927 decode(new_uuid
, bl
);
928 decode(new_xinfo
, bl
);
929 decode(new_hb_front_up
, bl
);
931 decode(encode_features
, bl
);
933 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
935 decode(new_nearfull_ratio
, bl
);
936 decode(new_full_ratio
, bl
);
938 new_nearfull_ratio
= -1;
942 decode(new_backfillfull_ratio
, bl
);
944 new_backfillfull_ratio
= -1;
950 new_require_min_compat_client
= ceph_release_from_name(r
);
954 decode(new_require_min_compat_client
, bl
);
955 decode(new_require_osd_release
, bl
);
957 if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
958 // only for compat with post-kraken pre-luminous test clusters
959 new_require_osd_release
= ceph_release_t::luminous
;
960 new_flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
961 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
962 new_require_osd_release
= ceph_release_t::kraken
;
963 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_JEWEL
)) {
964 new_require_osd_release
= ceph_release_t::jewel
;
966 new_require_osd_release
= ceph_release_t::unknown
;
970 decode(new_crush_node_flags
, bl
);
973 decode(new_device_class_flags
, bl
);
975 if (struct_v
>= 10) {
976 decode(change_stretch_mode
, bl
);
977 decode(new_stretch_bucket_count
, bl
);
978 decode(new_degraded_stretch_mode
, bl
);
979 decode(new_recovering_stretch_mode
, bl
);
980 decode(new_stretch_mode_bucket
, bl
);
981 decode(stretch_mode_enabled
, bl
);
984 DECODE_FINISH(bl
); // osd-only data
989 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
991 tail_offset
= bl
.get_off();
992 decode(full_crc
, bl
);
999 DECODE_FINISH(bl
); // wrapper
1003 uint32_t actual
= crc_front
.crc32c(-1);
1004 if (tail_offset
< bl
.get_off()) {
1005 ceph::buffer::list tail
;
1006 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
1007 actual
= tail
.crc32c(actual
);
1009 if (inc_crc
!= actual
) {
1011 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
1012 string s
= ss
.str();
1013 throw ceph::buffer::malformed_input(s
.c_str());
1018 void OSDMap::Incremental::dump(Formatter
*f
) const
1020 f
->dump_int("epoch", epoch
);
1021 f
->dump_stream("fsid") << fsid
;
1022 f
->dump_stream("modified") << modified
;
1023 f
->dump_stream("new_last_up_change") << new_last_up_change
;
1024 f
->dump_stream("new_last_in_change") << new_last_in_change
;
1025 f
->dump_int("new_pool_max", new_pool_max
);
1026 f
->dump_int("new_flags", new_flags
);
1027 f
->dump_float("new_full_ratio", new_full_ratio
);
1028 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
1029 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
1030 f
->dump_int("new_require_min_compat_client", to_integer
<int>(new_require_min_compat_client
));
1031 f
->dump_int("new_require_osd_release", to_integer
<int>(new_require_osd_release
));
1033 if (fullmap
.length()) {
1034 f
->open_object_section("full_map");
1036 ceph::buffer::list fbl
= fullmap
; // kludge around constness.
1037 auto p
= fbl
.cbegin();
1042 if (crush
.length()) {
1043 f
->open_object_section("crush");
1045 ceph::buffer::list tbl
= crush
; // kludge around constness.
1046 auto p
= tbl
.cbegin();
1052 f
->dump_int("new_max_osd", new_max_osd
);
1054 f
->open_array_section("new_pools");
1056 for (const auto &new_pool
: new_pools
) {
1057 f
->open_object_section("pool");
1058 f
->dump_int("pool", new_pool
.first
);
1059 new_pool
.second
.dump(f
);
1063 f
->open_array_section("new_pool_names");
1065 for (const auto &new_pool_name
: new_pool_names
) {
1066 f
->open_object_section("pool_name");
1067 f
->dump_int("pool", new_pool_name
.first
);
1068 f
->dump_string("name", new_pool_name
.second
);
1072 f
->open_array_section("old_pools");
1074 for (const auto &old_pool
: old_pools
)
1075 f
->dump_int("pool", old_pool
);
1078 f
->open_array_section("new_up_osds");
1080 for (const auto &upclient
: new_up_client
) {
1081 f
->open_object_section("osd");
1082 f
->dump_int("osd", upclient
.first
);
1083 f
->dump_stream("public_addr") << upclient
.second
.legacy_addr();
1084 f
->dump_object("public_addrs", upclient
.second
);
1085 if (auto p
= new_up_cluster
.find(upclient
.first
);
1086 p
!= new_up_cluster
.end()) {
1087 f
->dump_stream("cluster_addr") << p
->second
.legacy_addr();
1088 f
->dump_object("cluster_addrs", p
->second
);
1090 if (auto p
= new_hb_back_up
.find(upclient
.first
);
1091 p
!= new_hb_back_up
.end()) {
1092 f
->dump_object("heartbeat_back_addrs", p
->second
);
1094 if (auto p
= new_hb_front_up
.find(upclient
.first
);
1095 p
!= new_hb_front_up
.end()) {
1096 f
->dump_object("heartbeat_front_addrs", p
->second
);
1102 f
->open_array_section("new_weight");
1104 for (const auto &weight
: new_weight
) {
1105 f
->open_object_section("osd");
1106 f
->dump_int("osd", weight
.first
);
1107 f
->dump_int("weight", weight
.second
);
1112 f
->open_array_section("osd_state_xor");
1113 for (const auto &ns
: new_state
) {
1114 f
->open_object_section("osd");
1115 f
->dump_int("osd", ns
.first
);
1117 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
1118 f
->open_array_section("state_xor");
1119 for (auto &state
: st
)
1120 f
->dump_string("state", state
);
1126 f
->open_array_section("new_pg_temp");
1128 for (const auto &pg_temp
: new_pg_temp
) {
1129 f
->open_object_section("pg");
1130 f
->dump_stream("pgid") << pg_temp
.first
;
1131 f
->open_array_section("osds");
1133 for (const auto &osd
: pg_temp
.second
)
1134 f
->dump_int("osd", osd
);
1140 f
->open_array_section("primary_temp");
1142 for (const auto &primary_temp
: new_primary_temp
) {
1143 f
->dump_stream("pgid") << primary_temp
.first
;
1144 f
->dump_int("osd", primary_temp
.second
);
1146 f
->close_section(); // primary_temp
1148 f
->open_array_section("new_pg_upmap");
1149 for (auto& i
: new_pg_upmap
) {
1150 f
->open_object_section("mapping");
1151 f
->dump_stream("pgid") << i
.first
;
1152 f
->open_array_section("osds");
1153 for (auto osd
: i
.second
) {
1154 f
->dump_int("osd", osd
);
1160 f
->open_array_section("old_pg_upmap");
1161 for (auto& i
: old_pg_upmap
) {
1162 f
->dump_stream("pgid") << i
;
1166 f
->open_array_section("new_pg_upmap_items");
1167 for (auto& i
: new_pg_upmap_items
) {
1168 f
->open_object_section("mapping");
1169 f
->dump_stream("pgid") << i
.first
;
1170 f
->open_array_section("mappings");
1171 for (auto& p
: i
.second
) {
1172 f
->open_object_section("mapping");
1173 f
->dump_int("from", p
.first
);
1174 f
->dump_int("to", p
.second
);
1181 f
->open_array_section("old_pg_upmap_items");
1182 for (auto& i
: old_pg_upmap_items
) {
1183 f
->dump_stream("pgid") << i
;
1187 f
->open_array_section("new_up_thru");
1189 for (const auto &up_thru
: new_up_thru
) {
1190 f
->open_object_section("osd");
1191 f
->dump_int("osd", up_thru
.first
);
1192 f
->dump_int("up_thru", up_thru
.second
);
1197 f
->open_array_section("new_lost");
1199 for (const auto &lost
: new_lost
) {
1200 f
->open_object_section("osd");
1201 f
->dump_int("osd", lost
.first
);
1202 f
->dump_int("epoch_lost", lost
.second
);
1207 f
->open_array_section("new_last_clean_interval");
1209 for (const auto &last_clean_interval
: new_last_clean_interval
) {
1210 f
->open_object_section("osd");
1211 f
->dump_int("osd", last_clean_interval
.first
);
1212 f
->dump_int("first", last_clean_interval
.second
.first
);
1213 f
->dump_int("last", last_clean_interval
.second
.second
);
1218 f
->open_array_section("new_blocklist");
1219 for (const auto &blist
: new_blocklist
) {
1222 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1225 f
->open_array_section("old_blocklist");
1226 for (const auto &blist
: old_blocklist
)
1227 f
->dump_stream("addr") << blist
;
1230 f
->open_array_section("new_xinfo");
1231 for (const auto &xinfo
: new_xinfo
) {
1232 f
->open_object_section("xinfo");
1233 f
->dump_int("osd", xinfo
.first
);
1234 xinfo
.second
.dump(f
);
1239 if (cluster_snapshot
.size())
1240 f
->dump_string("cluster_snapshot", cluster_snapshot
);
1242 f
->open_array_section("new_uuid");
1243 for (const auto &uuid
: new_uuid
) {
1244 f
->open_object_section("osd");
1245 f
->dump_int("osd", uuid
.first
);
1246 f
->dump_stream("uuid") << uuid
.second
;
1251 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
1252 f
->open_array_section("old_erasure_code_profiles");
1253 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
1254 f
->dump_string("old", erasure_code_profile
);
1258 f
->open_array_section("new_removed_snaps");
1259 for (auto& p
: new_removed_snaps
) {
1260 f
->open_object_section("pool");
1261 f
->dump_int("pool", p
.first
);
1262 f
->open_array_section("snaps");
1263 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1264 f
->open_object_section("interval");
1265 f
->dump_unsigned("begin", q
.get_start());
1266 f
->dump_unsigned("length", q
.get_len());
1273 f
->open_array_section("new_purged_snaps");
1274 for (auto& p
: new_purged_snaps
) {
1275 f
->open_object_section("pool");
1276 f
->dump_int("pool", p
.first
);
1277 f
->open_array_section("snaps");
1278 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1279 f
->open_object_section("interval");
1280 f
->dump_unsigned("begin", q
.get_start());
1281 f
->dump_unsigned("length", q
.get_len());
1287 f
->open_array_section("new_crush_node_flags");
1288 for (auto& i
: new_crush_node_flags
) {
1289 f
->open_object_section("node");
1290 f
->dump_int("id", i
.first
);
1292 calc_state_set(i
.second
, st
);
1293 for (auto& j
: st
) {
1294 f
->dump_string("flag", j
);
1299 f
->open_array_section("new_device_class_flags");
1300 for (auto& i
: new_device_class_flags
) {
1301 f
->open_object_section("device_class");
1302 f
->dump_int("id", i
.first
);
1304 calc_state_set(i
.second
, st
);
1305 for (auto& j
: st
) {
1306 f
->dump_string("flag", j
);
1311 f
->open_object_section("stretch_mode");
1313 f
->dump_bool("change_stretch_mode", change_stretch_mode
);
1314 f
->dump_bool("stretch_mode_enabled", stretch_mode_enabled
);
1315 f
->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count
);
1316 f
->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode
);
1317 f
->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode
);
1318 f
->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket
);
1324 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
1326 o
.push_back(new Incremental
);
1329 // ----------------------------------
1332 void OSDMap::set_epoch(epoch_t e
)
1335 for (auto &pool
: pools
)
1336 pool
.second
.last_change
= e
;
1339 bool OSDMap::is_blocklisted(const entity_addr_t
& orig
) const
1341 if (blocklist
.empty()) {
1345 // all blocklist entries are type ANY for nautilus+
1346 // FIXME: avoid this copy!
1347 entity_addr_t a
= orig
;
1348 if (require_osd_release
< ceph_release_t::nautilus
) {
1349 a
.set_type(entity_addr_t::TYPE_LEGACY
);
1351 a
.set_type(entity_addr_t::TYPE_ANY
);
1354 // this specific instance?
1355 if (blocklist
.count(a
)) {
1359 // is entire ip blocklisted?
1363 if (blocklist
.count(a
)) {
1371 bool OSDMap::is_blocklisted(const entity_addrvec_t
& av
) const
1373 if (blocklist
.empty())
1376 for (auto& a
: av
.v
) {
1377 if (is_blocklisted(a
)) {
1385 void OSDMap::get_blocklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const
1387 std::copy(blocklist
.begin(), blocklist
.end(), std::back_inserter(*bl
));
1390 void OSDMap::get_blocklist(std::set
<entity_addr_t
> *bl
) const
1392 for (const auto &i
: blocklist
) {
1393 bl
->insert(i
.first
);
1397 void OSDMap::set_max_osd(int m
)
1400 osd_state
.resize(max_osd
, 0);
1401 osd_weight
.resize(max_osd
, CEPH_OSD_OUT
);
1402 osd_info
.resize(max_osd
);
1403 osd_xinfo
.resize(max_osd
);
1404 osd_addrs
->client_addrs
.resize(max_osd
);
1405 osd_addrs
->cluster_addrs
.resize(max_osd
);
1406 osd_addrs
->hb_back_addrs
.resize(max_osd
);
1407 osd_addrs
->hb_front_addrs
.resize(max_osd
);
1408 osd_uuid
->resize(max_osd
);
1409 if (osd_primary_affinity
)
1410 osd_primary_affinity
->resize(max_osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1415 int OSDMap::calc_num_osds()
1420 for (int i
=0; i
<max_osd
; i
++) {
1421 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1423 if (osd_state
[i
] & CEPH_OSD_UP
) {
1426 if (get_weight(i
) != CEPH_OSD_OUT
) {
1434 void OSDMap::get_full_pools(CephContext
*cct
,
1436 set
<int64_t> *backfillfull
,
1437 set
<int64_t> *nearfull
) const
1440 ceph_assert(backfillfull
);
1441 ceph_assert(nearfull
);
1443 backfillfull
->clear();
1446 vector
<int> full_osds
;
1447 vector
<int> backfillfull_osds
;
1448 vector
<int> nearfull_osds
;
1449 for (int i
= 0; i
< max_osd
; ++i
) {
1450 if (exists(i
) && is_up(i
) && is_in(i
)) {
1451 if (osd_state
[i
] & CEPH_OSD_FULL
)
1452 full_osds
.push_back(i
);
1453 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1454 backfillfull_osds
.push_back(i
);
1455 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1456 nearfull_osds
.push_back(i
);
1460 for (auto i
: full_osds
) {
1461 get_pool_ids_by_osd(cct
, i
, full
);
1463 for (auto i
: backfillfull_osds
) {
1464 get_pool_ids_by_osd(cct
, i
, backfillfull
);
1466 for (auto i
: nearfull_osds
) {
1467 get_pool_ids_by_osd(cct
, i
, nearfull
);
1471 void OSDMap::get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
1472 set
<int> *nearfull
) const
1477 for (int i
= 0; i
< max_osd
; ++i
) {
1478 if (exists(i
) && is_up(i
) && is_in(i
)) {
1479 if (osd_state
[i
] & CEPH_OSD_FULL
)
1481 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1482 backfill
->emplace(i
);
1483 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1484 nearfull
->emplace(i
);
1489 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1491 for (int i
=0; i
<max_osd
; i
++)
1496 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1498 for (int i
= 0; i
< max_osd
; i
++) {
1504 void OSDMap::get_out_existing_osds(set
<int32_t>& ls
) const
1506 for (int i
= 0; i
< max_osd
; i
++) {
1507 if (exists(i
) && get_weight(i
) == CEPH_OSD_OUT
)
1512 void OSDMap::get_flag_set(set
<string
> *flagset
) const
1514 for (unsigned i
= 0; i
< sizeof(flags
) * 8; ++i
) {
1515 if (flags
& (1<<i
)) {
1516 flagset
->insert(get_flag_string(flags
& (1<<i
)));
1521 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1524 for (unsigned s
= 1; t
; s
<<= 1) {
1527 st
.insert(ceph_osd_state_name(s
));
1532 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1535 for (const auto &weight
: weights
) {
1536 if (weight
.second
> max
)
1537 max
= weight
.second
;
1540 for (const auto &weight
: weights
) {
1541 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1545 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1547 for (int i
=0; i
<max_osd
; i
++)
1548 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1549 get_cluster_addrs(i
).contains(addr
)))
1554 int OSDMap::identify_osd(const uuid_d
& u
) const
1556 for (int i
=0; i
<max_osd
; i
++)
1557 if (exists(i
) && get_uuid(i
) == u
)
1562 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1564 for (int i
=0; i
<max_osd
; i
++)
1565 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1566 get_cluster_addrs(i
).contains(addr
) ||
1567 get_hb_back_addrs(i
).contains(addr
) ||
1568 get_hb_front_addrs(i
).contains(addr
)))
1573 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1575 for (int i
=0; i
<max_osd
; i
++)
1576 if (exists(i
) && (get_addrs(i
).is_same_host(ip
) ||
1577 get_cluster_addrs(i
).is_same_host(ip
)))
1583 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1585 uint64_t features
= 0; // things we actually have
1586 uint64_t mask
= 0; // things we could have
1588 if (crush
->has_nondefault_tunables())
1589 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1590 if (crush
->has_nondefault_tunables2())
1591 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1592 if (crush
->has_nondefault_tunables3())
1593 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1594 if (crush
->has_v4_buckets())
1595 features
|= CEPH_FEATURE_CRUSH_V4
;
1596 if (crush
->has_nondefault_tunables5())
1597 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1598 if (crush
->has_incompat_choose_args()) {
1599 features
|= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
;
1601 mask
|= CEPH_FEATURES_CRUSH
;
1603 if (!pg_upmap
.empty() || !pg_upmap_items
.empty())
1604 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1605 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1607 for (auto &pool
: pools
) {
1608 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1609 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1611 if (!pool
.second
.tiers
.empty() ||
1612 pool
.second
.is_tier()) {
1613 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1615 int ruleid
= crush
->find_rule(pool
.second
.get_crush_rule(),
1616 pool
.second
.get_type(),
1617 pool
.second
.get_size());
1619 if (crush
->is_v2_rule(ruleid
))
1620 features
|= CEPH_FEATURE_CRUSH_V2
;
1621 if (crush
->is_v3_rule(ruleid
))
1622 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1623 if (crush
->is_v5_rule(ruleid
))
1624 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1627 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1629 if (osd_primary_affinity
) {
1630 for (int i
= 0; i
< max_osd
; ++i
) {
1631 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1632 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1637 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1639 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1640 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1641 if (require_osd_release
>= ceph_release_t::jewel
) {
1642 features
|= jewel_features
;
1644 mask
|= jewel_features
;
1646 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1647 | CEPH_FEATURE_MSG_ADDR2
;
1648 if (require_osd_release
>= ceph_release_t::kraken
) {
1649 features
|= kraken_features
;
1651 mask
|= kraken_features
;
1653 if (stretch_mode_enabled
) {
1654 features
|= CEPH_FEATUREMASK_STRETCH_MODE
;
1655 mask
|= CEPH_FEATUREMASK_STRETCH_MODE
;
1659 if (require_min_compat_client
>= ceph_release_t::nautilus
) {
1660 // if min_compat_client is >= nautilus, require v2 cephx signatures
1662 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1663 } else if (require_osd_release
>= ceph_release_t::nautilus
&&
1664 entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1665 // if osds are >= nautilus, at least require the signatures from them
1666 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1668 mask
|= CEPH_FEATUREMASK_CEPHX_V2
;
1675 ceph_release_t
OSDMap::get_min_compat_client() const
1677 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1679 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1680 HAVE_FEATURE(f
, CRUSH_CHOOSE_ARGS
)) { // v12.0.1-2172-gef1ef28
1681 return ceph_release_t::luminous
; // v12.2.0
1683 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1684 return ceph_release_t::jewel
; // v10.2.0
1686 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1687 return ceph_release_t::hammer
; // v0.94.0
1689 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1690 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1691 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1692 return ceph_release_t::firefly
; // v0.80.0
1694 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1695 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1696 return ceph_release_t::dumpling
; // v0.67.0
1698 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1699 return ceph_release_t::argonaut
; // v0.48argonaut-206-g6f381af
1701 return ceph_release_t::argonaut
; // v0.48argonaut-206-g6f381af
1704 ceph_release_t
OSDMap::get_require_min_compat_client() const
1706 return require_min_compat_client
;
1709 void OSDMap::_calc_up_osd_features()
1712 cached_up_osd_features
= 0;
1713 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1716 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1717 if (xi
.features
== 0)
1718 continue; // bogus xinfo, maybe #20751 or similar, skipping
1720 cached_up_osd_features
= xi
.features
;
1723 cached_up_osd_features
&= xi
.features
;
1728 uint64_t OSDMap::get_up_osd_features() const
1730 return cached_up_osd_features
;
1733 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1736 if (o
->epoch
== n
->epoch
)
1742 if (o
->max_osd
!= n
->max_osd
)
1744 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1745 if ( n
->osd_addrs
->client_addrs
[i
] && o
->osd_addrs
->client_addrs
[i
] &&
1746 *n
->osd_addrs
->client_addrs
[i
] == *o
->osd_addrs
->client_addrs
[i
])
1747 n
->osd_addrs
->client_addrs
[i
] = o
->osd_addrs
->client_addrs
[i
];
1750 if ( n
->osd_addrs
->cluster_addrs
[i
] && o
->osd_addrs
->cluster_addrs
[i
] &&
1751 *n
->osd_addrs
->cluster_addrs
[i
] == *o
->osd_addrs
->cluster_addrs
[i
])
1752 n
->osd_addrs
->cluster_addrs
[i
] = o
->osd_addrs
->cluster_addrs
[i
];
1755 if ( n
->osd_addrs
->hb_back_addrs
[i
] && o
->osd_addrs
->hb_back_addrs
[i
] &&
1756 *n
->osd_addrs
->hb_back_addrs
[i
] == *o
->osd_addrs
->hb_back_addrs
[i
])
1757 n
->osd_addrs
->hb_back_addrs
[i
] = o
->osd_addrs
->hb_back_addrs
[i
];
1760 if ( n
->osd_addrs
->hb_front_addrs
[i
] && o
->osd_addrs
->hb_front_addrs
[i
] &&
1761 *n
->osd_addrs
->hb_front_addrs
[i
] == *o
->osd_addrs
->hb_front_addrs
[i
])
1762 n
->osd_addrs
->hb_front_addrs
[i
] = o
->osd_addrs
->hb_front_addrs
[i
];
1767 // zoinks, no differences at all!
1768 n
->osd_addrs
= o
->osd_addrs
;
1771 // does crush match?
1772 ceph::buffer::list oc
, nc
;
1773 encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1774 encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1775 if (oc
.contents_equal(nc
)) {
1776 n
->crush
= o
->crush
;
1779 // does pg_temp match?
1780 if (*o
->pg_temp
== *n
->pg_temp
)
1781 n
->pg_temp
= o
->pg_temp
;
1783 // does primary_temp match?
1784 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1785 if (*o
->primary_temp
== *n
->primary_temp
)
1786 n
->primary_temp
= o
->primary_temp
;
1790 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1791 *o
->osd_uuid
== *n
->osd_uuid
)
1792 n
->osd_uuid
= o
->osd_uuid
;
1795 void OSDMap::clean_temps(CephContext
*cct
,
1796 const OSDMap
& oldmap
,
1797 const OSDMap
& nextmap
,
1798 Incremental
*pending_inc
)
1800 ldout(cct
, 10) << __func__
<< dendl
;
1802 for (auto pg
: *nextmap
.pg_temp
) {
1803 // if pool does not exist, remove any existing pg_temps associated with
1804 // it. we don't care about pg_temps on the pending_inc either; if there
1805 // are new_pg_temp entries on the pending, clear them out just as well.
1806 if (!nextmap
.have_pg_pool(pg
.first
.pool())) {
1807 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1808 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1809 pending_inc
->new_pg_temp
[pg
.first
].clear();
1813 unsigned num_up
= 0;
1814 for (auto o
: pg
.second
) {
1815 if (!nextmap
.is_down(o
)) {
1821 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1822 << " with all down osds" << pg
.second
<< dendl
;
1823 pending_inc
->new_pg_temp
[pg
.first
].clear();
1826 // redundant pg_temp?
1829 nextmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1830 bool remove
= false;
1831 if (raw_up
== pg
.second
) {
1832 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1833 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1836 // oversized pg_temp?
1837 if (pg
.second
.size() > nextmap
.get_pg_pool(pg
.first
.pool())->get_size()) {
1838 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1839 << pg
.second
<< " exceeds pool size" << dendl
;
1843 if (oldmap
.pg_temp
->count(pg
.first
))
1844 pending_inc
->new_pg_temp
[pg
.first
].clear();
1846 pending_inc
->new_pg_temp
.erase(pg
.first
);
1850 for (auto &pg
: *nextmap
.primary_temp
) {
1852 if (nextmap
.is_down(pg
.second
)) {
1853 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
1854 << " to down " << pg
.second
<< dendl
;
1855 pending_inc
->new_primary_temp
[pg
.first
] = -1;
1858 // redundant primary_temp?
1859 vector
<int> real_up
, templess_up
;
1860 int real_primary
, templess_primary
;
1861 pg_t pgid
= pg
.first
;
1862 nextmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
1863 nextmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
1864 if (real_primary
== templess_primary
){
1865 ldout(cct
, 10) << __func__
<< " removing primary_temp "
1866 << pgid
<< " -> " << real_primary
1867 << " (unnecessary/redundant)" << dendl
;
1868 if (oldmap
.primary_temp
->count(pgid
))
1869 pending_inc
->new_primary_temp
[pgid
] = -1;
1871 pending_inc
->new_primary_temp
.erase(pgid
);
1876 void OSDMap::get_upmap_pgs(vector
<pg_t
> *upmap_pgs
) const
1878 upmap_pgs
->reserve(pg_upmap
.size() + pg_upmap_items
.size());
1879 for (auto& p
: pg_upmap
)
1880 upmap_pgs
->push_back(p
.first
);
1881 for (auto& p
: pg_upmap_items
)
1882 upmap_pgs
->push_back(p
.first
);
1885 bool OSDMap::check_pg_upmaps(
1887 const vector
<pg_t
>& to_check
,
1888 vector
<pg_t
> *to_cancel
,
1889 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> *to_remap
) const
1891 bool any_change
= false;
1892 map
<int, map
<int, float>> rule_weight_map
;
1893 for (auto& pg
: to_check
) {
1894 const pg_pool_t
*pi
= get_pg_pool(pg
.pool());
1895 if (!pi
|| pg
.ps() >= pi
->get_pg_num_pending()) {
1896 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is gone or merge source"
1898 to_cancel
->push_back(pg
);
1901 if (pi
->is_pending_merge(pg
, nullptr)) {
1902 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is pending merge"
1904 to_cancel
->push_back(pg
);
1907 vector
<int> raw
, up
;
1908 pg_to_raw_upmap(pg
, &raw
, &up
);
1909 auto crush_rule
= get_pg_pool_crush_rule(pg
);
1910 auto r
= crush
->verify_upmap(cct
,
1912 get_pg_pool_size(pg
),
1915 ldout(cct
, 0) << __func__
<< " verify_upmap of pg " << pg
1916 << " returning " << r
1918 to_cancel
->push_back(pg
);
1921 // below we check against crush-topology changing..
1922 map
<int, float> weight_map
;
1923 auto it
= rule_weight_map
.find(crush_rule
);
1924 if (it
== rule_weight_map
.end()) {
1925 auto r
= crush
->get_rule_weight_osd_map(crush_rule
, &weight_map
);
1927 lderr(cct
) << __func__
<< " unable to get crush weight_map for "
1928 << "crush_rule " << crush_rule
1932 rule_weight_map
[crush_rule
] = weight_map
;
1934 weight_map
= it
->second
;
1936 ldout(cct
, 10) << __func__
<< " pg " << pg
1937 << " weight_map " << weight_map
1939 for (auto osd
: up
) {
1940 auto it
= weight_map
.find(osd
);
1941 if (it
== weight_map
.end()) {
1942 ldout(cct
, 10) << __func__
<< " pg " << pg
<< ": osd " << osd
<< " is gone or has "
1943 << "been moved out of the specific crush-tree"
1945 to_cancel
->push_back(pg
);
1948 auto adjusted_weight
= get_weightf(it
->first
) * it
->second
;
1949 if (adjusted_weight
== 0) {
1950 ldout(cct
, 10) << __func__
<< " pg " << pg
<< ": osd " << osd
1951 << " is out/crush-out"
1953 to_cancel
->push_back(pg
);
1957 if (!to_cancel
->empty() && to_cancel
->back() == pg
)
1959 // okay, upmap is valid
1960 // continue to check if it is still necessary
1961 auto i
= pg_upmap
.find(pg
);
1962 if (i
!= pg_upmap
.end()) {
1963 if (i
->second
== raw
) {
1964 ldout(cct
, 10) << "removing redundant pg_upmap " << i
->first
<< " "
1965 << i
->second
<< dendl
;
1966 to_cancel
->push_back(pg
);
1969 if ((int)i
->second
.size() != get_pg_pool_size(pg
)) {
1970 ldout(cct
, 10) << "removing pg_upmap " << i
->first
<< " "
1971 << i
->second
<< " != pool size " << get_pg_pool_size(pg
)
1973 to_cancel
->push_back(pg
);
1977 auto j
= pg_upmap_items
.find(pg
);
1978 if (j
!= pg_upmap_items
.end()) {
1979 mempool::osdmap::vector
<pair
<int,int>> newmap
;
1980 for (auto& p
: j
->second
) {
1981 if (std::find(raw
.begin(), raw
.end(), p
.first
) == raw
.end()) {
1982 // cancel mapping if source osd does not exist anymore
1985 if (p
.second
!= CRUSH_ITEM_NONE
&& p
.second
< max_osd
&&
1986 p
.second
>= 0 && osd_weight
[p
.second
] == 0) {
1987 // cancel mapping if target osd is out
1990 newmap
.push_back(p
);
1992 if (newmap
.empty()) {
1993 ldout(cct
, 10) << " removing no-op pg_upmap_items "
1994 << j
->first
<< " " << j
->second
1996 to_cancel
->push_back(pg
);
1997 } else if (newmap
!= j
->second
) {
1998 ldout(cct
, 10) << " simplifying partially no-op pg_upmap_items "
1999 << j
->first
<< " " << j
->second
2002 to_remap
->insert({pg
, newmap
});
2007 any_change
= any_change
|| !to_cancel
->empty();
2011 void OSDMap::clean_pg_upmaps(
2013 Incremental
*pending_inc
,
2014 const vector
<pg_t
>& to_cancel
,
2015 const map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>>& to_remap
) const
2017 for (auto &pg
: to_cancel
) {
2018 auto i
= pending_inc
->new_pg_upmap
.find(pg
);
2019 if (i
!= pending_inc
->new_pg_upmap
.end()) {
2020 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
2021 << "pg_upmap entry "
2022 << i
->first
<< "->" << i
->second
2024 pending_inc
->new_pg_upmap
.erase(i
);
2026 auto j
= pg_upmap
.find(pg
);
2027 if (j
!= pg_upmap
.end()) {
2028 ldout(cct
, 10) << __func__
<< " cancel invalid pg_upmap entry "
2029 << j
->first
<< "->" << j
->second
2031 pending_inc
->old_pg_upmap
.insert(pg
);
2033 auto p
= pending_inc
->new_pg_upmap_items
.find(pg
);
2034 if (p
!= pending_inc
->new_pg_upmap_items
.end()) {
2035 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
2036 << "pg_upmap_items entry "
2037 << p
->first
<< "->" << p
->second
2039 pending_inc
->new_pg_upmap_items
.erase(p
);
2041 auto q
= pg_upmap_items
.find(pg
);
2042 if (q
!= pg_upmap_items
.end()) {
2043 ldout(cct
, 10) << __func__
<< " cancel invalid "
2044 << "pg_upmap_items entry "
2045 << q
->first
<< "->" << q
->second
2047 pending_inc
->old_pg_upmap_items
.insert(pg
);
2050 for (auto& i
: to_remap
)
2051 pending_inc
->new_pg_upmap_items
[i
.first
] = i
.second
;
2054 bool OSDMap::clean_pg_upmaps(
2056 Incremental
*pending_inc
) const
2058 ldout(cct
, 10) << __func__
<< dendl
;
2059 vector
<pg_t
> to_check
;
2060 vector
<pg_t
> to_cancel
;
2061 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> to_remap
;
2063 get_upmap_pgs(&to_check
);
2064 auto any_change
= check_pg_upmaps(cct
, to_check
, &to_cancel
, &to_remap
);
2065 clean_pg_upmaps(cct
, pending_inc
, to_cancel
, to_remap
);
2069 int OSDMap::apply_incremental(const Incremental
&inc
)
2071 new_blocklist_entries
= false;
2074 else if (inc
.fsid
!= fsid
)
2077 ceph_assert(inc
.epoch
== epoch
+1);
2080 modified
= inc
.modified
;
2083 if (inc
.fullmap
.length()) {
2084 ceph::buffer::list
bl(inc
.fullmap
);
2089 // nope, incremental.
2090 if (inc
.new_flags
>= 0) {
2091 flags
= inc
.new_flags
;
2092 // the below is just to cover a newly-upgraded luminous mon
2093 // cluster that has to set require_jewel_osds or
2094 // require_kraken_osds before the osds can be upgraded to
2096 if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
2097 if (require_osd_release
< ceph_release_t::kraken
) {
2098 require_osd_release
= ceph_release_t::kraken
;
2100 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
2101 if (require_osd_release
< ceph_release_t::jewel
) {
2102 require_osd_release
= ceph_release_t::jewel
;
2107 if (inc
.new_max_osd
>= 0)
2108 set_max_osd(inc
.new_max_osd
);
2110 if (inc
.new_pool_max
!= -1)
2111 pool_max
= inc
.new_pool_max
;
2113 for (const auto &pool
: inc
.new_pools
) {
2114 pools
[pool
.first
] = pool
.second
;
2115 pools
[pool
.first
].last_change
= epoch
;
2118 new_removed_snaps
= inc
.new_removed_snaps
;
2119 new_purged_snaps
= inc
.new_purged_snaps
;
2120 for (auto p
= new_removed_snaps
.begin();
2121 p
!= new_removed_snaps
.end();
2123 removed_snaps_queue
[p
->first
].union_of(p
->second
);
2125 for (auto p
= new_purged_snaps
.begin();
2126 p
!= new_purged_snaps
.end();
2128 auto q
= removed_snaps_queue
.find(p
->first
);
2129 ceph_assert(q
!= removed_snaps_queue
.end());
2130 q
->second
.subtract(p
->second
);
2131 if (q
->second
.empty()) {
2132 removed_snaps_queue
.erase(q
);
2136 if (inc
.new_last_up_change
!= utime_t()) {
2137 last_up_change
= inc
.new_last_up_change
;
2139 if (inc
.new_last_in_change
!= utime_t()) {
2140 last_in_change
= inc
.new_last_in_change
;
2143 for (const auto &pname
: inc
.new_pool_names
) {
2144 auto pool_name_entry
= pool_name
.find(pname
.first
);
2145 if (pool_name_entry
!= pool_name
.end()) {
2146 name_pool
.erase(pool_name_entry
->second
);
2147 pool_name_entry
->second
= pname
.second
;
2149 pool_name
[pname
.first
] = pname
.second
;
2151 name_pool
[pname
.second
] = pname
.first
;
2154 for (const auto &pool
: inc
.old_pools
) {
2156 name_pool
.erase(pool_name
[pool
]);
2157 pool_name
.erase(pool
);
2160 for (const auto &weight
: inc
.new_weight
) {
2161 set_weight(weight
.first
, weight
.second
);
2163 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2164 // xinfo old_weight.
2165 if (weight
.second
) {
2166 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
2167 osd_xinfo
[weight
.first
].old_weight
= 0;
2171 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
2172 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
2175 // erasure_code_profiles
2176 for (const auto &profile
: inc
.old_erasure_code_profiles
)
2177 erasure_code_profiles
.erase(profile
);
2179 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
2180 set_erasure_code_profile(profile
.first
, profile
.second
);
2184 for (const auto &state
: inc
.new_state
) {
2185 const auto osd
= state
.first
;
2186 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
2187 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
2188 (s
& CEPH_OSD_UP
)) {
2189 osd_info
[osd
].down_at
= epoch
;
2190 osd_xinfo
[osd
].down_stamp
= modified
;
2192 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
2193 (s
& CEPH_OSD_EXISTS
)) {
2194 // osd is destroyed; clear out anything interesting.
2195 (*osd_uuid
)[osd
] = uuid_d();
2196 osd_info
[osd
] = osd_info_t();
2197 osd_xinfo
[osd
] = osd_xinfo_t();
2198 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
2199 osd_addrs
->client_addrs
[osd
].reset(new entity_addrvec_t());
2200 osd_addrs
->cluster_addrs
[osd
].reset(new entity_addrvec_t());
2201 osd_addrs
->hb_front_addrs
[osd
].reset(new entity_addrvec_t());
2202 osd_addrs
->hb_back_addrs
[osd
].reset(new entity_addrvec_t());
2205 osd_state
[osd
] ^= s
;
2209 for (const auto &client
: inc
.new_up_client
) {
2210 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
2211 osd_state
[client
.first
] &= ~CEPH_OSD_STOP
; // if any
2212 osd_addrs
->client_addrs
[client
.first
].reset(
2213 new entity_addrvec_t(client
.second
));
2214 osd_addrs
->hb_back_addrs
[client
.first
].reset(
2215 new entity_addrvec_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
2216 osd_addrs
->hb_front_addrs
[client
.first
].reset(
2217 new entity_addrvec_t(inc
.new_hb_front_up
.find(client
.first
)->second
));
2219 osd_info
[client
.first
].up_from
= epoch
;
2222 for (const auto &cluster
: inc
.new_up_cluster
)
2223 osd_addrs
->cluster_addrs
[cluster
.first
].reset(
2224 new entity_addrvec_t(cluster
.second
));
2227 for (const auto &thru
: inc
.new_up_thru
)
2228 osd_info
[thru
.first
].up_thru
= thru
.second
;
2230 for (const auto &interval
: inc
.new_last_clean_interval
) {
2231 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
2232 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
2235 for (const auto &lost
: inc
.new_lost
)
2236 osd_info
[lost
.first
].lost_at
= lost
.second
;
2239 for (const auto &xinfo
: inc
.new_xinfo
)
2240 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
2243 for (const auto &uuid
: inc
.new_uuid
)
2244 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
2247 for (const auto &pg
: inc
.new_pg_temp
) {
2248 if (pg
.second
.empty())
2249 pg_temp
->erase(pg
.first
);
2251 pg_temp
->set(pg
.first
, pg
.second
);
2253 if (!inc
.new_pg_temp
.empty()) {
2254 // make sure pg_temp is efficiently stored
2258 for (const auto &pg
: inc
.new_primary_temp
) {
2259 if (pg
.second
== -1)
2260 primary_temp
->erase(pg
.first
);
2262 (*primary_temp
)[pg
.first
] = pg
.second
;
2265 for (auto& p
: inc
.new_pg_upmap
) {
2266 pg_upmap
[p
.first
] = p
.second
;
2268 for (auto& pg
: inc
.old_pg_upmap
) {
2271 for (auto& p
: inc
.new_pg_upmap_items
) {
2272 pg_upmap_items
[p
.first
] = p
.second
;
2274 for (auto& pg
: inc
.old_pg_upmap_items
) {
2275 pg_upmap_items
.erase(pg
);
2279 if (!inc
.new_blocklist
.empty()) {
2280 blocklist
.insert(inc
.new_blocklist
.begin(),inc
.new_blocklist
.end());
2281 new_blocklist_entries
= true;
2283 for (const auto &addr
: inc
.old_blocklist
)
2284 blocklist
.erase(addr
);
2286 for (auto& i
: inc
.new_crush_node_flags
) {
2288 crush_node_flags
[i
.first
] = i
.second
;
2290 crush_node_flags
.erase(i
.first
);
2294 for (auto& i
: inc
.new_device_class_flags
) {
2296 device_class_flags
[i
.first
] = i
.second
;
2298 device_class_flags
.erase(i
.first
);
2302 // cluster snapshot?
2303 if (inc
.cluster_snapshot
.length()) {
2304 cluster_snapshot
= inc
.cluster_snapshot
;
2305 cluster_snapshot_epoch
= inc
.epoch
;
2307 cluster_snapshot
.clear();
2308 cluster_snapshot_epoch
= 0;
2311 if (inc
.new_nearfull_ratio
>= 0) {
2312 nearfull_ratio
= inc
.new_nearfull_ratio
;
2314 if (inc
.new_backfillfull_ratio
>= 0) {
2315 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
2317 if (inc
.new_full_ratio
>= 0) {
2318 full_ratio
= inc
.new_full_ratio
;
2320 if (inc
.new_require_min_compat_client
> ceph_release_t::unknown
) {
2321 require_min_compat_client
= inc
.new_require_min_compat_client
;
2323 if (inc
.new_require_osd_release
>= ceph_release_t::unknown
) {
2324 require_osd_release
= inc
.new_require_osd_release
;
2325 if (require_osd_release
>= ceph_release_t::luminous
) {
2326 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2327 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
2331 if (inc
.new_require_osd_release
>= ceph_release_t::unknown
) {
2332 require_osd_release
= inc
.new_require_osd_release
;
2333 if (require_osd_release
>= ceph_release_t::nautilus
) {
2334 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
2337 // do new crush map last (after up/down stuff)
2338 if (inc
.crush
.length()) {
2339 ceph::buffer::list
bl(inc
.crush
);
2340 auto blp
= bl
.cbegin();
2341 crush
.reset(new CrushWrapper
);
2343 if (require_osd_release
>= ceph_release_t::luminous
) {
2344 // only increment if this is a luminous-encoded osdmap, lest
2345 // the mon's crush_version diverge from what the osds or others
2346 // are decoding and applying on their end. if we won't encode
2347 // it in the canonical version, don't change it.
2350 for (auto it
= device_class_flags
.begin();
2351 it
!= device_class_flags
.end();) {
2352 const char* class_name
= crush
->get_class_name(it
->first
);
2353 if (!class_name
) // device class is gone
2354 it
= device_class_flags
.erase(it
);
2360 if (inc
.change_stretch_mode
) {
2361 stretch_mode_enabled
= inc
.stretch_mode_enabled
;
2362 stretch_bucket_count
= inc
.new_stretch_bucket_count
;
2363 degraded_stretch_mode
= inc
.new_degraded_stretch_mode
;
2364 recovering_stretch_mode
= inc
.new_recovering_stretch_mode
;
2365 stretch_mode_bucket
= inc
.new_stretch_mode_bucket
;
2369 _calc_up_osd_features();
2374 int OSDMap::map_to_pg(
2378 const string
& nspace
,
2381 // calculate ps (placement seed)
2382 const pg_pool_t
*pool
= get_pg_pool(poolid
);
2387 ps
= pool
->hash_key(key
, nspace
);
2389 ps
= pool
->hash_key(name
, nspace
);
2390 *pg
= pg_t(ps
, poolid
);
2394 int OSDMap::object_locator_to_pg(
2395 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
2397 if (loc
.hash
>= 0) {
2398 if (!get_pg_pool(loc
.get_pool())) {
2401 pg
= pg_t(loc
.hash
, loc
.get_pool());
2404 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
2407 ceph_object_layout
OSDMap::make_object_layout(
2408 object_t oid
, int pg_pool
, string nspace
) const
2410 object_locator_t
loc(pg_pool
, nspace
);
2412 ceph_object_layout ol
;
2413 pg_t pgid
= object_locator_to_pg(oid
, loc
);
2414 ol
.ol_pgid
= pgid
.get_old_pg().v
;
2415 ol
.ol_stripe_unit
= 0;
2419 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
2420 vector
<int>& osds
) const
2422 if (pool
.can_shift_osds()) {
2423 unsigned removed
= 0;
2424 for (unsigned i
= 0; i
< osds
.size(); i
++) {
2425 if (!exists(osds
[i
])) {
2430 osds
[i
- removed
] = osds
[i
];
2434 osds
.resize(osds
.size() - removed
);
2436 for (auto& osd
: osds
) {
2438 osd
= CRUSH_ITEM_NONE
;
2443 void OSDMap::_pg_to_raw_osds(
2444 const pg_pool_t
& pool
, pg_t pg
,
2449 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
2450 unsigned size
= pool
.get_size();
2453 int ruleno
= crush
->find_rule(pool
.get_crush_rule(), pool
.get_type(), size
);
2455 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
2457 _remove_nonexistent_osds(pool
, *osds
);
2463 int OSDMap::_pick_primary(const vector
<int>& osds
) const
2465 for (auto osd
: osds
) {
2466 if (osd
!= CRUSH_ITEM_NONE
) {
2473 void OSDMap::_apply_upmap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
2475 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
2476 auto p
= pg_upmap
.find(pg
);
2477 if (p
!= pg_upmap
.end()) {
2478 // make sure targets aren't marked out
2479 for (auto osd
: p
->second
) {
2480 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd
>= 0 &&
2481 osd_weight
[osd
] == 0) {
2482 // reject/ignore the explicit mapping
2486 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
2487 // continue to check and apply pg_upmap_items if any
2490 auto q
= pg_upmap_items
.find(pg
);
2491 if (q
!= pg_upmap_items
.end()) {
2492 // NOTE: this approach does not allow a bidirectional swap,
2493 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2494 for (auto& r
: q
->second
) {
2495 // make sure the replacement value doesn't already appear
2496 bool exists
= false;
2498 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
2499 int osd
= (*raw
)[i
];
2500 if (osd
== r
.second
) {
2504 // ignore mapping if target is marked out (or invalid osd id)
2505 if (osd
== r
.first
&&
2507 !(r
.second
!= CRUSH_ITEM_NONE
&& r
.second
< max_osd
&&
2508 r
.second
>= 0 && osd_weight
[r
.second
] == 0)) {
2512 if (!exists
&& pos
>= 0) {
2513 (*raw
)[pos
] = r
.second
;
2519 // pg -> (up osd list)
2520 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
2521 vector
<int> *up
) const
2523 if (pool
.can_shift_osds()) {
2526 up
->reserve(raw
.size());
2527 for (unsigned i
=0; i
<raw
.size(); i
++) {
2528 if (!exists(raw
[i
]) || is_down(raw
[i
]))
2530 up
->push_back(raw
[i
]);
2533 // set down/dne devices to NONE
2534 up
->resize(raw
.size());
2535 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
2536 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
2537 (*up
)[i
] = CRUSH_ITEM_NONE
;
2545 void OSDMap::_apply_primary_affinity(ps_t seed
,
2546 const pg_pool_t
& pool
,
2550 // do we have any non-default primary_affinity values for these osds?
2551 if (!osd_primary_affinity
)
2555 for (const auto osd
: *osds
) {
2556 if (osd
!= CRUSH_ITEM_NONE
&&
2557 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
2565 // pick the primary. feed both the seed (for the pg) and the osd
2566 // into the hash/rng so that a proportional fraction of an osd's pgs
2567 // get rejected as primary.
2569 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
2571 if (o
== CRUSH_ITEM_NONE
)
2573 unsigned a
= (*osd_primary_affinity
)[o
];
2574 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
2575 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
2576 seed
, o
) >> 16) >= a
) {
2577 // we chose not to use this primary. note it anyway as a
2578 // fallback in case we don't pick anyone else, but keep looking.
2589 *primary
= (*osds
)[pos
];
2591 if (pool
.can_shift_osds() && pos
> 0) {
2592 // move the new primary to the front.
2593 for (int i
= pos
; i
> 0; --i
) {
2594 (*osds
)[i
] = (*osds
)[i
-1];
2596 (*osds
)[0] = *primary
;
2600 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
2601 vector
<int> *temp_pg
, int *temp_primary
) const
2603 pg
= pool
.raw_pg_to_pg(pg
);
2604 const auto p
= pg_temp
->find(pg
);
2606 if (p
!= pg_temp
->end()) {
2607 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
2608 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
2609 if (pool
.can_shift_osds()) {
2612 temp_pg
->push_back(CRUSH_ITEM_NONE
);
2615 temp_pg
->push_back(p
->second
[i
]);
2619 const auto &pp
= primary_temp
->find(pg
);
2621 if (pp
!= primary_temp
->end()) {
2622 *temp_primary
= pp
->second
;
2623 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
2624 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
2625 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
2626 *temp_primary
= (*temp_pg
)[i
];
2633 void OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
2635 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2641 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2642 *primary
= _pick_primary(*raw
);
2645 void OSDMap::pg_to_raw_upmap(pg_t pg
, vector
<int>*raw
,
2646 vector
<int> *raw_upmap
) const
2648 auto pool
= get_pg_pool(pg
.pool());
2653 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2655 _apply_upmap(*pool
, pg
, raw_upmap
);
2658 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
2660 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2668 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2669 _apply_upmap(*pool
, pg
, &raw
);
2670 _raw_to_up_osds(*pool
, raw
, up
);
2671 *primary
= _pick_primary(raw
);
2672 _apply_primary_affinity(pps
, *pool
, up
, primary
);
2675 void OSDMap::_pg_to_up_acting_osds(
2676 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
2677 vector
<int> *acting
, int *acting_primary
,
2678 bool raw_pg_to_pg
) const
2680 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2682 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
2690 *acting_primary
= -1;
2695 vector
<int> _acting
;
2697 int _acting_primary
;
2699 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
2700 if (_acting
.empty() || up
|| up_primary
) {
2701 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2702 _apply_upmap(*pool
, pg
, &raw
);
2703 _raw_to_up_osds(*pool
, raw
, &_up
);
2704 _up_primary
= _pick_primary(_up
);
2705 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
2706 if (_acting
.empty()) {
2708 if (_acting_primary
== -1) {
2709 _acting_primary
= _up_primary
;
2716 *up_primary
= _up_primary
;
2720 acting
->swap(_acting
);
2722 *acting_primary
= _acting_primary
;
2725 int OSDMap::calc_pg_role_broken(int osd
, const vector
<int>& acting
, int nrep
)
2727 // This implementation is broken for EC PGs since the osd may appear
2728 // multiple times in the acting set. See
2729 // https://tracker.ceph.com/issues/43213
2731 nrep
= acting
.size();
2732 for (int i
=0; i
<nrep
; i
++)
2733 if (acting
[i
] == osd
)
2738 int OSDMap::calc_pg_role(pg_shard_t who
, const vector
<int>& acting
)
2740 int nrep
= acting
.size();
2741 if (who
.shard
== shard_id_t::NO_SHARD
) {
2742 for (int i
=0; i
<nrep
; i
++) {
2743 if (acting
[i
] == who
.osd
) {
2748 if (who
.shard
< nrep
&& acting
[who
.shard
] == who
.osd
) {
2755 bool OSDMap::primary_changed_broken(
2757 const vector
<int> &oldacting
,
2759 const vector
<int> &newacting
)
2761 if (oldacting
.empty() && newacting
.empty())
2762 return false; // both still empty
2763 if (oldacting
.empty() ^ newacting
.empty())
2764 return true; // was empty, now not, or vice versa
2765 if (oldprimary
!= newprimary
)
2766 return true; // primary changed
2767 if (calc_pg_role_broken(oldprimary
, oldacting
) !=
2768 calc_pg_role_broken(newprimary
, newacting
))
2770 return false; // same primary (tho replicas may have changed)
2773 uint64_t OSDMap::get_encoding_features() const
2775 uint64_t f
= SIGNIFICANT_FEATURES
;
2776 if (require_osd_release
< ceph_release_t::octopus
) {
2777 f
&= ~CEPH_FEATURE_SERVER_OCTOPUS
;
2779 if (require_osd_release
< ceph_release_t::nautilus
) {
2780 f
&= ~CEPH_FEATURE_SERVER_NAUTILUS
;
2782 if (require_osd_release
< ceph_release_t::mimic
) {
2783 f
&= ~CEPH_FEATURE_SERVER_MIMIC
;
2785 if (require_osd_release
< ceph_release_t::luminous
) {
2786 f
&= ~(CEPH_FEATURE_SERVER_LUMINOUS
|
2787 CEPH_FEATURE_CRUSH_CHOOSE_ARGS
);
2789 if (require_osd_release
< ceph_release_t::kraken
) {
2790 f
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
2791 CEPH_FEATURE_MSG_ADDR2
);
2793 if (require_osd_release
< ceph_release_t::jewel
) {
2794 f
&= ~(CEPH_FEATURE_SERVER_JEWEL
|
2795 CEPH_FEATURE_NEW_OSDOP_ENCODING
|
2796 CEPH_FEATURE_CRUSH_TUNABLES5
);
2801 // serialize, unserialize
2802 void OSDMap::encode_client_old(ceph::buffer::list
& bl
) const
2811 encode(created
, bl
);
2812 encode(modified
, bl
);
2814 // for encode(pools, bl);
2815 __u32 n
= pools
.size();
2818 for (const auto &pool
: pools
) {
2821 encode(pool
.second
, bl
, 0);
2823 // for encode(pool_name, bl);
2824 n
= pool_name
.size();
2826 for (const auto &pname
: pool_name
) {
2829 encode(pname
.second
, bl
);
2831 // for encode(pool_max, bl);
2837 encode(max_osd
, bl
);
2839 uint32_t n
= osd_state
.size();
2841 for (auto s
: osd_state
) {
2842 encode((uint8_t)s
, bl
);
2845 encode(osd_weight
, bl
);
2846 encode(osd_addrs
->client_addrs
, bl
, 0);
2848 // for encode(pg_temp, bl);
2849 n
= pg_temp
->size();
2851 for (const auto& pg
: *pg_temp
) {
2852 old_pg_t opg
= pg
.first
.get_old_pg();
2854 encode(pg
.second
, bl
);
2858 ceph::buffer::list cbl
;
2859 crush
->encode(cbl
, 0 /* legacy (no) features */);
2863 void OSDMap::encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const
2866 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
2867 encode_client_old(bl
);
2877 encode(created
, bl
);
2878 encode(modified
, bl
);
2880 encode(pools
, bl
, features
);
2881 encode(pool_name
, bl
);
2882 encode(pool_max
, bl
);
2886 encode(max_osd
, bl
);
2888 uint32_t n
= osd_state
.size();
2890 for (auto s
: osd_state
) {
2891 encode((uint8_t)s
, bl
);
2894 encode(osd_weight
, bl
);
2895 encode(osd_addrs
->client_addrs
, bl
, features
);
2897 encode(*pg_temp
, bl
);
2900 ceph::buffer::list cbl
;
2901 crush
->encode(cbl
, 0 /* legacy (no) features */);
2907 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
2908 encode(osd_info
, bl
);
2909 encode(blocklist
, bl
, features
);
2910 encode(osd_addrs
->cluster_addrs
, bl
, features
);
2911 encode(cluster_snapshot_epoch
, bl
);
2912 encode(cluster_snapshot
, bl
);
2913 encode(*osd_uuid
, bl
);
2914 encode(osd_xinfo
, bl
, features
);
2915 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
2918 /* for a description of osdmap versions, and when they were introduced, please
2920 * doc/dev/osd_internals/osdmap_versions.txt
2922 void OSDMap::encode(ceph::buffer::list
& bl
, uint64_t features
) const
2925 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
2926 encode_classic(bl
, features
);
2930 // only a select set of callers should *ever* be encoding new
2931 // OSDMaps. others should be passing around the canonical encoded
2932 // buffers from on high. select out those callers by passing in an
2933 // "impossible" feature bit.
2934 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
2935 features
&= ~CEPH_FEATURE_RESERVED
;
2937 size_t start_offset
= bl
.length();
2940 std::optional
<ceph::buffer::list::contiguous_filler
> crc_filler
;
2942 // meta-encoding: how we include client-used and osd-specific data
2943 ENCODE_START(8, 7, bl
);
2946 // NOTE: any new encoding dependencies must be reflected by
2947 // SIGNIFICANT_FEATURES
2949 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2951 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
2953 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
2956 ENCODE_START(v
, 1, bl
); // client-usable data
2960 encode(created
, bl
);
2961 encode(modified
, bl
);
2963 encode(pools
, bl
, features
);
2964 encode(pool_name
, bl
);
2965 encode(pool_max
, bl
);
2968 decltype(flags
) f
= flags
;
2969 if (require_osd_release
>= ceph_release_t::luminous
)
2970 f
|= CEPH_OSDMAP_REQUIRE_LUMINOUS
| CEPH_OSDMAP_RECOVERY_DELETES
;
2971 else if (require_osd_release
== ceph_release_t::kraken
)
2972 f
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
2973 else if (require_osd_release
== ceph_release_t::jewel
)
2974 f
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
2980 encode(max_osd
, bl
);
2982 encode(osd_state
, bl
);
2984 uint32_t n
= osd_state
.size();
2986 for (auto s
: osd_state
) {
2987 encode((uint8_t)s
, bl
);
2990 encode(osd_weight
, bl
);
2992 encode(osd_addrs
->client_addrs
, bl
, features
);
2994 encode_addrvec_pvec_as_addr(osd_addrs
->client_addrs
, bl
, features
);
2997 encode(*pg_temp
, bl
);
2998 encode(*primary_temp
, bl
);
2999 if (osd_primary_affinity
) {
3000 encode(*osd_primary_affinity
, bl
);
3007 ceph::buffer::list cbl
;
3008 crush
->encode(cbl
, features
);
3010 encode(erasure_code_profiles
, bl
);
3013 encode(pg_upmap
, bl
);
3014 encode(pg_upmap_items
, bl
);
3016 ceph_assert(pg_upmap
.empty());
3017 ceph_assert(pg_upmap_items
.empty());
3020 encode(crush_version
, bl
);
3023 encode(new_removed_snaps
, bl
);
3024 encode(new_purged_snaps
, bl
);
3027 encode(last_up_change
, bl
);
3028 encode(last_in_change
, bl
);
3030 ENCODE_FINISH(bl
); // client-usable data
3034 // NOTE: any new encoding dependencies must be reflected by
3035 // SIGNIFICANT_FEATURES
3036 uint8_t target_v
= 9; // when bumping this, be aware of stretch_mode target_v 10!
3037 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
3039 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
3041 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
3044 if (stretch_mode_enabled
) {
3045 target_v
= std::max((uint8_t)10, target_v
);
3047 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
3049 encode_addrvec_pvec_as_addr(osd_addrs
->hb_back_addrs
, bl
, features
);
3051 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
3053 encode(osd_info
, bl
);
3055 // put this in a sorted, ordered map<> so that we encode in a
3056 // deterministic order.
3057 map
<entity_addr_t
,utime_t
> blocklist_map
;
3058 for (const auto &addr
: blocklist
)
3059 blocklist_map
.insert(make_pair(addr
.first
, addr
.second
));
3060 encode(blocklist_map
, bl
, features
);
3063 encode_addrvec_pvec_as_addr(osd_addrs
->cluster_addrs
, bl
, features
);
3065 encode(osd_addrs
->cluster_addrs
, bl
, features
);
3067 encode(cluster_snapshot_epoch
, bl
);
3068 encode(cluster_snapshot
, bl
);
3069 encode(*osd_uuid
, bl
);
3070 encode(osd_xinfo
, bl
, features
);
3072 encode_addrvec_pvec_as_addr(osd_addrs
->hb_front_addrs
, bl
, features
);
3074 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
3076 if (target_v
>= 2) {
3077 encode(nearfull_ratio
, bl
);
3078 encode(full_ratio
, bl
);
3079 encode(backfillfull_ratio
, bl
);
3081 // 4 was string-based new_require_min_compat_client
3082 if (target_v
>= 5) {
3083 encode(require_min_compat_client
, bl
);
3084 encode(require_osd_release
, bl
);
3086 if (target_v
>= 6) {
3087 encode(removed_snaps_queue
, bl
);
3089 if (target_v
>= 8) {
3090 encode(crush_node_flags
, bl
);
3092 if (target_v
>= 9) {
3093 encode(device_class_flags
, bl
);
3095 if (target_v
>= 10) {
3096 encode(stretch_mode_enabled
, bl
);
3097 encode(stretch_bucket_count
, bl
);
3098 encode(degraded_stretch_mode
, bl
);
3099 encode(recovering_stretch_mode
, bl
);
3100 encode(stretch_mode_bucket
, bl
);
3102 ENCODE_FINISH(bl
); // osd-only data
3105 crc_offset
= bl
.length();
3106 crc_filler
= bl
.append_hole(sizeof(uint32_t));
3107 tail_offset
= bl
.length();
3109 ENCODE_FINISH(bl
); // meta-encoding wrapper
3112 ceph::buffer::list front
;
3113 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
3114 crc
= front
.crc32c(-1);
3115 if (tail_offset
< bl
.length()) {
3116 ceph::buffer::list tail
;
3117 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
3118 crc
= tail
.crc32c(crc
);
3122 crc_filler
->copy_in(4, (char*)&crc_le
);
3126 /* for a description of osdmap versions, and when they were introduced, please
3128 * doc/dev/osd_internals/osdmap_versions.txt
3130 void OSDMap::decode(ceph::buffer::list
& bl
)
3132 auto p
= bl
.cbegin();
3136 void OSDMap::decode_classic(ceph::buffer::list::const_iterator
& p
)
3147 decode(modified
, p
);
3151 int32_t max_pools
= 0;
3152 decode(max_pools
, p
);
3153 pool_max
= max_pools
;
3159 decode(pools
[t
], p
);
3164 } else if (v
== 5) {
3169 decode(pool_name
[t
], p
);
3176 decode(pool_name
, p
);
3177 decode(pool_max
, p
);
3179 // kludge around some old bug that zeroed out pool_max (#2307)
3180 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
3181 pool_max
= pools
.rbegin()->first
;
3190 osd_state
.resize(os
.size());
3191 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3192 osd_state
[i
] = os
[i
];
3195 decode(osd_weight
, p
);
3196 decode(osd_addrs
->client_addrs
, p
);
3202 ceph::decode_raw(opg
, p
);
3203 mempool::osdmap::vector
<int32_t> v
;
3205 pg_temp
->set(pg_t(opg
), v
);
3208 decode(*pg_temp
, p
);
3212 ceph::buffer::list cbl
;
3214 auto cblp
= cbl
.cbegin();
3215 crush
->decode(cblp
);
3221 decode(osd_addrs
->hb_back_addrs
, p
);
3222 decode(osd_info
, p
);
3224 decode(pool_name
, p
);
3226 decode(blocklist
, p
);
3228 decode(osd_addrs
->cluster_addrs
, p
);
3230 osd_addrs
->cluster_addrs
.resize(osd_addrs
->client_addrs
.size());
3233 decode(cluster_snapshot_epoch
, p
);
3234 decode(cluster_snapshot
, p
);
3238 decode(*osd_uuid
, p
);
3240 osd_uuid
->resize(max_osd
);
3243 decode(osd_xinfo
, p
);
3245 osd_xinfo
.resize(max_osd
);
3248 decode(osd_addrs
->hb_front_addrs
, p
);
3250 osd_addrs
->hb_front_addrs
.resize(osd_addrs
->hb_back_addrs
.size());
3252 osd_primary_affinity
.reset();
3257 void OSDMap::decode(ceph::buffer::list::const_iterator
& bl
)
3261 * Older encodings of the OSDMap had a single struct_v which
3262 * covered the whole encoding, and was prior to our modern
3263 * stuff which includes a compatv and a size. So if we see
3264 * a struct_v < 7, we must rewind to the beginning and use our
3267 size_t start_offset
= bl
.get_off();
3268 size_t tail_offset
= 0;
3269 ceph::buffer::list crc_front
, crc_tail
;
3271 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
3273 bl
.seek(start_offset
);
3278 * Since we made it past that hurdle, we can use our normal paths.
3281 DECODE_START(9, bl
); // client-usable data
3285 decode(created
, bl
);
3286 decode(modified
, bl
);
3289 decode(pool_name
, bl
);
3290 decode(pool_max
, bl
);
3294 decode(max_osd
, bl
);
3295 if (struct_v
>= 5) {
3296 decode(osd_state
, bl
);
3300 osd_state
.resize(os
.size());
3301 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3302 osd_state
[i
] = os
[i
];
3305 decode(osd_weight
, bl
);
3306 decode(osd_addrs
->client_addrs
, bl
);
3308 decode(*pg_temp
, bl
);
3309 decode(*primary_temp
, bl
);
3310 // dates back to firefly. version increased from 2 to 3 still in firefly.
3311 // do we really still need to keep this around? even for old clients?
3312 if (struct_v
>= 2) {
3313 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
3314 decode(*osd_primary_affinity
, bl
);
3315 if (osd_primary_affinity
->empty())
3316 osd_primary_affinity
.reset();
3318 osd_primary_affinity
.reset();
3322 ceph::buffer::list cbl
;
3324 auto cblp
= cbl
.cbegin();
3325 crush
->decode(cblp
);
3326 // added in firefly; version increased in luminous, so it affects
3327 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3328 // alone until we require clients to be all luminous?
3329 if (struct_v
>= 3) {
3330 decode(erasure_code_profiles
, bl
);
3332 erasure_code_profiles
.clear();
3334 // version increased from 3 to 4 still in luminous, so same as above
3336 if (struct_v
>= 4) {
3337 decode(pg_upmap
, bl
);
3338 decode(pg_upmap_items
, bl
);
3341 pg_upmap_items
.clear();
3343 // again, version increased from 5 to 6 still in luminous, so above
3345 if (struct_v
>= 6) {
3346 decode(crush_version
, bl
);
3348 // version increase from 6 to 7 in mimic
3349 if (struct_v
>= 7) {
3350 decode(new_removed_snaps
, bl
);
3351 decode(new_purged_snaps
, bl
);
3353 // version increase from 7 to 8, 8 to 9, in nautilus.
3354 if (struct_v
>= 9) {
3355 decode(last_up_change
, bl
);
3356 decode(last_in_change
, bl
);
3358 DECODE_FINISH(bl
); // client-usable data
3362 DECODE_START(10, bl
); // extended, osd-only data
3363 decode(osd_addrs
->hb_back_addrs
, bl
);
3364 decode(osd_info
, bl
);
3365 decode(blocklist
, bl
);
3366 decode(osd_addrs
->cluster_addrs
, bl
);
3367 decode(cluster_snapshot_epoch
, bl
);
3368 decode(cluster_snapshot
, bl
);
3369 decode(*osd_uuid
, bl
);
3370 decode(osd_xinfo
, bl
);
3371 decode(osd_addrs
->hb_front_addrs
, bl
);
3373 if (struct_v
>= 2) {
3374 decode(nearfull_ratio
, bl
);
3375 decode(full_ratio
, bl
);
3380 if (struct_v
>= 3) {
3381 decode(backfillfull_ratio
, bl
);
3383 backfillfull_ratio
= 0;
3385 if (struct_v
== 4) {
3389 require_min_compat_client
= ceph_release_from_name(r
.c_str());
3391 if (struct_v
>= 5) {
3392 decode(require_min_compat_client
, bl
);
3393 decode(require_osd_release
, bl
);
3394 if (require_osd_release
>= ceph_release_t::nautilus
) {
3395 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
3397 if (require_osd_release
>= ceph_release_t::luminous
) {
3398 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3399 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3402 if (flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
) {
3403 // only for compat with post-kraken pre-luminous test clusters
3404 require_osd_release
= ceph_release_t::luminous
;
3405 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3406 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3407 } else if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
3408 require_osd_release
= ceph_release_t::kraken
;
3409 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
3410 require_osd_release
= ceph_release_t::jewel
;
3412 require_osd_release
= ceph_release_t::unknown
;
3415 if (struct_v
>= 6) {
3416 decode(removed_snaps_queue
, bl
);
3418 if (struct_v
>= 8) {
3419 decode(crush_node_flags
, bl
);
3421 crush_node_flags
.clear();
3423 if (struct_v
>= 9) {
3424 decode(device_class_flags
, bl
);
3426 device_class_flags
.clear();
3428 if (struct_v
>= 10) {
3429 decode(stretch_mode_enabled
, bl
);
3430 decode(stretch_bucket_count
, bl
);
3431 decode(degraded_stretch_mode
, bl
);
3432 decode(recovering_stretch_mode
, bl
);
3433 decode(stretch_mode_bucket
, bl
);
3435 stretch_mode_enabled
= false;
3436 stretch_bucket_count
= 0;
3437 degraded_stretch_mode
= 0;
3438 recovering_stretch_mode
= 0;
3439 stretch_mode_bucket
= 0;
3441 DECODE_FINISH(bl
); // osd-only data
3444 if (struct_v
>= 8) {
3445 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
3447 tail_offset
= bl
.get_off();
3450 crc_defined
= false;
3454 DECODE_FINISH(bl
); // wrapper
3458 uint32_t actual
= crc_front
.crc32c(-1);
3459 if (tail_offset
< bl
.get_off()) {
3460 ceph::buffer::list tail
;
3461 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
3462 actual
= tail
.crc32c(actual
);
3464 if (crc
!= actual
) {
3466 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
3467 string s
= ss
.str();
3468 throw ceph::buffer::malformed_input(s
.c_str());
3475 void OSDMap::post_decode()
3479 for (const auto &pname
: pool_name
) {
3480 name_pool
[pname
.second
] = pname
.first
;
3484 _calc_up_osd_features();
3487 void OSDMap::dump_erasure_code_profiles(
3488 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
3491 f
->open_object_section("erasure_code_profiles");
3492 for (const auto &profile
: profiles
) {
3493 f
->open_object_section(profile
.first
.c_str());
3494 for (const auto &profm
: profile
.second
) {
3495 f
->dump_string(profm
.first
.c_str(), profm
.second
);
3502 void OSDMap::dump_osds(Formatter
*f
) const
3504 f
->open_array_section("osds");
3505 for (int i
=0; i
<get_max_osd(); i
++) {
3513 void OSDMap::dump_osd(int id
, Formatter
*f
) const
3515 ceph_assert(f
!= nullptr);
3520 f
->open_object_section("osd_info");
3521 f
->dump_int("osd", id
);
3522 f
->dump_stream("uuid") << get_uuid(id
);
3523 f
->dump_int("up", is_up(id
));
3524 f
->dump_int("in", is_in(id
));
3525 f
->dump_float("weight", get_weightf(id
));
3526 f
->dump_float("primary_affinity", get_primary_affinityf(id
));
3527 get_info(id
).dump(f
);
3528 f
->dump_object("public_addrs", get_addrs(id
));
3529 f
->dump_object("cluster_addrs", get_cluster_addrs(id
));
3530 f
->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id
));
3531 f
->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id
));
3533 f
->dump_stream("public_addr") << get_addrs(id
).get_legacy_str();
3534 f
->dump_stream("cluster_addr") << get_cluster_addrs(id
).get_legacy_str();
3535 f
->dump_stream("heartbeat_back_addr")
3536 << get_hb_back_addrs(id
).get_legacy_str();
3537 f
->dump_stream("heartbeat_front_addr")
3538 << get_hb_front_addrs(id
).get_legacy_str();
3542 f
->open_array_section("state");
3543 for (const auto &state
: st
)
3544 f
->dump_string("state", state
);
3550 void OSDMap::dump(Formatter
*f
) const
3552 f
->dump_int("epoch", get_epoch());
3553 f
->dump_stream("fsid") << get_fsid();
3554 f
->dump_stream("created") << get_created();
3555 f
->dump_stream("modified") << get_modified();
3556 f
->dump_stream("last_up_change") << last_up_change
;
3557 f
->dump_stream("last_in_change") << last_in_change
;
3558 f
->dump_string("flags", get_flag_string());
3559 f
->dump_unsigned("flags_num", flags
);
3560 f
->open_array_section("flags_set");
3561 set
<string
> flagset
;
3562 get_flag_set(&flagset
);
3563 for (auto p
: flagset
) {
3564 f
->dump_string("flag", p
);
3567 f
->dump_unsigned("crush_version", get_crush_version());
3568 f
->dump_float("full_ratio", full_ratio
);
3569 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
3570 f
->dump_float("nearfull_ratio", nearfull_ratio
);
3571 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
3572 f
->dump_int("pool_max", get_pool_max());
3573 f
->dump_int("max_osd", get_max_osd());
3574 f
->dump_string("require_min_compat_client",
3575 to_string(require_min_compat_client
));
3576 f
->dump_string("min_compat_client",
3577 to_string(get_min_compat_client()));
3578 f
->dump_string("require_osd_release",
3579 to_string(require_osd_release
));
3581 f
->open_array_section("pools");
3582 for (const auto &pool
: pools
) {
3583 std::string
name("<unknown>");
3584 const auto &pni
= pool_name
.find(pool
.first
);
3585 if (pni
!= pool_name
.end())
3587 f
->open_object_section("pool");
3588 f
->dump_int("pool", pool
.first
);
3589 f
->dump_string("pool_name", name
);
3590 pool
.second
.dump(f
);
3597 f
->open_array_section("osd_xinfo");
3598 for (int i
=0; i
<get_max_osd(); i
++) {
3600 f
->open_object_section("xinfo");
3601 f
->dump_int("osd", i
);
3602 osd_xinfo
[i
].dump(f
);
3608 f
->open_array_section("pg_upmap");
3609 for (auto& p
: pg_upmap
) {
3610 f
->open_object_section("mapping");
3611 f
->dump_stream("pgid") << p
.first
;
3612 f
->open_array_section("osds");
3613 for (auto q
: p
.second
) {
3614 f
->dump_int("osd", q
);
3620 f
->open_array_section("pg_upmap_items");
3621 for (auto& p
: pg_upmap_items
) {
3622 f
->open_object_section("mapping");
3623 f
->dump_stream("pgid") << p
.first
;
3624 f
->open_array_section("mappings");
3625 for (auto& q
: p
.second
) {
3626 f
->open_object_section("mapping");
3627 f
->dump_int("from", q
.first
);
3628 f
->dump_int("to", q
.second
);
3635 f
->open_array_section("pg_temp");
3639 f
->open_array_section("primary_temp");
3640 for (const auto &pg
: *primary_temp
) {
3641 f
->dump_stream("pgid") << pg
.first
;
3642 f
->dump_int("osd", pg
.second
);
3644 f
->close_section(); // primary_temp
3646 f
->open_object_section("blocklist");
3647 for (const auto &addr
: blocklist
) {
3650 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3654 dump_erasure_code_profiles(erasure_code_profiles
, f
);
3656 f
->open_array_section("removed_snaps_queue");
3657 for (auto& p
: removed_snaps_queue
) {
3658 f
->open_object_section("pool");
3659 f
->dump_int("pool", p
.first
);
3660 f
->open_array_section("snaps");
3661 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3662 f
->open_object_section("interval");
3663 f
->dump_unsigned("begin", q
.get_start());
3664 f
->dump_unsigned("length", q
.get_len());
3671 f
->open_array_section("new_removed_snaps");
3672 for (auto& p
: new_removed_snaps
) {
3673 f
->open_object_section("pool");
3674 f
->dump_int("pool", p
.first
);
3675 f
->open_array_section("snaps");
3676 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3677 f
->open_object_section("interval");
3678 f
->dump_unsigned("begin", q
.get_start());
3679 f
->dump_unsigned("length", q
.get_len());
3686 f
->open_array_section("new_purged_snaps");
3687 for (auto& p
: new_purged_snaps
) {
3688 f
->open_object_section("pool");
3689 f
->dump_int("pool", p
.first
);
3690 f
->open_array_section("snaps");
3691 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3692 f
->open_object_section("interval");
3693 f
->dump_unsigned("begin", q
.get_start());
3694 f
->dump_unsigned("length", q
.get_len());
3701 f
->open_object_section("crush_node_flags");
3702 for (auto& i
: crush_node_flags
) {
3703 string s
= crush
->item_exists(i
.first
) ? crush
->get_item_name(i
.first
)
3704 : stringify(i
.first
);
3705 f
->open_array_section(s
.c_str());
3707 calc_state_set(i
.second
, st
);
3708 for (auto& j
: st
) {
3709 f
->dump_string("flag", j
);
3714 f
->open_object_section("device_class_flags");
3715 for (auto& i
: device_class_flags
) {
3716 const char* class_name
= crush
->get_class_name(i
.first
);
3717 string s
= class_name
? class_name
: stringify(i
.first
);
3718 f
->open_array_section(s
.c_str());
3720 calc_state_set(i
.second
, st
);
3721 for (auto& j
: st
) {
3722 f
->dump_string("flag", j
);
3727 f
->open_object_section("stretch_mode");
3729 f
->dump_bool("stretch_mode_enabled", stretch_mode_enabled
);
3730 f
->dump_unsigned("stretch_bucket_count", stretch_bucket_count
);
3731 f
->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode
);
3732 f
->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode
);
3733 f
->dump_int("stretch_mode_bucket", stretch_mode_bucket
);
3738 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
3740 o
.push_back(new OSDMap
);
3742 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
3743 o
.push_back(new OSDMap
);
3745 o
.back()->build_simple(cct
, 1, fsid
, 16);
3746 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
3747 o
.back()->blocklist
[entity_addr_t()] = utime_t(5, 6);
3751 string
OSDMap::get_flag_string(unsigned f
)
3754 if (f
& CEPH_OSDMAP_PAUSERD
)
3756 if (f
& CEPH_OSDMAP_PAUSEWR
)
3758 if (f
& CEPH_OSDMAP_PAUSEREC
)
3760 if (f
& CEPH_OSDMAP_NOUP
)
3762 if (f
& CEPH_OSDMAP_NODOWN
)
3764 if (f
& CEPH_OSDMAP_NOOUT
)
3766 if (f
& CEPH_OSDMAP_NOIN
)
3768 if (f
& CEPH_OSDMAP_NOBACKFILL
)
3770 if (f
& CEPH_OSDMAP_NOREBALANCE
)
3771 s
+= ",norebalance";
3772 if (f
& CEPH_OSDMAP_NORECOVER
)
3774 if (f
& CEPH_OSDMAP_NOSCRUB
)
3776 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
3777 s
+= ",nodeep-scrub";
3778 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
3779 s
+= ",notieragent";
3780 if (f
& CEPH_OSDMAP_NOSNAPTRIM
)
3782 if (f
& CEPH_OSDMAP_SORTBITWISE
)
3783 s
+= ",sortbitwise";
3784 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
3785 s
+= ",require_jewel_osds";
3786 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
3787 s
+= ",require_kraken_osds";
3788 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
3789 s
+= ",require_luminous_osds";
3790 if (f
& CEPH_OSDMAP_RECOVERY_DELETES
)
3791 s
+= ",recovery_deletes";
3792 if (f
& CEPH_OSDMAP_PURGED_SNAPDIRS
)
3793 s
+= ",purged_snapdirs";
3794 if (f
& CEPH_OSDMAP_PGLOG_HARDLIMIT
)
3795 s
+= ",pglog_hardlimit";
3801 string
OSDMap::get_flag_string() const
3803 return get_flag_string(flags
);
3806 void OSDMap::print_pools(ostream
& out
) const
3808 for (const auto &pool
: pools
) {
3809 std::string
name("<unknown>");
3810 const auto &pni
= pool_name
.find(pool
.first
);
3811 if (pni
!= pool_name
.end())
3813 out
<< "pool " << pool
.first
3815 << "' " << pool
.second
<< "\n";
3817 for (const auto &snap
: pool
.second
.snaps
)
3818 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
3820 if (!pool
.second
.removed_snaps
.empty())
3821 out
<< "\tremoved_snaps " << pool
.second
.removed_snaps
<< "\n";
3822 auto p
= removed_snaps_queue
.find(pool
.first
);
3823 if (p
!= removed_snaps_queue
.end()) {
3824 out
<< "\tremoved_snaps_queue " << p
->second
<< "\n";
3830 void OSDMap::print_osds(ostream
& out
) const
3832 for (int i
=0; i
<get_max_osd(); i
++) {
3838 void OSDMap::print_osd(int id
, ostream
& out
) const
3844 out
<< "osd." << id
;
3845 out
<< (is_up(id
) ? " up ":" down");
3846 out
<< (is_in(id
) ? " in ":" out");
3847 out
<< " weight " << get_weightf(id
);
3848 if (get_primary_affinity(id
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
3849 out
<< " primary_affinity " << get_primary_affinityf(id
);
3851 const osd_info_t
& info(get_info(id
));
3853 out
<< " " << get_addrs(id
) << " " << get_cluster_addrs(id
);
3857 if (!get_uuid(id
).is_zero()) {
3858 out
<< " " << get_uuid(id
);
3863 void OSDMap::print(ostream
& out
) const
3865 out
<< "epoch " << get_epoch() << "\n"
3866 << "fsid " << get_fsid() << "\n"
3867 << "created " << get_created() << "\n"
3868 << "modified " << get_modified() << "\n";
3870 out
<< "flags " << get_flag_string() << "\n";
3871 out
<< "crush_version " << get_crush_version() << "\n";
3872 out
<< "full_ratio " << full_ratio
<< "\n";
3873 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
3874 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
3875 if (require_min_compat_client
!= ceph_release_t::unknown
) {
3876 out
<< "require_min_compat_client "
3877 << require_min_compat_client
<< "\n";
3879 out
<< "min_compat_client " << get_min_compat_client()
3881 if (require_osd_release
> ceph_release_t::unknown
) {
3882 out
<< "require_osd_release " << require_osd_release
3885 out
<< "stretch_mode_enabled " << (stretch_mode_enabled
? "true" : "false") << "\n";
3886 if (stretch_mode_enabled
) {
3887 out
<< "stretch_bucket_count " << stretch_bucket_count
<< "\n";
3888 out
<< "degraded_stretch_mode " << degraded_stretch_mode
<< "\n";
3889 out
<< "recovering_stretch_mode " << recovering_stretch_mode
<< "\n";
3890 out
<< "stretch_mode_bucket " << stretch_mode_bucket
<< "\n";
3892 if (get_cluster_snapshot().length())
3893 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
3898 out
<< "max_osd " << get_max_osd() << "\n";
3902 for (auto& p
: pg_upmap
) {
3903 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
3905 for (auto& p
: pg_upmap_items
) {
3906 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
3909 for (const auto& pg
: *pg_temp
)
3910 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3912 for (const auto& pg
: *primary_temp
)
3913 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3915 for (const auto &addr
: blocklist
)
3916 out
<< "blocklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
3919 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
3921 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
3923 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3925 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
3927 bool should_dump_leaf(int i
) const override
{
3929 return true; // normal case
3931 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
3932 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
3933 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
3934 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
3935 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
3941 bool should_dump_empty_bucket() const override
{
3945 void init_table(TextTable
*tbl
) {
3946 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
3947 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
3948 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3949 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
3950 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
3951 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3952 tbl
->define_column("PRI-AFF", TextTable::LEFT
, TextTable::RIGHT
);
3954 void dump(TextTable
*tbl
, string
& bucket
) {
3957 if (!bucket
.empty()) {
3962 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3963 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
)) {
3964 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), tbl
);
3971 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
3972 const char *c
= crush
->get_item_class(qi
.id
);
3977 << weightf_t(qi
.weight
);
3980 for (int k
= 0; k
< qi
.depth
; k
++)
3982 if (qi
.is_bucket()) {
3983 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
3984 << crush
->get_item_name(qi
.id
);
3986 name
<< "osd." << qi
.id
;
3990 if (!qi
.is_bucket()) {
3991 if (!osdmap
->exists(qi
.id
)) {
3996 if (osdmap
->is_up(qi
.id
)) {
3998 } else if (osdmap
->is_destroyed(qi
.id
)) {
4004 << weightf_t(osdmap
->get_weightf(qi
.id
))
4005 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
4008 *tbl
<< TextTable::endrow
;
4012 const OSDMap
*osdmap
;
4013 const unsigned filter
;
4016 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
4018 typedef CrushTreeDumper::FormattingDumper Parent
;
4020 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4022 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
4024 bool should_dump_leaf(int i
) const override
{
4026 return true; // normal case
4028 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
4029 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
4030 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
4031 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
4032 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
4038 bool should_dump_empty_bucket() const override
{
4042 void dump(Formatter
*f
, string
& bucket
) {
4043 if (!bucket
.empty()) {
4045 f
->open_array_section("nodes");
4049 f
->open_array_section("nodes");
4052 f
->open_array_section("stray");
4053 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4054 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
))
4055 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
4062 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
4063 Parent::dump_item_fields(qi
, f
);
4064 if (!qi
.is_bucket())
4067 if (osdmap
->is_up(qi
.id
)) {
4069 } else if (osdmap
->is_destroyed(qi
.id
)) {
4074 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
4075 f
->dump_string("status", s
);
4076 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
4077 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
4082 const OSDMap
*osdmap
;
4083 const unsigned filter
;
4086 void OSDMap::print_tree(Formatter
*f
, ostream
*out
, unsigned filter
, string bucket
) const
4089 OSDTreeFormattingDumper(crush
.get(), this, filter
).dump(f
, bucket
);
4093 OSDTreePlainDumper(crush
.get(), this, filter
).dump(&tbl
, bucket
);
4098 void OSDMap::print_summary(Formatter
*f
, ostream
& out
,
4099 const string
& prefix
, bool extra
) const
4102 f
->dump_int("epoch", get_epoch());
4103 f
->dump_int("num_osds", get_num_osds());
4104 f
->dump_int("num_up_osds", get_num_up_osds());
4105 f
->dump_int("osd_up_since", last_up_change
.to_msec() / 1000);
4106 f
->dump_int("num_in_osds", get_num_in_osds());
4107 f
->dump_int("osd_in_since", last_in_change
.to_msec() / 1000);
4108 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
4110 utime_t now
= ceph_clock_now();
4111 out
<< get_num_osds() << " osds: "
4112 << get_num_up_osds() << " up";
4113 if (last_up_change
!= utime_t()) {
4114 out
<< " (since " << utimespan_str(now
- last_up_change
) << ")";
4116 out
<< ", " << get_num_in_osds() << " in";
4117 if (last_in_change
!= utime_t()) {
4118 out
<< " (since " << utimespan_str(now
- last_in_change
) << ")";
4121 out
<< "; epoch: e" << get_epoch();
4122 if (get_num_pg_temp())
4123 out
<< "; " << get_num_pg_temp() << " remapped pgs";
4125 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
4126 if (important_flags
)
4127 out
<< prefix
<< "flags " << get_flag_string(important_flags
) << "\n";
4131 void OSDMap::print_oneline_summary(ostream
& out
) const
4133 out
<< "e" << get_epoch() << ": "
4134 << get_num_osds() << " total, "
4135 << get_num_up_osds() << " up, "
4136 << get_num_in_osds() << " in";
4139 bool OSDMap::crush_rule_in_use(int rule_id
) const
4141 for (const auto &pool
: pools
) {
4142 if (pool
.second
.crush_rule
== rule_id
)
4148 int OSDMap::validate_crush_rules(CrushWrapper
*newcrush
,
4151 for (auto& i
: pools
) {
4152 auto& pool
= i
.second
;
4153 int ruleno
= pool
.get_crush_rule();
4154 if (!newcrush
->rule_exists(ruleno
)) {
4155 *ss
<< "pool " << i
.first
<< " references crush_rule " << ruleno
4156 << " but it is not present";
4159 if (newcrush
->get_rule_mask_ruleset(ruleno
) != ruleno
) {
4160 *ss
<< "rule " << ruleno
<< " mask ruleset does not match rule id";
4163 if (newcrush
->get_rule_mask_type(ruleno
) != (int)pool
.get_type()) {
4164 *ss
<< "pool " << i
.first
<< " type does not match rule " << ruleno
;
4167 int poolsize
= pool
.get_size();
4168 if (poolsize
< newcrush
->get_rule_mask_min_size(ruleno
) ||
4169 poolsize
> newcrush
->get_rule_mask_max_size(ruleno
)) {
4170 *ss
<< "pool " << i
.first
<< " size " << poolsize
<< " does not"
4171 << " fall within rule " << ruleno
4172 << " min_size " << newcrush
->get_rule_mask_min_size(ruleno
)
4173 << " and max_size " << newcrush
->get_rule_mask_max_size(ruleno
);
4180 int OSDMap::build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
4181 int nosd
, int pg_bits
, int pgp_bits
,
4184 ldout(cct
, 10) << "build_simple on " << nosd
4185 << " osds" << dendl
;
4188 created
= modified
= ceph_clock_now();
4195 const auto& conf
= cct
->_conf
;
4196 vector
<string
> sections
;
4197 conf
.get_all_sections(sections
);
4199 for (auto §ion
: sections
) {
4200 if (section
.find("osd.") != 0)
4203 const char *begin
= section
.c_str() + 4;
4204 char *end
= (char*)begin
;
4205 int o
= strtol(begin
, &end
, 10);
4209 if (o
> cct
->_conf
->mon_max_osd
) {
4210 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
4218 set_max_osd(maxosd
+ 1);
4225 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
4227 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
4228 ceph_assert(r
== 0);
4230 int poolbase
= get_max_osd() ? get_max_osd() : 1;
4232 const int default_replicated_rule
= crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
4233 ceph_assert(default_replicated_rule
>= 0);
4236 // pgp_num <= pg_num
4237 if (pgp_bits
> pg_bits
)
4240 vector
<string
> pool_names
;
4241 pool_names
.push_back("rbd");
4242 for (auto &plname
: pool_names
) {
4243 int64_t pool
= ++pool_max
;
4244 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
4245 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
4246 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
4247 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
4248 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
4249 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
4250 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
4251 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
4252 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
4253 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
4254 pools
[pool
].size
= cct
->_conf
.get_val
<uint64_t>("osd_pool_default_size");
4255 pools
[pool
].min_size
= cct
->_conf
.get_osd_pool_default_min_size(
4257 pools
[pool
].crush_rule
= default_replicated_rule
;
4258 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
4259 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
4260 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
4261 pools
[pool
].set_pg_num_target(poolbase
<< pg_bits
);
4262 pools
[pool
].set_pgp_num_target(poolbase
<< pgp_bits
);
4263 pools
[pool
].last_change
= epoch
;
4264 pools
[pool
].application_metadata
.insert(
4265 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
4266 if (auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
4267 cct
->_conf
.get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
4268 m
!= pg_pool_t::pg_autoscale_mode_t::UNKNOWN
) {
4269 pools
[pool
].pg_autoscale_mode
= m
;
4271 pools
[pool
].pg_autoscale_mode
= pg_pool_t::pg_autoscale_mode_t::OFF
;
4273 pool_name
[pool
] = plname
;
4274 name_pool
[plname
] = pool
;
4278 map
<string
,string
> profile_map
;
4279 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
4281 lderr(cct
) << ss
.str() << dendl
;
4284 set_erasure_code_profile("default", profile_map
);
4288 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
4289 map
<string
,string
> &profile_map
,
4292 int r
= get_json_str_map(cct
->_conf
.get_val
<string
>("osd_pool_default_erasure_code_profile"),
4298 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
4300 crush
.set_type_name(0, "osd");
4301 crush
.set_type_name(1, "host");
4302 crush
.set_type_name(2, "chassis");
4303 crush
.set_type_name(3, "rack");
4304 crush
.set_type_name(4, "row");
4305 crush
.set_type_name(5, "pdu");
4306 crush
.set_type_name(6, "pod");
4307 crush
.set_type_name(7, "room");
4308 crush
.set_type_name(8, "datacenter");
4309 crush
.set_type_name(9, "zone");
4310 crush
.set_type_name(10, "region");
4311 crush
.set_type_name(11, "root");
4315 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
4316 int nosd
, ostream
*ss
)
4321 int root_type
= _build_crush_types(crush
);
4323 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
4324 root_type
, 0, NULL
, NULL
, &rootid
);
4325 ceph_assert(r
== 0);
4326 crush
.set_item_name(rootid
, "default");
4328 map
<string
,string
> loc
{
4329 {"host", "localhost"},
4330 {"rack", "localrack"},
4333 for (int o
=0; o
<nosd
; o
++) {
4334 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
4336 snprintf(name
, sizeof(name
), "osd.%d", o
);
4337 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
4340 build_simple_crush_rules(cct
, crush
, "default", ss
);
4347 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
4348 CrushWrapper
& crush
,
4351 const auto& conf
= cct
->_conf
;
4356 int root_type
= _build_crush_types(crush
);
4358 int r
= crush
.add_bucket(0, 0,
4360 root_type
, 0, NULL
, NULL
, &rootid
);
4361 ceph_assert(r
== 0);
4362 crush
.set_item_name(rootid
, "default");
4365 vector
<string
> sections
;
4366 conf
.get_all_sections(sections
);
4368 for (auto §ion
: sections
) {
4369 if (section
.find("osd.") != 0)
4372 const char *begin
= section
.c_str() + 4;
4373 char *end
= (char*)begin
;
4374 int o
= strtol(begin
, &end
, 10);
4378 string host
, rack
, row
, room
, dc
, pool
;
4379 vector
<string
> sectiontmp
;
4380 sectiontmp
.push_back("osd");
4381 sectiontmp
.push_back(section
);
4382 conf
.get_val_from_conf_file(sectiontmp
, "host", host
, false);
4383 conf
.get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
4384 conf
.get_val_from_conf_file(sectiontmp
, "row", row
, false);
4385 conf
.get_val_from_conf_file(sectiontmp
, "room", room
, false);
4386 conf
.get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
4387 conf
.get_val_from_conf_file(sectiontmp
, "root", pool
, false);
4389 if (host
.length() == 0)
4390 host
= "unknownhost";
4391 if (rack
.length() == 0)
4392 rack
= "unknownrack";
4394 map
<string
,string
> loc
;
4402 loc
["datacenter"] = dc
;
4403 loc
["root"] = "default";
4405 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
4406 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
4409 build_simple_crush_rules(cct
, crush
, "default", ss
);
4417 int OSDMap::build_simple_crush_rules(
4419 CrushWrapper
& crush
,
4423 int crush_rule
= crush
.get_osd_pool_default_crush_replicated_ruleset(cct
);
4424 string failure_domain
=
4425 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
4428 r
= crush
.add_simple_rule_at(
4429 "replicated_rule", root
, failure_domain
, "",
4430 "firstn", pg_pool_t::TYPE_REPLICATED
,
4434 // do not add an erasure rule by default or else we will implicitly
4435 // require the crush_v2 feature of clients
4439 int OSDMap::summarize_mapping_stats(
4441 const set
<int64_t> *pools
,
4449 for (auto &p
: get_pools())
4453 unsigned total_pg
= 0;
4454 unsigned moved_pg
= 0;
4455 vector
<unsigned> base_by_osd(get_max_osd(), 0);
4456 vector
<unsigned> new_by_osd(get_max_osd(), 0);
4457 for (int64_t pool_id
: ls
) {
4458 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
4459 vector
<int> up
, up2
;
4461 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
4462 pg_t
pgid(ps
, pool_id
);
4463 total_pg
+= pi
->get_size();
4464 pg_to_up_acting_osds(pgid
, &up
, &up_primary
, nullptr, nullptr);
4465 for (int osd
: up
) {
4466 if (osd
>= 0 && osd
< get_max_osd())
4470 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
, nullptr, nullptr);
4471 for (int osd
: up2
) {
4472 if (osd
>= 0 && osd
< get_max_osd())
4475 if (pi
->type
== pg_pool_t::TYPE_ERASURE
) {
4476 for (unsigned i
=0; i
<up
.size(); ++i
) {
4477 if (up
[i
] != up2
[i
]) {
4481 } else if (pi
->type
== pg_pool_t::TYPE_REPLICATED
) {
4482 for (int osd
: up
) {
4483 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
4488 ceph_abort_msg("unhandled pool type");
4494 unsigned num_up_in
= 0;
4495 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4496 if (is_up(osd
) && is_in(osd
))
4503 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
4504 float base_stddev
= 0, new_stddev
= 0;
4505 int min
= -1, max
= -1;
4506 unsigned min_base_pg
= 0, max_base_pg
= 0;
4507 unsigned min_new_pg
= 0, max_new_pg
= 0;
4508 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4509 if (is_up(osd
) && is_in(osd
)) {
4510 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
4511 base_stddev
+= base_diff
* base_diff
;
4512 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
4513 new_stddev
+= new_diff
* new_diff
;
4514 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
4516 min_base_pg
= base_by_osd
[osd
];
4517 min_new_pg
= new_by_osd
[osd
];
4519 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
4521 max_base_pg
= base_by_osd
[osd
];
4522 max_new_pg
= new_by_osd
[osd
];
4526 base_stddev
= sqrt(base_stddev
/ num_up_in
);
4527 new_stddev
= sqrt(new_stddev
/ num_up_in
);
4529 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
4533 f
->open_object_section("utilization");
4536 f
->dump_unsigned("moved_pgs", moved_pg
);
4537 f
->dump_unsigned("total_pgs", total_pg
);
4541 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
4542 ss
<< "moved " << moved_pg
<< " / " << total_pg
4543 << " (" << percent
<< "%)\n";
4547 f
->dump_float("avg_pgs", avg_pg
);
4548 f
->dump_float("std_dev", base_stddev
);
4549 f
->dump_float("expected_baseline_std_dev", edev
);
4551 f
->dump_float("new_std_dev", new_stddev
);
4553 ss
<< "avg " << avg_pg
<< "\n";
4554 ss
<< "stddev " << base_stddev
;
4556 ss
<< " -> " << new_stddev
;
4557 ss
<< " (expected baseline " << edev
<< ")\n";
4561 f
->dump_unsigned("min_osd", min
);
4562 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
4564 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
4566 ss
<< "min osd." << min
<< " with " << min_base_pg
;
4568 ss
<< " -> " << min_new_pg
;
4569 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
4571 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
4577 f
->dump_unsigned("max_osd", max
);
4578 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
4580 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
4582 ss
<< "max osd." << max
<< " with " << max_base_pg
;
4584 ss
<< " -> " << max_new_pg
;
4585 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
4587 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
4598 bool OSDMap::try_pg_upmap(
4600 pg_t pg
, ///< pg to potentially remap
4601 const set
<int>& overfull
, ///< osds we'd want to evacuate
4602 const vector
<int>& underfull
, ///< osds to move to, in order of preference
4603 const vector
<int>& more_underfull
, ///< more osds only slightly underfull
4605 vector
<int> *out
) ///< resulting alternative mapping
4607 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
4610 int rule
= crush
->find_rule(pool
->get_crush_rule(), pool
->get_type(),
4615 // make sure there is something there to remap
4617 for (auto osd
: *orig
) {
4618 if (overfull
.count(osd
)) {
4627 int r
= crush
->try_remap_rule(
4631 overfull
, underfull
,
4642 int OSDMap::calc_pg_upmaps(
4644 uint32_t max_deviation
,
4646 const set
<int64_t>& only_pools
,
4647 OSDMap::Incremental
*pending_inc
)
4649 ldout(cct
, 10) << __func__
<< " pools " << only_pools
<< dendl
;
4651 // Can't be less than 1 pg
4652 if (max_deviation
< 1)
4654 tmp
.deepish_copy_from(*this);
4655 int num_changed
= 0;
4656 map
<int,set
<pg_t
>> pgs_by_osd
;
4658 float osd_weight_total
= 0;
4659 map
<int,float> osd_weight
;
4660 for (auto& i
: pools
) {
4661 if (!only_pools
.empty() && !only_pools
.count(i
.first
))
4663 for (unsigned ps
= 0; ps
< i
.second
.get_pg_num(); ++ps
) {
4664 pg_t
pg(ps
, i
.first
);
4666 tmp
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
4667 ldout(cct
, 20) << __func__
<< " " << pg
<< " up " << up
<< dendl
;
4668 for (auto osd
: up
) {
4669 if (osd
!= CRUSH_ITEM_NONE
)
4670 pgs_by_osd
[osd
].insert(pg
);
4673 total_pgs
+= i
.second
.get_size() * i
.second
.get_pg_num();
4675 map
<int,float> pmap
;
4676 int ruleno
= tmp
.crush
->find_rule(i
.second
.get_crush_rule(),
4677 i
.second
.get_type(),
4678 i
.second
.get_size());
4679 tmp
.crush
->get_rule_weight_osd_map(ruleno
, &pmap
);
4680 ldout(cct
,20) << __func__
<< " pool " << i
.first
4681 << " ruleno " << ruleno
4682 << " weight-map " << pmap
4684 for (auto p
: pmap
) {
4685 auto adjusted_weight
= tmp
.get_weightf(p
.first
) * p
.second
;
4686 if (adjusted_weight
== 0) {
4689 osd_weight
[p
.first
] += adjusted_weight
;
4690 osd_weight_total
+= adjusted_weight
;
4693 for (auto& i
: osd_weight
) {
4695 auto p
= pgs_by_osd
.find(i
.first
);
4696 if (p
!= pgs_by_osd
.end())
4697 pgs
= p
->second
.size();
4699 pgs_by_osd
.emplace(i
.first
, set
<pg_t
>());
4700 ldout(cct
, 20) << " osd." << i
.first
<< " weight " << i
.second
4701 << " pgs " << pgs
<< dendl
;
4703 if (osd_weight_total
== 0) {
4704 lderr(cct
) << __func__
<< " abort due to osd_weight_total == 0" << dendl
;
4707 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
4708 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
4709 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
4712 lderr(cct
) << __func__
<< " abort due to max <= 0" << dendl
;
4716 map
<int,float> osd_deviation
; // osd, deviation(pgs)
4717 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
4718 float cur_max_deviation
= 0;
4719 for (auto& i
: pgs_by_osd
) {
4720 // make sure osd is still there (belongs to this crush-tree)
4721 ceph_assert(osd_weight
.count(i
.first
));
4722 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
4723 float deviation
= (float)i
.second
.size() - target
;
4724 ldout(cct
, 20) << " osd." << i
.first
4725 << "\tpgs " << i
.second
.size()
4726 << "\ttarget " << target
4727 << "\tdeviation " << deviation
4729 osd_deviation
[i
.first
] = deviation
;
4730 deviation_osd
.insert(make_pair(deviation
, i
.first
));
4731 stddev
+= deviation
* deviation
;
4732 if (fabsf(deviation
) > cur_max_deviation
)
4733 cur_max_deviation
= fabsf(deviation
);
4735 ldout(cct
, 20) << " stdev " << stddev
<< " max_deviation " << cur_max_deviation
<< dendl
;
4736 if (cur_max_deviation
<= max_deviation
) {
4737 ldout(cct
, 10) << __func__
<< " distribution is almost perfect"
4741 bool skip_overfull
= false;
4743 cct
->_conf
.get_val
<bool>("osd_calc_pg_upmaps_aggressively");
4744 auto local_fallback_retries
=
4745 cct
->_conf
.get_val
<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
4747 ldout(cct
, 30) << "Top of loop #" << max
+1 << dendl
;
4748 // build overfull and underfull
4750 set
<int> more_overfull
;
4751 bool using_more_overfull
= false;
4752 vector
<int> underfull
;
4753 vector
<int> more_underfull
;
4754 for (auto i
= deviation_osd
.rbegin(); i
!= deviation_osd
.rend(); i
++) {
4755 ldout(cct
, 30) << " check " << i
->first
<< " <= " << max_deviation
<< dendl
;
4758 if (i
->first
> max_deviation
) {
4759 ldout(cct
, 30) << " add overfull osd." << i
->second
<< dendl
;
4760 overfull
.insert(i
->second
);
4762 more_overfull
.insert(i
->second
);
4766 for (auto i
= deviation_osd
.begin(); i
!= deviation_osd
.end(); i
++) {
4767 ldout(cct
, 30) << " check " << i
->first
<< " >= " << -(int)max_deviation
<< dendl
;
4770 if (i
->first
< -(int)max_deviation
) {
4771 ldout(cct
, 30) << " add underfull osd." << i
->second
<< dendl
;
4772 underfull
.push_back(i
->second
);
4774 more_underfull
.push_back(i
->second
);
4777 if (underfull
.empty() && overfull
.empty()) {
4778 ldout(cct
, 20) << __func__
<< " failed to build overfull and underfull" << dendl
;
4781 if (overfull
.empty() && !underfull
.empty()) {
4782 ldout(cct
, 20) << __func__
<< " Using more_overfull since we still have underfull" << dendl
;
4783 overfull
= more_overfull
;
4784 using_more_overfull
= true;
4787 ldout(cct
, 10) << " overfull " << overfull
4788 << " underfull " << underfull
4791 uint64_t local_fallback_retried
= 0;
4796 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>> to_upmap
;
4797 auto temp_pgs_by_osd
= pgs_by_osd
;
4798 // always start with fullest, break if we find any changes to make
4799 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
4800 if (skip_overfull
&& !underfull
.empty()) {
4801 ldout(cct
, 10) << " skipping overfull " << dendl
;
4802 break; // fall through to check underfull
4804 int osd
= p
->second
;
4805 float deviation
= p
->first
;
4806 if (deviation
< 0) {
4807 ldout(cct
, 10) << " hitting underfull osds now"
4808 << " when trying to remap overfull osds"
4812 float target
= osd_weight
[osd
] * pgs_per_weight
;
4813 ldout(cct
, 10) << " Overfull search osd." << osd
4814 << " target " << target
4815 << " deviation " << deviation
4817 ceph_assert(target
> 0);
4818 if (!using_more_overfull
&& deviation
<= max_deviation
) {
4819 ldout(cct
, 10) << " osd." << osd
4820 << " target " << target
4821 << " deviation " << deviation
4822 << " < max deviation " << max_deviation
4828 pgs
.reserve(pgs_by_osd
[osd
].size());
4829 for (auto& pg
: pgs_by_osd
[osd
]) {
4830 if (to_skip
.count(pg
))
4835 // shuffle PG list so they all get equal (in)attention
4836 std::random_device rd
;
4837 std::default_random_engine rng
{rd()};
4838 std::shuffle(pgs
.begin(), pgs
.end(), rng
);
4840 // look for remaps we can un-remap
4841 for (auto pg
: pgs
) {
4842 auto p
= tmp
.pg_upmap_items
.find(pg
);
4843 if (p
== tmp
.pg_upmap_items
.end())
4845 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
4846 for (auto q
: p
->second
) {
4847 if (q
.second
== osd
) {
4848 ldout(cct
, 10) << " will try dropping existing"
4849 << " remapping pair "
4850 << q
.first
<< " -> " << q
.second
4851 << " which remapped " << pg
4852 << " into overfull osd." << osd
4854 temp_pgs_by_osd
[q
.second
].erase(pg
);
4855 temp_pgs_by_osd
[q
.first
].insert(pg
);
4857 new_upmap_items
.push_back(q
);
4860 if (new_upmap_items
.empty()) {
4862 ldout(cct
, 10) << " existing pg_upmap_items " << p
->second
4863 << " remapped " << pg
<< " into overfull osd." << osd
4864 << ", will try cancelling it entirely"
4866 to_unmap
.insert(pg
);
4868 } else if (new_upmap_items
.size() != p
->second
.size()) {
4869 // drop single remapping pair, updating
4870 ceph_assert(new_upmap_items
.size() < p
->second
.size());
4871 ldout(cct
, 10) << " existing pg_upmap_items " << p
->second
4872 << " remapped " << pg
<< " into overfull osd." << osd
4873 << ", new_pg_upmap_items now " << new_upmap_items
4875 to_upmap
[pg
] = new_upmap_items
;
4881 for (auto pg
: pgs
) {
4882 auto temp_it
= tmp
.pg_upmap
.find(pg
);
4883 if (temp_it
!= tmp
.pg_upmap
.end()) {
4884 // leave pg_upmap alone
4885 // it must be specified by admin since balancer does not
4886 // support pg_upmap yet
4887 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap "
4888 << temp_it
->second
<< ", skipping"
4892 auto pg_pool_size
= tmp
.get_pg_pool_size(pg
);
4893 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
4895 auto it
= tmp
.pg_upmap_items
.find(pg
);
4896 if (it
!= tmp
.pg_upmap_items
.end() &&
4897 it
->second
.size() >= (size_t)pg_pool_size
) {
4898 ldout(cct
, 10) << " " << pg
<< " already has full-size pg_upmap_items "
4899 << it
->second
<< ", skipping"
4902 } else if (it
!= tmp
.pg_upmap_items
.end()) {
4903 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap_items "
4906 new_upmap_items
= it
->second
;
4907 // build existing too (for dedup)
4908 for (auto i
: it
->second
) {
4909 existing
.insert(i
.first
);
4910 existing
.insert(i
.second
);
4913 // to see if we can append more remapping pairs
4915 ldout(cct
, 10) << " trying " << pg
<< dendl
;
4916 vector
<int> raw
, orig
, out
;
4917 tmp
.pg_to_raw_upmap(pg
, &raw
, &orig
); // including existing upmaps too
4918 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, more_underfull
, &orig
, &out
)) {
4921 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
4922 if (orig
.size() != out
.size()) {
4925 ceph_assert(orig
!= out
);
4928 for (unsigned i
= 0; i
< out
.size(); ++i
) {
4929 if (orig
[i
] == out
[i
])
4930 continue; // skip invalid remappings
4931 if (existing
.count(orig
[i
]) || existing
.count(out
[i
]))
4932 continue; // we want new remappings only!
4933 if (osd_deviation
[orig
[i
]] > max_dev
) {
4934 max_dev
= osd_deviation
[orig
[i
]];
4936 ldout(cct
, 30) << "Max osd." << orig
[i
] << " pos " << i
<< " dev " << osd_deviation
[orig
[i
]] << dendl
;
4941 ldout(cct
, 10) << " will try adding new remapping pair "
4942 << orig
[i
] << " -> " << out
[i
] << " for " << pg
4943 << (orig
[i
] != osd
? " NOT selected osd" : "")
4945 existing
.insert(orig
[i
]);
4946 existing
.insert(out
[i
]);
4947 temp_pgs_by_osd
[orig
[i
]].erase(pg
);
4948 temp_pgs_by_osd
[out
[i
]].insert(pg
);
4949 ceph_assert(new_upmap_items
.size() < (size_t)pg_pool_size
);
4950 new_upmap_items
.push_back(make_pair(orig
[i
], out
[i
]));
4951 // append new remapping pairs slowly
4952 // This way we can make sure that each tiny change will
4953 // definitely make distribution of PGs converging to
4954 // the perfect status.
4955 to_upmap
[pg
] = new_upmap_items
;
4961 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
4962 ldout(cct
, 10) << " failed to find any changes for overfull osds"
4964 for (auto& p
: deviation_osd
) {
4965 if (std::find(underfull
.begin(), underfull
.end(), p
.second
) ==
4969 float deviation
= p
.first
;
4970 float target
= osd_weight
[osd
] * pgs_per_weight
;
4971 ceph_assert(target
> 0);
4972 if (fabsf(deviation
) < max_deviation
) {
4973 // respect max_deviation too
4974 ldout(cct
, 10) << " osd." << osd
4975 << " target " << target
4976 << " deviation " << deviation
4977 << " -> absolute " << fabsf(deviation
)
4978 << " < max " << max_deviation
4982 // look for remaps we can un-remap
4984 mempool::osdmap::vector
<pair
<int32_t,int32_t>>>> candidates
;
4985 candidates
.reserve(tmp
.pg_upmap_items
.size());
4986 for (auto& i
: tmp
.pg_upmap_items
) {
4987 if (to_skip
.count(i
.first
))
4989 if (!only_pools
.empty() && !only_pools
.count(i
.first
.pool()))
4991 candidates
.push_back(make_pair(i
.first
, i
.second
));
4994 // shuffle candidates so they all get equal (in)attention
4995 std::random_device rd
;
4996 std::default_random_engine rng
{rd()};
4997 std::shuffle(candidates
.begin(), candidates
.end(), rng
);
4999 for (auto& i
: candidates
) {
5001 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
5002 for (auto& j
: i
.second
) {
5003 if (j
.first
== osd
) {
5004 ldout(cct
, 10) << " will try dropping existing"
5005 << " remapping pair "
5006 << j
.first
<< " -> " << j
.second
5007 << " which remapped " << pg
5008 << " out from underfull osd." << osd
5010 temp_pgs_by_osd
[j
.second
].erase(pg
);
5011 temp_pgs_by_osd
[j
.first
].insert(pg
);
5013 new_upmap_items
.push_back(j
);
5016 if (new_upmap_items
.empty()) {
5018 ldout(cct
, 10) << " existing pg_upmap_items " << i
.second
5019 << " remapped " << pg
5020 << " out from underfull osd." << osd
5021 << ", will try cancelling it entirely"
5023 to_unmap
.insert(pg
);
5025 } else if (new_upmap_items
.size() != i
.second
.size()) {
5026 // drop single remapping pair, updating
5027 ceph_assert(new_upmap_items
.size() < i
.second
.size());
5028 ldout(cct
, 10) << " existing pg_upmap_items " << i
.second
5029 << " remapped " << pg
5030 << " out from underfull osd." << osd
5031 << ", new_pg_upmap_items now " << new_upmap_items
5033 to_upmap
[pg
] = new_upmap_items
;
5039 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
5040 ldout(cct
, 10) << " failed to find any changes for underfull osds"
5043 ldout(cct
, 10) << " break due to aggressive mode not enabled" << dendl
;
5045 } else if (!skip_overfull
) {
5046 // safe to quit because below here we know
5047 // we've done checking both overfull and underfull osds..
5048 ldout(cct
, 10) << " break due to not being able to find any"
5049 << " further optimizations"
5053 // restart with fullest and do exhaustive searching
5054 skip_overfull
= false;
5059 // test change, apply if change is good
5060 ceph_assert(to_unmap
.size() || to_upmap
.size());
5061 float new_stddev
= 0;
5062 map
<int,float> temp_osd_deviation
;
5063 multimap
<float,int> temp_deviation_osd
;
5064 float cur_max_deviation
= 0;
5065 for (auto& i
: temp_pgs_by_osd
) {
5066 // make sure osd is still there (belongs to this crush-tree)
5067 ceph_assert(osd_weight
.count(i
.first
));
5068 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
5069 float deviation
= (float)i
.second
.size() - target
;
5070 ldout(cct
, 20) << " osd." << i
.first
5071 << "\tpgs " << i
.second
.size()
5072 << "\ttarget " << target
5073 << "\tdeviation " << deviation
5075 temp_osd_deviation
[i
.first
] = deviation
;
5076 temp_deviation_osd
.insert(make_pair(deviation
, i
.first
));
5077 new_stddev
+= deviation
* deviation
;
5078 if (fabsf(deviation
) > cur_max_deviation
)
5079 cur_max_deviation
= fabsf(deviation
);
5081 ldout(cct
, 10) << " stddev " << stddev
<< " -> " << new_stddev
<< dendl
;
5082 if (new_stddev
>= stddev
) {
5084 ldout(cct
, 10) << " break because stddev is not decreasing"
5085 << " and aggressive mode is not enabled"
5089 local_fallback_retried
++;
5090 if (local_fallback_retried
>= local_fallback_retries
) {
5091 // does not make progress
5092 // flip *skip_overfull* so both overfull and underfull
5093 // get equal (in)attention
5094 skip_overfull
= !skip_overfull
;
5095 ldout(cct
, 10) << " hit local_fallback_retries "
5096 << local_fallback_retries
5100 for (auto& i
: to_unmap
)
5102 for (auto& i
: to_upmap
)
5103 to_skip
.insert(i
.first
);
5104 ldout(cct
, 20) << " local_fallback_retried " << local_fallback_retried
5105 << " to_skip " << to_skip
5111 ceph_assert(new_stddev
< stddev
);
5112 stddev
= new_stddev
;
5113 pgs_by_osd
= temp_pgs_by_osd
;
5114 osd_deviation
= temp_osd_deviation
;
5115 deviation_osd
= temp_deviation_osd
;
5116 for (auto& i
: to_unmap
) {
5117 ldout(cct
, 10) << " unmap pg " << i
<< dendl
;
5118 ceph_assert(tmp
.pg_upmap_items
.count(i
));
5119 tmp
.pg_upmap_items
.erase(i
);
5120 pending_inc
->old_pg_upmap_items
.insert(i
);
5123 for (auto& i
: to_upmap
) {
5124 ldout(cct
, 10) << " upmap pg " << i
.first
5125 << " new pg_upmap_items " << i
.second
5127 tmp
.pg_upmap_items
[i
.first
] = i
.second
;
5128 pending_inc
->new_pg_upmap_items
[i
.first
] = i
.second
;
5131 ldout(cct
, 20) << " stdev " << stddev
<< " max_deviation " << cur_max_deviation
<< dendl
;
5132 if (cur_max_deviation
<= max_deviation
) {
5133 ldout(cct
, 10) << __func__
<< " Optimization plan is almost perfect"
5138 ldout(cct
, 10) << " num_changed = " << num_changed
<< dendl
;
5142 int OSDMap::get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const
5144 return crush
->get_leaves(name
, osds
);
5147 // get pools whose crush rules might reference the given osd
5148 void OSDMap::get_pool_ids_by_osd(CephContext
*cct
,
5150 set
<int64_t> *pool_ids
) const
5152 ceph_assert(pool_ids
);
5154 int r
= crush
->get_rules_by_osd(osd
, &raw_rules
);
5156 lderr(cct
) << __func__
<< " get_rules_by_osd failed: " << cpp_strerror(r
)
5158 ceph_assert(r
>= 0);
5161 for (auto &i
: raw_rules
) {
5162 // exclude any dead rule
5163 if (crush_rule_in_use(i
)) {
5167 for (auto &r
: rules
) {
5168 get_pool_ids_by_rule(r
, pool_ids
);
5172 template <typename F
>
5173 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
5175 typedef CrushTreeDumper::Dumper
<F
> Parent
;
5177 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
5178 const PGMap
& pgmap_
, bool tree_
,
5179 const string
& filter
) :
5180 Parent(crush
, osdmap_
->get_pool_names()),
5188 if (osdmap
->crush
->name_exists(filter
)) {
5189 // filter by crush node
5190 auto item_id
= osdmap
->crush
->get_item_id(filter
);
5191 allowed
.insert(item_id
);
5192 osdmap
->crush
->get_all_children(item_id
, &allowed
);
5193 } else if (osdmap
->crush
->class_exists(filter
)) {
5194 // filter by device class
5195 class_id
= osdmap
->crush
->get_class_id(filter
);
5196 } else if (auto pool_id
= osdmap
->lookup_pg_pool_name(filter
);
5199 auto crush_rule
= osdmap
->get_pool_crush_rule(pool_id
);
5201 osdmap
->crush
->find_takes_by_rule(crush_rule
, &roots
);
5203 for (auto r
: roots
)
5204 osdmap
->crush
->get_all_children(r
, &allowed
);
5206 average_util
= average_utilization();
5211 bool should_dump(int id
) const {
5212 if (!allowed
.empty() && !allowed
.count(id
)) // filter by name
5214 if (id
>= 0 && class_id
>= 0) {
5215 auto item_class_id
= osdmap
->crush
->get_item_class_id(id
);
5216 if (item_class_id
< 0 || // not bound to a class yet
5217 item_class_id
!= class_id
) // or already bound to a different class
5223 set
<int> get_dumped_osds() {
5224 if (allowed
.empty() && class_id
< 0) {
5231 void dump_stray(F
*f
) {
5232 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
5233 if (osdmap
->exists(i
) && !this->is_touched(i
))
5234 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
5238 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
5239 if (!tree
&& (qi
.is_bucket() || dumped_osds
.count(qi
.id
)))
5241 if (!should_dump(qi
.id
))
5244 if (!qi
.is_bucket())
5245 dumped_osds
.insert(qi
.id
);
5246 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
5247 int64_t kb
= 0, kb_used
= 0, kb_used_data
= 0, kb_used_omap
= 0,
5248 kb_used_meta
= 0, kb_avail
= 0;
5250 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_used_data
,
5251 &kb_used_omap
, &kb_used_meta
, &kb_avail
))
5253 util
= 100.0 * (double)kb_used
/ (double)kb
;
5257 var
= util
/ average_util
;
5259 size_t num_pgs
= qi
.is_bucket() ? 0 : pgmap
.get_num_pg_by_osd(qi
.id
);
5261 dump_item(qi
, reweight
, kb
, kb_used
,
5262 kb_used_data
, kb_used_omap
, kb_used_meta
,
5263 kb_avail
, util
, var
, num_pgs
, f
);
5265 if (!qi
.is_bucket() && reweight
> 0) {
5266 if (min_var
< 0 || var
< min_var
)
5268 if (max_var
< 0 || var
> max_var
)
5271 double dev
= util
- average_util
;
5273 stddev
+= reweight
* dev
;
5278 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
5282 int64_t kb_used_data
,
5283 int64_t kb_used_omap
,
5284 int64_t kb_used_meta
,
5288 const size_t num_pgs
,
5292 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
5295 double average_utilization() {
5296 int64_t kb
= 0, kb_used
= 0;
5297 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
5298 if (!osdmap
->exists(i
) ||
5299 osdmap
->get_weight(i
) == 0 ||
5302 int64_t kb_i
, kb_used_i
, kb_used_data_i
, kb_used_omap_i
, kb_used_meta_i
,
5304 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_used_data_i
,
5305 &kb_used_omap_i
, &kb_used_meta_i
, &kb_avail_i
)) {
5307 kb_used
+= kb_used_i
;
5310 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
5313 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
5314 int64_t* kb_used_data
,
5315 int64_t* kb_used_omap
,
5316 int64_t* kb_used_meta
,
5317 int64_t* kb_avail
) const {
5318 const osd_stat_t
*p
= pgmap
.get_osd_stat(id
);
5319 if (!p
) return false;
5320 *kb
= p
->statfs
.kb();
5321 *kb_used
= p
->statfs
.kb_used_raw();
5322 *kb_used_data
= p
->statfs
.kb_used_data();
5323 *kb_used_omap
= p
->statfs
.kb_used_omap();
5324 *kb_used_meta
= p
->statfs
.kb_used_internal_metadata();
5325 *kb_avail
= p
->statfs
.kb_avail();
5330 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
5331 int64_t* kb_used_data
,
5332 int64_t* kb_used_omap
,
5333 int64_t* kb_used_meta
,
5334 int64_t* kb_avail
) const {
5336 if (osdmap
->is_out(id
) || !should_dump(id
)) {
5345 return get_osd_utilization(id
, kb
, kb_used
, kb_used_data
,
5346 kb_used_omap
, kb_used_meta
, kb_avail
);
5356 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
5357 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
5358 int64_t kb_i
= 0, kb_used_i
= 0, kb_used_data_i
= 0,
5359 kb_used_omap_i
= 0, kb_used_meta_i
= 0, kb_avail_i
= 0;
5360 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
,
5361 &kb_used_data_i
, &kb_used_omap_i
,
5362 &kb_used_meta_i
, &kb_avail_i
))
5365 *kb_used
+= kb_used_i
;
5366 *kb_used_data
+= kb_used_data_i
;
5367 *kb_used_omap
+= kb_used_omap_i
;
5368 *kb_used_meta
+= kb_used_meta_i
;
5369 *kb_avail
+= kb_avail_i
;
5375 const OSDMap
*osdmap
;
5378 double average_util
;
5385 set
<int> dumped_osds
;
5389 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
5391 typedef OSDUtilizationDumper
<TextTable
> Parent
;
5393 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
5394 const PGMap
& pgmap
, bool tree
,
5395 const string
& filter
) :
5396 Parent(crush
, osdmap
, pgmap
, tree
, filter
) {}
5398 void dump(TextTable
*tbl
) {
5399 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
5400 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
5401 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
5402 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
5403 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
5404 tbl
->define_column("RAW USE", TextTable::LEFT
, TextTable::RIGHT
);
5405 tbl
->define_column("DATA", TextTable::LEFT
, TextTable::RIGHT
);
5406 tbl
->define_column("OMAP", TextTable::LEFT
, TextTable::RIGHT
);
5407 tbl
->define_column("META", TextTable::LEFT
, TextTable::RIGHT
);
5408 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
5409 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
5410 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
5411 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
5412 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
5414 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
5420 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
5424 << byte_u_t(sum
.statfs
.total
)
5425 << byte_u_t(sum
.statfs
.get_used_raw())
5426 << byte_u_t(sum
.statfs
.allocated
)
5427 << byte_u_t(sum
.statfs
.omap_allocated
)
5428 << byte_u_t(sum
.statfs
.internal_metadata
)
5429 << byte_u_t(sum
.statfs
.available
)
5430 << lowprecision_t(average_util
)
5432 << TextTable::endrow
;
5436 struct lowprecision_t
{
5438 explicit lowprecision_t(float _v
) : v(_v
) {}
5440 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
5442 using OSDUtilizationDumper
<TextTable
>::dump_item
;
5443 void dump_item(const CrushTreeDumper::Item
&qi
,
5447 int64_t kb_used_data
,
5448 int64_t kb_used_omap
,
5449 int64_t kb_used_meta
,
5453 const size_t num_pgs
,
5454 TextTable
*tbl
) override
{
5455 const char *c
= crush
->get_item_class(qi
.id
);
5460 << weightf_t(qi
.weight
)
5461 << weightf_t(reweight
)
5462 << byte_u_t(kb
<< 10)
5463 << byte_u_t(kb_used
<< 10)
5464 << byte_u_t(kb_used_data
<< 10)
5465 << byte_u_t(kb_used_omap
<< 10)
5466 << byte_u_t(kb_used_meta
<< 10)
5467 << byte_u_t(kb_avail
<< 10)
5468 << lowprecision_t(util
)
5469 << lowprecision_t(var
);
5471 if (qi
.is_bucket()) {
5476 if (osdmap
->is_up(qi
.id
)) {
5478 } else if (osdmap
->is_destroyed(qi
.id
)) {
5479 *tbl
<< "destroyed";
5487 for (int k
= 0; k
< qi
.depth
; k
++)
5489 if (qi
.is_bucket()) {
5490 int type
= crush
->get_bucket_type(qi
.id
);
5491 name
<< crush
->get_type_name(type
) << " "
5492 << crush
->get_item_name(qi
.id
);
5494 name
<< "osd." << qi
.id
;
5499 *tbl
<< TextTable::endrow
;
5505 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
5506 << "/" << lowprecision_t(max_var
) << " "
5507 << "STDDEV: " << lowprecision_t(dev());
5512 ostream
& operator<<(ostream
& out
,
5513 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
5517 } else if (v
.v
< 0.001) {
5520 std::streamsize p
= out
.precision();
5521 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
5525 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
5527 typedef OSDUtilizationDumper
<Formatter
> Parent
;
5529 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
5530 const PGMap
& pgmap
, bool tree
,
5531 const string
& filter
) :
5532 Parent(crush
, osdmap
, pgmap
, tree
, filter
) {}
5534 void dump(Formatter
*f
) {
5535 f
->open_array_section("nodes");
5539 f
->open_array_section("stray");
5545 using OSDUtilizationDumper
<Formatter
>::dump_item
;
5546 void dump_item(const CrushTreeDumper::Item
&qi
,
5550 int64_t kb_used_data
,
5551 int64_t kb_used_omap
,
5552 int64_t kb_used_meta
,
5556 const size_t num_pgs
,
5557 Formatter
*f
) override
{
5558 f
->open_object_section("item");
5559 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
5560 f
->dump_float("reweight", reweight
);
5561 f
->dump_int("kb", kb
);
5562 f
->dump_int("kb_used", kb_used
);
5563 f
->dump_int("kb_used_data", kb_used_data
);
5564 f
->dump_int("kb_used_omap", kb_used_omap
);
5565 f
->dump_int("kb_used_meta", kb_used_meta
);
5566 f
->dump_int("kb_avail", kb_avail
);
5567 f
->dump_float("utilization", util
);
5568 f
->dump_float("var", var
);
5569 f
->dump_unsigned("pgs", num_pgs
);
5570 if (!qi
.is_bucket()) {
5571 if (osdmap
->is_up(qi
.id
)) {
5572 f
->dump_string("status", "up");
5573 } else if (osdmap
->is_destroyed(qi
.id
)) {
5574 f
->dump_string("status", "destroyed");
5576 f
->dump_string("status", "down");
5579 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
5584 void summary(Formatter
*f
) {
5585 f
->open_object_section("summary");
5586 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
5587 auto& s
= sum
.statfs
;
5589 f
->dump_int("total_kb", s
.kb());
5590 f
->dump_int("total_kb_used", s
.kb_used_raw());
5591 f
->dump_int("total_kb_used_data", s
.kb_used_data());
5592 f
->dump_int("total_kb_used_omap", s
.kb_used_omap());
5593 f
->dump_int("total_kb_used_meta", s
.kb_used_internal_metadata());
5594 f
->dump_int("total_kb_avail", s
.kb_avail());
5595 f
->dump_float("average_utilization", average_util
);
5596 f
->dump_float("min_var", min_var
);
5597 f
->dump_float("max_var", max_var
);
5598 f
->dump_float("dev", dev());
5603 void print_osd_utilization(const OSDMap
& osdmap
,
5608 const string
& filter
)
5610 const CrushWrapper
*crush
= osdmap
.crush
.get();
5612 f
->open_object_section("df");
5613 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgmap
, tree
, filter
);
5619 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgmap
, tree
, filter
);
5622 out
<< tbl
<< d
.summary() << "\n";
5626 void OSDMap::check_health(CephContext
*cct
,
5627 health_check_map_t
*checks
) const
5629 int num_osds
= get_num_osds();
5632 // OSD_$subtree_DOWN
5634 if (num_osds
>= 0) {
5635 int num_in_osds
= 0;
5636 int num_down_in_osds
= 0;
5638 set
<int> down_in_osds
;
5639 set
<int> up_in_osds
;
5640 set
<int> subtree_up
;
5641 unordered_map
<int, set
<int> > subtree_type_down
;
5642 unordered_map
<int, int> num_osds_subtree
;
5643 int max_type
= crush
->get_max_type_id();
5645 for (int i
= 0; i
< get_max_osd(); i
++) {
5647 if (crush
->item_exists(i
)) {
5652 if (is_out(i
) || (osd_state
[i
] & CEPH_OSD_NEW
))
5655 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
5658 down_in_osds
.insert(i
);
5661 for (int type
= 0; type
<= max_type
; type
++) {
5662 if (!crush
->get_type_name(type
))
5664 int r
= crush
->get_immediate_parent_id(current
, &parent_id
);
5667 // break early if this parent is already marked as up
5668 if (subtree_up
.count(parent_id
))
5670 type
= crush
->get_bucket_type(parent_id
);
5671 if (!subtree_type_is_down(
5672 cct
, parent_id
, type
,
5673 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
5675 current
= parent_id
;
5680 // calculate the number of down osds in each down subtree and
5681 // store it in num_osds_subtree
5682 for (int type
= 1; type
<= max_type
; type
++) {
5683 if (!crush
->get_type_name(type
))
5685 for (auto j
= subtree_type_down
[type
].begin();
5686 j
!= subtree_type_down
[type
].end();
5690 int num_children
= crush
->get_children(*j
, &children
);
5691 if (num_children
== 0)
5693 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
5696 } else if (num_osds_subtree
[*l
] > 0) {
5697 num
= num
+ num_osds_subtree
[*l
];
5700 num_osds_subtree
[*j
] = num
;
5703 num_down_in_osds
= down_in_osds
.size();
5704 ceph_assert(num_down_in_osds
<= num_in_osds
);
5705 if (num_down_in_osds
> 0) {
5706 // summary of down subtree types and osds
5707 for (int type
= max_type
; type
> 0; type
--) {
5708 if (!crush
->get_type_name(type
))
5710 if (subtree_type_down
[type
].size() > 0) {
5712 ss
<< subtree_type_down
[type
].size() << " "
5713 << crush
->get_type_name(type
);
5714 if (subtree_type_down
[type
].size() > 1) {
5717 int sum_down_osds
= 0;
5718 for (auto j
= subtree_type_down
[type
].begin();
5719 j
!= subtree_type_down
[type
].end();
5721 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
5723 ss
<< " (" << sum_down_osds
<< " osds) down";
5724 string err
= string("OSD_") +
5725 string(crush
->get_type_name(type
)) + "_DOWN";
5726 boost::to_upper(err
);
5727 auto& d
= checks
->add(err
, HEALTH_WARN
, ss
.str(),
5728 subtree_type_down
[type
].size());
5729 for (auto j
= subtree_type_down
[type
].rbegin();
5730 j
!= subtree_type_down
[type
].rend();
5733 ss
<< crush
->get_type_name(type
);
5735 ss
<< crush
->get_item_name(*j
);
5736 // at the top level, do not print location
5737 if (type
!= max_type
) {
5739 ss
<< crush
->get_full_location_ordered_string(*j
);
5742 int num
= num_osds_subtree
[*j
];
5743 ss
<< " (" << num
<< " osds)";
5745 d
.detail
.push_back(ss
.str());
5750 ss
<< down_in_osds
.size() << " osds down";
5751 auto& d
= checks
->add("OSD_DOWN", HEALTH_WARN
, ss
.str(),
5752 down_in_osds
.size());
5753 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
5755 ss
<< "osd." << *it
<< " (";
5756 ss
<< crush
->get_full_location_ordered_string(*it
);
5758 d
.detail
.push_back(ss
.str());
5762 if (!osds
.empty()) {
5764 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
5765 auto& d
= checks
->add("OSD_ORPHAN", HEALTH_WARN
, ss
.str(),
5767 for (auto osd
: osds
) {
5769 ss
<< "osd." << osd
<< " exists in crush map but not in osdmap";
5770 d
.detail
.push_back(ss
.str());
5775 std::list
<std::string
> scrub_messages
;
5776 bool noscrub
= false, nodeepscrub
= false;
5777 for (const auto &p
: pools
) {
5778 if (p
.second
.flags
& pg_pool_t::FLAG_NOSCRUB
) {
5780 ss
<< "Pool " << get_pool_name(p
.first
) << " has noscrub flag";
5781 scrub_messages
.push_back(ss
.str());
5784 if (p
.second
.flags
& pg_pool_t::FLAG_NODEEP_SCRUB
) {
5786 ss
<< "Pool " << get_pool_name(p
.first
) << " has nodeep-scrub flag";
5787 scrub_messages
.push_back(ss
.str());
5791 if (noscrub
|| nodeepscrub
) {
5793 out
+= noscrub
? string("noscrub") + (nodeepscrub
? ", " : "") : "";
5794 out
+= nodeepscrub
? "nodeep-scrub" : "";
5795 auto& d
= checks
->add("POOL_SCRUB_FLAGS", HEALTH_OK
,
5796 "Some pool(s) have the " + out
+ " flag(s) set", 0);
5797 d
.detail
.splice(d
.detail
.end(), scrub_messages
);
5800 // OSD_OUT_OF_ORDER_FULL
5802 // An osd could configure failsafe ratio, to something different
5803 // but for now assume it is the same here.
5804 float fsr
= cct
->_conf
->osd_failsafe_full_ratio
;
5805 if (fsr
> 1.0) fsr
/= 100;
5806 float fr
= get_full_ratio();
5807 float br
= get_backfillfull_ratio();
5808 float nr
= get_nearfull_ratio();
5810 list
<string
> detail
;
5811 // These checks correspond to how OSDService::check_full_status() in an OSD
5812 // handles the improper setting of these values.
5815 ss
<< "backfillfull_ratio (" << br
5816 << ") < nearfull_ratio (" << nr
<< "), increased";
5817 detail
.push_back(ss
.str());
5822 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
5824 detail
.push_back(ss
.str());
5829 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
5831 detail
.push_back(ss
.str());
5833 if (!detail
.empty()) {
5834 auto& d
= checks
->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR
,
5835 "full ratio(s) out of order", 0);
5836 d
.detail
.swap(detail
);
5843 // OSD_FAILSAFE_FULL
5845 set
<int> full
, backfillfull
, nearfull
;
5846 get_full_osd_counts(&full
, &backfillfull
, &nearfull
);
5849 ss
<< full
.size() << " full osd(s)";
5850 auto& d
= checks
->add("OSD_FULL", HEALTH_ERR
, ss
.str(), full
.size());
5851 for (auto& i
: full
) {
5853 ss
<< "osd." << i
<< " is full";
5854 d
.detail
.push_back(ss
.str());
5857 if (backfillfull
.size()) {
5859 ss
<< backfillfull
.size() << " backfillfull osd(s)";
5860 auto& d
= checks
->add("OSD_BACKFILLFULL", HEALTH_WARN
, ss
.str(),
5861 backfillfull
.size());
5862 for (auto& i
: backfillfull
) {
5864 ss
<< "osd." << i
<< " is backfill full";
5865 d
.detail
.push_back(ss
.str());
5868 if (nearfull
.size()) {
5870 ss
<< nearfull
.size() << " nearfull osd(s)";
5871 auto& d
= checks
->add("OSD_NEARFULL", HEALTH_WARN
, ss
.str(), nearfull
.size());
5872 for (auto& i
: nearfull
) {
5874 ss
<< "osd." << i
<< " is near full";
5875 d
.detail
.push_back(ss
.str());
5883 uint64_t warn_flags
=
5884 CEPH_OSDMAP_PAUSERD
|
5885 CEPH_OSDMAP_PAUSEWR
|
5886 CEPH_OSDMAP_PAUSEREC
|
5888 CEPH_OSDMAP_NODOWN
|
5891 CEPH_OSDMAP_NOBACKFILL
|
5892 CEPH_OSDMAP_NORECOVER
|
5893 CEPH_OSDMAP_NOSCRUB
|
5894 CEPH_OSDMAP_NODEEP_SCRUB
|
5895 CEPH_OSDMAP_NOTIERAGENT
|
5896 CEPH_OSDMAP_NOSNAPTRIM
|
5897 CEPH_OSDMAP_NOREBALANCE
;
5898 if (test_flag(warn_flags
)) {
5900 string s
= get_flag_string(get_flags() & warn_flags
);
5901 ss
<< s
<< " flag(s) set";
5902 checks
->add("OSDMAP_FLAGS", HEALTH_WARN
, ss
.str(),
5903 s
.size() /* kludgey but sufficient */);
5909 list
<string
> detail
;
5910 const unsigned flags
=
5915 for (int i
= 0; i
< max_osd
; ++i
) {
5916 if (osd_state
[i
] & flags
) {
5919 OSDMap::calc_state_set(osd_state
[i
] & flags
, states
);
5920 ss
<< "osd." << i
<< " has flags " << states
;
5921 detail
.push_back(ss
.str());
5924 for (auto& i
: crush_node_flags
) {
5925 if (i
.second
&& crush
->item_exists(i
.first
)) {
5928 OSDMap::calc_state_set(i
.second
, states
);
5929 int t
= i
.first
>= 0 ? 0 : crush
->get_bucket_type(i
.first
);
5930 const char *tn
= crush
->get_type_name(t
);
5931 ss
<< (tn
? tn
: "node") << " "
5932 << crush
->get_item_name(i
.first
) << " has flags " << states
;
5933 detail
.push_back(ss
.str());
5936 for (auto& i
: device_class_flags
) {
5937 const char* class_name
= crush
->get_class_name(i
.first
);
5938 if (i
.second
&& class_name
) {
5941 OSDMap::calc_state_set(i
.second
, states
);
5942 ss
<< "device class '" << class_name
<< "' has flags " << states
;
5943 detail
.push_back(ss
.str());
5946 if (!detail
.empty()) {
5948 ss
<< detail
.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
5949 auto& d
= checks
->add("OSD_FLAGS", HEALTH_WARN
, ss
.str(), detail
.size());
5950 d
.detail
.swap(detail
);
5954 // OLD_CRUSH_TUNABLES
5955 if (cct
->_conf
->mon_warn_on_legacy_crush_tunables
) {
5956 string min
= crush
->get_min_required_version();
5957 if (min
< cct
->_conf
->mon_crush_min_required_version
) {
5959 ss
<< "crush map has legacy tunables (require " << min
5960 << ", min is " << cct
->_conf
->mon_crush_min_required_version
<< ")";
5961 auto& d
= checks
->add("OLD_CRUSH_TUNABLES", HEALTH_WARN
, ss
.str(), 0);
5962 d
.detail
.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
5966 // OLD_CRUSH_STRAW_CALC_VERSION
5967 if (cct
->_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
5968 if (crush
->get_straw_calc_version() == 0) {
5970 ss
<< "crush map has straw_calc_version=0";
5971 auto& d
= checks
->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN
, ss
.str(), 0);
5973 "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
5977 // CACHE_POOL_NO_HIT_SET
5978 if (cct
->_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
5979 list
<string
> detail
;
5980 for (auto p
= pools
.cbegin(); p
!= pools
.cend(); ++p
) {
5981 const pg_pool_t
& info
= p
->second
;
5982 if (info
.cache_mode_requires_hit_set() &&
5983 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
5985 ss
<< "pool '" << get_pool_name(p
->first
)
5986 << "' with cache_mode " << info
.get_cache_mode_name()
5987 << " needs hit_set_type to be set but it is not";
5988 detail
.push_back(ss
.str());
5991 if (!detail
.empty()) {
5993 ss
<< detail
.size() << " cache pools are missing hit_sets";
5994 auto& d
= checks
->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN
, ss
.str(),
5996 d
.detail
.swap(detail
);
6000 // OSD_NO_SORTBITWISE
6001 if (!test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6003 ss
<< "'sortbitwise' flag is not set";
6004 checks
->add("OSD_NO_SORTBITWISE", HEALTH_WARN
, ss
.str(), 0);
6007 // OSD_UPGRADE_FINISHED
6008 // none of these (yet) since we don't run until luminous upgrade is done.
6010 // POOL_NEARFULL/BACKFILLFULL/FULL
6012 list
<string
> full_detail
, backfillfull_detail
, nearfull_detail
;
6013 for (auto it
: get_pools()) {
6014 const pg_pool_t
&pool
= it
.second
;
6015 const string
& pool_name
= get_pool_name(it
.first
);
6016 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
6018 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
6019 // may run out of space too,
6020 // but we want EQUOTA taking precedence
6021 ss
<< "pool '" << pool_name
<< "' is full (running out of quota)";
6023 ss
<< "pool '" << pool_name
<< "' is full (no space)";
6025 full_detail
.push_back(ss
.str());
6026 } else if (pool
.has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
6028 ss
<< "pool '" << pool_name
<< "' is backfillfull";
6029 backfillfull_detail
.push_back(ss
.str());
6030 } else if (pool
.has_flag(pg_pool_t::FLAG_NEARFULL
)) {
6032 ss
<< "pool '" << pool_name
<< "' is nearfull";
6033 nearfull_detail
.push_back(ss
.str());
6036 if (!full_detail
.empty()) {
6038 ss
<< full_detail
.size() << " pool(s) full";
6039 auto& d
= checks
->add("POOL_FULL", HEALTH_WARN
, ss
.str(), full_detail
.size());
6040 d
.detail
.swap(full_detail
);
6042 if (!backfillfull_detail
.empty()) {
6044 ss
<< backfillfull_detail
.size() << " pool(s) backfillfull";
6045 auto& d
= checks
->add("POOL_BACKFILLFULL", HEALTH_WARN
, ss
.str(),
6046 backfillfull_detail
.size());
6047 d
.detail
.swap(backfillfull_detail
);
6049 if (!nearfull_detail
.empty()) {
6051 ss
<< nearfull_detail
.size() << " pool(s) nearfull";
6052 auto& d
= checks
->add("POOL_NEARFULL", HEALTH_WARN
, ss
.str(),
6053 nearfull_detail
.size());
6054 d
.detail
.swap(nearfull_detail
);
6058 // POOL_PG_NUM_NOT_POWER_OF_TWO
6059 if (cct
->_conf
.get_val
<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
6060 list
<string
> detail
;
6061 for (auto it
: get_pools()) {
6062 if (!isp2(it
.second
.get_pg_num_target())) {
6064 ss
<< "pool '" << get_pool_name(it
.first
)
6065 << "' pg_num " << it
.second
.get_pg_num_target()
6066 << " is not a power of two";
6067 detail
.push_back(ss
.str());
6070 if (!detail
.empty()) {
6072 ss
<< detail
.size() << " pool(s) have non-power-of-two pg_num";
6073 auto& d
= checks
->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN
,
6074 ss
.str(), detail
.size());
6075 d
.detail
.swap(detail
);
6079 // POOL_NO_REDUNDANCY
6080 if (cct
->_conf
.get_val
<bool>("mon_warn_on_pool_no_redundancy"))
6082 list
<string
> detail
;
6083 for (auto it
: get_pools()) {
6084 if (it
.second
.get_size() == 1) {
6086 ss
<< "pool '" << get_pool_name(it
.first
)
6087 << "' has no replicas configured";
6088 detail
.push_back(ss
.str());
6091 if (!detail
.empty()) {
6093 ss
<< detail
.size() << " pool(s) have no replicas configured";
6094 auto& d
= checks
->add("POOL_NO_REDUNDANCY", HEALTH_WARN
,
6095 ss
.str(), detail
.size());
6096 d
.detail
.swap(detail
);
6100 // DEGRADED STRETCH MODE
6101 if (cct
->_conf
.get_val
<bool>("mon_warn_on_degraded_stretch_mode")) {
6102 if (recovering_stretch_mode
) {
6104 ss
<< "We are recovering stretch mode buckets, only requiring "
6105 << degraded_stretch_mode
<< " of " << stretch_bucket_count
<< " buckets to peer" ;
6106 checks
->add("RECOVERING_STRETCH_MODE", HEALTH_WARN
,
6108 } else if (degraded_stretch_mode
) {
6110 ss
<< "We are missing stretch mode buckets, only requiring "
6111 << degraded_stretch_mode
<< " of " << stretch_bucket_count
<< " buckets to peer" ;
6112 checks
->add("DEGRADED_STRETCH_MODE", HEALTH_WARN
,
6118 int OSDMap::parse_osd_id_list(const vector
<string
>& ls
, set
<int> *out
,
6122 for (auto i
= ls
.begin(); i
!= ls
.end(); ++i
) {
6123 if (i
== ls
.begin() &&
6124 (*i
== "any" || *i
== "all" || *i
== "*")) {
6128 long osd
= TOPNSPC::common::parse_osd_id(i
->c_str(), ss
);
6130 *ss
<< "invalid osd id '" << *i
<< "'";
6138 void OSDMap::get_random_up_osds_by_subtree(int n
, // whoami
6140 int limit
, // how many
6142 set
<int> *want
) const {
6145 int subtree_type
= crush
->get_type_id(subtree
);
6146 if (subtree_type
< 1)
6148 vector
<int> subtrees
;
6149 crush
->get_subtree_of_type(subtree_type
, &subtrees
);
6150 std::random_device rd
;
6151 std::default_random_engine rng
{rd()};
6152 std::shuffle(subtrees
.begin(), subtrees
.end(), rng
);
6153 for (auto s
: subtrees
) {
6156 if (crush
->subtree_contains(s
, n
))
6159 crush
->get_children_of_type(s
, 0, &osds
);
6162 vector
<int> up_osds
;
6163 for (auto o
: osds
) {
6164 if (is_up(o
) && !skip
.count(o
))
6165 up_osds
.push_back(o
);
6167 if (up_osds
.empty())
6169 auto it
= up_osds
.begin();
6170 std::advance(it
, (n
% up_osds
.size()));
6176 float OSDMap::pool_raw_used_rate(int64_t poolid
) const
6178 const pg_pool_t
*pool
= get_pg_pool(poolid
);
6179 assert(pool
!= nullptr);
6181 switch (pool
->get_type()) {
6182 case pg_pool_t::TYPE_REPLICATED
:
6183 return pool
->get_size();
6184 case pg_pool_t::TYPE_ERASURE
:
6187 get_erasure_code_profile(pool
->erasure_code_profile
);
6188 auto pm
= ecp
.find("m");
6189 auto pk
= ecp
.find("k");
6190 if (pm
!= ecp
.end() && pk
!= ecp
.end()) {
6191 int k
= atoi(pk
->second
.c_str());
6192 int m
= atoi(pm
->second
.c_str());
6194 ceph_assert(mk
!= 0);
6195 ceph_assert(k
!= 0);
6196 return (float)mk
/ k
;
6203 ceph_abort_msg("unrecognized pool type");
6207 unsigned OSDMap::get_osd_crush_node_flags(int osd
) const
6210 if (!crush_node_flags
.empty()) {
6211 // the map will contain type -> name
6212 std::map
<std::string
,std::string
> ploc
= crush
->get_full_location(osd
);
6213 for (auto& i
: ploc
) {
6214 int id
= crush
->get_item_id(i
.second
);
6215 auto p
= crush_node_flags
.find(id
);
6216 if (p
!= crush_node_flags
.end()) {
6224 unsigned OSDMap::get_crush_node_flags(int id
) const
6227 auto it
= crush_node_flags
.find(id
);
6228 if (it
!= crush_node_flags
.end())
6233 unsigned OSDMap::get_device_class_flags(int id
) const
6236 auto it
= device_class_flags
.find(id
);
6237 if (it
!= device_class_flags
.end())