1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
22 #include <boost/algorithm/string.hpp>
25 #include "common/config.h"
26 #include "common/errno.h"
27 #include "common/Formatter.h"
28 #include "common/TextTable.h"
29 #include "global/global_context.h"
30 #include "include/ceph_features.h"
31 #include "include/str_map.h"
33 #include "common/code_environment.h"
34 #include "mon/health_check.h"
36 #include "crush/CrushTreeDumper.h"
37 #include "common/Clock.h"
38 #include "mon/PGMap.h"
40 #define dout_subsys ceph_subsys_osd
42 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
43 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
46 // ----------------------------------
49 void osd_info_t::dump(Formatter
*f
) const
51 f
->dump_int("last_clean_begin", last_clean_begin
);
52 f
->dump_int("last_clean_end", last_clean_end
);
53 f
->dump_int("up_from", up_from
);
54 f
->dump_int("up_thru", up_thru
);
55 f
->dump_int("down_at", down_at
);
56 f
->dump_int("lost_at", lost_at
);
59 void osd_info_t::encode(bufferlist
& bl
) const
64 encode(last_clean_begin
, bl
);
65 encode(last_clean_end
, bl
);
72 void osd_info_t::decode(bufferlist::const_iterator
& bl
)
77 decode(last_clean_begin
, bl
);
78 decode(last_clean_end
, bl
);
85 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
87 o
.push_back(new osd_info_t
);
88 o
.push_back(new osd_info_t
);
89 o
.back()->last_clean_begin
= 1;
90 o
.back()->last_clean_end
= 2;
91 o
.back()->up_from
= 30;
92 o
.back()->up_thru
= 40;
93 o
.back()->down_at
= 5;
94 o
.back()->lost_at
= 6;
97 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
99 out
<< "up_from " << info
.up_from
100 << " up_thru " << info
.up_thru
101 << " down_at " << info
.down_at
102 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
104 out
<< " lost_at " << info
.lost_at
;
108 // ----------------------------------
111 void osd_xinfo_t::dump(Formatter
*f
) const
113 f
->dump_stream("down_stamp") << down_stamp
;
114 f
->dump_float("laggy_probability", laggy_probability
);
115 f
->dump_int("laggy_interval", laggy_interval
);
116 f
->dump_int("features", features
);
117 f
->dump_unsigned("old_weight", old_weight
);
120 void osd_xinfo_t::encode(bufferlist
& bl
) const
122 ENCODE_START(3, 1, bl
);
123 encode(down_stamp
, bl
);
124 __u32 lp
= laggy_probability
* 0xfffffffful
;
126 encode(laggy_interval
, bl
);
127 encode(features
, bl
);
128 encode(old_weight
, bl
);
132 void osd_xinfo_t::decode(bufferlist::const_iterator
& bl
)
135 decode(down_stamp
, bl
);
138 laggy_probability
= (float)lp
/ (float)0xffffffff;
139 decode(laggy_interval
, bl
);
141 decode(features
, bl
);
145 decode(old_weight
, bl
);
151 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
153 o
.push_back(new osd_xinfo_t
);
154 o
.push_back(new osd_xinfo_t
);
155 o
.back()->down_stamp
= utime_t(2, 3);
156 o
.back()->laggy_probability
= .123;
157 o
.back()->laggy_interval
= 123456;
158 o
.back()->old_weight
= 0x7fff;
161 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
163 return out
<< "down_stamp " << xi
.down_stamp
164 << " laggy_probability " << xi
.laggy_probability
165 << " laggy_interval " << xi
.laggy_interval
166 << " old_weight " << xi
.old_weight
;
169 // ----------------------------------
170 // OSDMap::Incremental
172 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
175 for (auto &weight
: new_weight
) {
176 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
178 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
184 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
187 for (auto &state
: new_state
) { //
188 if (state
.second
& CEPH_OSD_UP
) {
189 if (previous
->is_up(state
.first
))
198 int OSDMap::Incremental::identify_osd(uuid_d u
) const
200 for (auto &uuid
: new_uuid
)
201 if (uuid
.second
== u
)
206 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext
*cct
,
207 const OSDMap
& osdmap
)
209 ceph_assert(epoch
== osdmap
.get_epoch() + 1);
211 for (auto &new_pool
: new_pools
) {
212 if (!new_pool
.second
.tiers
.empty()) {
213 pg_pool_t
& base
= new_pool
.second
;
215 auto new_rem_it
= new_removed_snaps
.find(new_pool
.first
);
217 for (const auto &tier_pool
: base
.tiers
) {
218 const auto &r
= new_pools
.find(tier_pool
);
220 if (r
== new_pools
.end()) {
221 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
223 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
226 tier
= get_new_pool(tier_pool
, orig
);
230 if (tier
->tier_of
!= new_pool
.first
) {
231 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
235 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
236 << tier_pool
<< dendl
;
237 tier
->snap_seq
= base
.snap_seq
;
238 tier
->snap_epoch
= base
.snap_epoch
;
239 tier
->snaps
= base
.snaps
;
240 tier
->removed_snaps
= base
.removed_snaps
;
241 tier
->flags
|= base
.flags
& (pg_pool_t::FLAG_SELFMANAGED_SNAPS
|
242 pg_pool_t::FLAG_POOL_SNAPS
);
244 if (new_rem_it
!= new_removed_snaps
.end()) {
245 new_removed_snaps
[tier_pool
] = new_rem_it
->second
;
253 // ----------------------------------
256 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
262 down_cache
->count(id
)) {
267 crush
->get_children(id
, &children
);
268 for (const auto &child
: children
) {
269 if (!subtree_is_down(child
, down_cache
)) {
274 down_cache
->insert(id
);
279 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
281 // use a stack-local down_cache if we didn't get one from the
282 // caller. then at least this particular call will avoid duplicated
284 set
<int> local_down_cache
;
286 down_cache
= &local_down_cache
;
295 type
= crush
->get_bucket_type(current
);
297 ceph_assert(type
>= 0);
299 if (!subtree_is_down(current
, down_cache
)) {
300 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
304 // is this a big enough subtree to be marked as down?
305 if (type
>= subtree_type
) {
306 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
310 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
317 bool OSDMap::subtree_type_is_down(
321 set
<int> *down_in_osds
,
322 set
<int> *up_in_osds
,
323 set
<int> *subtree_up
,
324 unordered_map
<int, set
<int> > *subtree_type_down
) const
327 bool is_down_ret
= is_down(id
);
330 down_in_osds
->insert(id
);
332 up_in_osds
->insert(id
);
338 if (subtree_type_down
&&
339 (*subtree_type_down
)[subtree_type
].count(id
)) {
344 crush
->get_children(id
, &children
);
345 for (const auto &child
: children
) {
346 if (!subtree_type_is_down(
347 cct
, child
, crush
->get_bucket_type(child
),
348 down_in_osds
, up_in_osds
, subtree_up
, subtree_type_down
)) {
349 subtree_up
->insert(id
);
353 if (subtree_type_down
) {
354 (*subtree_type_down
)[subtree_type
].insert(id
);
359 void OSDMap::Incremental::encode_client_old(bufferlist
& bl
) const
366 encode(modified
, bl
);
367 int32_t new_t
= new_pool_max
;
369 encode(new_flags
, bl
);
373 encode(new_max_osd
, bl
);
374 // for encode(new_pools, bl);
375 __u32 n
= new_pools
.size();
377 for (const auto &new_pool
: new_pools
) {
380 encode(new_pool
.second
, bl
, 0);
382 // for encode(new_pool_names, bl);
383 n
= new_pool_names
.size();
386 for (const auto &new_pool_name
: new_pool_names
) {
387 n
= new_pool_name
.first
;
389 encode(new_pool_name
.second
, bl
);
391 // for encode(old_pools, bl);
392 n
= old_pools
.size();
394 for (auto &old_pool
: old_pools
) {
398 encode(new_up_client
, bl
, 0);
400 // legacy is map<int32_t,uint8_t>
401 uint32_t n
= new_state
.size();
403 for (auto p
: new_state
) {
405 encode((uint8_t)p
.second
, bl
);
408 encode(new_weight
, bl
);
409 // for encode(new_pg_temp, bl);
410 n
= new_pg_temp
.size();
413 for (const auto &pg_temp
: new_pg_temp
) {
414 old_pg_t opg
= pg_temp
.first
.get_old_pg();
416 encode(pg_temp
.second
, bl
);
420 void OSDMap::Incremental::encode_classic(bufferlist
& bl
, uint64_t features
) const
423 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
424 encode_client_old(bl
);
433 encode(modified
, bl
);
434 encode(new_pool_max
, bl
);
435 encode(new_flags
, bl
);
439 encode(new_max_osd
, bl
);
440 encode(new_pools
, bl
, features
);
441 encode(new_pool_names
, bl
);
442 encode(old_pools
, bl
);
443 encode(new_up_client
, bl
, features
);
445 uint32_t n
= new_state
.size();
447 for (auto p
: new_state
) {
449 encode((uint8_t)p
.second
, bl
);
452 encode(new_weight
, bl
);
453 encode(new_pg_temp
, bl
);
458 encode(new_hb_back_up
, bl
, features
);
459 encode(new_up_thru
, bl
);
460 encode(new_last_clean_interval
, bl
);
461 encode(new_lost
, bl
);
462 encode(new_blacklist
, bl
, features
);
463 encode(old_blacklist
, bl
, features
);
464 encode(new_up_cluster
, bl
, features
);
465 encode(cluster_snapshot
, bl
);
466 encode(new_uuid
, bl
);
467 encode(new_xinfo
, bl
);
468 encode(new_hb_front_up
, bl
, features
);
472 static void encode_addrvec_map_as_addr(const T
& m
, bufferlist
& bl
, uint64_t f
)
474 uint32_t n
= m
.size();
478 encode(i
.second
.legacy_addr(), bl
, f
);
483 static void encode_addrvec_pvec_as_addr(const T
& m
, bufferlist
& bl
, uint64_t f
)
485 uint32_t n
= m
.size();
489 encode(i
->legacy_addr(), bl
, f
);
491 encode(entity_addr_t(), bl
, f
);
496 /* for a description of osdmap incremental versions, and when they were
497 * introduced, please refer to
498 * doc/dev/osd_internals/osdmap_versions.txt
500 void OSDMap::Incremental::encode(bufferlist
& bl
, uint64_t features
) const
503 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
504 encode_classic(bl
, features
);
508 // only a select set of callers should *ever* be encoding new
509 // OSDMaps. others should be passing around the canonical encoded
510 // buffers from on high. select out those callers by passing in an
511 // "impossible" feature bit.
512 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
513 features
&= ~CEPH_FEATURE_RESERVED
;
515 size_t start_offset
= bl
.length();
518 std::optional
<buffer::list::contiguous_filler
> crc_filler
;
520 // meta-encoding: how we include client-used and osd-specific data
521 ENCODE_START(8, 7, bl
);
525 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
527 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
529 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
532 ENCODE_START(v
, 1, bl
); // client-usable data
535 encode(modified
, bl
);
536 encode(new_pool_max
, bl
);
537 encode(new_flags
, bl
);
541 encode(new_max_osd
, bl
);
542 encode(new_pools
, bl
, features
);
543 encode(new_pool_names
, bl
);
544 encode(old_pools
, bl
);
546 encode(new_up_client
, bl
, features
);
548 encode_addrvec_map_as_addr(new_up_client
, bl
, features
);
551 encode(new_state
, bl
);
553 uint32_t n
= new_state
.size();
555 for (auto p
: new_state
) {
557 encode((uint8_t)p
.second
, bl
);
560 encode(new_weight
, bl
);
561 encode(new_pg_temp
, bl
);
562 encode(new_primary_temp
, bl
);
563 encode(new_primary_affinity
, bl
);
564 encode(new_erasure_code_profiles
, bl
);
565 encode(old_erasure_code_profiles
, bl
);
567 encode(new_pg_upmap
, bl
);
568 encode(old_pg_upmap
, bl
);
569 encode(new_pg_upmap_items
, bl
);
570 encode(old_pg_upmap_items
, bl
);
573 encode(new_removed_snaps
, bl
);
574 encode(new_purged_snaps
, bl
);
577 encode(new_last_up_change
, bl
);
578 encode(new_last_in_change
, bl
);
580 ENCODE_FINISH(bl
); // client-usable data
584 uint8_t target_v
= 9;
585 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
587 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
590 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
592 encode_addrvec_map_as_addr(new_hb_back_up
, bl
, features
);
594 encode(new_hb_back_up
, bl
, features
);
596 encode(new_up_thru
, bl
);
597 encode(new_last_clean_interval
, bl
);
598 encode(new_lost
, bl
);
599 encode(new_blacklist
, bl
, features
);
600 encode(old_blacklist
, bl
, features
);
602 encode_addrvec_map_as_addr(new_up_cluster
, bl
, features
);
604 encode(new_up_cluster
, bl
, features
);
606 encode(cluster_snapshot
, bl
);
607 encode(new_uuid
, bl
);
608 encode(new_xinfo
, bl
);
610 encode_addrvec_map_as_addr(new_hb_front_up
, bl
, features
);
612 encode(new_hb_front_up
, bl
, features
);
614 encode(features
, bl
); // NOTE: features arg, not the member
616 encode(new_nearfull_ratio
, bl
);
617 encode(new_full_ratio
, bl
);
618 encode(new_backfillfull_ratio
, bl
);
620 // 5 was string-based new_require_min_compat_client
622 encode(new_require_min_compat_client
, bl
);
623 encode(new_require_osd_release
, bl
);
626 encode(new_crush_node_flags
, bl
);
629 encode(new_device_class_flags
, bl
);
631 ENCODE_FINISH(bl
); // osd-only data
634 crc_offset
= bl
.length();
635 crc_filler
= bl
.append_hole(sizeof(uint32_t));
636 tail_offset
= bl
.length();
638 encode(full_crc
, bl
);
640 ENCODE_FINISH(bl
); // meta-encoding wrapper
644 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
645 inc_crc
= front
.crc32c(-1);
647 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
648 inc_crc
= tail
.crc32c(inc_crc
);
651 crc_filler
->copy_in(4u, (char*)&crc_le
);
655 void OSDMap::Incremental::decode_classic(bufferlist::const_iterator
&p
)
665 if (v
== 4 || v
== 5) {
669 decode(new_pool_max
, p
);
670 decode(new_flags
, p
);
674 decode(new_max_osd
, p
);
680 decode(new_pools
[t
], p
);
683 decode(new_pools
, p
);
686 new_pool_names
.clear();
690 decode(new_pool_names
[t
], p
);
693 decode(new_pool_names
, p
);
703 decode(old_pools
, p
);
705 decode(new_up_client
, p
);
707 map
<int32_t,uint8_t> ns
;
710 new_state
[q
.first
] = q
.second
;
713 decode(new_weight
, p
);
720 ::decode_raw(opg
, p
);
721 decode(new_pg_temp
[pg_t(opg
)], p
);
724 decode(new_pg_temp
, p
);
727 // decode short map, too.
728 if (v
== 5 && p
.end())
735 decode(new_hb_back_up
, p
);
737 decode(new_pool_names
, p
);
738 decode(new_up_thru
, p
);
739 decode(new_last_clean_interval
, p
);
741 decode(new_blacklist
, p
);
742 decode(old_blacklist
, p
);
744 decode(new_up_cluster
, p
);
746 decode(cluster_snapshot
, p
);
750 decode(new_xinfo
, p
);
752 decode(new_hb_front_up
, p
);
755 /* for a description of osdmap incremental versions, and when they were
756 * introduced, please refer to
757 * doc/dev/osd_internals/osdmap_versions.txt
759 void OSDMap::Incremental::decode(bufferlist::const_iterator
& bl
)
763 * Older encodings of the Incremental had a single struct_v which
764 * covered the whole encoding, and was prior to our modern
765 * stuff which includes a compatv and a size. So if we see
766 * a struct_v < 7, we must rewind to the beginning and use our
769 size_t start_offset
= bl
.get_off();
770 size_t tail_offset
= 0;
771 bufferlist crc_front
, crc_tail
;
773 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
775 bl
.seek(start_offset
);
779 encode_features
= CEPH_FEATURE_PGID64
;
785 DECODE_START(8, bl
); // client-usable data
788 decode(modified
, bl
);
789 decode(new_pool_max
, bl
);
790 decode(new_flags
, bl
);
794 decode(new_max_osd
, bl
);
795 decode(new_pools
, bl
);
796 decode(new_pool_names
, bl
);
797 decode(old_pools
, bl
);
798 decode(new_up_client
, bl
);
800 decode(new_state
, bl
);
802 map
<int32_t,uint8_t> ns
;
805 new_state
[q
.first
] = q
.second
;
808 decode(new_weight
, bl
);
809 decode(new_pg_temp
, bl
);
810 decode(new_primary_temp
, bl
);
812 decode(new_primary_affinity
, bl
);
814 new_primary_affinity
.clear();
816 decode(new_erasure_code_profiles
, bl
);
817 decode(old_erasure_code_profiles
, bl
);
819 new_erasure_code_profiles
.clear();
820 old_erasure_code_profiles
.clear();
823 decode(new_pg_upmap
, bl
);
824 decode(old_pg_upmap
, bl
);
825 decode(new_pg_upmap_items
, bl
);
826 decode(old_pg_upmap_items
, bl
);
829 decode(new_removed_snaps
, bl
);
830 decode(new_purged_snaps
, bl
);
833 decode(new_last_up_change
, bl
);
834 decode(new_last_in_change
, bl
);
836 DECODE_FINISH(bl
); // client-usable data
840 DECODE_START(9, bl
); // extended, osd-only data
841 decode(new_hb_back_up
, bl
);
842 decode(new_up_thru
, bl
);
843 decode(new_last_clean_interval
, bl
);
844 decode(new_lost
, bl
);
845 decode(new_blacklist
, bl
);
846 decode(old_blacklist
, bl
);
847 decode(new_up_cluster
, bl
);
848 decode(cluster_snapshot
, bl
);
849 decode(new_uuid
, bl
);
850 decode(new_xinfo
, bl
);
851 decode(new_hb_front_up
, bl
);
853 decode(encode_features
, bl
);
855 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
857 decode(new_nearfull_ratio
, bl
);
858 decode(new_full_ratio
, bl
);
860 new_nearfull_ratio
= -1;
864 decode(new_backfillfull_ratio
, bl
);
866 new_backfillfull_ratio
= -1;
872 new_require_min_compat_client
= ceph_release_from_name(r
.c_str());
876 decode(new_require_min_compat_client
, bl
);
877 decode(new_require_osd_release
, bl
);
879 if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
880 // only for compat with post-kraken pre-luminous test clusters
881 new_require_osd_release
= CEPH_RELEASE_LUMINOUS
;
882 new_flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
883 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
884 new_require_osd_release
= CEPH_RELEASE_KRAKEN
;
885 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_JEWEL
)) {
886 new_require_osd_release
= CEPH_RELEASE_JEWEL
;
888 new_require_osd_release
= -1;
892 decode(new_crush_node_flags
, bl
);
895 decode(new_device_class_flags
, bl
);
897 DECODE_FINISH(bl
); // osd-only data
902 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
904 tail_offset
= bl
.get_off();
905 decode(full_crc
, bl
);
912 DECODE_FINISH(bl
); // wrapper
916 uint32_t actual
= crc_front
.crc32c(-1);
917 if (tail_offset
< bl
.get_off()) {
919 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
920 actual
= tail
.crc32c(actual
);
922 if (inc_crc
!= actual
) {
924 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
926 throw buffer::malformed_input(s
.c_str());
931 void OSDMap::Incremental::dump(Formatter
*f
) const
933 f
->dump_int("epoch", epoch
);
934 f
->dump_stream("fsid") << fsid
;
935 f
->dump_stream("modified") << modified
;
936 f
->dump_stream("new_last_up_change") << new_last_up_change
;
937 f
->dump_stream("new_last_in_change") << new_last_in_change
;
938 f
->dump_int("new_pool_max", new_pool_max
);
939 f
->dump_int("new_flags", new_flags
);
940 f
->dump_float("new_full_ratio", new_full_ratio
);
941 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
942 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
943 f
->dump_int("new_require_min_compat_client", new_require_min_compat_client
);
944 f
->dump_int("new_require_osd_release", new_require_osd_release
);
946 if (fullmap
.length()) {
947 f
->open_object_section("full_map");
949 bufferlist fbl
= fullmap
; // kludge around constness.
950 auto p
= fbl
.cbegin();
955 if (crush
.length()) {
956 f
->open_object_section("crush");
958 bufferlist tbl
= crush
; // kludge around constness.
959 auto p
= tbl
.cbegin();
965 f
->dump_int("new_max_osd", new_max_osd
);
967 f
->open_array_section("new_pools");
969 for (const auto &new_pool
: new_pools
) {
970 f
->open_object_section("pool");
971 f
->dump_int("pool", new_pool
.first
);
972 new_pool
.second
.dump(f
);
976 f
->open_array_section("new_pool_names");
978 for (const auto &new_pool_name
: new_pool_names
) {
979 f
->open_object_section("pool_name");
980 f
->dump_int("pool", new_pool_name
.first
);
981 f
->dump_string("name", new_pool_name
.second
);
985 f
->open_array_section("old_pools");
987 for (const auto &old_pool
: old_pools
)
988 f
->dump_int("pool", old_pool
);
991 f
->open_array_section("new_up_osds");
993 for (const auto &upclient
: new_up_client
) {
994 f
->open_object_section("osd");
995 f
->dump_int("osd", upclient
.first
);
996 f
->dump_stream("public_addr") << upclient
.second
.legacy_addr();
997 f
->dump_object("public_addrs", upclient
.second
);
998 if (auto p
= new_up_cluster
.find(upclient
.first
);
999 p
!= new_up_cluster
.end()) {
1000 f
->dump_stream("cluster_addr") << p
->second
.legacy_addr();
1001 f
->dump_object("cluster_addrs", p
->second
);
1003 if (auto p
= new_hb_back_up
.find(upclient
.first
);
1004 p
!= new_hb_back_up
.end()) {
1005 f
->dump_object("heartbeat_back_addrs", p
->second
);
1007 if (auto p
= new_hb_front_up
.find(upclient
.first
);
1008 p
!= new_hb_front_up
.end()) {
1009 f
->dump_object("heartbeat_front_addrs", p
->second
);
1015 f
->open_array_section("new_weight");
1017 for (const auto &weight
: new_weight
) {
1018 f
->open_object_section("osd");
1019 f
->dump_int("osd", weight
.first
);
1020 f
->dump_int("weight", weight
.second
);
1025 f
->open_array_section("osd_state_xor");
1026 for (const auto &ns
: new_state
) {
1027 f
->open_object_section("osd");
1028 f
->dump_int("osd", ns
.first
);
1030 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
1031 f
->open_array_section("state_xor");
1032 for (auto &state
: st
)
1033 f
->dump_string("state", state
);
1039 f
->open_array_section("new_pg_temp");
1041 for (const auto &pg_temp
: new_pg_temp
) {
1042 f
->open_object_section("pg");
1043 f
->dump_stream("pgid") << pg_temp
.first
;
1044 f
->open_array_section("osds");
1046 for (const auto &osd
: pg_temp
.second
)
1047 f
->dump_int("osd", osd
);
1053 f
->open_array_section("primary_temp");
1055 for (const auto &primary_temp
: new_primary_temp
) {
1056 f
->dump_stream("pgid") << primary_temp
.first
;
1057 f
->dump_int("osd", primary_temp
.second
);
1059 f
->close_section(); // primary_temp
1061 f
->open_array_section("new_pg_upmap");
1062 for (auto& i
: new_pg_upmap
) {
1063 f
->open_object_section("mapping");
1064 f
->dump_stream("pgid") << i
.first
;
1065 f
->open_array_section("osds");
1066 for (auto osd
: i
.second
) {
1067 f
->dump_int("osd", osd
);
1073 f
->open_array_section("old_pg_upmap");
1074 for (auto& i
: old_pg_upmap
) {
1075 f
->dump_stream("pgid") << i
;
1079 f
->open_array_section("new_pg_upmap_items");
1080 for (auto& i
: new_pg_upmap_items
) {
1081 f
->open_object_section("mapping");
1082 f
->dump_stream("pgid") << i
.first
;
1083 f
->open_array_section("mappings");
1084 for (auto& p
: i
.second
) {
1085 f
->open_object_section("mapping");
1086 f
->dump_int("from", p
.first
);
1087 f
->dump_int("to", p
.second
);
1094 f
->open_array_section("old_pg_upmap_items");
1095 for (auto& i
: old_pg_upmap_items
) {
1096 f
->dump_stream("pgid") << i
;
1100 f
->open_array_section("new_up_thru");
1102 for (const auto &up_thru
: new_up_thru
) {
1103 f
->open_object_section("osd");
1104 f
->dump_int("osd", up_thru
.first
);
1105 f
->dump_int("up_thru", up_thru
.second
);
1110 f
->open_array_section("new_lost");
1112 for (const auto &lost
: new_lost
) {
1113 f
->open_object_section("osd");
1114 f
->dump_int("osd", lost
.first
);
1115 f
->dump_int("epoch_lost", lost
.second
);
1120 f
->open_array_section("new_last_clean_interval");
1122 for (const auto &last_clean_interval
: new_last_clean_interval
) {
1123 f
->open_object_section("osd");
1124 f
->dump_int("osd", last_clean_interval
.first
);
1125 f
->dump_int("first", last_clean_interval
.second
.first
);
1126 f
->dump_int("last", last_clean_interval
.second
.second
);
1131 f
->open_array_section("new_blacklist");
1132 for (const auto &blist
: new_blacklist
) {
1135 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1138 f
->open_array_section("old_blacklist");
1139 for (const auto &blist
: old_blacklist
)
1140 f
->dump_stream("addr") << blist
;
1143 f
->open_array_section("new_xinfo");
1144 for (const auto &xinfo
: new_xinfo
) {
1145 f
->open_object_section("xinfo");
1146 f
->dump_int("osd", xinfo
.first
);
1147 xinfo
.second
.dump(f
);
1152 if (cluster_snapshot
.size())
1153 f
->dump_string("cluster_snapshot", cluster_snapshot
);
1155 f
->open_array_section("new_uuid");
1156 for (const auto &uuid
: new_uuid
) {
1157 f
->open_object_section("osd");
1158 f
->dump_int("osd", uuid
.first
);
1159 f
->dump_stream("uuid") << uuid
.second
;
1164 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
1165 f
->open_array_section("old_erasure_code_profiles");
1166 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
1167 f
->dump_string("old", erasure_code_profile
.c_str());
1171 f
->open_array_section("new_removed_snaps");
1172 for (auto& p
: new_removed_snaps
) {
1173 f
->open_object_section("pool");
1174 f
->dump_int("pool", p
.first
);
1175 f
->open_array_section("snaps");
1176 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1177 f
->open_object_section("interval");
1178 f
->dump_unsigned("begin", q
.get_start());
1179 f
->dump_unsigned("length", q
.get_len());
1186 f
->open_array_section("new_purged_snaps");
1187 for (auto& p
: new_purged_snaps
) {
1188 f
->open_object_section("pool");
1189 f
->dump_int("pool", p
.first
);
1190 f
->open_array_section("snaps");
1191 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
1192 f
->open_object_section("interval");
1193 f
->dump_unsigned("begin", q
.get_start());
1194 f
->dump_unsigned("length", q
.get_len());
1200 f
->open_array_section("new_crush_node_flags");
1201 for (auto& i
: new_crush_node_flags
) {
1202 f
->open_object_section("node");
1203 f
->dump_int("id", i
.first
);
1205 calc_state_set(i
.second
, st
);
1206 for (auto& j
: st
) {
1207 f
->dump_string("flag", j
);
1212 f
->open_array_section("new_device_class_flags");
1213 for (auto& i
: new_device_class_flags
) {
1214 f
->open_object_section("device_class");
1215 f
->dump_int("id", i
.first
);
1217 calc_state_set(i
.second
, st
);
1218 for (auto& j
: st
) {
1219 f
->dump_string("flag", j
);
1227 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
1229 o
.push_back(new Incremental
);
1232 // ----------------------------------
1235 void OSDMap::set_epoch(epoch_t e
)
1238 for (auto &pool
: pools
)
1239 pool
.second
.last_change
= e
;
1242 bool OSDMap::is_blacklisted(const entity_addr_t
& orig
) const
1244 if (blacklist
.empty()) {
1248 // all blacklist entries are type ANY for nautilus+
1249 // FIXME: avoid this copy!
1250 entity_addr_t a
= orig
;
1251 if (require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
1252 a
.set_type(entity_addr_t::TYPE_LEGACY
);
1254 a
.set_type(entity_addr_t::TYPE_ANY
);
1257 // this specific instance?
1258 if (blacklist
.count(a
)) {
1262 // is entire ip blacklisted?
1266 if (blacklist
.count(a
)) {
1274 bool OSDMap::is_blacklisted(const entity_addrvec_t
& av
) const
1276 if (blacklist
.empty())
1279 for (auto& a
: av
.v
) {
1280 if (is_blacklisted(a
)) {
1288 void OSDMap::get_blacklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const
1290 std::copy(blacklist
.begin(), blacklist
.end(), std::back_inserter(*bl
));
1293 void OSDMap::get_blacklist(std::set
<entity_addr_t
> *bl
) const
1295 for (const auto &i
: blacklist
) {
1296 bl
->insert(i
.first
);
1300 void OSDMap::set_max_osd(int m
)
1304 osd_state
.resize(m
);
1305 osd_weight
.resize(m
);
1306 for (; o
<max_osd
; o
++) {
1308 osd_weight
[o
] = CEPH_OSD_OUT
;
1311 osd_xinfo
.resize(m
);
1312 osd_addrs
->client_addrs
.resize(m
);
1313 osd_addrs
->cluster_addrs
.resize(m
);
1314 osd_addrs
->hb_back_addrs
.resize(m
);
1315 osd_addrs
->hb_front_addrs
.resize(m
);
1316 osd_uuid
->resize(m
);
1317 if (osd_primary_affinity
)
1318 osd_primary_affinity
->resize(m
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1323 int OSDMap::calc_num_osds()
1328 for (int i
=0; i
<max_osd
; i
++) {
1329 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1331 if (osd_state
[i
] & CEPH_OSD_UP
) {
1334 if (get_weight(i
) != CEPH_OSD_OUT
) {
1342 void OSDMap::get_full_pools(CephContext
*cct
,
1344 set
<int64_t> *backfillfull
,
1345 set
<int64_t> *nearfull
) const
1348 ceph_assert(backfillfull
);
1349 ceph_assert(nearfull
);
1351 backfillfull
->clear();
1354 vector
<int> full_osds
;
1355 vector
<int> backfillfull_osds
;
1356 vector
<int> nearfull_osds
;
1357 for (int i
= 0; i
< max_osd
; ++i
) {
1358 if (exists(i
) && is_up(i
) && is_in(i
)) {
1359 if (osd_state
[i
] & CEPH_OSD_FULL
)
1360 full_osds
.push_back(i
);
1361 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1362 backfillfull_osds
.push_back(i
);
1363 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1364 nearfull_osds
.push_back(i
);
1368 for (auto i
: full_osds
) {
1369 get_pool_ids_by_osd(cct
, i
, full
);
1371 for (auto i
: backfillfull_osds
) {
1372 get_pool_ids_by_osd(cct
, i
, backfillfull
);
1374 for (auto i
: nearfull_osds
) {
1375 get_pool_ids_by_osd(cct
, i
, nearfull
);
1379 void OSDMap::get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
1380 set
<int> *nearfull
) const
1385 for (int i
= 0; i
< max_osd
; ++i
) {
1386 if (exists(i
) && is_up(i
) && is_in(i
)) {
1387 if (osd_state
[i
] & CEPH_OSD_FULL
)
1389 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1390 backfill
->emplace(i
);
1391 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1392 nearfull
->emplace(i
);
1397 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1399 for (int i
=0; i
<max_osd
; i
++)
1404 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1406 for (int i
= 0; i
< max_osd
; i
++) {
1412 void OSDMap::get_out_existing_osds(set
<int32_t>& ls
) const
1414 for (int i
= 0; i
< max_osd
; i
++) {
1415 if (exists(i
) && get_weight(i
) == CEPH_OSD_OUT
)
1420 void OSDMap::get_flag_set(set
<string
> *flagset
) const
1422 for (unsigned i
= 0; i
< sizeof(flags
) * 8; ++i
) {
1423 if (flags
& (1<<i
)) {
1424 flagset
->insert(get_flag_string(flags
& (1<<i
)));
1429 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1432 for (unsigned s
= 1; t
; s
<<= 1) {
1435 st
.insert(ceph_osd_state_name(s
));
1440 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1443 for (const auto &weight
: weights
) {
1444 if (weight
.second
> max
)
1445 max
= weight
.second
;
1448 for (const auto &weight
: weights
) {
1449 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1453 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1455 for (int i
=0; i
<max_osd
; i
++)
1456 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1457 get_cluster_addrs(i
).contains(addr
)))
1462 int OSDMap::identify_osd(const uuid_d
& u
) const
1464 for (int i
=0; i
<max_osd
; i
++)
1465 if (exists(i
) && get_uuid(i
) == u
)
1470 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1472 for (int i
=0; i
<max_osd
; i
++)
1473 if (exists(i
) && (get_addrs(i
).contains(addr
) ||
1474 get_cluster_addrs(i
).contains(addr
) ||
1475 get_hb_back_addrs(i
).contains(addr
) ||
1476 get_hb_front_addrs(i
).contains(addr
)))
1481 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1483 for (int i
=0; i
<max_osd
; i
++)
1484 if (exists(i
) && (get_addrs(i
).is_same_host(ip
) ||
1485 get_cluster_addrs(i
).is_same_host(ip
)))
1491 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1493 uint64_t features
= 0; // things we actually have
1494 uint64_t mask
= 0; // things we could have
1496 if (crush
->has_nondefault_tunables())
1497 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1498 if (crush
->has_nondefault_tunables2())
1499 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1500 if (crush
->has_nondefault_tunables3())
1501 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1502 if (crush
->has_v4_buckets())
1503 features
|= CEPH_FEATURE_CRUSH_V4
;
1504 if (crush
->has_nondefault_tunables5())
1505 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1506 if (crush
->has_incompat_choose_args()) {
1507 features
|= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
;
1509 mask
|= CEPH_FEATURES_CRUSH
;
1511 if (!pg_upmap
.empty() || !pg_upmap_items
.empty())
1512 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1513 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1515 for (auto &pool
: pools
) {
1516 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1517 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1519 if (!pool
.second
.tiers
.empty() ||
1520 pool
.second
.is_tier()) {
1521 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1523 int ruleid
= crush
->find_rule(pool
.second
.get_crush_rule(),
1524 pool
.second
.get_type(),
1525 pool
.second
.get_size());
1527 if (crush
->is_v2_rule(ruleid
))
1528 features
|= CEPH_FEATURE_CRUSH_V2
;
1529 if (crush
->is_v3_rule(ruleid
))
1530 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1531 if (crush
->is_v5_rule(ruleid
))
1532 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1535 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1537 if (osd_primary_affinity
) {
1538 for (int i
= 0; i
< max_osd
; ++i
) {
1539 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1540 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1545 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1547 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1548 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1549 if (require_osd_release
>= CEPH_RELEASE_JEWEL
) {
1550 features
|= jewel_features
;
1552 mask
|= jewel_features
;
1554 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1555 | CEPH_FEATURE_MSG_ADDR2
;
1556 if (require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
1557 features
|= kraken_features
;
1559 mask
|= kraken_features
;
1562 if (require_min_compat_client
>= CEPH_RELEASE_NAUTILUS
) {
1563 // if min_compat_client is >= nautilus, require v2 cephx signatures
1565 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1566 } else if (require_osd_release
>= CEPH_RELEASE_NAUTILUS
&&
1567 entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1568 // if osds are >= nautilus, at least require the signatures from them
1569 features
|= CEPH_FEATUREMASK_CEPHX_V2
;
1571 mask
|= CEPH_FEATUREMASK_CEPHX_V2
;
1578 uint8_t OSDMap::get_min_compat_client() const
1580 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1582 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1583 HAVE_FEATURE(f
, CRUSH_CHOOSE_ARGS
)) { // v12.0.1-2172-gef1ef28
1584 return CEPH_RELEASE_LUMINOUS
; // v12.2.0
1586 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1587 return CEPH_RELEASE_JEWEL
; // v10.2.0
1589 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1590 return CEPH_RELEASE_HAMMER
; // v0.94.0
1592 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1593 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1594 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1595 return CEPH_RELEASE_FIREFLY
; // v0.80.0
1597 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1598 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1599 return CEPH_RELEASE_DUMPLING
; // v0.67.0
1601 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1602 return CEPH_RELEASE_ARGONAUT
; // v0.48argonaut-206-g6f381af
1604 return CEPH_RELEASE_ARGONAUT
; // v0.48argonaut-206-g6f381af
1607 uint8_t OSDMap::get_require_min_compat_client() const
1609 return require_min_compat_client
;
1612 void OSDMap::_calc_up_osd_features()
1615 cached_up_osd_features
= 0;
1616 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1619 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1620 if (xi
.features
== 0)
1621 continue; // bogus xinfo, maybe #20751 or similar, skipping
1623 cached_up_osd_features
= xi
.features
;
1626 cached_up_osd_features
&= xi
.features
;
1631 uint64_t OSDMap::get_up_osd_features() const
1633 return cached_up_osd_features
;
1636 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1639 if (o
->epoch
== n
->epoch
)
1645 if (o
->max_osd
!= n
->max_osd
)
1647 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1648 if ( n
->osd_addrs
->client_addrs
[i
] && o
->osd_addrs
->client_addrs
[i
] &&
1649 *n
->osd_addrs
->client_addrs
[i
] == *o
->osd_addrs
->client_addrs
[i
])
1650 n
->osd_addrs
->client_addrs
[i
] = o
->osd_addrs
->client_addrs
[i
];
1653 if ( n
->osd_addrs
->cluster_addrs
[i
] && o
->osd_addrs
->cluster_addrs
[i
] &&
1654 *n
->osd_addrs
->cluster_addrs
[i
] == *o
->osd_addrs
->cluster_addrs
[i
])
1655 n
->osd_addrs
->cluster_addrs
[i
] = o
->osd_addrs
->cluster_addrs
[i
];
1658 if ( n
->osd_addrs
->hb_back_addrs
[i
] && o
->osd_addrs
->hb_back_addrs
[i
] &&
1659 *n
->osd_addrs
->hb_back_addrs
[i
] == *o
->osd_addrs
->hb_back_addrs
[i
])
1660 n
->osd_addrs
->hb_back_addrs
[i
] = o
->osd_addrs
->hb_back_addrs
[i
];
1663 if ( n
->osd_addrs
->hb_front_addrs
[i
] && o
->osd_addrs
->hb_front_addrs
[i
] &&
1664 *n
->osd_addrs
->hb_front_addrs
[i
] == *o
->osd_addrs
->hb_front_addrs
[i
])
1665 n
->osd_addrs
->hb_front_addrs
[i
] = o
->osd_addrs
->hb_front_addrs
[i
];
1670 // zoinks, no differences at all!
1671 n
->osd_addrs
= o
->osd_addrs
;
1674 // does crush match?
1676 encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1677 encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1678 if (oc
.contents_equal(nc
)) {
1679 n
->crush
= o
->crush
;
1682 // does pg_temp match?
1683 if (*o
->pg_temp
== *n
->pg_temp
)
1684 n
->pg_temp
= o
->pg_temp
;
1686 // does primary_temp match?
1687 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1688 if (*o
->primary_temp
== *n
->primary_temp
)
1689 n
->primary_temp
= o
->primary_temp
;
1693 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1694 *o
->osd_uuid
== *n
->osd_uuid
)
1695 n
->osd_uuid
= o
->osd_uuid
;
1698 void OSDMap::clean_temps(CephContext
*cct
,
1699 const OSDMap
& oldmap
,
1700 const OSDMap
& nextmap
,
1701 Incremental
*pending_inc
)
1703 ldout(cct
, 10) << __func__
<< dendl
;
1705 for (auto pg
: *nextmap
.pg_temp
) {
1706 // if pool does not exist, remove any existing pg_temps associated with
1707 // it. we don't care about pg_temps on the pending_inc either; if there
1708 // are new_pg_temp entries on the pending, clear them out just as well.
1709 if (!nextmap
.have_pg_pool(pg
.first
.pool())) {
1710 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1711 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1712 pending_inc
->new_pg_temp
[pg
.first
].clear();
1716 unsigned num_up
= 0;
1717 for (auto o
: pg
.second
) {
1718 if (!nextmap
.is_down(o
)) {
1724 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1725 << " with all down osds" << pg
.second
<< dendl
;
1726 pending_inc
->new_pg_temp
[pg
.first
].clear();
1729 // redundant pg_temp?
1732 nextmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1733 bool remove
= false;
1734 if (raw_up
== pg
.second
) {
1735 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1736 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1739 // oversized pg_temp?
1740 if (pg
.second
.size() > nextmap
.get_pg_pool(pg
.first
.pool())->get_size()) {
1741 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1742 << pg
.second
<< " exceeds pool size" << dendl
;
1746 if (oldmap
.pg_temp
->count(pg
.first
))
1747 pending_inc
->new_pg_temp
[pg
.first
].clear();
1749 pending_inc
->new_pg_temp
.erase(pg
.first
);
1753 for (auto &pg
: *nextmap
.primary_temp
) {
1755 if (nextmap
.is_down(pg
.second
)) {
1756 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
1757 << " to down " << pg
.second
<< dendl
;
1758 pending_inc
->new_primary_temp
[pg
.first
] = -1;
1761 // redundant primary_temp?
1762 vector
<int> real_up
, templess_up
;
1763 int real_primary
, templess_primary
;
1764 pg_t pgid
= pg
.first
;
1765 nextmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
1766 nextmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
1767 if (real_primary
== templess_primary
){
1768 ldout(cct
, 10) << __func__
<< " removing primary_temp "
1769 << pgid
<< " -> " << real_primary
1770 << " (unnecessary/redundant)" << dendl
;
1771 if (oldmap
.primary_temp
->count(pgid
))
1772 pending_inc
->new_primary_temp
[pgid
] = -1;
1774 pending_inc
->new_primary_temp
.erase(pgid
);
1779 void OSDMap::get_upmap_pgs(vector
<pg_t
> *upmap_pgs
) const
1781 upmap_pgs
->reserve(pg_upmap
.size() + pg_upmap_items
.size());
1782 for (auto& p
: pg_upmap
)
1783 upmap_pgs
->push_back(p
.first
);
1784 for (auto& p
: pg_upmap_items
)
1785 upmap_pgs
->push_back(p
.first
);
1788 bool OSDMap::check_pg_upmaps(
1790 const vector
<pg_t
>& to_check
,
1791 vector
<pg_t
> *to_cancel
,
1792 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> *to_remap
) const
1794 bool any_change
= false;
1795 map
<int, map
<int, float>> rule_weight_map
;
1796 for (auto& pg
: to_check
) {
1797 const pg_pool_t
*pi
= get_pg_pool(pg
.pool());
1798 if (!pi
|| pg
.ps() >= pi
->get_pg_num_pending()) {
1799 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is gone or merge source"
1801 to_cancel
->push_back(pg
);
1804 if (pi
->is_pending_merge(pg
, nullptr)) {
1805 ldout(cct
, 0) << __func__
<< " pg " << pg
<< " is pending merge"
1807 to_cancel
->push_back(pg
);
1810 vector
<int> raw
, up
;
1811 pg_to_raw_upmap(pg
, &raw
, &up
);
1812 auto crush_rule
= get_pg_pool_crush_rule(pg
);
1813 auto r
= crush
->verify_upmap(cct
,
1815 get_pg_pool_size(pg
),
1818 ldout(cct
, 0) << __func__
<< " verify_upmap of pg " << pg
1819 << " returning " << r
1821 to_cancel
->push_back(pg
);
1824 // below we check against crush-topology changing..
1825 map
<int, float> weight_map
;
1826 auto it
= rule_weight_map
.find(crush_rule
);
1827 if (it
== rule_weight_map
.end()) {
1828 auto r
= crush
->get_rule_weight_osd_map(crush_rule
, &weight_map
);
1830 lderr(cct
) << __func__
<< " unable to get crush weight_map for "
1831 << "crush_rule " << crush_rule
1835 rule_weight_map
[crush_rule
] = weight_map
;
1837 weight_map
= it
->second
;
1839 ldout(cct
, 10) << __func__
<< " pg " << pg
1840 << " weight_map " << weight_map
1842 for (auto osd
: up
) {
1843 auto it
= weight_map
.find(osd
);
1844 if (it
== weight_map
.end()) {
1845 // osd is gone or has been moved out of the specific crush-tree
1846 to_cancel
->push_back(pg
);
1849 auto adjusted_weight
= get_weightf(it
->first
) * it
->second
;
1850 if (adjusted_weight
== 0) {
1851 // osd is out/crush-out
1852 to_cancel
->push_back(pg
);
1856 if (!to_cancel
->empty() && to_cancel
->back() == pg
)
1858 // okay, upmap is valid
1859 // continue to check if it is still necessary
1860 auto i
= pg_upmap
.find(pg
);
1861 if (i
!= pg_upmap
.end() && raw
== i
->second
) {
1862 ldout(cct
, 10) << " removing redundant pg_upmap "
1863 << i
->first
<< " " << i
->second
1865 to_cancel
->push_back(pg
);
1868 auto j
= pg_upmap_items
.find(pg
);
1869 if (j
!= pg_upmap_items
.end()) {
1870 mempool::osdmap::vector
<pair
<int,int>> newmap
;
1871 for (auto& p
: j
->second
) {
1872 if (std::find(raw
.begin(), raw
.end(), p
.first
) == raw
.end()) {
1873 // cancel mapping if source osd does not exist anymore
1876 if (p
.second
!= CRUSH_ITEM_NONE
&& p
.second
< max_osd
&&
1877 p
.second
>= 0 && osd_weight
[p
.second
] == 0) {
1878 // cancel mapping if target osd is out
1881 newmap
.push_back(p
);
1883 if (newmap
.empty()) {
1884 ldout(cct
, 10) << " removing no-op pg_upmap_items "
1885 << j
->first
<< " " << j
->second
1887 to_cancel
->push_back(pg
);
1888 } else if (newmap
!= j
->second
) {
1889 ldout(cct
, 10) << " simplifying partially no-op pg_upmap_items "
1890 << j
->first
<< " " << j
->second
1893 to_remap
->insert({pg
, newmap
});
1898 any_change
= any_change
|| !to_cancel
->empty();
1902 void OSDMap::clean_pg_upmaps(
1904 Incremental
*pending_inc
,
1905 const vector
<pg_t
>& to_cancel
,
1906 const map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>>& to_remap
) const
1908 for (auto &pg
: to_cancel
) {
1909 auto i
= pending_inc
->new_pg_upmap
.find(pg
);
1910 if (i
!= pending_inc
->new_pg_upmap
.end()) {
1911 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
1912 << "pg_upmap entry "
1913 << i
->first
<< "->" << i
->second
1915 pending_inc
->new_pg_upmap
.erase(i
);
1917 auto j
= pg_upmap
.find(pg
);
1918 if (j
!= pg_upmap
.end()) {
1919 ldout(cct
, 10) << __func__
<< " cancel invalid pg_upmap entry "
1920 << j
->first
<< "->" << j
->second
1922 pending_inc
->old_pg_upmap
.insert(pg
);
1924 auto p
= pending_inc
->new_pg_upmap_items
.find(pg
);
1925 if (p
!= pending_inc
->new_pg_upmap_items
.end()) {
1926 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
1927 << "pg_upmap_items entry "
1928 << p
->first
<< "->" << p
->second
1930 pending_inc
->new_pg_upmap_items
.erase(p
);
1932 auto q
= pg_upmap_items
.find(pg
);
1933 if (q
!= pg_upmap_items
.end()) {
1934 ldout(cct
, 10) << __func__
<< " cancel invalid "
1935 << "pg_upmap_items entry "
1936 << q
->first
<< "->" << q
->second
1938 pending_inc
->old_pg_upmap_items
.insert(pg
);
1941 for (auto& i
: to_remap
)
1942 pending_inc
->new_pg_upmap_items
[i
.first
] = i
.second
;
1945 bool OSDMap::clean_pg_upmaps(
1947 Incremental
*pending_inc
) const
1949 ldout(cct
, 10) << __func__
<< dendl
;
1950 vector
<pg_t
> to_check
;
1951 vector
<pg_t
> to_cancel
;
1952 map
<pg_t
, mempool::osdmap::vector
<pair
<int,int>>> to_remap
;
1954 get_upmap_pgs(&to_check
);
1955 auto any_change
= check_pg_upmaps(cct
, to_check
, &to_cancel
, &to_remap
);
1956 clean_pg_upmaps(cct
, pending_inc
, to_cancel
, to_remap
);
1960 int OSDMap::apply_incremental(const Incremental
&inc
)
1962 new_blacklist_entries
= false;
1965 else if (inc
.fsid
!= fsid
)
1968 ceph_assert(inc
.epoch
== epoch
+1);
1971 modified
= inc
.modified
;
1974 if (inc
.fullmap
.length()) {
1975 bufferlist
bl(inc
.fullmap
);
1980 // nope, incremental.
1981 if (inc
.new_flags
>= 0) {
1982 flags
= inc
.new_flags
;
1983 // the below is just to cover a newly-upgraded luminous mon
1984 // cluster that has to set require_jewel_osds or
1985 // require_kraken_osds before the osds can be upgraded to
1987 if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
1988 if (require_osd_release
< CEPH_RELEASE_KRAKEN
) {
1989 require_osd_release
= CEPH_RELEASE_KRAKEN
;
1991 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
1992 if (require_osd_release
< CEPH_RELEASE_JEWEL
) {
1993 require_osd_release
= CEPH_RELEASE_JEWEL
;
1998 if (inc
.new_max_osd
>= 0)
1999 set_max_osd(inc
.new_max_osd
);
2001 if (inc
.new_pool_max
!= -1)
2002 pool_max
= inc
.new_pool_max
;
2004 for (const auto &pool
: inc
.new_pools
) {
2005 pools
[pool
.first
] = pool
.second
;
2006 pools
[pool
.first
].last_change
= epoch
;
2009 new_removed_snaps
= inc
.new_removed_snaps
;
2010 new_purged_snaps
= inc
.new_purged_snaps
;
2011 for (auto p
= new_removed_snaps
.begin();
2012 p
!= new_removed_snaps
.end();
2014 removed_snaps_queue
[p
->first
].union_of(p
->second
);
2016 for (auto p
= new_purged_snaps
.begin();
2017 p
!= new_purged_snaps
.end();
2019 auto q
= removed_snaps_queue
.find(p
->first
);
2020 ceph_assert(q
!= removed_snaps_queue
.end());
2021 q
->second
.subtract(p
->second
);
2022 if (q
->second
.empty()) {
2023 removed_snaps_queue
.erase(q
);
2027 if (inc
.new_last_up_change
!= utime_t()) {
2028 last_up_change
= inc
.new_last_up_change
;
2030 if (inc
.new_last_in_change
!= utime_t()) {
2031 last_in_change
= inc
.new_last_in_change
;
2034 for (const auto &pname
: inc
.new_pool_names
) {
2035 auto pool_name_entry
= pool_name
.find(pname
.first
);
2036 if (pool_name_entry
!= pool_name
.end()) {
2037 name_pool
.erase(pool_name_entry
->second
);
2038 pool_name_entry
->second
= pname
.second
;
2040 pool_name
[pname
.first
] = pname
.second
;
2042 name_pool
[pname
.second
] = pname
.first
;
2045 for (const auto &pool
: inc
.old_pools
) {
2047 name_pool
.erase(pool_name
[pool
]);
2048 pool_name
.erase(pool
);
2051 for (const auto &weight
: inc
.new_weight
) {
2052 set_weight(weight
.first
, weight
.second
);
2054 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
2055 // xinfo old_weight.
2056 if (weight
.second
) {
2057 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
2058 osd_xinfo
[weight
.first
].old_weight
= 0;
2062 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
2063 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
2066 // erasure_code_profiles
2067 for (const auto &profile
: inc
.old_erasure_code_profiles
)
2068 erasure_code_profiles
.erase(profile
);
2070 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
2071 set_erasure_code_profile(profile
.first
, profile
.second
);
2075 for (const auto &state
: inc
.new_state
) {
2076 const auto osd
= state
.first
;
2077 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
2078 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
2079 (s
& CEPH_OSD_UP
)) {
2080 osd_info
[osd
].down_at
= epoch
;
2081 osd_xinfo
[osd
].down_stamp
= modified
;
2083 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
2084 (s
& CEPH_OSD_EXISTS
)) {
2085 // osd is destroyed; clear out anything interesting.
2086 (*osd_uuid
)[osd
] = uuid_d();
2087 osd_info
[osd
] = osd_info_t();
2088 osd_xinfo
[osd
] = osd_xinfo_t();
2089 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
2090 osd_addrs
->client_addrs
[osd
].reset(new entity_addrvec_t());
2091 osd_addrs
->cluster_addrs
[osd
].reset(new entity_addrvec_t());
2092 osd_addrs
->hb_front_addrs
[osd
].reset(new entity_addrvec_t());
2093 osd_addrs
->hb_back_addrs
[osd
].reset(new entity_addrvec_t());
2096 osd_state
[osd
] ^= s
;
2100 for (const auto &client
: inc
.new_up_client
) {
2101 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
2102 osd_addrs
->client_addrs
[client
.first
].reset(
2103 new entity_addrvec_t(client
.second
));
2104 osd_addrs
->hb_back_addrs
[client
.first
].reset(
2105 new entity_addrvec_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
2106 osd_addrs
->hb_front_addrs
[client
.first
].reset(
2107 new entity_addrvec_t(inc
.new_hb_front_up
.find(client
.first
)->second
));
2109 osd_info
[client
.first
].up_from
= epoch
;
2112 for (const auto &cluster
: inc
.new_up_cluster
)
2113 osd_addrs
->cluster_addrs
[cluster
.first
].reset(
2114 new entity_addrvec_t(cluster
.second
));
2117 for (const auto &thru
: inc
.new_up_thru
)
2118 osd_info
[thru
.first
].up_thru
= thru
.second
;
2120 for (const auto &interval
: inc
.new_last_clean_interval
) {
2121 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
2122 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
2125 for (const auto &lost
: inc
.new_lost
)
2126 osd_info
[lost
.first
].lost_at
= lost
.second
;
2129 for (const auto &xinfo
: inc
.new_xinfo
)
2130 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
2133 for (const auto &uuid
: inc
.new_uuid
)
2134 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
2137 for (const auto &pg
: inc
.new_pg_temp
) {
2138 if (pg
.second
.empty())
2139 pg_temp
->erase(pg
.first
);
2141 pg_temp
->set(pg
.first
, pg
.second
);
2143 if (!inc
.new_pg_temp
.empty()) {
2144 // make sure pg_temp is efficiently stored
2148 for (const auto &pg
: inc
.new_primary_temp
) {
2149 if (pg
.second
== -1)
2150 primary_temp
->erase(pg
.first
);
2152 (*primary_temp
)[pg
.first
] = pg
.second
;
2155 for (auto& p
: inc
.new_pg_upmap
) {
2156 pg_upmap
[p
.first
] = p
.second
;
2158 for (auto& pg
: inc
.old_pg_upmap
) {
2161 for (auto& p
: inc
.new_pg_upmap_items
) {
2162 pg_upmap_items
[p
.first
] = p
.second
;
2164 for (auto& pg
: inc
.old_pg_upmap_items
) {
2165 pg_upmap_items
.erase(pg
);
2169 if (!inc
.new_blacklist
.empty()) {
2170 blacklist
.insert(inc
.new_blacklist
.begin(),inc
.new_blacklist
.end());
2171 new_blacklist_entries
= true;
2173 for (const auto &addr
: inc
.old_blacklist
)
2174 blacklist
.erase(addr
);
2176 for (auto& i
: inc
.new_crush_node_flags
) {
2178 crush_node_flags
[i
.first
] = i
.second
;
2180 crush_node_flags
.erase(i
.first
);
2184 for (auto& i
: inc
.new_device_class_flags
) {
2186 device_class_flags
[i
.first
] = i
.second
;
2188 device_class_flags
.erase(i
.first
);
2192 // cluster snapshot?
2193 if (inc
.cluster_snapshot
.length()) {
2194 cluster_snapshot
= inc
.cluster_snapshot
;
2195 cluster_snapshot_epoch
= inc
.epoch
;
2197 cluster_snapshot
.clear();
2198 cluster_snapshot_epoch
= 0;
2201 if (inc
.new_nearfull_ratio
>= 0) {
2202 nearfull_ratio
= inc
.new_nearfull_ratio
;
2204 if (inc
.new_backfillfull_ratio
>= 0) {
2205 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
2207 if (inc
.new_full_ratio
>= 0) {
2208 full_ratio
= inc
.new_full_ratio
;
2210 if (inc
.new_require_min_compat_client
> 0) {
2211 require_min_compat_client
= inc
.new_require_min_compat_client
;
2213 if (inc
.new_require_osd_release
>= 0) {
2214 require_osd_release
= inc
.new_require_osd_release
;
2215 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
2216 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2217 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
2221 if (inc
.new_require_osd_release
>= 0) {
2222 require_osd_release
= inc
.new_require_osd_release
;
2223 if (require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
2224 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
2227 // do new crush map last (after up/down stuff)
2228 if (inc
.crush
.length()) {
2229 bufferlist
bl(inc
.crush
);
2230 auto blp
= bl
.cbegin();
2231 crush
.reset(new CrushWrapper
);
2233 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
2234 // only increment if this is a luminous-encoded osdmap, lest
2235 // the mon's crush_version diverge from what the osds or others
2236 // are decoding and applying on their end. if we won't encode
2237 // it in the canonical version, don't change it.
2240 for (auto it
= device_class_flags
.begin();
2241 it
!= device_class_flags
.end();) {
2242 const char* class_name
= crush
->get_class_name(it
->first
);
2243 if (!class_name
) // device class is gone
2244 it
= device_class_flags
.erase(it
);
2251 _calc_up_osd_features();
2256 int OSDMap::map_to_pg(
2260 const string
& nspace
,
2263 // calculate ps (placement seed)
2264 const pg_pool_t
*pool
= get_pg_pool(poolid
);
2269 ps
= pool
->hash_key(key
, nspace
);
2271 ps
= pool
->hash_key(name
, nspace
);
2272 *pg
= pg_t(ps
, poolid
);
2276 int OSDMap::object_locator_to_pg(
2277 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
2279 if (loc
.hash
>= 0) {
2280 if (!get_pg_pool(loc
.get_pool())) {
2283 pg
= pg_t(loc
.hash
, loc
.get_pool());
2286 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
2289 ceph_object_layout
OSDMap::make_object_layout(
2290 object_t oid
, int pg_pool
, string nspace
) const
2292 object_locator_t
loc(pg_pool
, nspace
);
2294 ceph_object_layout ol
;
2295 pg_t pgid
= object_locator_to_pg(oid
, loc
);
2296 ol
.ol_pgid
= pgid
.get_old_pg().v
;
2297 ol
.ol_stripe_unit
= 0;
2301 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
2302 vector
<int>& osds
) const
2304 if (pool
.can_shift_osds()) {
2305 unsigned removed
= 0;
2306 for (unsigned i
= 0; i
< osds
.size(); i
++) {
2307 if (!exists(osds
[i
])) {
2312 osds
[i
- removed
] = osds
[i
];
2316 osds
.resize(osds
.size() - removed
);
2318 for (auto& osd
: osds
) {
2320 osd
= CRUSH_ITEM_NONE
;
2325 void OSDMap::_pg_to_raw_osds(
2326 const pg_pool_t
& pool
, pg_t pg
,
2331 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
2332 unsigned size
= pool
.get_size();
2335 int ruleno
= crush
->find_rule(pool
.get_crush_rule(), pool
.get_type(), size
);
2337 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
2339 _remove_nonexistent_osds(pool
, *osds
);
2345 int OSDMap::_pick_primary(const vector
<int>& osds
) const
2347 for (auto osd
: osds
) {
2348 if (osd
!= CRUSH_ITEM_NONE
) {
2355 void OSDMap::_apply_upmap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
2357 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
2358 auto p
= pg_upmap
.find(pg
);
2359 if (p
!= pg_upmap
.end()) {
2360 // make sure targets aren't marked out
2361 for (auto osd
: p
->second
) {
2362 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd
>= 0 &&
2363 osd_weight
[osd
] == 0) {
2364 // reject/ignore the explicit mapping
2368 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
2369 // continue to check and apply pg_upmap_items if any
2372 auto q
= pg_upmap_items
.find(pg
);
2373 if (q
!= pg_upmap_items
.end()) {
2374 // NOTE: this approach does not allow a bidirectional swap,
2375 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2376 for (auto& r
: q
->second
) {
2377 // make sure the replacement value doesn't already appear
2378 bool exists
= false;
2380 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
2381 int osd
= (*raw
)[i
];
2382 if (osd
== r
.second
) {
2386 // ignore mapping if target is marked out (or invalid osd id)
2387 if (osd
== r
.first
&&
2389 !(r
.second
!= CRUSH_ITEM_NONE
&& r
.second
< max_osd
&&
2390 r
.second
>= 0 && osd_weight
[r
.second
] == 0)) {
2394 if (!exists
&& pos
>= 0) {
2395 (*raw
)[pos
] = r
.second
;
2401 // pg -> (up osd list)
2402 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
2403 vector
<int> *up
) const
2405 if (pool
.can_shift_osds()) {
2408 up
->reserve(raw
.size());
2409 for (unsigned i
=0; i
<raw
.size(); i
++) {
2410 if (!exists(raw
[i
]) || is_down(raw
[i
]))
2412 up
->push_back(raw
[i
]);
2415 // set down/dne devices to NONE
2416 up
->resize(raw
.size());
2417 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
2418 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
2419 (*up
)[i
] = CRUSH_ITEM_NONE
;
2427 void OSDMap::_apply_primary_affinity(ps_t seed
,
2428 const pg_pool_t
& pool
,
2432 // do we have any non-default primary_affinity values for these osds?
2433 if (!osd_primary_affinity
)
2437 for (const auto osd
: *osds
) {
2438 if (osd
!= CRUSH_ITEM_NONE
&&
2439 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
2447 // pick the primary. feed both the seed (for the pg) and the osd
2448 // into the hash/rng so that a proportional fraction of an osd's pgs
2449 // get rejected as primary.
2451 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
2453 if (o
== CRUSH_ITEM_NONE
)
2455 unsigned a
= (*osd_primary_affinity
)[o
];
2456 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
2457 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
2458 seed
, o
) >> 16) >= a
) {
2459 // we chose not to use this primary. note it anyway as a
2460 // fallback in case we don't pick anyone else, but keep looking.
2471 *primary
= (*osds
)[pos
];
2473 if (pool
.can_shift_osds() && pos
> 0) {
2474 // move the new primary to the front.
2475 for (int i
= pos
; i
> 0; --i
) {
2476 (*osds
)[i
] = (*osds
)[i
-1];
2478 (*osds
)[0] = *primary
;
2482 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
2483 vector
<int> *temp_pg
, int *temp_primary
) const
2485 pg
= pool
.raw_pg_to_pg(pg
);
2486 const auto p
= pg_temp
->find(pg
);
2488 if (p
!= pg_temp
->end()) {
2489 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
2490 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
2491 if (pool
.can_shift_osds()) {
2494 temp_pg
->push_back(CRUSH_ITEM_NONE
);
2497 temp_pg
->push_back(p
->second
[i
]);
2501 const auto &pp
= primary_temp
->find(pg
);
2503 if (pp
!= primary_temp
->end()) {
2504 *temp_primary
= pp
->second
;
2505 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
2506 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
2507 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
2508 *temp_primary
= (*temp_pg
)[i
];
2515 void OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
2517 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2523 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2524 *primary
= _pick_primary(*raw
);
2527 void OSDMap::pg_to_raw_upmap(pg_t pg
, vector
<int>*raw
,
2528 vector
<int> *raw_upmap
) const
2530 auto pool
= get_pg_pool(pg
.pool());
2535 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2537 _apply_upmap(*pool
, pg
, raw_upmap
);
2540 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
2542 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2550 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2551 _apply_upmap(*pool
, pg
, &raw
);
2552 _raw_to_up_osds(*pool
, raw
, up
);
2553 *primary
= _pick_primary(raw
);
2554 _apply_primary_affinity(pps
, *pool
, up
, primary
);
2557 void OSDMap::_pg_to_up_acting_osds(
2558 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
2559 vector
<int> *acting
, int *acting_primary
,
2560 bool raw_pg_to_pg
) const
2562 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2564 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
2572 *acting_primary
= -1;
2577 vector
<int> _acting
;
2579 int _acting_primary
;
2581 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
2582 if (_acting
.empty() || up
|| up_primary
) {
2583 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2584 _apply_upmap(*pool
, pg
, &raw
);
2585 _raw_to_up_osds(*pool
, raw
, &_up
);
2586 _up_primary
= _pick_primary(_up
);
2587 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
2588 if (_acting
.empty()) {
2590 if (_acting_primary
== -1) {
2591 _acting_primary
= _up_primary
;
2598 *up_primary
= _up_primary
;
2602 acting
->swap(_acting
);
2604 *acting_primary
= _acting_primary
;
2607 int OSDMap::calc_pg_rank(int osd
, const vector
<int>& acting
, int nrep
)
2610 nrep
= acting
.size();
2611 for (int i
=0; i
<nrep
; i
++)
2612 if (acting
[i
] == osd
)
2617 int OSDMap::calc_pg_role(int osd
, const vector
<int>& acting
, int nrep
)
2619 return calc_pg_rank(osd
, acting
, nrep
);
2622 bool OSDMap::primary_changed(
2624 const vector
<int> &oldacting
,
2626 const vector
<int> &newacting
)
2628 if (oldacting
.empty() && newacting
.empty())
2629 return false; // both still empty
2630 if (oldacting
.empty() ^ newacting
.empty())
2631 return true; // was empty, now not, or vice versa
2632 if (oldprimary
!= newprimary
)
2633 return true; // primary changed
2634 if (calc_pg_rank(oldprimary
, oldacting
) !=
2635 calc_pg_rank(newprimary
, newacting
))
2637 return false; // same primary (tho replicas may have changed)
2640 uint64_t OSDMap::get_encoding_features() const
2642 uint64_t f
= SIGNIFICANT_FEATURES
;
2643 if (require_osd_release
< CEPH_RELEASE_NAUTILUS
) {
2644 f
&= ~CEPH_FEATURE_SERVER_NAUTILUS
;
2646 if (require_osd_release
< CEPH_RELEASE_MIMIC
) {
2647 f
&= ~CEPH_FEATURE_SERVER_MIMIC
;
2649 if (require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
2650 f
&= ~(CEPH_FEATURE_SERVER_LUMINOUS
|
2651 CEPH_FEATURE_CRUSH_CHOOSE_ARGS
);
2653 if (require_osd_release
< CEPH_RELEASE_KRAKEN
) {
2654 f
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
2655 CEPH_FEATURE_MSG_ADDR2
);
2657 if (require_osd_release
< CEPH_RELEASE_JEWEL
) {
2658 f
&= ~(CEPH_FEATURE_SERVER_JEWEL
|
2659 CEPH_FEATURE_NEW_OSDOP_ENCODING
|
2660 CEPH_FEATURE_CRUSH_TUNABLES5
);
2665 // serialize, unserialize
2666 void OSDMap::encode_client_old(bufferlist
& bl
) const
2675 encode(created
, bl
);
2676 encode(modified
, bl
);
2678 // for encode(pools, bl);
2679 __u32 n
= pools
.size();
2682 for (const auto &pool
: pools
) {
2685 encode(pool
.second
, bl
, 0);
2687 // for encode(pool_name, bl);
2688 n
= pool_name
.size();
2690 for (const auto &pname
: pool_name
) {
2693 encode(pname
.second
, bl
);
2695 // for encode(pool_max, bl);
2701 encode(max_osd
, bl
);
2703 uint32_t n
= osd_state
.size();
2705 for (auto s
: osd_state
) {
2706 encode((uint8_t)s
, bl
);
2709 encode(osd_weight
, bl
);
2710 encode(osd_addrs
->client_addrs
, bl
, 0);
2712 // for encode(pg_temp, bl);
2713 n
= pg_temp
->size();
2715 for (const auto pg
: *pg_temp
) {
2716 old_pg_t opg
= pg
.first
.get_old_pg();
2718 encode(pg
.second
, bl
);
2723 crush
->encode(cbl
, 0 /* legacy (no) features */);
2727 void OSDMap::encode_classic(bufferlist
& bl
, uint64_t features
) const
2730 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
2731 encode_client_old(bl
);
2741 encode(created
, bl
);
2742 encode(modified
, bl
);
2744 encode(pools
, bl
, features
);
2745 encode(pool_name
, bl
);
2746 encode(pool_max
, bl
);
2750 encode(max_osd
, bl
);
2752 uint32_t n
= osd_state
.size();
2754 for (auto s
: osd_state
) {
2755 encode((uint8_t)s
, bl
);
2758 encode(osd_weight
, bl
);
2759 encode(osd_addrs
->client_addrs
, bl
, features
);
2761 encode(*pg_temp
, bl
);
2765 crush
->encode(cbl
, 0 /* legacy (no) features */);
2771 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
2772 encode(osd_info
, bl
);
2773 encode(blacklist
, bl
, features
);
2774 encode(osd_addrs
->cluster_addrs
, bl
, features
);
2775 encode(cluster_snapshot_epoch
, bl
);
2776 encode(cluster_snapshot
, bl
);
2777 encode(*osd_uuid
, bl
);
2778 encode(osd_xinfo
, bl
);
2779 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
2782 /* for a description of osdmap versions, and when they were introduced, please
2784 * doc/dev/osd_internals/osdmap_versions.txt
2786 void OSDMap::encode(bufferlist
& bl
, uint64_t features
) const
2789 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
2790 encode_classic(bl
, features
);
2794 // only a select set of callers should *ever* be encoding new
2795 // OSDMaps. others should be passing around the canonical encoded
2796 // buffers from on high. select out those callers by passing in an
2797 // "impossible" feature bit.
2798 ceph_assert(features
& CEPH_FEATURE_RESERVED
);
2799 features
&= ~CEPH_FEATURE_RESERVED
;
2801 size_t start_offset
= bl
.length();
2804 std::optional
<buffer::list::contiguous_filler
> crc_filler
;
2806 // meta-encoding: how we include client-used and osd-specific data
2807 ENCODE_START(8, 7, bl
);
2810 // NOTE: any new encoding dependencies must be reflected by
2811 // SIGNIFICANT_FEATURES
2813 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2815 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
2817 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
2820 ENCODE_START(v
, 1, bl
); // client-usable data
2824 encode(created
, bl
);
2825 encode(modified
, bl
);
2827 encode(pools
, bl
, features
);
2828 encode(pool_name
, bl
);
2829 encode(pool_max
, bl
);
2832 decltype(flags
) f
= flags
;
2833 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
)
2834 f
|= CEPH_OSDMAP_REQUIRE_LUMINOUS
| CEPH_OSDMAP_RECOVERY_DELETES
;
2835 else if (require_osd_release
== CEPH_RELEASE_KRAKEN
)
2836 f
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
2837 else if (require_osd_release
== CEPH_RELEASE_JEWEL
)
2838 f
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
2844 encode(max_osd
, bl
);
2846 encode(osd_state
, bl
);
2848 uint32_t n
= osd_state
.size();
2850 for (auto s
: osd_state
) {
2851 encode((uint8_t)s
, bl
);
2854 encode(osd_weight
, bl
);
2856 encode(osd_addrs
->client_addrs
, bl
, features
);
2858 encode_addrvec_pvec_as_addr(osd_addrs
->client_addrs
, bl
, features
);
2861 encode(*pg_temp
, bl
);
2862 encode(*primary_temp
, bl
);
2863 if (osd_primary_affinity
) {
2864 encode(*osd_primary_affinity
, bl
);
2872 crush
->encode(cbl
, features
);
2874 encode(erasure_code_profiles
, bl
);
2877 encode(pg_upmap
, bl
);
2878 encode(pg_upmap_items
, bl
);
2880 ceph_assert(pg_upmap
.empty());
2881 ceph_assert(pg_upmap_items
.empty());
2884 encode(crush_version
, bl
);
2887 encode(new_removed_snaps
, bl
);
2888 encode(new_purged_snaps
, bl
);
2891 encode(last_up_change
, bl
);
2892 encode(last_in_change
, bl
);
2894 ENCODE_FINISH(bl
); // client-usable data
2898 // NOTE: any new encoding dependencies must be reflected by
2899 // SIGNIFICANT_FEATURES
2900 uint8_t target_v
= 9;
2901 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2903 } else if (!HAVE_FEATURE(features
, SERVER_MIMIC
)) {
2905 } else if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
2908 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
2910 encode_addrvec_pvec_as_addr(osd_addrs
->hb_back_addrs
, bl
, features
);
2912 encode(osd_addrs
->hb_back_addrs
, bl
, features
);
2914 encode(osd_info
, bl
);
2916 // put this in a sorted, ordered map<> so that we encode in a
2917 // deterministic order.
2918 map
<entity_addr_t
,utime_t
> blacklist_map
;
2919 for (const auto &addr
: blacklist
)
2920 blacklist_map
.insert(make_pair(addr
.first
, addr
.second
));
2921 encode(blacklist_map
, bl
, features
);
2924 encode_addrvec_pvec_as_addr(osd_addrs
->cluster_addrs
, bl
, features
);
2926 encode(osd_addrs
->cluster_addrs
, bl
, features
);
2928 encode(cluster_snapshot_epoch
, bl
);
2929 encode(cluster_snapshot
, bl
);
2930 encode(*osd_uuid
, bl
);
2931 encode(osd_xinfo
, bl
);
2933 encode_addrvec_pvec_as_addr(osd_addrs
->hb_front_addrs
, bl
, features
);
2935 encode(osd_addrs
->hb_front_addrs
, bl
, features
);
2937 if (target_v
>= 2) {
2938 encode(nearfull_ratio
, bl
);
2939 encode(full_ratio
, bl
);
2940 encode(backfillfull_ratio
, bl
);
2942 // 4 was string-based new_require_min_compat_client
2943 if (target_v
>= 5) {
2944 encode(require_min_compat_client
, bl
);
2945 encode(require_osd_release
, bl
);
2947 if (target_v
>= 6) {
2948 encode(removed_snaps_queue
, bl
);
2950 if (target_v
>= 8) {
2951 encode(crush_node_flags
, bl
);
2953 if (target_v
>= 9) {
2954 encode(device_class_flags
, bl
);
2956 ENCODE_FINISH(bl
); // osd-only data
2959 crc_offset
= bl
.length();
2960 crc_filler
= bl
.append_hole(sizeof(uint32_t));
2961 tail_offset
= bl
.length();
2963 ENCODE_FINISH(bl
); // meta-encoding wrapper
2967 front
.substr_of(bl
, start_offset
, crc_offset
- start_offset
);
2968 crc
= front
.crc32c(-1);
2969 if (tail_offset
< bl
.length()) {
2971 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
2972 crc
= tail
.crc32c(crc
);
2976 crc_filler
->copy_in(4, (char*)&crc_le
);
2980 /* for a description of osdmap versions, and when they were introduced, please
2982 * doc/dev/osd_internals/osdmap_versions.txt
2984 void OSDMap::decode(bufferlist
& bl
)
2986 auto p
= bl
.cbegin();
2990 void OSDMap::decode_classic(bufferlist::const_iterator
& p
)
3001 decode(modified
, p
);
3005 int32_t max_pools
= 0;
3006 decode(max_pools
, p
);
3007 pool_max
= max_pools
;
3013 decode(pools
[t
], p
);
3018 } else if (v
== 5) {
3023 decode(pool_name
[t
], p
);
3030 decode(pool_name
, p
);
3031 decode(pool_max
, p
);
3033 // kludge around some old bug that zeroed out pool_max (#2307)
3034 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
3035 pool_max
= pools
.rbegin()->first
;
3044 osd_state
.resize(os
.size());
3045 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3046 osd_state
[i
] = os
[i
];
3049 decode(osd_weight
, p
);
3050 decode(osd_addrs
->client_addrs
, p
);
3056 ::decode_raw(opg
, p
);
3057 mempool::osdmap::vector
<int32_t> v
;
3059 pg_temp
->set(pg_t(opg
), v
);
3062 decode(*pg_temp
, p
);
3068 auto cblp
= cbl
.cbegin();
3069 crush
->decode(cblp
);
3075 decode(osd_addrs
->hb_back_addrs
, p
);
3076 decode(osd_info
, p
);
3078 decode(pool_name
, p
);
3080 decode(blacklist
, p
);
3082 decode(osd_addrs
->cluster_addrs
, p
);
3084 osd_addrs
->cluster_addrs
.resize(osd_addrs
->client_addrs
.size());
3087 decode(cluster_snapshot_epoch
, p
);
3088 decode(cluster_snapshot
, p
);
3092 decode(*osd_uuid
, p
);
3094 osd_uuid
->resize(max_osd
);
3097 decode(osd_xinfo
, p
);
3099 osd_xinfo
.resize(max_osd
);
3102 decode(osd_addrs
->hb_front_addrs
, p
);
3104 osd_addrs
->hb_front_addrs
.resize(osd_addrs
->hb_back_addrs
.size());
3106 osd_primary_affinity
.reset();
3111 void OSDMap::decode(bufferlist::const_iterator
& bl
)
3115 * Older encodings of the OSDMap had a single struct_v which
3116 * covered the whole encoding, and was prior to our modern
3117 * stuff which includes a compatv and a size. So if we see
3118 * a struct_v < 7, we must rewind to the beginning and use our
3121 size_t start_offset
= bl
.get_off();
3122 size_t tail_offset
= 0;
3123 bufferlist crc_front
, crc_tail
;
3125 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
3127 bl
.seek(start_offset
);
3132 * Since we made it past that hurdle, we can use our normal paths.
3135 DECODE_START(9, bl
); // client-usable data
3139 decode(created
, bl
);
3140 decode(modified
, bl
);
3143 decode(pool_name
, bl
);
3144 decode(pool_max
, bl
);
3148 decode(max_osd
, bl
);
3149 if (struct_v
>= 5) {
3150 decode(osd_state
, bl
);
3154 osd_state
.resize(os
.size());
3155 for (unsigned i
= 0; i
< os
.size(); ++i
) {
3156 osd_state
[i
] = os
[i
];
3159 decode(osd_weight
, bl
);
3160 decode(osd_addrs
->client_addrs
, bl
);
3162 decode(*pg_temp
, bl
);
3163 decode(*primary_temp
, bl
);
3164 // dates back to firefly. version increased from 2 to 3 still in firefly.
3165 // do we really still need to keep this around? even for old clients?
3166 if (struct_v
>= 2) {
3167 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
3168 decode(*osd_primary_affinity
, bl
);
3169 if (osd_primary_affinity
->empty())
3170 osd_primary_affinity
.reset();
3172 osd_primary_affinity
.reset();
3178 auto cblp
= cbl
.cbegin();
3179 crush
->decode(cblp
);
3180 // added in firefly; version increased in luminous, so it affects
3181 // giant, hammer, infernallis, jewel, and kraken. probably should be left
3182 // alone until we require clients to be all luminous?
3183 if (struct_v
>= 3) {
3184 decode(erasure_code_profiles
, bl
);
3186 erasure_code_profiles
.clear();
3188 // version increased from 3 to 4 still in luminous, so same as above
3190 if (struct_v
>= 4) {
3191 decode(pg_upmap
, bl
);
3192 decode(pg_upmap_items
, bl
);
3195 pg_upmap_items
.clear();
3197 // again, version increased from 5 to 6 still in luminous, so above
3199 if (struct_v
>= 6) {
3200 decode(crush_version
, bl
);
3202 // version increase from 6 to 7 in mimic
3203 if (struct_v
>= 7) {
3204 decode(new_removed_snaps
, bl
);
3205 decode(new_purged_snaps
, bl
);
3207 // version increase from 7 to 8, 8 to 9, in nautilus.
3208 if (struct_v
>= 9) {
3209 decode(last_up_change
, bl
);
3210 decode(last_in_change
, bl
);
3212 DECODE_FINISH(bl
); // client-usable data
3216 DECODE_START(9, bl
); // extended, osd-only data
3217 decode(osd_addrs
->hb_back_addrs
, bl
);
3218 decode(osd_info
, bl
);
3219 decode(blacklist
, bl
);
3220 decode(osd_addrs
->cluster_addrs
, bl
);
3221 decode(cluster_snapshot_epoch
, bl
);
3222 decode(cluster_snapshot
, bl
);
3223 decode(*osd_uuid
, bl
);
3224 decode(osd_xinfo
, bl
);
3225 decode(osd_addrs
->hb_front_addrs
, bl
);
3227 if (struct_v
>= 2) {
3228 decode(nearfull_ratio
, bl
);
3229 decode(full_ratio
, bl
);
3234 if (struct_v
>= 3) {
3235 decode(backfillfull_ratio
, bl
);
3237 backfillfull_ratio
= 0;
3239 if (struct_v
== 4) {
3243 require_min_compat_client
= ceph_release_from_name(r
.c_str());
3245 if (struct_v
>= 5) {
3246 decode(require_min_compat_client
, bl
);
3247 decode(require_osd_release
, bl
);
3248 if (require_osd_release
>= CEPH_RELEASE_NAUTILUS
) {
3249 flags
|= CEPH_OSDMAP_PGLOG_HARDLIMIT
;
3251 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
3252 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3253 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3256 if (flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
) {
3257 // only for compat with post-kraken pre-luminous test clusters
3258 require_osd_release
= CEPH_RELEASE_LUMINOUS
;
3259 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
3260 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
3261 } else if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
3262 require_osd_release
= CEPH_RELEASE_KRAKEN
;
3263 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
3264 require_osd_release
= CEPH_RELEASE_JEWEL
;
3266 require_osd_release
= 0;
3269 if (struct_v
>= 6) {
3270 decode(removed_snaps_queue
, bl
);
3272 if (struct_v
>= 8) {
3273 decode(crush_node_flags
, bl
);
3275 crush_node_flags
.clear();
3277 if (struct_v
>= 9) {
3278 decode(device_class_flags
, bl
);
3280 device_class_flags
.clear();
3282 DECODE_FINISH(bl
); // osd-only data
3285 if (struct_v
>= 8) {
3286 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
3288 tail_offset
= bl
.get_off();
3291 crc_defined
= false;
3295 DECODE_FINISH(bl
); // wrapper
3299 uint32_t actual
= crc_front
.crc32c(-1);
3300 if (tail_offset
< bl
.get_off()) {
3302 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
3303 actual
= tail
.crc32c(actual
);
3305 if (crc
!= actual
) {
3307 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
3308 string s
= ss
.str();
3309 throw buffer::malformed_input(s
.c_str());
3316 void OSDMap::post_decode()
3320 for (const auto &pname
: pool_name
) {
3321 name_pool
[pname
.second
] = pname
.first
;
3325 _calc_up_osd_features();
3328 void OSDMap::dump_erasure_code_profiles(
3329 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
3332 f
->open_object_section("erasure_code_profiles");
3333 for (const auto &profile
: profiles
) {
3334 f
->open_object_section(profile
.first
.c_str());
3335 for (const auto &profm
: profile
.second
) {
3336 f
->dump_string(profm
.first
.c_str(), profm
.second
.c_str());
3343 void OSDMap::dump(Formatter
*f
) const
3345 f
->dump_int("epoch", get_epoch());
3346 f
->dump_stream("fsid") << get_fsid();
3347 f
->dump_stream("created") << get_created();
3348 f
->dump_stream("modified") << get_modified();
3349 f
->dump_stream("last_up_change") << last_up_change
;
3350 f
->dump_stream("last_in_change") << last_in_change
;
3351 f
->dump_string("flags", get_flag_string());
3352 f
->dump_unsigned("flags_num", flags
);
3353 f
->open_array_section("flags_set");
3354 set
<string
> flagset
;
3355 get_flag_set(&flagset
);
3356 for (auto p
: flagset
) {
3357 f
->dump_string("flag", p
);
3360 f
->dump_unsigned("crush_version", get_crush_version());
3361 f
->dump_float("full_ratio", full_ratio
);
3362 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
3363 f
->dump_float("nearfull_ratio", nearfull_ratio
);
3364 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
3365 f
->dump_int("pool_max", get_pool_max());
3366 f
->dump_int("max_osd", get_max_osd());
3367 f
->dump_string("require_min_compat_client",
3368 ceph_release_name(require_min_compat_client
));
3369 f
->dump_string("min_compat_client",
3370 ceph_release_name(get_min_compat_client()));
3371 f
->dump_string("require_osd_release",
3372 ceph_release_name(require_osd_release
));
3374 f
->open_array_section("pools");
3375 for (const auto &pool
: pools
) {
3376 std::string
name("<unknown>");
3377 const auto &pni
= pool_name
.find(pool
.first
);
3378 if (pni
!= pool_name
.end())
3380 f
->open_object_section("pool");
3381 f
->dump_int("pool", pool
.first
);
3382 f
->dump_string("pool_name", name
);
3383 pool
.second
.dump(f
);
3388 f
->open_array_section("osds");
3389 for (int i
=0; i
<get_max_osd(); i
++)
3391 f
->open_object_section("osd_info");
3392 f
->dump_int("osd", i
);
3393 f
->dump_stream("uuid") << get_uuid(i
);
3394 f
->dump_int("up", is_up(i
));
3395 f
->dump_int("in", is_in(i
));
3396 f
->dump_float("weight", get_weightf(i
));
3397 f
->dump_float("primary_affinity", get_primary_affinityf(i
));
3398 get_info(i
).dump(f
);
3399 f
->dump_object("public_addrs", get_addrs(i
));
3400 f
->dump_object("cluster_addrs", get_cluster_addrs(i
));
3401 f
->dump_object("heartbeat_back_addrs", get_hb_back_addrs(i
));
3402 f
->dump_object("heartbeat_front_addrs", get_hb_front_addrs(i
));
3404 f
->dump_stream("public_addr") << get_addrs(i
).get_legacy_str();
3405 f
->dump_stream("cluster_addr") << get_cluster_addrs(i
).get_legacy_str();
3406 f
->dump_stream("heartbeat_back_addr")
3407 << get_hb_back_addrs(i
).get_legacy_str();
3408 f
->dump_stream("heartbeat_front_addr")
3409 << get_hb_front_addrs(i
).get_legacy_str();
3413 f
->open_array_section("state");
3414 for (const auto &state
: st
)
3415 f
->dump_string("state", state
);
3422 f
->open_array_section("osd_xinfo");
3423 for (int i
=0; i
<get_max_osd(); i
++) {
3425 f
->open_object_section("xinfo");
3426 f
->dump_int("osd", i
);
3427 osd_xinfo
[i
].dump(f
);
3433 f
->open_array_section("pg_upmap");
3434 for (auto& p
: pg_upmap
) {
3435 f
->open_object_section("mapping");
3436 f
->dump_stream("pgid") << p
.first
;
3437 f
->open_array_section("osds");
3438 for (auto q
: p
.second
) {
3439 f
->dump_int("osd", q
);
3445 f
->open_array_section("pg_upmap_items");
3446 for (auto& p
: pg_upmap_items
) {
3447 f
->open_object_section("mapping");
3448 f
->dump_stream("pgid") << p
.first
;
3449 f
->open_array_section("mappings");
3450 for (auto& q
: p
.second
) {
3451 f
->open_object_section("mapping");
3452 f
->dump_int("from", q
.first
);
3453 f
->dump_int("to", q
.second
);
3460 f
->open_array_section("pg_temp");
3464 f
->open_array_section("primary_temp");
3465 for (const auto &pg
: *primary_temp
) {
3466 f
->dump_stream("pgid") << pg
.first
;
3467 f
->dump_int("osd", pg
.second
);
3469 f
->close_section(); // primary_temp
3471 f
->open_object_section("blacklist");
3472 for (const auto &addr
: blacklist
) {
3475 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3479 dump_erasure_code_profiles(erasure_code_profiles
, f
);
3481 f
->open_array_section("removed_snaps_queue");
3482 for (auto& p
: removed_snaps_queue
) {
3483 f
->open_object_section("pool");
3484 f
->dump_int("pool", p
.first
);
3485 f
->open_array_section("snaps");
3486 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3487 f
->open_object_section("interval");
3488 f
->dump_unsigned("begin", q
.get_start());
3489 f
->dump_unsigned("length", q
.get_len());
3496 f
->open_array_section("new_removed_snaps");
3497 for (auto& p
: new_removed_snaps
) {
3498 f
->open_object_section("pool");
3499 f
->dump_int("pool", p
.first
);
3500 f
->open_array_section("snaps");
3501 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3502 f
->open_object_section("interval");
3503 f
->dump_unsigned("begin", q
.get_start());
3504 f
->dump_unsigned("length", q
.get_len());
3511 f
->open_array_section("new_purged_snaps");
3512 for (auto& p
: new_purged_snaps
) {
3513 f
->open_object_section("pool");
3514 f
->dump_int("pool", p
.first
);
3515 f
->open_array_section("snaps");
3516 for (auto q
= p
.second
.begin(); q
!= p
.second
.end(); ++q
) {
3517 f
->open_object_section("interval");
3518 f
->dump_unsigned("begin", q
.get_start());
3519 f
->dump_unsigned("length", q
.get_len());
3526 f
->open_object_section("crush_node_flags");
3527 for (auto& i
: crush_node_flags
) {
3528 string s
= crush
->item_exists(i
.first
) ? crush
->get_item_name(i
.first
)
3529 : stringify(i
.first
);
3530 f
->open_array_section(s
.c_str());
3532 calc_state_set(i
.second
, st
);
3533 for (auto& j
: st
) {
3534 f
->dump_string("flag", j
);
3539 f
->open_object_section("device_class_flags");
3540 for (auto& i
: device_class_flags
) {
3541 const char* class_name
= crush
->get_class_name(i
.first
);
3542 string s
= class_name
? class_name
: stringify(i
.first
);
3543 f
->open_array_section(s
.c_str());
3545 calc_state_set(i
.second
, st
);
3546 for (auto& j
: st
) {
3547 f
->dump_string("flag", j
);
3554 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
3556 o
.push_back(new OSDMap
);
3558 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
3559 o
.push_back(new OSDMap
);
3561 o
.back()->build_simple(cct
, 1, fsid
, 16);
3562 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
3563 o
.back()->blacklist
[entity_addr_t()] = utime_t(5, 6);
3567 string
OSDMap::get_flag_string(unsigned f
)
3570 if ( f
& CEPH_OSDMAP_NEARFULL
)
3572 if (f
& CEPH_OSDMAP_FULL
)
3574 if (f
& CEPH_OSDMAP_PAUSERD
)
3576 if (f
& CEPH_OSDMAP_PAUSEWR
)
3578 if (f
& CEPH_OSDMAP_PAUSEREC
)
3580 if (f
& CEPH_OSDMAP_NOUP
)
3582 if (f
& CEPH_OSDMAP_NODOWN
)
3584 if (f
& CEPH_OSDMAP_NOOUT
)
3586 if (f
& CEPH_OSDMAP_NOIN
)
3588 if (f
& CEPH_OSDMAP_NOBACKFILL
)
3590 if (f
& CEPH_OSDMAP_NOREBALANCE
)
3591 s
+= ",norebalance";
3592 if (f
& CEPH_OSDMAP_NORECOVER
)
3594 if (f
& CEPH_OSDMAP_NOSCRUB
)
3596 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
3597 s
+= ",nodeep-scrub";
3598 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
3599 s
+= ",notieragent";
3600 if (f
& CEPH_OSDMAP_NOSNAPTRIM
)
3602 if (f
& CEPH_OSDMAP_SORTBITWISE
)
3603 s
+= ",sortbitwise";
3604 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
3605 s
+= ",require_jewel_osds";
3606 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
3607 s
+= ",require_kraken_osds";
3608 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
3609 s
+= ",require_luminous_osds";
3610 if (f
& CEPH_OSDMAP_RECOVERY_DELETES
)
3611 s
+= ",recovery_deletes";
3612 if (f
& CEPH_OSDMAP_PURGED_SNAPDIRS
)
3613 s
+= ",purged_snapdirs";
3614 if (f
& CEPH_OSDMAP_PGLOG_HARDLIMIT
)
3615 s
+= ",pglog_hardlimit";
3621 string
OSDMap::get_flag_string() const
3623 return get_flag_string(flags
);
3626 void OSDMap::print_pools(ostream
& out
) const
3628 for (const auto &pool
: pools
) {
3629 std::string
name("<unknown>");
3630 const auto &pni
= pool_name
.find(pool
.first
);
3631 if (pni
!= pool_name
.end())
3633 out
<< "pool " << pool
.first
3635 << "' " << pool
.second
<< "\n";
3637 for (const auto &snap
: pool
.second
.snaps
)
3638 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
3640 if (!pool
.second
.removed_snaps
.empty())
3641 out
<< "\tremoved_snaps " << pool
.second
.removed_snaps
<< "\n";
3642 auto p
= removed_snaps_queue
.find(pool
.first
);
3643 if (p
!= removed_snaps_queue
.end()) {
3644 out
<< "\tremoved_snaps_queue " << p
->second
<< "\n";
3650 void OSDMap::print(ostream
& out
) const
3652 out
<< "epoch " << get_epoch() << "\n"
3653 << "fsid " << get_fsid() << "\n"
3654 << "created " << get_created() << "\n"
3655 << "modified " << get_modified() << "\n";
3657 out
<< "flags " << get_flag_string() << "\n";
3658 out
<< "crush_version " << get_crush_version() << "\n";
3659 out
<< "full_ratio " << full_ratio
<< "\n";
3660 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
3661 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
3662 if (require_min_compat_client
> 0) {
3663 out
<< "require_min_compat_client "
3664 << ceph_release_name(require_min_compat_client
) << "\n";
3666 out
<< "min_compat_client " << ceph_release_name(get_min_compat_client())
3668 if (require_osd_release
> 0) {
3669 out
<< "require_osd_release " << ceph_release_name(require_osd_release
)
3672 if (get_cluster_snapshot().length())
3673 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
3678 out
<< "max_osd " << get_max_osd() << "\n";
3679 for (int i
=0; i
<get_max_osd(); i
++) {
3682 out
<< (is_up(i
) ? " up ":" down");
3683 out
<< (is_in(i
) ? " in ":" out");
3684 out
<< " weight " << get_weightf(i
);
3685 if (get_primary_affinity(i
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
)
3686 out
<< " primary_affinity " << get_primary_affinityf(i
);
3687 const osd_info_t
& info(get_info(i
));
3689 out
<< " " << get_addrs(i
) << " " << get_cluster_addrs(i
);
3693 if (!get_uuid(i
).is_zero())
3694 out
<< " " << get_uuid(i
);
3700 for (auto& p
: pg_upmap
) {
3701 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
3703 for (auto& p
: pg_upmap_items
) {
3704 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
3707 for (const auto pg
: *pg_temp
)
3708 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3710 for (const auto pg
: *primary_temp
)
3711 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3713 for (const auto &addr
: blacklist
)
3714 out
<< "blacklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
3717 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
3719 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
3721 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3723 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
3725 bool should_dump_leaf(int i
) const override
{
3727 return true; // normal case
3729 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
3730 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
3731 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
3732 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
3733 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
3739 bool should_dump_empty_bucket() const override
{
3743 void init_table(TextTable
*tbl
) {
3744 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
3745 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
3746 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3747 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
3748 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
3749 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3750 tbl
->define_column("PRI-AFF", TextTable::LEFT
, TextTable::RIGHT
);
3752 void dump(TextTable
*tbl
, string
& bucket
) {
3755 if (!bucket
.empty()) {
3760 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3761 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
)) {
3762 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), tbl
);
3769 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
3770 const char *c
= crush
->get_item_class(qi
.id
);
3775 << weightf_t(qi
.weight
);
3778 for (int k
= 0; k
< qi
.depth
; k
++)
3780 if (qi
.is_bucket()) {
3781 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
3782 << crush
->get_item_name(qi
.id
);
3784 name
<< "osd." << qi
.id
;
3788 if (!qi
.is_bucket()) {
3789 if (!osdmap
->exists(qi
.id
)) {
3794 if (osdmap
->is_up(qi
.id
)) {
3796 } else if (osdmap
->is_destroyed(qi
.id
)) {
3802 << weightf_t(osdmap
->get_weightf(qi
.id
))
3803 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
3806 *tbl
<< TextTable::endrow
;
3810 const OSDMap
*osdmap
;
3811 const unsigned filter
;
3814 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
3816 typedef CrushTreeDumper::FormattingDumper Parent
;
3818 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3820 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
3822 bool should_dump_leaf(int i
) const override
{
3824 return true; // normal case
3826 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
3827 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
3828 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
3829 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
3830 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
3836 bool should_dump_empty_bucket() const override
{
3840 void dump(Formatter
*f
, string
& bucket
) {
3841 if (!bucket
.empty()) {
3843 f
->open_array_section("nodes");
3847 f
->open_array_section("nodes");
3850 f
->open_array_section("stray");
3851 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3852 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
))
3853 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
3860 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
3861 Parent::dump_item_fields(qi
, f
);
3862 if (!qi
.is_bucket())
3865 if (osdmap
->is_up(qi
.id
)) {
3867 } else if (osdmap
->is_destroyed(qi
.id
)) {
3872 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
3873 f
->dump_string("status", s
);
3874 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
3875 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
3880 const OSDMap
*osdmap
;
3881 const unsigned filter
;
3884 void OSDMap::print_tree(Formatter
*f
, ostream
*out
, unsigned filter
, string bucket
) const
3887 OSDTreeFormattingDumper(crush
.get(), this, filter
).dump(f
, bucket
);
3891 OSDTreePlainDumper(crush
.get(), this, filter
).dump(&tbl
, bucket
);
3896 void OSDMap::print_summary(Formatter
*f
, ostream
& out
,
3897 const string
& prefix
, bool extra
) const
3900 f
->open_object_section("osdmap");
3901 f
->dump_int("epoch", get_epoch());
3902 f
->dump_int("num_osds", get_num_osds());
3903 f
->dump_int("num_up_osds", get_num_up_osds());
3904 f
->dump_int("num_in_osds", get_num_in_osds());
3905 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3908 utime_t now
= ceph_clock_now();
3909 out
<< get_num_osds() << " osds: "
3910 << get_num_up_osds() << " up";
3911 if (last_up_change
!= utime_t()) {
3912 out
<< " (since " << utimespan_str(now
- last_up_change
) << ")";
3914 out
<< ", " << get_num_in_osds() << " in";
3915 if (last_in_change
!= utime_t()) {
3916 out
<< " (since " << utimespan_str(now
- last_in_change
) << ")";
3919 out
<< "; epoch: e" << get_epoch();
3920 if (get_num_pg_temp())
3921 out
<< "; " << get_num_pg_temp() << " remapped pgs";
3923 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
3924 if (important_flags
)
3925 out
<< prefix
<< "flags " << get_flag_string(important_flags
) << "\n";
3929 void OSDMap::print_oneline_summary(ostream
& out
) const
3931 out
<< "e" << get_epoch() << ": "
3932 << get_num_osds() << " total, "
3933 << get_num_up_osds() << " up, "
3934 << get_num_in_osds() << " in";
3937 bool OSDMap::crush_rule_in_use(int rule_id
) const
3939 for (const auto &pool
: pools
) {
3940 if (pool
.second
.crush_rule
== rule_id
)
3946 int OSDMap::validate_crush_rules(CrushWrapper
*newcrush
,
3949 for (auto& i
: pools
) {
3950 auto& pool
= i
.second
;
3951 int ruleno
= pool
.get_crush_rule();
3952 if (!newcrush
->rule_exists(ruleno
)) {
3953 *ss
<< "pool " << i
.first
<< " references crush_rule " << ruleno
3954 << " but it is not present";
3957 if (newcrush
->get_rule_mask_ruleset(ruleno
) != ruleno
) {
3958 *ss
<< "rule " << ruleno
<< " mask ruleset does not match rule id";
3961 if (newcrush
->get_rule_mask_type(ruleno
) != (int)pool
.get_type()) {
3962 *ss
<< "pool " << i
.first
<< " type does not match rule " << ruleno
;
3965 int poolsize
= pool
.get_size();
3966 if (poolsize
< newcrush
->get_rule_mask_min_size(ruleno
) ||
3967 poolsize
> newcrush
->get_rule_mask_max_size(ruleno
)) {
3968 *ss
<< "pool " << i
.first
<< " size " << poolsize
<< " does not"
3969 << " fall within rule " << ruleno
3970 << " min_size " << newcrush
->get_rule_mask_min_size(ruleno
)
3971 << " and max_size " << newcrush
->get_rule_mask_max_size(ruleno
);
3978 int OSDMap::build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
3979 int nosd
, int pg_bits
, int pgp_bits
,
3982 ldout(cct
, 10) << "build_simple on " << nosd
3983 << " osds" << dendl
;
3986 created
= modified
= ceph_clock_now();
3993 const auto& conf
= cct
->_conf
;
3994 vector
<string
> sections
;
3995 conf
.get_all_sections(sections
);
3997 for (auto §ion
: sections
) {
3998 if (section
.find("osd.") != 0)
4001 const char *begin
= section
.c_str() + 4;
4002 char *end
= (char*)begin
;
4003 int o
= strtol(begin
, &end
, 10);
4007 if (o
> cct
->_conf
->mon_max_osd
) {
4008 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
4016 set_max_osd(maxosd
+ 1);
4023 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
4025 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
4026 ceph_assert(r
== 0);
4028 int poolbase
= get_max_osd() ? get_max_osd() : 1;
4030 const int default_replicated_rule
= crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
4031 ceph_assert(default_replicated_rule
>= 0);
4034 // pgp_num <= pg_num
4035 if (pgp_bits
> pg_bits
)
4038 vector
<string
> pool_names
;
4039 pool_names
.push_back("rbd");
4040 for (auto &plname
: pool_names
) {
4041 int64_t pool
= ++pool_max
;
4042 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
4043 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
4044 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
4045 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
4046 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
4047 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
4048 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
4049 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
4050 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
4051 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
4052 pools
[pool
].size
= cct
->_conf
.get_val
<uint64_t>("osd_pool_default_size");
4053 pools
[pool
].min_size
= cct
->_conf
.get_osd_pool_default_min_size(
4055 pools
[pool
].crush_rule
= default_replicated_rule
;
4056 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
4057 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
4058 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
4059 pools
[pool
].set_pg_num_target(poolbase
<< pg_bits
);
4060 pools
[pool
].set_pgp_num_target(poolbase
<< pgp_bits
);
4061 pools
[pool
].last_change
= epoch
;
4062 pools
[pool
].application_metadata
.insert(
4063 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
4064 auto m
= pg_pool_t::get_pg_autoscale_mode_by_name(
4065 cct
->_conf
.get_val
<string
>("osd_pool_default_pg_autoscale_mode"));
4066 pools
[pool
].pg_autoscale_mode
= m
>= 0 ? m
: 0;
4067 pool_name
[pool
] = plname
;
4068 name_pool
[plname
] = pool
;
4072 for (int i
=0; i
<get_max_osd(); i
++) {
4074 set_weight(i
, CEPH_OSD_OUT
);
4077 map
<string
,string
> profile_map
;
4078 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
4080 lderr(cct
) << ss
.str() << dendl
;
4083 set_erasure_code_profile("default", profile_map
);
4087 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
4088 map
<string
,string
> &profile_map
,
4091 int r
= get_json_str_map(cct
->_conf
.get_val
<string
>("osd_pool_default_erasure_code_profile"),
4097 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
4099 crush
.set_type_name(0, "osd");
4100 crush
.set_type_name(1, "host");
4101 crush
.set_type_name(2, "chassis");
4102 crush
.set_type_name(3, "rack");
4103 crush
.set_type_name(4, "row");
4104 crush
.set_type_name(5, "pdu");
4105 crush
.set_type_name(6, "pod");
4106 crush
.set_type_name(7, "room");
4107 crush
.set_type_name(8, "datacenter");
4108 crush
.set_type_name(9, "zone");
4109 crush
.set_type_name(10, "region");
4110 crush
.set_type_name(11, "root");
4114 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
4115 int nosd
, ostream
*ss
)
4120 int root_type
= _build_crush_types(crush
);
4122 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
4123 root_type
, 0, NULL
, NULL
, &rootid
);
4124 ceph_assert(r
== 0);
4125 crush
.set_item_name(rootid
, "default");
4127 for (int o
=0; o
<nosd
; o
++) {
4128 map
<string
,string
> loc
;
4129 loc
["host"] = "localhost";
4130 loc
["rack"] = "localrack";
4131 loc
["root"] = "default";
4132 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
4134 snprintf(name
, sizeof(name
), "osd.%d", o
);
4135 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
4138 build_simple_crush_rules(cct
, crush
, "default", ss
);
4145 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
4146 CrushWrapper
& crush
,
4149 const auto& conf
= cct
->_conf
;
4154 int root_type
= _build_crush_types(crush
);
4156 int r
= crush
.add_bucket(0, 0,
4158 root_type
, 0, NULL
, NULL
, &rootid
);
4159 ceph_assert(r
== 0);
4160 crush
.set_item_name(rootid
, "default");
4163 vector
<string
> sections
;
4164 conf
.get_all_sections(sections
);
4166 for (auto §ion
: sections
) {
4167 if (section
.find("osd.") != 0)
4170 const char *begin
= section
.c_str() + 4;
4171 char *end
= (char*)begin
;
4172 int o
= strtol(begin
, &end
, 10);
4176 string host
, rack
, row
, room
, dc
, pool
;
4177 vector
<string
> sectiontmp
;
4178 sectiontmp
.push_back("osd");
4179 sectiontmp
.push_back(section
);
4180 conf
.get_val_from_conf_file(sectiontmp
, "host", host
, false);
4181 conf
.get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
4182 conf
.get_val_from_conf_file(sectiontmp
, "row", row
, false);
4183 conf
.get_val_from_conf_file(sectiontmp
, "room", room
, false);
4184 conf
.get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
4185 conf
.get_val_from_conf_file(sectiontmp
, "root", pool
, false);
4187 if (host
.length() == 0)
4188 host
= "unknownhost";
4189 if (rack
.length() == 0)
4190 rack
= "unknownrack";
4192 map
<string
,string
> loc
;
4200 loc
["datacenter"] = dc
;
4201 loc
["root"] = "default";
4203 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
4204 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
4207 build_simple_crush_rules(cct
, crush
, "default", ss
);
4215 int OSDMap::build_simple_crush_rules(
4217 CrushWrapper
& crush
,
4221 int crush_rule
= crush
.get_osd_pool_default_crush_replicated_ruleset(cct
);
4222 string failure_domain
=
4223 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
4226 r
= crush
.add_simple_rule_at(
4227 "replicated_rule", root
, failure_domain
, "",
4228 "firstn", pg_pool_t::TYPE_REPLICATED
,
4232 // do not add an erasure rule by default or else we will implicitly
4233 // require the crush_v2 feature of clients
4237 int OSDMap::summarize_mapping_stats(
4239 const set
<int64_t> *pools
,
4247 for (auto &p
: get_pools())
4251 unsigned total_pg
= 0;
4252 unsigned moved_pg
= 0;
4253 vector
<unsigned> base_by_osd(get_max_osd(), 0);
4254 vector
<unsigned> new_by_osd(get_max_osd(), 0);
4255 for (int64_t pool_id
: ls
) {
4256 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
4257 vector
<int> up
, up2
;
4259 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
4260 pg_t
pgid(ps
, pool_id
);
4261 total_pg
+= pi
->get_size();
4262 pg_to_up_acting_osds(pgid
, &up
, &up_primary
, nullptr, nullptr);
4263 for (int osd
: up
) {
4264 if (osd
>= 0 && osd
< get_max_osd())
4268 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
, nullptr, nullptr);
4269 for (int osd
: up2
) {
4270 if (osd
>= 0 && osd
< get_max_osd())
4273 if (pi
->type
== pg_pool_t::TYPE_ERASURE
) {
4274 for (unsigned i
=0; i
<up
.size(); ++i
) {
4275 if (up
[i
] != up2
[i
]) {
4279 } else if (pi
->type
== pg_pool_t::TYPE_REPLICATED
) {
4280 for (int osd
: up
) {
4281 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
4286 ceph_abort_msg("unhandled pool type");
4292 unsigned num_up_in
= 0;
4293 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4294 if (is_up(osd
) && is_in(osd
))
4301 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
4302 float base_stddev
= 0, new_stddev
= 0;
4303 int min
= -1, max
= -1;
4304 unsigned min_base_pg
= 0, max_base_pg
= 0;
4305 unsigned min_new_pg
= 0, max_new_pg
= 0;
4306 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
4307 if (is_up(osd
) && is_in(osd
)) {
4308 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
4309 base_stddev
+= base_diff
* base_diff
;
4310 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
4311 new_stddev
+= new_diff
* new_diff
;
4312 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
4314 min_base_pg
= base_by_osd
[osd
];
4315 min_new_pg
= new_by_osd
[osd
];
4317 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
4319 max_base_pg
= base_by_osd
[osd
];
4320 max_new_pg
= new_by_osd
[osd
];
4324 base_stddev
= sqrt(base_stddev
/ num_up_in
);
4325 new_stddev
= sqrt(new_stddev
/ num_up_in
);
4327 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
4331 f
->open_object_section("utilization");
4334 f
->dump_unsigned("moved_pgs", moved_pg
);
4335 f
->dump_unsigned("total_pgs", total_pg
);
4339 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
4340 ss
<< "moved " << moved_pg
<< " / " << total_pg
4341 << " (" << percent
<< "%)\n";
4345 f
->dump_float("avg_pgs", avg_pg
);
4346 f
->dump_float("std_dev", base_stddev
);
4347 f
->dump_float("expected_baseline_std_dev", edev
);
4349 f
->dump_float("new_std_dev", new_stddev
);
4351 ss
<< "avg " << avg_pg
<< "\n";
4352 ss
<< "stddev " << base_stddev
;
4354 ss
<< " -> " << new_stddev
;
4355 ss
<< " (expected baseline " << edev
<< ")\n";
4359 f
->dump_unsigned("min_osd", min
);
4360 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
4362 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
4364 ss
<< "min osd." << min
<< " with " << min_base_pg
;
4366 ss
<< " -> " << min_new_pg
;
4367 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
4369 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
4375 f
->dump_unsigned("max_osd", max
);
4376 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
4378 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
4380 ss
<< "max osd." << max
<< " with " << max_base_pg
;
4382 ss
<< " -> " << max_new_pg
;
4383 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
4385 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
4396 bool OSDMap::try_pg_upmap(
4398 pg_t pg
, ///< pg to potentially remap
4399 const set
<int>& overfull
, ///< osds we'd want to evacuate
4400 const vector
<int>& underfull
, ///< osds to move to, in order of preference
4402 vector
<int> *out
) ///< resulting alternative mapping
4404 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
4407 int rule
= crush
->find_rule(pool
->get_crush_rule(), pool
->get_type(),
4412 // make sure there is something there to remap
4414 for (auto osd
: *orig
) {
4415 if (overfull
.count(osd
)) {
4424 int r
= crush
->try_remap_rule(
4428 overfull
, underfull
,
4438 int OSDMap::calc_pg_upmaps(
4440 float max_deviation_ratio
,
4442 const set
<int64_t>& only_pools
,
4443 OSDMap::Incremental
*pending_inc
)
4445 ldout(cct
, 10) << __func__
<< " pools " << only_pools
<< dendl
;
4447 tmp
.deepish_copy_from(*this);
4448 int num_changed
= 0;
4449 map
<int,set
<pg_t
>> pgs_by_osd
;
4451 float osd_weight_total
= 0;
4452 map
<int,float> osd_weight
;
4453 for (auto& i
: pools
) {
4454 if (!only_pools
.empty() && !only_pools
.count(i
.first
))
4456 for (unsigned ps
= 0; ps
< i
.second
.get_pg_num(); ++ps
) {
4457 pg_t
pg(ps
, i
.first
);
4459 tmp
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
4460 ldout(cct
, 20) << __func__
<< " " << pg
<< " up " << up
<< dendl
;
4461 for (auto osd
: up
) {
4462 if (osd
!= CRUSH_ITEM_NONE
)
4463 pgs_by_osd
[osd
].insert(pg
);
4466 total_pgs
+= i
.second
.get_size() * i
.second
.get_pg_num();
4468 map
<int,float> pmap
;
4469 int ruleno
= tmp
.crush
->find_rule(i
.second
.get_crush_rule(),
4470 i
.second
.get_type(),
4471 i
.second
.get_size());
4472 tmp
.crush
->get_rule_weight_osd_map(ruleno
, &pmap
);
4473 ldout(cct
,20) << __func__
<< " pool " << i
.first
4474 << " ruleno " << ruleno
4475 << " weight-map " << pmap
4477 for (auto p
: pmap
) {
4478 auto adjusted_weight
= tmp
.get_weightf(p
.first
) * p
.second
;
4479 if (adjusted_weight
== 0) {
4482 osd_weight
[p
.first
] += adjusted_weight
;
4483 osd_weight_total
+= adjusted_weight
;
4486 for (auto& i
: osd_weight
) {
4488 auto p
= pgs_by_osd
.find(i
.first
);
4489 if (p
!= pgs_by_osd
.end())
4490 pgs
= p
->second
.size();
4492 pgs_by_osd
.emplace(i
.first
, set
<pg_t
>());
4493 ldout(cct
, 20) << " osd." << i
.first
<< " weight " << i
.second
4494 << " pgs " << pgs
<< dendl
;
4496 if (osd_weight_total
== 0) {
4497 lderr(cct
) << __func__
<< " abort due to osd_weight_total == 0" << dendl
;
4500 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
4501 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
4502 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
4505 lderr(cct
) << __func__
<< " abort due to max <= 0" << dendl
;
4508 float decay_factor
= 1.0 / float(max
);
4510 map
<int,float> osd_deviation
; // osd, deviation(pgs)
4511 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
4512 for (auto& i
: pgs_by_osd
) {
4513 // make sure osd is still there (belongs to this crush-tree)
4514 ceph_assert(osd_weight
.count(i
.first
));
4515 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
4516 float deviation
= (float)i
.second
.size() - target
;
4517 ldout(cct
, 20) << " osd." << i
.first
4518 << "\tpgs " << i
.second
.size()
4519 << "\ttarget " << target
4520 << "\tdeviation " << deviation
4522 osd_deviation
[i
.first
] = deviation
;
4523 deviation_osd
.insert(make_pair(deviation
, i
.first
));
4524 stddev
+= deviation
* deviation
;
4526 if (stddev
<= cct
->_conf
.get_val
<double>("osd_calc_pg_upmaps_max_stddev")) {
4527 ldout(cct
, 10) << __func__
<< " distribution is almost perfect"
4531 bool skip_overfull
= false;
4533 cct
->_conf
.get_val
<bool>("osd_calc_pg_upmaps_aggressively");
4534 auto local_fallback_retries
=
4535 cct
->_conf
.get_val
<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
4537 // build overfull and underfull
4539 vector
<int> underfull
;
4541 int decay_count
= 0;
4542 while (overfull
.empty()) {
4543 for (auto i
= deviation_osd
.rbegin(); i
!= deviation_osd
.rend(); i
++) {
4544 if (i
->first
>= (1.0 - decay
))
4545 overfull
.insert(i
->second
);
4547 if (!overfull
.empty())
4550 decay
= decay_factor
* decay_count
;
4553 ldout(cct
, 30) << " decay_factor = " << decay_factor
4554 << " decay_count = " << decay_count
4555 << " decay (overfull) = " << decay
4558 if (overfull
.empty()) {
4559 lderr(cct
) << __func__
<< " failed to build overfull" << dendl
;
4565 while (underfull
.empty()) {
4566 for (auto i
= deviation_osd
.begin(); i
!= deviation_osd
.end(); i
++) {
4567 if (i
->first
>= (-.999 + decay
))
4569 underfull
.push_back(i
->second
);
4571 if (!underfull
.empty())
4574 decay
= decay_factor
* decay_count
;
4577 ldout(cct
, 30) << " decay_factor = " << decay_factor
4578 << " decay_count = " << decay_count
4579 << " decay (underfull) = " << decay
4582 if (underfull
.empty()) {
4583 lderr(cct
) << __func__
<< " failed to build underfull" << dendl
;
4587 ldout(cct
, 10) << " overfull " << overfull
4588 << " underfull " << underfull
4591 uint64_t local_fallback_retried
= 0;
4596 map
<pg_t
, mempool::osdmap::vector
<pair
<int32_t,int32_t>>> to_upmap
;
4597 auto temp_pgs_by_osd
= pgs_by_osd
;
4598 // always start with fullest, break if we find any changes to make
4599 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
4600 if (skip_overfull
) {
4601 ldout(cct
, 10) << " skipping overfull " << dendl
;
4602 break; // fall through to check underfull
4604 int osd
= p
->second
;
4605 float deviation
= p
->first
;
4606 float target
= osd_weight
[osd
] * pgs_per_weight
;
4607 ceph_assert(target
> 0);
4608 float deviation_ratio
= deviation
/ target
;
4609 if (deviation_ratio
< max_deviation_ratio
) {
4610 ldout(cct
, 10) << " osd." << osd
4611 << " target " << target
4612 << " deviation " << deviation
4613 << " -> ratio " << deviation_ratio
4614 << " < max ratio " << max_deviation_ratio
4620 pgs
.reserve(pgs_by_osd
[osd
].size());
4621 for (auto& pg
: pgs_by_osd
[osd
]) {
4622 if (to_skip
.count(pg
))
4627 // shuffle PG list so they all get equal (in)attention
4628 std::random_device rd
;
4629 std::default_random_engine rng
{rd()};
4630 std::shuffle(pgs
.begin(), pgs
.end(), rng
);
4632 // look for remaps we can un-remap
4633 for (auto pg
: pgs
) {
4634 auto p
= tmp
.pg_upmap_items
.find(pg
);
4635 if (p
== tmp
.pg_upmap_items
.end())
4637 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
4638 for (auto q
: p
->second
) {
4639 if (q
.second
== osd
) {
4640 ldout(cct
, 10) << " will try dropping existing"
4641 << " remapping pair "
4642 << q
.first
<< " -> " << q
.second
4643 << " which remapped " << pg
4644 << " into overfull osd." << osd
4646 temp_pgs_by_osd
[q
.second
].erase(pg
);
4647 temp_pgs_by_osd
[q
.first
].insert(pg
);
4649 new_upmap_items
.push_back(q
);
4652 if (new_upmap_items
.empty()) {
4654 ldout(cct
, 10) << " existing pg_upmap_items " << p
->second
4655 << " remapped " << pg
<< " into overfull osd." << osd
4656 << ", will try cancelling it entirely"
4658 to_unmap
.insert(pg
);
4660 } else if (new_upmap_items
.size() != p
->second
.size()) {
4661 // drop single remapping pair, updating
4662 ceph_assert(new_upmap_items
.size() < p
->second
.size());
4663 ldout(cct
, 10) << " existing pg_upmap_items " << p
->second
4664 << " remapped " << pg
<< " into overfull osd." << osd
4665 << ", new_pg_upmap_items now " << new_upmap_items
4667 to_upmap
[pg
] = new_upmap_items
;
4673 for (auto pg
: pgs
) {
4674 auto temp_it
= tmp
.pg_upmap
.find(pg
);
4675 if (temp_it
!= tmp
.pg_upmap
.end()) {
4676 // leave pg_upmap alone
4677 // it must be specified by admin since balancer does not
4678 // support pg_upmap yet
4679 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap "
4680 << temp_it
->second
<< ", skipping"
4684 auto pg_pool_size
= tmp
.get_pg_pool_size(pg
);
4685 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
4687 auto it
= tmp
.pg_upmap_items
.find(pg
);
4688 if (it
!= tmp
.pg_upmap_items
.end() &&
4689 it
->second
.size() >= (size_t)pg_pool_size
) {
4690 ldout(cct
, 10) << " " << pg
<< " already has full-size pg_upmap_items "
4691 << it
->second
<< ", skipping"
4694 } else if (it
!= tmp
.pg_upmap_items
.end()) {
4695 ldout(cct
, 10) << " " << pg
<< " already has pg_upmap_items "
4698 new_upmap_items
= it
->second
;
4699 // build existing too (for dedup)
4700 for (auto i
: it
->second
) {
4701 existing
.insert(i
.first
);
4702 existing
.insert(i
.second
);
4705 // to see if we can append more remapping pairs
4707 ldout(cct
, 10) << " trying " << pg
<< dendl
;
4708 vector
<int> raw
, orig
, out
;
4709 tmp
.pg_to_raw_upmap(pg
, &raw
, &orig
); // including existing upmaps too
4710 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, &orig
, &out
)) {
4713 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
4714 if (orig
.size() != out
.size()) {
4717 ceph_assert(orig
!= out
);
4718 for (unsigned i
= 0; i
< out
.size(); ++i
) {
4719 if (orig
[i
] == out
[i
])
4720 continue; // skip invalid remappings
4721 if (existing
.count(orig
[i
]) || existing
.count(out
[i
]))
4722 continue; // we want new remappings only!
4723 ldout(cct
, 10) << " will try adding new remapping pair "
4724 << orig
[i
] << " -> " << out
[i
] << " for " << pg
4726 existing
.insert(orig
[i
]);
4727 existing
.insert(out
[i
]);
4728 temp_pgs_by_osd
[orig
[i
]].erase(pg
);
4729 temp_pgs_by_osd
[out
[i
]].insert(pg
);
4730 ceph_assert(new_upmap_items
.size() < (size_t)pg_pool_size
);
4731 new_upmap_items
.push_back(make_pair(orig
[i
], out
[i
]));
4732 // append new remapping pairs slowly
4733 // This way we can make sure that each tiny change will
4734 // definitely make distribution of PGs converging to
4735 // the perfect status.
4736 to_upmap
[pg
] = new_upmap_items
;
4742 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
4743 ldout(cct
, 10) << " failed to find any changes for overfull osds"
4745 for (auto& p
: deviation_osd
) {
4746 if (std::find(underfull
.begin(), underfull
.end(), p
.second
) ==
4750 float deviation
= p
.first
;
4751 float target
= osd_weight
[osd
] * pgs_per_weight
;
4752 ceph_assert(target
> 0);
4753 float deviation_ratio
= abs(deviation
/ target
);
4754 if (deviation_ratio
< max_deviation_ratio
) {
4755 // respect max_deviation_ratio too
4756 ldout(cct
, 10) << " osd." << osd
4757 << " target " << target
4758 << " deviation " << deviation
4759 << " -> absolute ratio " << deviation_ratio
4760 << " < max ratio " << max_deviation_ratio
4764 // look for remaps we can un-remap
4766 mempool::osdmap::vector
<pair
<int32_t,int32_t>>>> candidates
;
4767 candidates
.reserve(tmp
.pg_upmap_items
.size());
4768 for (auto& i
: tmp
.pg_upmap_items
) {
4769 if (to_skip
.count(i
.first
))
4771 if (!only_pools
.empty() && !only_pools
.count(i
.first
.pool()))
4773 candidates
.push_back(make_pair(i
.first
, i
.second
));
4776 // shuffle candidates so they all get equal (in)attention
4777 std::random_device rd
;
4778 std::default_random_engine rng
{rd()};
4779 std::shuffle(candidates
.begin(), candidates
.end(), rng
);
4781 for (auto& i
: candidates
) {
4783 mempool::osdmap::vector
<pair
<int32_t,int32_t>> new_upmap_items
;
4784 for (auto& j
: i
.second
) {
4785 if (j
.first
== osd
) {
4786 ldout(cct
, 10) << " will try dropping existing"
4787 << " remapping pair "
4788 << j
.first
<< " -> " << j
.second
4789 << " which remapped " << pg
4790 << " out from underfull osd." << osd
4792 temp_pgs_by_osd
[j
.second
].erase(pg
);
4793 temp_pgs_by_osd
[j
.first
].insert(pg
);
4795 new_upmap_items
.push_back(j
);
4798 if (new_upmap_items
.empty()) {
4800 ldout(cct
, 10) << " existing pg_upmap_items " << i
.second
4801 << " remapped " << pg
4802 << " out from underfull osd." << osd
4803 << ", will try cancelling it entirely"
4805 to_unmap
.insert(pg
);
4807 } else if (new_upmap_items
.size() != i
.second
.size()) {
4808 // drop single remapping pair, updating
4809 ceph_assert(new_upmap_items
.size() < i
.second
.size());
4810 ldout(cct
, 10) << " existing pg_upmap_items " << i
.second
4811 << " remapped " << pg
4812 << " out from underfull osd." << osd
4813 << ", new_pg_upmap_items now " << new_upmap_items
4815 to_upmap
[pg
] = new_upmap_items
;
4821 ceph_assert(!(to_unmap
.size() || to_upmap
.size()));
4822 ldout(cct
, 10) << " failed to find any changes for underfull osds"
4825 ldout(cct
, 10) << " break due to aggressive mode not enabled" << dendl
;
4827 } else if (!skip_overfull
) {
4828 // safe to quit because below here we know
4829 // we've done checking both overfull and underfull osds..
4830 ldout(cct
, 10) << " break due to not being able to find any"
4831 << " further optimizations"
4835 // restart with fullest and do exhaustive searching
4836 skip_overfull
= false;
4841 // test change, apply if change is good
4842 ceph_assert(to_unmap
.size() || to_upmap
.size());
4843 float new_stddev
= 0;
4844 map
<int,float> temp_osd_deviation
;
4845 multimap
<float,int> temp_deviation_osd
;
4846 for (auto& i
: temp_pgs_by_osd
) {
4847 // make sure osd is still there (belongs to this crush-tree)
4848 ceph_assert(osd_weight
.count(i
.first
));
4849 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
4850 float deviation
= (float)i
.second
.size() - target
;
4851 ldout(cct
, 20) << " osd." << i
.first
4852 << "\tpgs " << i
.second
.size()
4853 << "\ttarget " << target
4854 << "\tdeviation " << deviation
4856 temp_osd_deviation
[i
.first
] = deviation
;
4857 temp_deviation_osd
.insert(make_pair(deviation
, i
.first
));
4858 new_stddev
+= deviation
* deviation
;
4860 ldout(cct
, 10) << " stddev " << stddev
<< " -> " << new_stddev
<< dendl
;
4861 if (new_stddev
>= stddev
) {
4863 ldout(cct
, 10) << " break because stddev is not decreasing"
4864 << " and aggressive mode is not enabled"
4868 local_fallback_retried
++;
4869 if (local_fallback_retried
>= local_fallback_retries
) {
4870 // does not make progress
4871 // flip *skip_overfull* so both overfull and underfull
4872 // get equal (in)attention
4873 skip_overfull
= !skip_overfull
;
4874 ldout(cct
, 10) << " hit local_fallback_retries "
4875 << local_fallback_retries
4879 for (auto& i
: to_unmap
)
4881 for (auto& i
: to_upmap
)
4882 to_skip
.insert(i
.first
);
4883 ldout(cct
, 20) << " local_fallback_retried " << local_fallback_retried
4884 << " to_skip " << to_skip
4890 ceph_assert(new_stddev
< stddev
);
4891 stddev
= new_stddev
;
4892 pgs_by_osd
= temp_pgs_by_osd
;
4893 osd_deviation
= temp_osd_deviation
;
4894 deviation_osd
= temp_deviation_osd
;
4895 for (auto& i
: to_unmap
) {
4896 ldout(cct
, 10) << " unmap pg " << i
<< dendl
;
4897 ceph_assert(tmp
.pg_upmap_items
.count(i
));
4898 tmp
.pg_upmap_items
.erase(i
);
4899 pending_inc
->old_pg_upmap_items
.insert(i
);
4902 for (auto& i
: to_upmap
) {
4903 ldout(cct
, 10) << " upmap pg " << i
.first
4904 << " new pg_upmap_items " << i
.second
4906 tmp
.pg_upmap_items
[i
.first
] = i
.second
;
4907 pending_inc
->new_pg_upmap_items
[i
.first
] = i
.second
;
4911 ldout(cct
, 10) << " num_changed = " << num_changed
<< dendl
;
4915 int OSDMap::get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const
4917 return crush
->get_leaves(name
, osds
);
4920 // get pools whose crush rules might reference the given osd
4921 void OSDMap::get_pool_ids_by_osd(CephContext
*cct
,
4923 set
<int64_t> *pool_ids
) const
4925 ceph_assert(pool_ids
);
4927 int r
= crush
->get_rules_by_osd(osd
, &raw_rules
);
4929 lderr(cct
) << __func__
<< " get_rules_by_osd failed: " << cpp_strerror(r
)
4931 ceph_assert(r
>= 0);
4934 for (auto &i
: raw_rules
) {
4935 // exclude any dead rule
4936 if (crush_rule_in_use(i
)) {
4940 for (auto &r
: rules
) {
4941 get_pool_ids_by_rule(r
, pool_ids
);
4945 template <typename F
>
4946 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
4948 typedef CrushTreeDumper::Dumper
<F
> Parent
;
4950 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4951 const PGMap
& pgmap_
, bool tree_
,
4952 const string
& class_name_
,
4953 const string
& item_name_
) :
4954 Parent(crush
, osdmap_
->get_pool_names()),
4958 class_name(class_name_
),
4959 item_name(item_name_
),
4964 if (osdmap
->crush
->name_exists(item_name
)) {
4965 // filter out items we are allowed to dump
4966 auto item_id
= osdmap
->crush
->get_item_id(item_name
);
4967 allowed
.insert(item_id
);
4968 osdmap
->crush
->get_all_children(item_id
, &allowed
);
4970 average_util
= average_utilization();
4975 bool should_dump(int id
) const {
4976 if (!allowed
.empty() && !allowed
.count(id
)) // filter by name
4978 if (id
>= 0 && !class_name
.empty()) {
4979 const char* item_class_name
= osdmap
->crush
->get_item_class(id
);
4980 if (!item_class_name
|| // not bound to a class yet
4981 item_class_name
!= class_name
) // or already bound to
4982 // a different class
4988 set
<int> get_dumped_osds() {
4989 if (class_name
.empty() && item_name
.empty()) {
4996 void dump_stray(F
*f
) {
4997 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4998 if (osdmap
->exists(i
) && !this->is_touched(i
))
4999 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
5003 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
5004 if (!tree
&& qi
.is_bucket())
5006 if (!should_dump(qi
.id
))
5009 if (!qi
.is_bucket())
5010 dumped_osds
.insert(qi
.id
);
5011 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
5012 int64_t kb
= 0, kb_used
= 0, kb_used_data
= 0, kb_used_omap
= 0,
5013 kb_used_meta
= 0, kb_avail
= 0;
5015 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_used_data
,
5016 &kb_used_omap
, &kb_used_meta
, &kb_avail
))
5018 util
= 100.0 * (double)kb_used
/ (double)kb
;
5022 var
= util
/ average_util
;
5024 size_t num_pgs
= qi
.is_bucket() ? 0 : pgmap
.get_num_pg_by_osd(qi
.id
);
5026 dump_item(qi
, reweight
, kb
, kb_used
,
5027 kb_used_data
, kb_used_omap
, kb_used_meta
,
5028 kb_avail
, util
, var
, num_pgs
, f
);
5030 if (!qi
.is_bucket() && reweight
> 0) {
5031 if (min_var
< 0 || var
< min_var
)
5033 if (max_var
< 0 || var
> max_var
)
5036 double dev
= util
- average_util
;
5038 stddev
+= reweight
* dev
;
5043 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
5047 int64_t kb_used_data
,
5048 int64_t kb_used_omap
,
5049 int64_t kb_used_meta
,
5053 const size_t num_pgs
,
5057 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
5060 double average_utilization() {
5061 int64_t kb
= 0, kb_used
= 0;
5062 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
5063 if (!osdmap
->exists(i
) ||
5064 osdmap
->get_weight(i
) == 0 ||
5067 int64_t kb_i
, kb_used_i
, kb_used_data_i
, kb_used_omap_i
, kb_used_meta_i
,
5069 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_used_data_i
,
5070 &kb_used_omap_i
, &kb_used_meta_i
, &kb_avail_i
)) {
5072 kb_used
+= kb_used_i
;
5075 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
5078 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
5079 int64_t* kb_used_data
,
5080 int64_t* kb_used_omap
,
5081 int64_t* kb_used_meta
,
5082 int64_t* kb_avail
) const {
5083 const osd_stat_t
*p
= pgmap
.get_osd_stat(id
);
5084 if (!p
) return false;
5085 *kb
= p
->statfs
.kb();
5086 *kb_used
= p
->statfs
.kb_used_raw();
5087 *kb_used_data
= p
->statfs
.kb_used_data();
5088 *kb_used_omap
= p
->statfs
.kb_used_omap();
5089 *kb_used_meta
= p
->statfs
.kb_used_internal_metadata();
5090 *kb_avail
= p
->statfs
.kb_avail();
5095 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
5096 int64_t* kb_used_data
,
5097 int64_t* kb_used_omap
,
5098 int64_t* kb_used_meta
,
5099 int64_t* kb_avail
) const {
5101 if (osdmap
->is_out(id
) || !should_dump(id
)) {
5110 return get_osd_utilization(id
, kb
, kb_used
, kb_used_data
,
5111 kb_used_omap
, kb_used_meta
, kb_avail
);
5121 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
5122 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
5123 int64_t kb_i
= 0, kb_used_i
= 0, kb_used_data_i
= 0,
5124 kb_used_omap_i
= 0, kb_used_meta_i
= 0, kb_avail_i
= 0;
5125 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
,
5126 &kb_used_data_i
, &kb_used_omap_i
,
5127 &kb_used_meta_i
, &kb_avail_i
))
5130 *kb_used
+= kb_used_i
;
5131 *kb_used_data
+= kb_used_data_i
;
5132 *kb_used_omap
+= kb_used_omap_i
;
5133 *kb_used_meta
+= kb_used_meta_i
;
5134 *kb_avail
+= kb_avail_i
;
5140 const OSDMap
*osdmap
;
5143 const string class_name
;
5144 const string item_name
;
5145 double average_util
;
5151 set
<int> dumped_osds
;
5155 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
5157 typedef OSDUtilizationDumper
<TextTable
> Parent
;
5159 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
5160 const PGMap
& pgmap
, bool tree
,
5161 const string
& class_name
,
5162 const string
& item_name
) :
5163 Parent(crush
, osdmap
, pgmap
, tree
, class_name
, item_name
) {}
5165 void dump(TextTable
*tbl
) {
5166 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
5167 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
5168 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
5169 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
5170 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
5171 tbl
->define_column("RAW USE", TextTable::LEFT
, TextTable::RIGHT
);
5172 tbl
->define_column("DATA", TextTable::LEFT
, TextTable::RIGHT
);
5173 tbl
->define_column("OMAP", TextTable::LEFT
, TextTable::RIGHT
);
5174 tbl
->define_column("META", TextTable::LEFT
, TextTable::RIGHT
);
5175 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
5176 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
5177 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
5178 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
5179 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
5181 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
5187 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
5191 << byte_u_t(sum
.statfs
.total
)
5192 << byte_u_t(sum
.statfs
.get_used_raw())
5193 << byte_u_t(sum
.statfs
.allocated
)
5194 << byte_u_t(sum
.statfs
.omap_allocated
)
5195 << byte_u_t(sum
.statfs
.internal_metadata
)
5196 << byte_u_t(sum
.statfs
.available
)
5197 << lowprecision_t(average_util
)
5199 << TextTable::endrow
;
5203 struct lowprecision_t
{
5205 explicit lowprecision_t(float _v
) : v(_v
) {}
5207 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
5209 using OSDUtilizationDumper
<TextTable
>::dump_item
;
5210 void dump_item(const CrushTreeDumper::Item
&qi
,
5214 int64_t kb_used_data
,
5215 int64_t kb_used_omap
,
5216 int64_t kb_used_meta
,
5220 const size_t num_pgs
,
5221 TextTable
*tbl
) override
{
5222 const char *c
= crush
->get_item_class(qi
.id
);
5227 << weightf_t(qi
.weight
)
5228 << weightf_t(reweight
)
5229 << byte_u_t(kb
<< 10)
5230 << byte_u_t(kb_used
<< 10)
5231 << byte_u_t(kb_used_data
<< 10)
5232 << byte_u_t(kb_used_omap
<< 10)
5233 << byte_u_t(kb_used_meta
<< 10)
5234 << byte_u_t(kb_avail
<< 10)
5235 << lowprecision_t(util
)
5236 << lowprecision_t(var
);
5238 if (qi
.is_bucket()) {
5243 if (osdmap
->is_up(qi
.id
)) {
5245 } else if (osdmap
->is_destroyed(qi
.id
)) {
5246 *tbl
<< "destroyed";
5254 for (int k
= 0; k
< qi
.depth
; k
++)
5256 if (qi
.is_bucket()) {
5257 int type
= crush
->get_bucket_type(qi
.id
);
5258 name
<< crush
->get_type_name(type
) << " "
5259 << crush
->get_item_name(qi
.id
);
5261 name
<< "osd." << qi
.id
;
5266 *tbl
<< TextTable::endrow
;
5272 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
5273 << "/" << lowprecision_t(max_var
) << " "
5274 << "STDDEV: " << lowprecision_t(dev());
5279 ostream
& operator<<(ostream
& out
,
5280 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
5284 } else if (v
.v
< 0.001) {
5287 std::streamsize p
= out
.precision();
5288 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
5292 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
5294 typedef OSDUtilizationDumper
<Formatter
> Parent
;
5296 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
5297 const PGMap
& pgmap
, bool tree
,
5298 const string
& class_name
,
5299 const string
& item_name
) :
5300 Parent(crush
, osdmap
, pgmap
, tree
, class_name
, item_name
) {}
5302 void dump(Formatter
*f
) {
5303 f
->open_array_section("nodes");
5307 f
->open_array_section("stray");
5313 using OSDUtilizationDumper
<Formatter
>::dump_item
;
5314 void dump_item(const CrushTreeDumper::Item
&qi
,
5318 int64_t kb_used_data
,
5319 int64_t kb_used_omap
,
5320 int64_t kb_used_meta
,
5324 const size_t num_pgs
,
5325 Formatter
*f
) override
{
5326 f
->open_object_section("item");
5327 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
5328 f
->dump_float("reweight", reweight
);
5329 f
->dump_int("kb", kb
);
5330 f
->dump_int("kb_used", kb_used
);
5331 f
->dump_int("kb_used_data", kb_used_data
);
5332 f
->dump_int("kb_used_omap", kb_used_omap
);
5333 f
->dump_int("kb_used_meta", kb_used_meta
);
5334 f
->dump_int("kb_avail", kb_avail
);
5335 f
->dump_float("utilization", util
);
5336 f
->dump_float("var", var
);
5337 f
->dump_unsigned("pgs", num_pgs
);
5338 if (!qi
.is_bucket()) {
5339 if (osdmap
->is_up(qi
.id
)) {
5340 f
->dump_string("status", "up");
5341 } else if (osdmap
->is_destroyed(qi
.id
)) {
5342 f
->dump_string("status", "destroyed");
5344 f
->dump_string("status", "down");
5347 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
5352 void summary(Formatter
*f
) {
5353 f
->open_object_section("summary");
5354 auto sum
= pgmap
.get_osd_sum(get_dumped_osds());
5355 auto& s
= sum
.statfs
;
5357 f
->dump_int("total_kb", s
.kb());
5358 f
->dump_int("total_kb_used", s
.kb_used_raw());
5359 f
->dump_int("total_kb_used_data", s
.kb_used_data());
5360 f
->dump_int("total_kb_used_omap", s
.kb_used_omap());
5361 f
->dump_int("total_kb_used_meta", s
.kb_used_internal_metadata());
5362 f
->dump_int("total_kb_avail", s
.kb_avail());
5363 f
->dump_float("average_utilization", average_util
);
5364 f
->dump_float("min_var", min_var
);
5365 f
->dump_float("max_var", max_var
);
5366 f
->dump_float("dev", dev());
5371 void print_osd_utilization(const OSDMap
& osdmap
,
5376 const string
& class_name
,
5377 const string
& item_name
)
5379 const CrushWrapper
*crush
= osdmap
.crush
.get();
5381 f
->open_object_section("df");
5382 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgmap
, tree
,
5383 class_name
, item_name
);
5389 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgmap
, tree
,
5390 class_name
, item_name
);
5393 out
<< tbl
<< d
.summary() << "\n";
5397 void OSDMap::check_health(health_check_map_t
*checks
) const
5399 int num_osds
= get_num_osds();
5402 // OSD_$subtree_DOWN
5404 if (num_osds
>= 0) {
5405 int num_in_osds
= 0;
5406 int num_down_in_osds
= 0;
5408 set
<int> down_in_osds
;
5409 set
<int> up_in_osds
;
5410 set
<int> subtree_up
;
5411 unordered_map
<int, set
<int> > subtree_type_down
;
5412 unordered_map
<int, int> num_osds_subtree
;
5413 int max_type
= crush
->get_max_type_id();
5415 for (int i
= 0; i
< get_max_osd(); i
++) {
5417 if (crush
->item_exists(i
)) {
5425 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
5428 down_in_osds
.insert(i
);
5431 for (int type
= 0; type
<= max_type
; type
++) {
5432 if (!crush
->get_type_name(type
))
5434 int r
= crush
->get_immediate_parent_id(current
, &parent_id
);
5437 // break early if this parent is already marked as up
5438 if (subtree_up
.count(parent_id
))
5440 type
= crush
->get_bucket_type(parent_id
);
5441 if (!subtree_type_is_down(
5442 g_ceph_context
, parent_id
, type
,
5443 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
5445 current
= parent_id
;
5450 // calculate the number of down osds in each down subtree and
5451 // store it in num_osds_subtree
5452 for (int type
= 1; type
<= max_type
; type
++) {
5453 if (!crush
->get_type_name(type
))
5455 for (auto j
= subtree_type_down
[type
].begin();
5456 j
!= subtree_type_down
[type
].end();
5460 int num_children
= crush
->get_children(*j
, &children
);
5461 if (num_children
== 0)
5463 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
5466 } else if (num_osds_subtree
[*l
] > 0) {
5467 num
= num
+ num_osds_subtree
[*l
];
5470 num_osds_subtree
[*j
] = num
;
5473 num_down_in_osds
= down_in_osds
.size();
5474 ceph_assert(num_down_in_osds
<= num_in_osds
);
5475 if (num_down_in_osds
> 0) {
5476 // summary of down subtree types and osds
5477 for (int type
= max_type
; type
> 0; type
--) {
5478 if (!crush
->get_type_name(type
))
5480 if (subtree_type_down
[type
].size() > 0) {
5482 ss
<< subtree_type_down
[type
].size() << " "
5483 << crush
->get_type_name(type
);
5484 if (subtree_type_down
[type
].size() > 1) {
5487 int sum_down_osds
= 0;
5488 for (auto j
= subtree_type_down
[type
].begin();
5489 j
!= subtree_type_down
[type
].end();
5491 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
5493 ss
<< " (" << sum_down_osds
<< " osds) down";
5494 string err
= string("OSD_") +
5495 string(crush
->get_type_name(type
)) + "_DOWN";
5496 boost::to_upper(err
);
5497 auto& d
= checks
->add(err
, HEALTH_WARN
, ss
.str());
5498 for (auto j
= subtree_type_down
[type
].rbegin();
5499 j
!= subtree_type_down
[type
].rend();
5502 ss
<< crush
->get_type_name(type
);
5504 ss
<< crush
->get_item_name(*j
);
5505 // at the top level, do not print location
5506 if (type
!= max_type
) {
5508 ss
<< crush
->get_full_location_ordered_string(*j
);
5511 int num
= num_osds_subtree
[*j
];
5512 ss
<< " (" << num
<< " osds)";
5514 d
.detail
.push_back(ss
.str());
5519 ss
<< down_in_osds
.size() << " osds down";
5520 auto& d
= checks
->add("OSD_DOWN", HEALTH_WARN
, ss
.str());
5521 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
5523 ss
<< "osd." << *it
<< " (";
5524 ss
<< crush
->get_full_location_ordered_string(*it
);
5526 d
.detail
.push_back(ss
.str());
5530 if (!osds
.empty()) {
5532 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
5533 auto& d
= checks
->add("OSD_ORPHAN", HEALTH_WARN
, ss
.str());
5534 for (auto osd
: osds
) {
5536 ss
<< "osd." << osd
<< " exists in crush map but not in osdmap";
5537 d
.detail
.push_back(ss
.str());
5542 std::list
<std::string
> scrub_messages
;
5543 bool noscrub
= false, nodeepscrub
= false;
5544 for (const auto &p
: pools
) {
5545 if (p
.second
.flags
& pg_pool_t::FLAG_NOSCRUB
) {
5547 ss
<< "Pool " << get_pool_name(p
.first
) << " has noscrub flag";
5548 scrub_messages
.push_back(ss
.str());
5551 if (p
.second
.flags
& pg_pool_t::FLAG_NODEEP_SCRUB
) {
5553 ss
<< "Pool " << get_pool_name(p
.first
) << " has nodeep-scrub flag";
5554 scrub_messages
.push_back(ss
.str());
5558 if (noscrub
|| nodeepscrub
) {
5560 out
+= noscrub
? string("noscrub") + (nodeepscrub
? ", " : "") : "";
5561 out
+= nodeepscrub
? "nodeep-scrub" : "";
5562 auto& d
= checks
->add("POOL_SCRUB_FLAGS", HEALTH_OK
,
5563 "Some pool(s) have the " + out
+ " flag(s) set");
5564 d
.detail
.splice(d
.detail
.end(), scrub_messages
);
5567 // OSD_OUT_OF_ORDER_FULL
5569 // An osd could configure failsafe ratio, to something different
5570 // but for now assume it is the same here.
5571 float fsr
= g_conf()->osd_failsafe_full_ratio
;
5572 if (fsr
> 1.0) fsr
/= 100;
5573 float fr
= get_full_ratio();
5574 float br
= get_backfillfull_ratio();
5575 float nr
= get_nearfull_ratio();
5577 list
<string
> detail
;
5578 // These checks correspond to how OSDService::check_full_status() in an OSD
5579 // handles the improper setting of these values.
5582 ss
<< "backfillfull_ratio (" << br
5583 << ") < nearfull_ratio (" << nr
<< "), increased";
5584 detail
.push_back(ss
.str());
5589 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
5591 detail
.push_back(ss
.str());
5596 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
5598 detail
.push_back(ss
.str());
5600 if (!detail
.empty()) {
5601 auto& d
= checks
->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR
,
5602 "full ratio(s) out of order");
5603 d
.detail
.swap(detail
);
5610 // OSD_FAILSAFE_FULL
5612 set
<int> full
, backfillfull
, nearfull
;
5613 get_full_osd_counts(&full
, &backfillfull
, &nearfull
);
5616 ss
<< full
.size() << " full osd(s)";
5617 auto& d
= checks
->add("OSD_FULL", HEALTH_ERR
, ss
.str());
5618 for (auto& i
: full
) {
5620 ss
<< "osd." << i
<< " is full";
5621 d
.detail
.push_back(ss
.str());
5624 if (backfillfull
.size()) {
5626 ss
<< backfillfull
.size() << " backfillfull osd(s)";
5627 auto& d
= checks
->add("OSD_BACKFILLFULL", HEALTH_WARN
, ss
.str());
5628 for (auto& i
: backfillfull
) {
5630 ss
<< "osd." << i
<< " is backfill full";
5631 d
.detail
.push_back(ss
.str());
5634 if (nearfull
.size()) {
5636 ss
<< nearfull
.size() << " nearfull osd(s)";
5637 auto& d
= checks
->add("OSD_NEARFULL", HEALTH_WARN
, ss
.str());
5638 for (auto& i
: nearfull
) {
5640 ss
<< "osd." << i
<< " is near full";
5641 d
.detail
.push_back(ss
.str());
5649 uint64_t warn_flags
=
5650 CEPH_OSDMAP_NEARFULL
|
5652 CEPH_OSDMAP_PAUSERD
|
5653 CEPH_OSDMAP_PAUSEWR
|
5654 CEPH_OSDMAP_PAUSEREC
|
5656 CEPH_OSDMAP_NODOWN
|
5659 CEPH_OSDMAP_NOBACKFILL
|
5660 CEPH_OSDMAP_NORECOVER
|
5661 CEPH_OSDMAP_NOSCRUB
|
5662 CEPH_OSDMAP_NODEEP_SCRUB
|
5663 CEPH_OSDMAP_NOTIERAGENT
|
5664 CEPH_OSDMAP_NOSNAPTRIM
|
5665 CEPH_OSDMAP_NOREBALANCE
;
5666 if (test_flag(warn_flags
)) {
5668 ss
<< get_flag_string(get_flags() & warn_flags
)
5670 checks
->add("OSDMAP_FLAGS", HEALTH_WARN
, ss
.str());
5676 list
<string
> detail
;
5677 const unsigned flags
=
5682 for (int i
= 0; i
< max_osd
; ++i
) {
5683 if (osd_state
[i
] & flags
) {
5686 OSDMap::calc_state_set(osd_state
[i
] & flags
, states
);
5687 ss
<< "osd." << i
<< " has flags " << states
;
5688 detail
.push_back(ss
.str());
5691 for (auto& i
: crush_node_flags
) {
5692 if (i
.second
&& crush
->item_exists(i
.first
)) {
5695 OSDMap::calc_state_set(i
.second
, states
);
5696 int t
= i
.first
>= 0 ? 0 : crush
->get_bucket_type(i
.first
);
5697 const char *tn
= crush
->get_type_name(t
);
5698 ss
<< (tn
? tn
: "node") << " "
5699 << crush
->get_item_name(i
.first
) << " has flags " << states
;
5700 detail
.push_back(ss
.str());
5703 for (auto& i
: device_class_flags
) {
5704 const char* class_name
= crush
->get_class_name(i
.first
);
5705 if (i
.second
&& class_name
) {
5708 OSDMap::calc_state_set(i
.second
, states
);
5709 ss
<< "device class '" << class_name
<< "' has flags " << states
;
5710 detail
.push_back(ss
.str());
5713 if (!detail
.empty()) {
5715 ss
<< detail
.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
5716 auto& d
= checks
->add("OSD_FLAGS", HEALTH_WARN
, ss
.str());
5717 d
.detail
.swap(detail
);
5721 // OLD_CRUSH_TUNABLES
5722 if (g_conf()->mon_warn_on_legacy_crush_tunables
) {
5723 string min
= crush
->get_min_required_version();
5724 if (min
< g_conf()->mon_crush_min_required_version
) {
5726 ss
<< "crush map has legacy tunables (require " << min
5727 << ", min is " << g_conf()->mon_crush_min_required_version
<< ")";
5728 auto& d
= checks
->add("OLD_CRUSH_TUNABLES", HEALTH_WARN
, ss
.str());
5729 d
.detail
.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5733 // OLD_CRUSH_STRAW_CALC_VERSION
5734 if (g_conf()->mon_warn_on_crush_straw_calc_version_zero
) {
5735 if (crush
->get_straw_calc_version() == 0) {
5737 ss
<< "crush map has straw_calc_version=0";
5738 auto& d
= checks
->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN
, ss
.str());
5740 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
5744 // CACHE_POOL_NO_HIT_SET
5745 if (g_conf()->mon_warn_on_cache_pools_without_hit_sets
) {
5746 list
<string
> detail
;
5747 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
5750 const pg_pool_t
& info
= p
->second
;
5751 if (info
.cache_mode_requires_hit_set() &&
5752 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
5754 ss
<< "pool '" << get_pool_name(p
->first
)
5755 << "' with cache_mode " << info
.get_cache_mode_name()
5756 << " needs hit_set_type to be set but it is not";
5757 detail
.push_back(ss
.str());
5760 if (!detail
.empty()) {
5762 ss
<< detail
.size() << " cache pools are missing hit_sets";
5763 auto& d
= checks
->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN
, ss
.str());
5764 d
.detail
.swap(detail
);
5768 // OSD_NO_SORTBITWISE
5769 if (!test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
5771 ss
<< "'sortbitwise' flag is not set";
5772 checks
->add("OSD_NO_SORTBITWISE", HEALTH_WARN
, ss
.str());
5775 // OSD_UPGRADE_FINISHED
5776 // none of these (yet) since we don't run until luminous upgrade is done.
5778 // POOL_NEARFULL/BACKFILLFULL/FULL
5780 list
<string
> full_detail
, backfillfull_detail
, nearfull_detail
;
5781 for (auto it
: get_pools()) {
5782 const pg_pool_t
&pool
= it
.second
;
5783 const string
& pool_name
= get_pool_name(it
.first
);
5784 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
5786 if (pool
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
)) {
5787 // may run out of space too,
5788 // but we want EQUOTA taking precedence
5789 ss
<< "pool '" << pool_name
<< "' is full (running out of quota)";
5791 ss
<< "pool '" << pool_name
<< "' is full (no space)";
5793 full_detail
.push_back(ss
.str());
5794 } else if (pool
.has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
5796 ss
<< "pool '" << pool_name
<< "' is backfillfull";
5797 backfillfull_detail
.push_back(ss
.str());
5798 } else if (pool
.has_flag(pg_pool_t::FLAG_NEARFULL
)) {
5800 ss
<< "pool '" << pool_name
<< "' is nearfull";
5801 nearfull_detail
.push_back(ss
.str());
5804 if (!full_detail
.empty()) {
5806 ss
<< full_detail
.size() << " pool(s) full";
5807 auto& d
= checks
->add("POOL_FULL", HEALTH_WARN
, ss
.str());
5808 d
.detail
.swap(full_detail
);
5810 if (!backfillfull_detail
.empty()) {
5812 ss
<< backfillfull_detail
.size() << " pool(s) backfillfull";
5813 auto& d
= checks
->add("POOL_BACKFILLFULL", HEALTH_WARN
, ss
.str());
5814 d
.detail
.swap(backfillfull_detail
);
5816 if (!nearfull_detail
.empty()) {
5818 ss
<< nearfull_detail
.size() << " pool(s) nearfull";
5819 auto& d
= checks
->add("POOL_NEARFULL", HEALTH_WARN
, ss
.str());
5820 d
.detail
.swap(nearfull_detail
);
5825 int OSDMap::parse_osd_id_list(const vector
<string
>& ls
, set
<int> *out
,
5829 for (auto i
= ls
.begin(); i
!= ls
.end(); ++i
) {
5830 if (i
== ls
.begin() &&
5831 (*i
== "any" || *i
== "all" || *i
== "*")) {
5835 long osd
= parse_osd_id(i
->c_str(), ss
);
5837 *ss
<< "invalid osd id '" << *i
<< "'";
5845 void OSDMap::get_random_up_osds_by_subtree(int n
, // whoami
5847 int limit
, // how many
5849 set
<int> *want
) const {
5852 int subtree_type
= crush
->get_type_id(subtree
);
5853 if (subtree_type
< 1)
5855 vector
<int> subtrees
;
5856 crush
->get_subtree_of_type(subtree_type
, &subtrees
);
5857 std::random_device rd
;
5858 std::default_random_engine rng
{rd()};
5859 std::shuffle(subtrees
.begin(), subtrees
.end(), rng
);
5860 for (auto s
: subtrees
) {
5863 if (crush
->subtree_contains(s
, n
))
5866 crush
->get_children_of_type(s
, 0, &osds
);
5869 vector
<int> up_osds
;
5870 for (auto o
: osds
) {
5871 if (is_up(o
) && !skip
.count(o
))
5872 up_osds
.push_back(o
);
5874 if (up_osds
.empty())
5876 auto it
= up_osds
.begin();
5877 std::advance(it
, (n
% up_osds
.size()));
5883 float OSDMap::pool_raw_used_rate(int64_t poolid
) const
5885 const pg_pool_t
*pool
= get_pg_pool(poolid
);
5886 assert(pool
!= nullptr);
5888 switch (pool
->get_type()) {
5889 case pg_pool_t::TYPE_REPLICATED
:
5890 return pool
->get_size();
5892 case pg_pool_t::TYPE_ERASURE
:
5895 get_erasure_code_profile(pool
->erasure_code_profile
);
5896 auto pm
= ecp
.find("m");
5897 auto pk
= ecp
.find("k");
5898 if (pm
!= ecp
.end() && pk
!= ecp
.end()) {
5899 int k
= atoi(pk
->second
.c_str());
5900 int m
= atoi(pm
->second
.c_str());
5902 ceph_assert(mk
!= 0);
5903 ceph_assert(k
!= 0);
5904 return (float)mk
/ k
;
5911 ceph_abort_msg("unrecognized pool type");
5915 unsigned OSDMap::get_osd_crush_node_flags(int osd
) const
5918 if (!crush_node_flags
.empty()) {
5919 // the map will contain type -> name
5920 std::map
<std::string
,std::string
> ploc
= crush
->get_full_location(osd
);
5921 for (auto& i
: ploc
) {
5922 int id
= crush
->get_item_id(i
.second
);
5923 auto p
= crush_node_flags
.find(id
);
5924 if (p
!= crush_node_flags
.end()) {
5932 unsigned OSDMap::get_crush_node_flags(int id
) const
5935 auto it
= crush_node_flags
.find(id
);
5936 if (it
!= crush_node_flags
.end())
5941 unsigned OSDMap::get_device_class_flags(int id
) const
5944 auto it
= device_class_flags
.find(id
);
5945 if (it
!= device_class_flags
.end())