1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include <boost/algorithm/string.hpp>
22 #include "common/config.h"
23 #include "common/errno.h"
24 #include "common/Formatter.h"
25 #include "common/TextTable.h"
26 #include "include/ceph_features.h"
27 #include "include/str_map.h"
29 #include "common/code_environment.h"
30 #include "mon/health_check.h"
32 #include "crush/CrushTreeDumper.h"
33 #include "common/Clock.h"
34 #include "mon/PGStatService.h"
36 #define dout_subsys ceph_subsys_osd
38 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
39 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
42 // ----------------------------------
45 void osd_info_t::dump(Formatter
*f
) const
47 f
->dump_int("last_clean_begin", last_clean_begin
);
48 f
->dump_int("last_clean_end", last_clean_end
);
49 f
->dump_int("up_from", up_from
);
50 f
->dump_int("up_thru", up_thru
);
51 f
->dump_int("down_at", down_at
);
52 f
->dump_int("lost_at", lost_at
);
55 void osd_info_t::encode(bufferlist
& bl
) const
58 ::encode(struct_v
, bl
);
59 ::encode(last_clean_begin
, bl
);
60 ::encode(last_clean_end
, bl
);
61 ::encode(up_from
, bl
);
62 ::encode(up_thru
, bl
);
63 ::encode(down_at
, bl
);
64 ::encode(lost_at
, bl
);
67 void osd_info_t::decode(bufferlist::iterator
& bl
)
70 ::decode(struct_v
, bl
);
71 ::decode(last_clean_begin
, bl
);
72 ::decode(last_clean_end
, bl
);
73 ::decode(up_from
, bl
);
74 ::decode(up_thru
, bl
);
75 ::decode(down_at
, bl
);
76 ::decode(lost_at
, bl
);
79 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
81 o
.push_back(new osd_info_t
);
82 o
.push_back(new osd_info_t
);
83 o
.back()->last_clean_begin
= 1;
84 o
.back()->last_clean_end
= 2;
85 o
.back()->up_from
= 30;
86 o
.back()->up_thru
= 40;
87 o
.back()->down_at
= 5;
88 o
.back()->lost_at
= 6;
91 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
93 out
<< "up_from " << info
.up_from
94 << " up_thru " << info
.up_thru
95 << " down_at " << info
.down_at
96 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
98 out
<< " lost_at " << info
.lost_at
;
102 // ----------------------------------
105 void osd_xinfo_t::dump(Formatter
*f
) const
107 f
->dump_stream("down_stamp") << down_stamp
;
108 f
->dump_float("laggy_probability", laggy_probability
);
109 f
->dump_int("laggy_interval", laggy_interval
);
110 f
->dump_int("features", features
);
111 f
->dump_unsigned("old_weight", old_weight
);
114 void osd_xinfo_t::encode(bufferlist
& bl
) const
116 ENCODE_START(3, 1, bl
);
117 ::encode(down_stamp
, bl
);
118 __u32 lp
= laggy_probability
* 0xfffffffful
;
120 ::encode(laggy_interval
, bl
);
121 ::encode(features
, bl
);
122 ::encode(old_weight
, bl
);
126 void osd_xinfo_t::decode(bufferlist::iterator
& bl
)
129 ::decode(down_stamp
, bl
);
132 laggy_probability
= (float)lp
/ (float)0xffffffff;
133 ::decode(laggy_interval
, bl
);
135 ::decode(features
, bl
);
139 ::decode(old_weight
, bl
);
145 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
147 o
.push_back(new osd_xinfo_t
);
148 o
.push_back(new osd_xinfo_t
);
149 o
.back()->down_stamp
= utime_t(2, 3);
150 o
.back()->laggy_probability
= .123;
151 o
.back()->laggy_interval
= 123456;
152 o
.back()->old_weight
= 0x7fff;
155 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
157 return out
<< "down_stamp " << xi
.down_stamp
158 << " laggy_probability " << xi
.laggy_probability
159 << " laggy_interval " << xi
.laggy_interval
160 << " old_weight " << xi
.old_weight
;
163 // ----------------------------------
164 // OSDMap::Incremental
166 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
169 for (auto &weight
: new_weight
) {
170 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
172 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
178 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
181 for (auto &state
: new_state
) { //
182 if (state
.second
& CEPH_OSD_UP
) {
183 if (previous
->is_up(state
.first
))
192 int OSDMap::Incremental::identify_osd(uuid_d u
) const
194 for (auto &uuid
: new_uuid
)
195 if (uuid
.second
== u
)
200 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext
*cct
,
201 const OSDMap
& osdmap
)
203 assert(epoch
== osdmap
.get_epoch() + 1);
205 for (auto &new_pool
: new_pools
) {
206 if (!new_pool
.second
.tiers
.empty()) {
207 pg_pool_t
& base
= new_pool
.second
;
209 for (const auto &tier_pool
: base
.tiers
) {
210 const auto &r
= new_pools
.find(tier_pool
);
212 if (r
== new_pools
.end()) {
213 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
215 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
218 tier
= get_new_pool(tier_pool
, orig
);
222 if (tier
->tier_of
!= new_pool
.first
) {
223 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
227 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
228 << tier_pool
<< dendl
;
229 tier
->snap_seq
= base
.snap_seq
;
230 tier
->snap_epoch
= base
.snap_epoch
;
231 tier
->snaps
= base
.snaps
;
232 tier
->removed_snaps
= base
.removed_snaps
;
239 // ----------------------------------
242 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
248 down_cache
->count(id
)) {
253 crush
->get_children(id
, &children
);
254 for (const auto &child
: children
) {
255 if (!subtree_is_down(child
, down_cache
)) {
260 down_cache
->insert(id
);
265 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
267 // use a stack-local down_cache if we didn't get one from the
268 // caller. then at least this particular call will avoid duplicated
270 set
<int> local_down_cache
;
272 down_cache
= &local_down_cache
;
281 type
= crush
->get_bucket_type(current
);
285 if (!subtree_is_down(current
, down_cache
)) {
286 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
290 // is this a big enough subtree to be marked as down?
291 if (type
>= subtree_type
) {
292 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
296 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
303 bool OSDMap::subtree_type_is_down(
307 set
<int> *down_in_osds
,
308 set
<int> *up_in_osds
,
309 set
<int> *subtree_up
,
310 unordered_map
<int, set
<int> > *subtree_type_down
) const
313 bool is_down_ret
= is_down(id
);
316 down_in_osds
->insert(id
);
318 up_in_osds
->insert(id
);
324 if (subtree_type_down
&&
325 (*subtree_type_down
)[subtree_type
].count(id
)) {
330 crush
->get_children(id
, &children
);
331 for (const auto &child
: children
) {
332 if (!subtree_type_is_down(
333 cct
, child
, crush
->get_bucket_type(child
),
334 down_in_osds
, up_in_osds
, subtree_up
, subtree_type_down
)) {
335 subtree_up
->insert(id
);
339 if (subtree_type_down
) {
340 (*subtree_type_down
)[subtree_type
].insert(id
);
345 void OSDMap::Incremental::encode_client_old(bufferlist
& bl
) const
351 ::encode(modified
, bl
);
352 int32_t new_t
= new_pool_max
;
354 ::encode(new_flags
, bl
);
355 ::encode(fullmap
, bl
);
358 ::encode(new_max_osd
, bl
);
359 // for ::encode(new_pools, bl);
360 __u32 n
= new_pools
.size();
362 for (const auto &new_pool
: new_pools
) {
365 ::encode(new_pool
.second
, bl
, 0);
367 // for ::encode(new_pool_names, bl);
368 n
= new_pool_names
.size();
371 for (const auto &new_pool_name
: new_pool_names
) {
372 n
= new_pool_name
.first
;
374 ::encode(new_pool_name
.second
, bl
);
376 // for ::encode(old_pools, bl);
377 n
= old_pools
.size();
379 for (auto &old_pool
: old_pools
) {
383 ::encode(new_up_client
, bl
, 0);
385 // legacy is map<int32_t,uint8_t>
386 uint32_t n
= new_state
.size();
388 for (auto p
: new_state
) {
389 ::encode(p
.first
, bl
);
390 ::encode((uint8_t)p
.second
, bl
);
393 ::encode(new_weight
, bl
);
394 // for ::encode(new_pg_temp, bl);
395 n
= new_pg_temp
.size();
398 for (const auto &pg_temp
: new_pg_temp
) {
399 old_pg_t opg
= pg_temp
.first
.get_old_pg();
401 ::encode(pg_temp
.second
, bl
);
405 void OSDMap::Incremental::encode_classic(bufferlist
& bl
, uint64_t features
) const
407 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
408 encode_client_old(bl
);
417 ::encode(modified
, bl
);
418 ::encode(new_pool_max
, bl
);
419 ::encode(new_flags
, bl
);
420 ::encode(fullmap
, bl
);
423 ::encode(new_max_osd
, bl
);
424 ::encode(new_pools
, bl
, features
);
425 ::encode(new_pool_names
, bl
);
426 ::encode(old_pools
, bl
);
427 ::encode(new_up_client
, bl
, features
);
429 uint32_t n
= new_state
.size();
431 for (auto p
: new_state
) {
432 ::encode(p
.first
, bl
);
433 ::encode((uint8_t)p
.second
, bl
);
436 ::encode(new_weight
, bl
);
437 ::encode(new_pg_temp
, bl
);
442 ::encode(new_hb_back_up
, bl
, features
);
443 ::encode(new_up_thru
, bl
);
444 ::encode(new_last_clean_interval
, bl
);
445 ::encode(new_lost
, bl
);
446 ::encode(new_blacklist
, bl
, features
);
447 ::encode(old_blacklist
, bl
, features
);
448 ::encode(new_up_cluster
, bl
, features
);
449 ::encode(cluster_snapshot
, bl
);
450 ::encode(new_uuid
, bl
);
451 ::encode(new_xinfo
, bl
);
452 ::encode(new_hb_front_up
, bl
, features
);
455 void OSDMap::Incremental::encode(bufferlist
& bl
, uint64_t features
) const
457 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
458 encode_classic(bl
, features
);
462 // only a select set of callers should *ever* be encoding new
463 // OSDMaps. others should be passing around the canonical encoded
464 // buffers from on high. select out those callers by passing in an
465 // "impossible" feature bit.
466 assert(features
& CEPH_FEATURE_RESERVED
);
467 features
&= ~CEPH_FEATURE_RESERVED
;
469 size_t start_offset
= bl
.length();
471 buffer::list::iterator crc_it
;
473 // meta-encoding: how we include client-used and osd-specific data
474 ENCODE_START(8, 7, bl
);
478 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
481 ENCODE_START(v
, 1, bl
); // client-usable data
484 ::encode(modified
, bl
);
485 ::encode(new_pool_max
, bl
);
486 ::encode(new_flags
, bl
);
487 ::encode(fullmap
, bl
);
490 ::encode(new_max_osd
, bl
);
491 ::encode(new_pools
, bl
, features
);
492 ::encode(new_pool_names
, bl
);
493 ::encode(old_pools
, bl
);
494 ::encode(new_up_client
, bl
, features
);
496 ::encode(new_state
, bl
);
498 uint32_t n
= new_state
.size();
500 for (auto p
: new_state
) {
501 ::encode(p
.first
, bl
);
502 ::encode((uint8_t)p
.second
, bl
);
505 ::encode(new_weight
, bl
);
506 ::encode(new_pg_temp
, bl
);
507 ::encode(new_primary_temp
, bl
);
508 ::encode(new_primary_affinity
, bl
);
509 ::encode(new_erasure_code_profiles
, bl
);
510 ::encode(old_erasure_code_profiles
, bl
);
512 ::encode(new_pg_upmap
, bl
);
513 ::encode(old_pg_upmap
, bl
);
514 ::encode(new_pg_upmap_items
, bl
);
515 ::encode(old_pg_upmap_items
, bl
);
517 ENCODE_FINISH(bl
); // client-usable data
521 uint8_t target_v
= 6;
522 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
525 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
526 ::encode(new_hb_back_up
, bl
, features
);
527 ::encode(new_up_thru
, bl
);
528 ::encode(new_last_clean_interval
, bl
);
529 ::encode(new_lost
, bl
);
530 ::encode(new_blacklist
, bl
, features
);
531 ::encode(old_blacklist
, bl
, features
);
532 ::encode(new_up_cluster
, bl
, features
);
533 ::encode(cluster_snapshot
, bl
);
534 ::encode(new_uuid
, bl
);
535 ::encode(new_xinfo
, bl
);
536 ::encode(new_hb_front_up
, bl
, features
);
537 ::encode(features
, bl
); // NOTE: features arg, not the member
539 ::encode(new_nearfull_ratio
, bl
);
540 ::encode(new_full_ratio
, bl
);
541 ::encode(new_backfillfull_ratio
, bl
);
543 // 5 was string-based new_require_min_compat_client
545 ::encode(new_require_min_compat_client
, bl
);
546 ::encode(new_require_osd_release
, bl
);
548 ENCODE_FINISH(bl
); // osd-only data
551 ::encode((uint32_t)0, bl
); // dummy inc_crc
554 tail_offset
= bl
.length();
556 ::encode(full_crc
, bl
);
558 ENCODE_FINISH(bl
); // meta-encoding wrapper
562 front
.substr_of(bl
, start_offset
, crc_it
.get_off() - start_offset
);
563 inc_crc
= front
.crc32c(-1);
565 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
566 inc_crc
= tail
.crc32c(inc_crc
);
569 crc_it
.copy_in(4, (char*)&crc_le
);
573 void OSDMap::Incremental::decode_classic(bufferlist::iterator
&p
)
581 ::decode(modified
, p
);
582 if (v
== 4 || v
== 5) {
586 ::decode(new_pool_max
, p
);
587 ::decode(new_flags
, p
);
588 ::decode(fullmap
, p
);
591 ::decode(new_max_osd
, p
);
597 ::decode(new_pools
[t
], p
);
600 ::decode(new_pools
, p
);
603 new_pool_names
.clear();
607 ::decode(new_pool_names
[t
], p
);
610 ::decode(new_pool_names
, p
);
620 ::decode(old_pools
, p
);
622 ::decode(new_up_client
, p
);
624 map
<int32_t,uint8_t> ns
;
627 new_state
[q
.first
] = q
.second
;
630 ::decode(new_weight
, p
);
637 ::decode_raw(opg
, p
);
638 ::decode(new_pg_temp
[pg_t(opg
)], p
);
641 ::decode(new_pg_temp
, p
);
644 // decode short map, too.
645 if (v
== 5 && p
.end())
652 ::decode(new_hb_back_up
, p
);
654 ::decode(new_pool_names
, p
);
655 ::decode(new_up_thru
, p
);
656 ::decode(new_last_clean_interval
, p
);
657 ::decode(new_lost
, p
);
658 ::decode(new_blacklist
, p
);
659 ::decode(old_blacklist
, p
);
661 ::decode(new_up_cluster
, p
);
663 ::decode(cluster_snapshot
, p
);
665 ::decode(new_uuid
, p
);
667 ::decode(new_xinfo
, p
);
669 ::decode(new_hb_front_up
, p
);
672 void OSDMap::Incremental::decode(bufferlist::iterator
& bl
)
675 * Older encodings of the Incremental had a single struct_v which
676 * covered the whole encoding, and was prior to our modern
677 * stuff which includes a compatv and a size. So if we see
678 * a struct_v < 7, we must rewind to the beginning and use our
681 size_t start_offset
= bl
.get_off();
682 size_t tail_offset
= 0;
683 bufferlist crc_front
, crc_tail
;
685 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
687 int struct_v_size
= sizeof(struct_v
);
688 bl
.advance(-struct_v_size
);
692 encode_features
= CEPH_FEATURE_PGID64
;
698 DECODE_START(5, bl
); // client-usable data
701 ::decode(modified
, bl
);
702 ::decode(new_pool_max
, bl
);
703 ::decode(new_flags
, bl
);
704 ::decode(fullmap
, bl
);
707 ::decode(new_max_osd
, bl
);
708 ::decode(new_pools
, bl
);
709 ::decode(new_pool_names
, bl
);
710 ::decode(old_pools
, bl
);
711 ::decode(new_up_client
, bl
);
713 ::decode(new_state
, bl
);
715 map
<int32_t,uint8_t> ns
;
718 new_state
[q
.first
] = q
.second
;
721 ::decode(new_weight
, bl
);
722 ::decode(new_pg_temp
, bl
);
723 ::decode(new_primary_temp
, bl
);
725 ::decode(new_primary_affinity
, bl
);
727 new_primary_affinity
.clear();
729 ::decode(new_erasure_code_profiles
, bl
);
730 ::decode(old_erasure_code_profiles
, bl
);
732 new_erasure_code_profiles
.clear();
733 old_erasure_code_profiles
.clear();
736 ::decode(new_pg_upmap
, bl
);
737 ::decode(old_pg_upmap
, bl
);
738 ::decode(new_pg_upmap_items
, bl
);
739 ::decode(old_pg_upmap_items
, bl
);
741 DECODE_FINISH(bl
); // client-usable data
745 DECODE_START(6, bl
); // extended, osd-only data
746 ::decode(new_hb_back_up
, bl
);
747 ::decode(new_up_thru
, bl
);
748 ::decode(new_last_clean_interval
, bl
);
749 ::decode(new_lost
, bl
);
750 ::decode(new_blacklist
, bl
);
751 ::decode(old_blacklist
, bl
);
752 ::decode(new_up_cluster
, bl
);
753 ::decode(cluster_snapshot
, bl
);
754 ::decode(new_uuid
, bl
);
755 ::decode(new_xinfo
, bl
);
756 ::decode(new_hb_front_up
, bl
);
758 ::decode(encode_features
, bl
);
760 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
762 ::decode(new_nearfull_ratio
, bl
);
763 ::decode(new_full_ratio
, bl
);
765 new_nearfull_ratio
= -1;
769 ::decode(new_backfillfull_ratio
, bl
);
771 new_backfillfull_ratio
= -1;
777 new_require_min_compat_client
= ceph_release_from_name(r
.c_str());
781 ::decode(new_require_min_compat_client
, bl
);
782 ::decode(new_require_osd_release
, bl
);
784 if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)) {
785 // only for compat with post-kraken pre-luminous test clusters
786 new_require_osd_release
= CEPH_RELEASE_LUMINOUS
;
787 new_flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
788 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
789 new_require_osd_release
= CEPH_RELEASE_KRAKEN
;
790 } else if (new_flags
>= 0 && (new_flags
& CEPH_OSDMAP_REQUIRE_JEWEL
)) {
791 new_require_osd_release
= CEPH_RELEASE_JEWEL
;
793 new_require_osd_release
= -1;
796 DECODE_FINISH(bl
); // osd-only data
801 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
802 ::decode(inc_crc
, bl
);
803 tail_offset
= bl
.get_off();
804 ::decode(full_crc
, bl
);
811 DECODE_FINISH(bl
); // wrapper
815 uint32_t actual
= crc_front
.crc32c(-1);
816 if (tail_offset
< bl
.get_off()) {
818 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
819 actual
= tail
.crc32c(actual
);
821 if (inc_crc
!= actual
) {
823 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
825 throw buffer::malformed_input(s
.c_str());
830 void OSDMap::Incremental::dump(Formatter
*f
) const
832 f
->dump_int("epoch", epoch
);
833 f
->dump_stream("fsid") << fsid
;
834 f
->dump_stream("modified") << modified
;
835 f
->dump_int("new_pool_max", new_pool_max
);
836 f
->dump_int("new_flags", new_flags
);
837 f
->dump_float("new_full_ratio", new_full_ratio
);
838 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
839 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
840 f
->dump_int("new_require_min_compat_client", new_require_min_compat_client
);
841 f
->dump_int("new_require_osd_release", new_require_osd_release
);
843 if (fullmap
.length()) {
844 f
->open_object_section("full_map");
846 bufferlist fbl
= fullmap
; // kludge around constness.
847 auto p
= fbl
.begin();
852 if (crush
.length()) {
853 f
->open_object_section("crush");
855 bufferlist tbl
= crush
; // kludge around constness.
856 auto p
= tbl
.begin();
862 f
->dump_int("new_max_osd", new_max_osd
);
864 f
->open_array_section("new_pools");
866 for (const auto &new_pool
: new_pools
) {
867 f
->open_object_section("pool");
868 f
->dump_int("pool", new_pool
.first
);
869 new_pool
.second
.dump(f
);
873 f
->open_array_section("new_pool_names");
875 for (const auto &new_pool_name
: new_pool_names
) {
876 f
->open_object_section("pool_name");
877 f
->dump_int("pool", new_pool_name
.first
);
878 f
->dump_string("name", new_pool_name
.second
);
882 f
->open_array_section("old_pools");
884 for (const auto &old_pool
: old_pools
)
885 f
->dump_int("pool", old_pool
);
888 f
->open_array_section("new_up_osds");
890 for (const auto &upclient
: new_up_client
) {
891 f
->open_object_section("osd");
892 f
->dump_int("osd", upclient
.first
);
893 f
->dump_stream("public_addr") << upclient
.second
;
894 f
->dump_stream("cluster_addr") << new_up_cluster
.find(upclient
.first
)->second
;
895 f
->dump_stream("heartbeat_back_addr") << new_hb_back_up
.find(upclient
.first
)->second
;
896 map
<int32_t, entity_addr_t
>::const_iterator q
;
897 if ((q
= new_hb_front_up
.find(upclient
.first
)) != new_hb_front_up
.end())
898 f
->dump_stream("heartbeat_front_addr") << q
->second
;
903 f
->open_array_section("new_weight");
905 for (const auto &weight
: new_weight
) {
906 f
->open_object_section("osd");
907 f
->dump_int("osd", weight
.first
);
908 f
->dump_int("weight", weight
.second
);
913 f
->open_array_section("osd_state_xor");
914 for (const auto &ns
: new_state
) {
915 f
->open_object_section("osd");
916 f
->dump_int("osd", ns
.first
);
918 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
919 f
->open_array_section("state_xor");
920 for (auto &state
: st
)
921 f
->dump_string("state", state
);
927 f
->open_array_section("new_pg_temp");
929 for (const auto &pg_temp
: new_pg_temp
) {
930 f
->open_object_section("pg");
931 f
->dump_stream("pgid") << pg_temp
.first
;
932 f
->open_array_section("osds");
934 for (const auto &osd
: pg_temp
.second
)
935 f
->dump_int("osd", osd
);
941 f
->open_array_section("primary_temp");
943 for (const auto &primary_temp
: new_primary_temp
) {
944 f
->dump_stream("pgid") << primary_temp
.first
;
945 f
->dump_int("osd", primary_temp
.second
);
947 f
->close_section(); // primary_temp
949 f
->open_array_section("new_pg_upmap");
950 for (auto& i
: new_pg_upmap
) {
951 f
->open_object_section("mapping");
952 f
->dump_stream("pgid") << i
.first
;
953 f
->open_array_section("osds");
954 for (auto osd
: i
.second
) {
955 f
->dump_int("osd", osd
);
961 f
->open_array_section("old_pg_upmap");
962 for (auto& i
: old_pg_upmap
) {
963 f
->dump_stream("pgid") << i
;
967 f
->open_array_section("new_pg_upmap_items");
968 for (auto& i
: new_pg_upmap_items
) {
969 f
->open_object_section("mapping");
970 f
->dump_stream("pgid") << i
.first
;
971 f
->open_array_section("mappings");
972 for (auto& p
: i
.second
) {
973 f
->open_object_section("mapping");
974 f
->dump_int("from", p
.first
);
975 f
->dump_int("to", p
.second
);
982 f
->open_array_section("old_pg_upmap_items");
983 for (auto& i
: old_pg_upmap_items
) {
984 f
->dump_stream("pgid") << i
;
988 f
->open_array_section("new_up_thru");
990 for (const auto &up_thru
: new_up_thru
) {
991 f
->open_object_section("osd");
992 f
->dump_int("osd", up_thru
.first
);
993 f
->dump_int("up_thru", up_thru
.second
);
998 f
->open_array_section("new_lost");
1000 for (const auto &lost
: new_lost
) {
1001 f
->open_object_section("osd");
1002 f
->dump_int("osd", lost
.first
);
1003 f
->dump_int("epoch_lost", lost
.second
);
1008 f
->open_array_section("new_last_clean_interval");
1010 for (const auto &last_clean_interval
: new_last_clean_interval
) {
1011 f
->open_object_section("osd");
1012 f
->dump_int("osd", last_clean_interval
.first
);
1013 f
->dump_int("first", last_clean_interval
.second
.first
);
1014 f
->dump_int("last", last_clean_interval
.second
.second
);
1019 f
->open_array_section("new_blacklist");
1020 for (const auto &blist
: new_blacklist
) {
1023 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
1026 f
->open_array_section("old_blacklist");
1027 for (const auto &blist
: old_blacklist
)
1028 f
->dump_stream("addr") << blist
;
1031 f
->open_array_section("new_xinfo");
1032 for (const auto &xinfo
: new_xinfo
) {
1033 f
->open_object_section("xinfo");
1034 f
->dump_int("osd", xinfo
.first
);
1035 xinfo
.second
.dump(f
);
1040 if (cluster_snapshot
.size())
1041 f
->dump_string("cluster_snapshot", cluster_snapshot
);
1043 f
->open_array_section("new_uuid");
1044 for (const auto &uuid
: new_uuid
) {
1045 f
->open_object_section("osd");
1046 f
->dump_int("osd", uuid
.first
);
1047 f
->dump_stream("uuid") << uuid
.second
;
1052 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
1053 f
->open_array_section("old_erasure_code_profiles");
1054 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
1055 f
->dump_string("old", erasure_code_profile
.c_str());
1060 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
1062 o
.push_back(new Incremental
);
1065 // ----------------------------------
1068 void OSDMap::set_epoch(epoch_t e
)
1071 for (auto &pool
: pools
)
1072 pool
.second
.last_change
= e
;
1075 bool OSDMap::is_blacklisted(const entity_addr_t
& a
) const
1077 if (blacklist
.empty())
1080 // this specific instance?
1081 if (blacklist
.count(a
))
1084 // is entire ip blacklisted?
1086 entity_addr_t b
= a
;
1089 if (blacklist
.count(b
)) {
1097 void OSDMap::get_blacklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const
1099 std::copy(blacklist
.begin(), blacklist
.end(), std::back_inserter(*bl
));
1102 void OSDMap::get_blacklist(std::set
<entity_addr_t
> *bl
) const
1104 for (const auto &i
: blacklist
) {
1105 bl
->insert(i
.first
);
1109 void OSDMap::set_max_osd(int m
)
1113 osd_state
.resize(m
);
1114 osd_weight
.resize(m
);
1115 for (; o
<max_osd
; o
++) {
1117 osd_weight
[o
] = CEPH_OSD_OUT
;
1120 osd_xinfo
.resize(m
);
1121 osd_addrs
->client_addr
.resize(m
);
1122 osd_addrs
->cluster_addr
.resize(m
);
1123 osd_addrs
->hb_back_addr
.resize(m
);
1124 osd_addrs
->hb_front_addr
.resize(m
);
1125 osd_uuid
->resize(m
);
1126 if (osd_primary_affinity
)
1127 osd_primary_affinity
->resize(m
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1132 int OSDMap::calc_num_osds()
1137 for (int i
=0; i
<max_osd
; i
++) {
1138 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1140 if (osd_state
[i
] & CEPH_OSD_UP
) {
1143 if (get_weight(i
) != CEPH_OSD_OUT
) {
1151 void OSDMap::get_full_pools(CephContext
*cct
,
1153 set
<int64_t> *backfillfull
,
1154 set
<int64_t> *nearfull
) const
1157 assert(backfillfull
);
1160 backfillfull
->clear();
1163 vector
<int> full_osds
;
1164 vector
<int> backfillfull_osds
;
1165 vector
<int> nearfull_osds
;
1166 for (int i
= 0; i
< max_osd
; ++i
) {
1167 if (exists(i
) && is_up(i
) && is_in(i
)) {
1168 if (osd_state
[i
] & CEPH_OSD_FULL
)
1169 full_osds
.push_back(i
);
1170 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1171 backfillfull_osds
.push_back(i
);
1172 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1173 nearfull_osds
.push_back(i
);
1177 for (auto i
: full_osds
) {
1178 get_pool_ids_by_osd(cct
, i
, full
);
1180 for (auto i
: backfillfull_osds
) {
1181 get_pool_ids_by_osd(cct
, i
, backfillfull
);
1183 for (auto i
: nearfull_osds
) {
1184 get_pool_ids_by_osd(cct
, i
, nearfull
);
1188 static bool get_osd_utilization(
1189 const mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
1190 int id
, int64_t* kb
, int64_t* kb_used
, int64_t* kb_avail
)
1192 auto p
= osd_stat
.find(id
);
1193 if (p
== osd_stat
.end())
1196 *kb_used
= p
->second
.kb_used
;
1197 *kb_avail
= p
->second
.kb_avail
;
1201 void OSDMap::get_full_osd_util(
1202 const mempool::pgmap::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
1203 map
<int, float> *full
, map
<int, float> *backfill
, map
<int, float> *nearfull
) const
1208 for (int i
= 0; i
< max_osd
; ++i
) {
1209 if (exists(i
) && is_up(i
) && is_in(i
)) {
1210 int64_t kb
, kb_used
, kb_avail
;
1211 if (osd_state
[i
] & CEPH_OSD_FULL
) {
1212 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1213 full
->emplace(i
, (float)kb_used
/ (float)kb
);
1214 } else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
) {
1215 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1216 backfill
->emplace(i
, (float)kb_used
/ (float)kb
);
1217 } else if (osd_state
[i
] & CEPH_OSD_NEARFULL
) {
1218 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1219 nearfull
->emplace(i
, (float)kb_used
/ (float)kb
);
1225 void OSDMap::get_full_osd_counts(set
<int> *full
, set
<int> *backfill
,
1226 set
<int> *nearfull
) const
1231 for (int i
= 0; i
< max_osd
; ++i
) {
1232 if (exists(i
) && is_up(i
) && is_in(i
)) {
1233 if (osd_state
[i
] & CEPH_OSD_FULL
)
1235 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1236 backfill
->emplace(i
);
1237 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1238 nearfull
->emplace(i
);
1243 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1245 for (int i
=0; i
<max_osd
; i
++)
1250 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1252 for (int i
= 0; i
< max_osd
; i
++) {
1258 void OSDMap::get_out_osds(set
<int32_t>& ls
) const
1260 for (int i
= 0; i
< max_osd
; i
++) {
1266 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1269 for (unsigned s
= 1; t
; s
<<= 1) {
1272 st
.insert(ceph_osd_state_name(s
));
1277 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1280 for (const auto &weight
: weights
) {
1281 if (weight
.second
> max
)
1282 max
= weight
.second
;
1285 for (const auto &weight
: weights
) {
1286 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1290 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1292 for (int i
=0; i
<max_osd
; i
++)
1293 if (exists(i
) && (get_addr(i
) == addr
|| get_cluster_addr(i
) == addr
))
1298 int OSDMap::identify_osd(const uuid_d
& u
) const
1300 for (int i
=0; i
<max_osd
; i
++)
1301 if (exists(i
) && get_uuid(i
) == u
)
1306 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1308 for (int i
=0; i
<max_osd
; i
++)
1309 if (exists(i
) && (get_addr(i
) == addr
|| get_cluster_addr(i
) == addr
||
1310 get_hb_back_addr(i
) == addr
|| get_hb_front_addr(i
) == addr
))
1315 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1317 for (int i
=0; i
<max_osd
; i
++)
1318 if (exists(i
) && (get_addr(i
).is_same_host(ip
) || get_cluster_addr(i
).is_same_host(ip
)))
1324 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1326 uint64_t features
= 0; // things we actually have
1327 uint64_t mask
= 0; // things we could have
1329 if (crush
->has_nondefault_tunables())
1330 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1331 if (crush
->has_nondefault_tunables2())
1332 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1333 if (crush
->has_nondefault_tunables3())
1334 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1335 if (crush
->has_v4_buckets())
1336 features
|= CEPH_FEATURE_CRUSH_V4
;
1337 if (crush
->has_nondefault_tunables5())
1338 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1339 if (crush
->has_incompat_choose_args()) {
1340 features
|= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
;
1342 mask
|= CEPH_FEATURES_CRUSH
;
1344 if (!pg_upmap
.empty() || !pg_upmap_items
.empty())
1345 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1346 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1348 for (auto &pool
: pools
) {
1349 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1350 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1352 if (pool
.second
.is_erasure() &&
1353 entity_type
!= CEPH_ENTITY_TYPE_CLIENT
) { // not for clients
1354 features
|= CEPH_FEATURE_OSD_ERASURE_CODES
;
1356 if (!pool
.second
.tiers
.empty() ||
1357 pool
.second
.is_tier()) {
1358 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1360 int ruleid
= crush
->find_rule(pool
.second
.get_crush_rule(),
1361 pool
.second
.get_type(),
1362 pool
.second
.get_size());
1364 if (crush
->is_v2_rule(ruleid
))
1365 features
|= CEPH_FEATURE_CRUSH_V2
;
1366 if (crush
->is_v3_rule(ruleid
))
1367 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1368 if (crush
->is_v5_rule(ruleid
))
1369 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1372 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1373 for (auto &erasure_code_profile
: erasure_code_profiles
) {
1374 auto& profile
= erasure_code_profile
.second
;
1375 const auto& plugin
= profile
.find("plugin");
1376 if (plugin
!= profile
.end()) {
1377 if (plugin
->second
== "isa" || plugin
->second
== "lrc")
1378 features
|= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
;
1379 if (plugin
->second
== "shec")
1380 features
|= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
;
1384 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1385 if (entity_type
!= CEPH_ENTITY_TYPE_CLIENT
)
1386 mask
|= CEPH_FEATURE_OSD_ERASURE_CODES
;
1388 if (osd_primary_affinity
) {
1389 for (int i
= 0; i
< max_osd
; ++i
) {
1390 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1391 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1396 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1398 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1399 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1400 if (require_osd_release
>= CEPH_RELEASE_JEWEL
) {
1401 features
|= jewel_features
;
1403 mask
|= jewel_features
;
1405 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1406 | CEPH_FEATURE_MSG_ADDR2
;
1407 if (require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
1408 features
|= kraken_features
;
1410 mask
|= kraken_features
;
1418 uint8_t OSDMap::get_min_compat_client() const
1420 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1422 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1423 HAVE_FEATURE(f
, CRUSH_CHOOSE_ARGS
)) { // v12.0.1-2172-gef1ef28
1424 return CEPH_RELEASE_LUMINOUS
; // v12.2.0
1426 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1427 return CEPH_RELEASE_JEWEL
; // v10.2.0
1429 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1430 return CEPH_RELEASE_HAMMER
; // v0.94.0
1432 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1433 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1434 HAVE_FEATURE(f
, OSD_ERASURE_CODES
) || // v0.73-498-gbfc86a8
1435 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1436 return CEPH_RELEASE_FIREFLY
; // v0.80.0
1438 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1439 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1440 return CEPH_RELEASE_DUMPLING
; // v0.67.0
1442 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1443 return CEPH_RELEASE_ARGONAUT
; // v0.48argonaut-206-g6f381af
1445 return CEPH_RELEASE_ARGONAUT
; // v0.48argonaut-206-g6f381af
1448 void OSDMap::_calc_up_osd_features()
1451 cached_up_osd_features
= 0;
1452 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1455 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1456 if (xi
.features
== 0)
1457 continue; // bogus xinfo, maybe #20751 or similar, skipping
1459 cached_up_osd_features
= xi
.features
;
1462 cached_up_osd_features
&= xi
.features
;
1467 uint64_t OSDMap::get_up_osd_features() const
1469 return cached_up_osd_features
;
1472 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1474 if (o
->epoch
== n
->epoch
)
1480 if (o
->max_osd
!= n
->max_osd
)
1482 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1483 if ( n
->osd_addrs
->client_addr
[i
] && o
->osd_addrs
->client_addr
[i
] &&
1484 *n
->osd_addrs
->client_addr
[i
] == *o
->osd_addrs
->client_addr
[i
])
1485 n
->osd_addrs
->client_addr
[i
] = o
->osd_addrs
->client_addr
[i
];
1488 if ( n
->osd_addrs
->cluster_addr
[i
] && o
->osd_addrs
->cluster_addr
[i
] &&
1489 *n
->osd_addrs
->cluster_addr
[i
] == *o
->osd_addrs
->cluster_addr
[i
])
1490 n
->osd_addrs
->cluster_addr
[i
] = o
->osd_addrs
->cluster_addr
[i
];
1493 if ( n
->osd_addrs
->hb_back_addr
[i
] && o
->osd_addrs
->hb_back_addr
[i
] &&
1494 *n
->osd_addrs
->hb_back_addr
[i
] == *o
->osd_addrs
->hb_back_addr
[i
])
1495 n
->osd_addrs
->hb_back_addr
[i
] = o
->osd_addrs
->hb_back_addr
[i
];
1498 if ( n
->osd_addrs
->hb_front_addr
[i
] && o
->osd_addrs
->hb_front_addr
[i
] &&
1499 *n
->osd_addrs
->hb_front_addr
[i
] == *o
->osd_addrs
->hb_front_addr
[i
])
1500 n
->osd_addrs
->hb_front_addr
[i
] = o
->osd_addrs
->hb_front_addr
[i
];
1505 // zoinks, no differences at all!
1506 n
->osd_addrs
= o
->osd_addrs
;
1509 // does crush match?
1511 ::encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1512 ::encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1513 if (oc
.contents_equal(nc
)) {
1514 n
->crush
= o
->crush
;
1517 // does pg_temp match?
1518 if (*o
->pg_temp
== *n
->pg_temp
)
1519 n
->pg_temp
= o
->pg_temp
;
1521 // does primary_temp match?
1522 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1523 if (*o
->primary_temp
== *n
->primary_temp
)
1524 n
->primary_temp
= o
->primary_temp
;
1528 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1529 *o
->osd_uuid
== *n
->osd_uuid
)
1530 n
->osd_uuid
= o
->osd_uuid
;
1533 void OSDMap::clean_temps(CephContext
*cct
,
1534 const OSDMap
& osdmap
, Incremental
*pending_inc
)
1536 ldout(cct
, 10) << __func__
<< dendl
;
1538 tmpmap
.deepish_copy_from(osdmap
);
1539 tmpmap
.apply_incremental(*pending_inc
);
1541 for (auto pg
: *tmpmap
.pg_temp
) {
1542 // if pool does not exist, remove any existing pg_temps associated with
1543 // it. we don't care about pg_temps on the pending_inc either; if there
1544 // are new_pg_temp entries on the pending, clear them out just as well.
1545 if (!osdmap
.have_pg_pool(pg
.first
.pool())) {
1546 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1547 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1548 pending_inc
->new_pg_temp
[pg
.first
].clear();
1552 unsigned num_up
= 0;
1553 for (auto o
: pg
.second
) {
1554 if (!tmpmap
.is_down(o
)) {
1560 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1561 << " with all down osds" << pg
.second
<< dendl
;
1562 pending_inc
->new_pg_temp
[pg
.first
].clear();
1565 // redundant pg_temp?
1568 tmpmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1569 if (vectors_equal(raw_up
, pg
.second
)) {
1570 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1571 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1572 if (osdmap
.pg_temp
->count(pg
.first
))
1573 pending_inc
->new_pg_temp
[pg
.first
].clear();
1575 pending_inc
->new_pg_temp
.erase(pg
.first
);
1579 for (auto &pg
: *tmpmap
.primary_temp
) {
1581 if (tmpmap
.is_down(pg
.second
)) {
1582 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
1583 << " to down " << pg
.second
<< dendl
;
1584 pending_inc
->new_primary_temp
[pg
.first
] = -1;
1587 // redundant primary_temp?
1588 vector
<int> real_up
, templess_up
;
1589 int real_primary
, templess_primary
;
1590 pg_t pgid
= pg
.first
;
1591 tmpmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
1592 tmpmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
1593 if (real_primary
== templess_primary
){
1594 ldout(cct
, 10) << __func__
<< " removing primary_temp "
1595 << pgid
<< " -> " << real_primary
1596 << " (unnecessary/redundant)" << dendl
;
1597 if (osdmap
.primary_temp
->count(pgid
))
1598 pending_inc
->new_primary_temp
[pgid
] = -1;
1600 pending_inc
->new_primary_temp
.erase(pgid
);
1605 void OSDMap::maybe_remove_pg_upmaps(CephContext
*cct
,
1606 const OSDMap
& osdmap
,
1607 Incremental
*pending_inc
)
1609 ldout(cct
, 10) << __func__
<< dendl
;
1611 tmpmap
.deepish_copy_from(osdmap
);
1612 tmpmap
.apply_incremental(*pending_inc
);
1614 set
<pg_t
> to_cancel
;
1615 map
<int, map
<int, float>> rule_weight_map
;
1617 for (auto& p
: tmpmap
.pg_upmap
) {
1618 to_check
.insert(p
.first
);
1620 for (auto& p
: tmpmap
.pg_upmap_items
) {
1621 to_check
.insert(p
.first
);
1623 for (auto& p
: pending_inc
->new_pg_upmap
) {
1624 to_check
.insert(p
.first
);
1626 for (auto& p
: pending_inc
->new_pg_upmap_items
) {
1627 to_check
.insert(p
.first
);
1629 for (auto& pg
: to_check
) {
1630 auto crush_rule
= tmpmap
.get_pg_pool_crush_rule(pg
);
1631 if (crush_rule
< 0) {
1632 lderr(cct
) << __func__
<< " unable to load crush-rule of pg "
1636 map
<int, float> weight_map
;
1637 auto it
= rule_weight_map
.find(crush_rule
);
1638 if (it
== rule_weight_map
.end()) {
1639 auto r
= tmpmap
.crush
->get_rule_weight_osd_map(crush_rule
, &weight_map
);
1641 lderr(cct
) << __func__
<< " unable to get crush weight_map for "
1642 << "crush_rule " << crush_rule
<< dendl
;
1645 rule_weight_map
[crush_rule
] = weight_map
;
1647 weight_map
= it
->second
;
1649 auto type
= tmpmap
.crush
->get_rule_failure_domain(crush_rule
);
1651 lderr(cct
) << __func__
<< " unable to load failure-domain-type of pg "
1655 ldout(cct
, 10) << __func__
<< " pg " << pg
1656 << " crush-rule-id " << crush_rule
1657 << " weight_map " << weight_map
1658 << " failure-domain-type " << type
1662 tmpmap
.pg_to_raw_up(pg
, &raw
, &primary
);
1664 for (auto osd
: raw
) {
1666 auto parent
= tmpmap
.crush
->get_parent_of_type(osd
, type
, crush_rule
);
1668 lderr(cct
) << __func__
<< " unable to get parent of raw osd."
1669 << osd
<< " of pg " << pg
1673 auto r
= parents
.insert(parent
);
1675 // two up-set osds come from same parent
1676 to_cancel
.insert(pg
);
1680 // the above check validates collision only
1681 // below we continue to check against crush-topology changing..
1682 auto it
= weight_map
.find(osd
);
1683 if (it
== weight_map
.end()) {
1684 // osd is gone or has been moved out of the specific crush-tree
1685 to_cancel
.insert(pg
);
1688 auto adjusted_weight
= tmpmap
.get_weightf(it
->first
) * it
->second
;
1689 if (adjusted_weight
== 0) {
1690 // osd is out/crush-out
1691 to_cancel
.insert(pg
);
1696 for (auto &pg
: to_cancel
) {
1698 auto it
= pending_inc
->new_pg_upmap
.find(pg
);
1699 if (it
!= pending_inc
->new_pg_upmap
.end()) {
1700 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
1701 << "pg_upmap entry "
1702 << it
->first
<< "->" << it
->second
1704 pending_inc
->new_pg_upmap
.erase(it
);
1706 if (osdmap
.pg_upmap
.count(pg
)) {
1707 ldout(cct
, 10) << __func__
<< " cancel invalid pg_upmap entry "
1708 << osdmap
.pg_upmap
.find(pg
)->first
<< "->"
1709 << osdmap
.pg_upmap
.find(pg
)->second
1711 pending_inc
->old_pg_upmap
.insert(pg
);
1715 auto it
= pending_inc
->new_pg_upmap_items
.find(pg
);
1716 if (it
!= pending_inc
->new_pg_upmap_items
.end()) {
1717 ldout(cct
, 10) << __func__
<< " cancel invalid pending "
1718 << "pg_upmap_items entry "
1719 << it
->first
<< "->" << it
->second
1721 pending_inc
->new_pg_upmap_items
.erase(it
);
1723 if (osdmap
.pg_upmap_items
.count(pg
)) {
1724 ldout(cct
, 10) << __func__
<< " cancel invalid "
1725 << "pg_upmap_items entry "
1726 << osdmap
.pg_upmap_items
.find(pg
)->first
<< "->"
1727 << osdmap
.pg_upmap_items
.find(pg
)->second
1729 pending_inc
->old_pg_upmap_items
.insert(pg
);
1735 int OSDMap::apply_incremental(const Incremental
&inc
)
1737 new_blacklist_entries
= false;
1740 else if (inc
.fsid
!= fsid
)
1743 assert(inc
.epoch
== epoch
+1);
1746 modified
= inc
.modified
;
1749 if (inc
.fullmap
.length()) {
1750 bufferlist
bl(inc
.fullmap
);
1755 // nope, incremental.
1756 if (inc
.new_flags
>= 0) {
1757 flags
= inc
.new_flags
;
1758 // the below is just to cover a newly-upgraded luminous mon
1759 // cluster that has to set require_jewel_osds or
1760 // require_kraken_osds before the osds can be upgraded to
1762 if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
1763 if (require_osd_release
< CEPH_RELEASE_KRAKEN
) {
1764 require_osd_release
= CEPH_RELEASE_KRAKEN
;
1766 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
1767 if (require_osd_release
< CEPH_RELEASE_JEWEL
) {
1768 require_osd_release
= CEPH_RELEASE_JEWEL
;
1773 if (inc
.new_max_osd
>= 0)
1774 set_max_osd(inc
.new_max_osd
);
1776 if (inc
.new_pool_max
!= -1)
1777 pool_max
= inc
.new_pool_max
;
1779 for (const auto &pool
: inc
.new_pools
) {
1780 pools
[pool
.first
] = pool
.second
;
1781 pools
[pool
.first
].last_change
= epoch
;
1784 for (const auto &pname
: inc
.new_pool_names
) {
1785 auto pool_name_entry
= pool_name
.find(pname
.first
);
1786 if (pool_name_entry
!= pool_name
.end()) {
1787 name_pool
.erase(pool_name_entry
->second
);
1788 pool_name_entry
->second
= pname
.second
;
1790 pool_name
[pname
.first
] = pname
.second
;
1792 name_pool
[pname
.second
] = pname
.first
;
1795 for (const auto &pool
: inc
.old_pools
) {
1797 name_pool
.erase(pool_name
[pool
]);
1798 pool_name
.erase(pool
);
1801 for (const auto &weight
: inc
.new_weight
) {
1802 set_weight(weight
.first
, weight
.second
);
1804 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1805 // xinfo old_weight.
1806 if (weight
.second
) {
1807 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
1808 osd_xinfo
[weight
.first
].old_weight
= 0;
1812 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
1813 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
1816 // erasure_code_profiles
1817 for (const auto &profile
: inc
.old_erasure_code_profiles
)
1818 erasure_code_profiles
.erase(profile
);
1820 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
1821 set_erasure_code_profile(profile
.first
, profile
.second
);
1825 for (const auto &state
: inc
.new_state
) {
1826 const auto osd
= state
.first
;
1827 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
1828 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
1829 (s
& CEPH_OSD_UP
)) {
1830 osd_info
[osd
].down_at
= epoch
;
1831 osd_xinfo
[osd
].down_stamp
= modified
;
1833 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
1834 (s
& CEPH_OSD_EXISTS
)) {
1835 // osd is destroyed; clear out anything interesting.
1836 (*osd_uuid
)[osd
] = uuid_d();
1837 osd_info
[osd
] = osd_info_t();
1838 osd_xinfo
[osd
] = osd_xinfo_t();
1839 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1840 osd_addrs
->client_addr
[osd
].reset(new entity_addr_t());
1841 osd_addrs
->cluster_addr
[osd
].reset(new entity_addr_t());
1842 osd_addrs
->hb_front_addr
[osd
].reset(new entity_addr_t());
1843 osd_addrs
->hb_back_addr
[osd
].reset(new entity_addr_t());
1846 osd_state
[osd
] ^= s
;
1850 for (const auto &client
: inc
.new_up_client
) {
1851 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
1852 osd_addrs
->client_addr
[client
.first
].reset(new entity_addr_t(client
.second
));
1853 if (inc
.new_hb_back_up
.empty())
1854 osd_addrs
->hb_back_addr
[client
.first
].reset(new entity_addr_t(client
.second
)); //this is a backward-compatibility hack
1856 osd_addrs
->hb_back_addr
[client
.first
].reset(
1857 new entity_addr_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
1858 const auto j
= inc
.new_hb_front_up
.find(client
.first
);
1859 if (j
!= inc
.new_hb_front_up
.end())
1860 osd_addrs
->hb_front_addr
[client
.first
].reset(new entity_addr_t(j
->second
));
1862 osd_addrs
->hb_front_addr
[client
.first
].reset();
1864 osd_info
[client
.first
].up_from
= epoch
;
1867 for (const auto &cluster
: inc
.new_up_cluster
)
1868 osd_addrs
->cluster_addr
[cluster
.first
].reset(new entity_addr_t(cluster
.second
));
1871 for (const auto &thru
: inc
.new_up_thru
)
1872 osd_info
[thru
.first
].up_thru
= thru
.second
;
1874 for (const auto &interval
: inc
.new_last_clean_interval
) {
1875 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
1876 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
1879 for (const auto &lost
: inc
.new_lost
)
1880 osd_info
[lost
.first
].lost_at
= lost
.second
;
1883 for (const auto &xinfo
: inc
.new_xinfo
)
1884 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
1887 for (const auto &uuid
: inc
.new_uuid
)
1888 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
1891 for (const auto &pg
: inc
.new_pg_temp
) {
1892 if (pg
.second
.empty())
1893 pg_temp
->erase(pg
.first
);
1895 pg_temp
->set(pg
.first
, pg
.second
);
1897 if (!inc
.new_pg_temp
.empty()) {
1898 // make sure pg_temp is efficiently stored
1902 for (const auto &pg
: inc
.new_primary_temp
) {
1903 if (pg
.second
== -1)
1904 primary_temp
->erase(pg
.first
);
1906 (*primary_temp
)[pg
.first
] = pg
.second
;
1909 for (auto& p
: inc
.new_pg_upmap
) {
1910 pg_upmap
[p
.first
] = p
.second
;
1912 for (auto& pg
: inc
.old_pg_upmap
) {
1915 for (auto& p
: inc
.new_pg_upmap_items
) {
1916 pg_upmap_items
[p
.first
] = p
.second
;
1918 for (auto& pg
: inc
.old_pg_upmap_items
) {
1919 pg_upmap_items
.erase(pg
);
1923 if (!inc
.new_blacklist
.empty()) {
1924 blacklist
.insert(inc
.new_blacklist
.begin(),inc
.new_blacklist
.end());
1925 new_blacklist_entries
= true;
1927 for (const auto &addr
: inc
.old_blacklist
)
1928 blacklist
.erase(addr
);
1930 // cluster snapshot?
1931 if (inc
.cluster_snapshot
.length()) {
1932 cluster_snapshot
= inc
.cluster_snapshot
;
1933 cluster_snapshot_epoch
= inc
.epoch
;
1935 cluster_snapshot
.clear();
1936 cluster_snapshot_epoch
= 0;
1939 if (inc
.new_nearfull_ratio
>= 0) {
1940 nearfull_ratio
= inc
.new_nearfull_ratio
;
1942 if (inc
.new_backfillfull_ratio
>= 0) {
1943 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
1945 if (inc
.new_full_ratio
>= 0) {
1946 full_ratio
= inc
.new_full_ratio
;
1948 if (inc
.new_require_min_compat_client
> 0) {
1949 require_min_compat_client
= inc
.new_require_min_compat_client
;
1951 if (inc
.new_require_osd_release
>= 0) {
1952 require_osd_release
= inc
.new_require_osd_release
;
1953 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1954 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
1955 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
1959 // do new crush map last (after up/down stuff)
1960 if (inc
.crush
.length()) {
1961 bufferlist
bl(inc
.crush
);
1962 auto blp
= bl
.begin();
1963 crush
.reset(new CrushWrapper
);
1965 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
1966 // only increment if this is a luminous-encoded osdmap, lest
1967 // the mon's crush_version diverge from what the osds or others
1968 // are decoding and applying on their end. if we won't encode
1969 // it in the canonical version, don't change it.
1975 _calc_up_osd_features();
1980 int OSDMap::map_to_pg(
1984 const string
& nspace
,
1987 // calculate ps (placement seed)
1988 const pg_pool_t
*pool
= get_pg_pool(poolid
);
1993 ps
= pool
->hash_key(key
, nspace
);
1995 ps
= pool
->hash_key(name
, nspace
);
1996 *pg
= pg_t(ps
, poolid
);
2000 int OSDMap::object_locator_to_pg(
2001 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
2003 if (loc
.hash
>= 0) {
2004 if (!get_pg_pool(loc
.get_pool())) {
2007 pg
= pg_t(loc
.hash
, loc
.get_pool());
2010 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
2013 ceph_object_layout
OSDMap::make_object_layout(
2014 object_t oid
, int pg_pool
, string nspace
) const
2016 object_locator_t
loc(pg_pool
, nspace
);
2018 ceph_object_layout ol
;
2019 pg_t pgid
= object_locator_to_pg(oid
, loc
);
2020 ol
.ol_pgid
= pgid
.get_old_pg().v
;
2021 ol
.ol_stripe_unit
= 0;
2025 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
2026 vector
<int>& osds
) const
2028 if (pool
.can_shift_osds()) {
2029 unsigned removed
= 0;
2030 for (unsigned i
= 0; i
< osds
.size(); i
++) {
2031 if (!exists(osds
[i
])) {
2036 osds
[i
- removed
] = osds
[i
];
2040 osds
.resize(osds
.size() - removed
);
2042 for (auto& osd
: osds
) {
2044 osd
= CRUSH_ITEM_NONE
;
2049 void OSDMap::_pg_to_raw_osds(
2050 const pg_pool_t
& pool
, pg_t pg
,
2055 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
2056 unsigned size
= pool
.get_size();
2059 int ruleno
= crush
->find_rule(pool
.get_crush_rule(), pool
.get_type(), size
);
2061 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
2063 _remove_nonexistent_osds(pool
, *osds
);
2069 int OSDMap::_pick_primary(const vector
<int>& osds
) const
2071 for (auto osd
: osds
) {
2072 if (osd
!= CRUSH_ITEM_NONE
) {
2079 void OSDMap::_apply_upmap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
2081 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
2082 auto p
= pg_upmap
.find(pg
);
2083 if (p
!= pg_upmap
.end()) {
2084 // make sure targets aren't marked out
2085 for (auto osd
: p
->second
) {
2086 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd_weight
[osd
] == 0) {
2087 // reject/ignore the explicit mapping
2091 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
2092 // continue to check and apply pg_upmap_items if any
2095 auto q
= pg_upmap_items
.find(pg
);
2096 if (q
!= pg_upmap_items
.end()) {
2097 // NOTE: this approach does not allow a bidirectional swap,
2098 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
2099 for (auto& r
: q
->second
) {
2100 // make sure the replacement value doesn't already appear
2101 bool exists
= false;
2103 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
2104 int osd
= (*raw
)[i
];
2105 if (osd
== r
.second
) {
2109 // ignore mapping if target is marked out (or invalid osd id)
2110 if (osd
== r
.first
&&
2112 !(r
.second
!= CRUSH_ITEM_NONE
&& r
.second
< max_osd
&&
2113 osd_weight
[r
.second
] == 0)) {
2117 if (!exists
&& pos
>= 0) {
2118 (*raw
)[pos
] = r
.second
;
2124 // pg -> (up osd list)
2125 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
2126 vector
<int> *up
) const
2128 if (pool
.can_shift_osds()) {
2131 up
->reserve(raw
.size());
2132 for (unsigned i
=0; i
<raw
.size(); i
++) {
2133 if (!exists(raw
[i
]) || is_down(raw
[i
]))
2135 up
->push_back(raw
[i
]);
2138 // set down/dne devices to NONE
2139 up
->resize(raw
.size());
2140 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
2141 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
2142 (*up
)[i
] = CRUSH_ITEM_NONE
;
2150 void OSDMap::_apply_primary_affinity(ps_t seed
,
2151 const pg_pool_t
& pool
,
2155 // do we have any non-default primary_affinity values for these osds?
2156 if (!osd_primary_affinity
)
2160 for (const auto osd
: *osds
) {
2161 if (osd
!= CRUSH_ITEM_NONE
&&
2162 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
2170 // pick the primary. feed both the seed (for the pg) and the osd
2171 // into the hash/rng so that a proportional fraction of an osd's pgs
2172 // get rejected as primary.
2174 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
2176 if (o
== CRUSH_ITEM_NONE
)
2178 unsigned a
= (*osd_primary_affinity
)[o
];
2179 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
2180 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
2181 seed
, o
) >> 16) >= a
) {
2182 // we chose not to use this primary. note it anyway as a
2183 // fallback in case we don't pick anyone else, but keep looking.
2194 *primary
= (*osds
)[pos
];
2196 if (pool
.can_shift_osds() && pos
> 0) {
2197 // move the new primary to the front.
2198 for (int i
= pos
; i
> 0; --i
) {
2199 (*osds
)[i
] = (*osds
)[i
-1];
2201 (*osds
)[0] = *primary
;
2205 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
2206 vector
<int> *temp_pg
, int *temp_primary
) const
2208 pg
= pool
.raw_pg_to_pg(pg
);
2209 const auto p
= pg_temp
->find(pg
);
2211 if (p
!= pg_temp
->end()) {
2212 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
2213 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
2214 if (pool
.can_shift_osds()) {
2217 temp_pg
->push_back(CRUSH_ITEM_NONE
);
2220 temp_pg
->push_back(p
->second
[i
]);
2224 const auto &pp
= primary_temp
->find(pg
);
2226 if (pp
!= primary_temp
->end()) {
2227 *temp_primary
= pp
->second
;
2228 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
2229 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
2230 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
2231 *temp_primary
= (*temp_pg
)[i
];
2238 void OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
2242 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2245 _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
2247 *primary
= _pick_primary(*raw
);
2250 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
2252 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2262 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2263 _apply_upmap(*pool
, pg
, &raw
);
2264 _raw_to_up_osds(*pool
, raw
, up
);
2265 *primary
= _pick_primary(raw
);
2266 _apply_primary_affinity(pps
, *pool
, up
, primary
);
2269 void OSDMap::_pg_to_up_acting_osds(
2270 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
2271 vector
<int> *acting
, int *acting_primary
,
2272 bool raw_pg_to_pg
) const
2274 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
2276 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
2284 *acting_primary
= -1;
2289 vector
<int> _acting
;
2291 int _acting_primary
;
2293 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
2294 if (_acting
.empty() || up
|| up_primary
) {
2295 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
2296 _apply_upmap(*pool
, pg
, &raw
);
2297 _raw_to_up_osds(*pool
, raw
, &_up
);
2298 _up_primary
= _pick_primary(_up
);
2299 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
2300 if (_acting
.empty()) {
2302 if (_acting_primary
== -1) {
2303 _acting_primary
= _up_primary
;
2310 *up_primary
= _up_primary
;
2314 acting
->swap(_acting
);
2316 *acting_primary
= _acting_primary
;
2319 int OSDMap::calc_pg_rank(int osd
, const vector
<int>& acting
, int nrep
)
2322 nrep
= acting
.size();
2323 for (int i
=0; i
<nrep
; i
++)
2324 if (acting
[i
] == osd
)
2329 int OSDMap::calc_pg_role(int osd
, const vector
<int>& acting
, int nrep
)
2331 return calc_pg_rank(osd
, acting
, nrep
);
2334 bool OSDMap::primary_changed(
2336 const vector
<int> &oldacting
,
2338 const vector
<int> &newacting
)
2340 if (oldacting
.empty() && newacting
.empty())
2341 return false; // both still empty
2342 if (oldacting
.empty() ^ newacting
.empty())
2343 return true; // was empty, now not, or vice versa
2344 if (oldprimary
!= newprimary
)
2345 return true; // primary changed
2346 if (calc_pg_rank(oldprimary
, oldacting
) !=
2347 calc_pg_rank(newprimary
, newacting
))
2349 return false; // same primary (tho replicas may have changed)
2352 uint64_t OSDMap::get_encoding_features() const
2354 uint64_t f
= SIGNIFICANT_FEATURES
;
2355 if (require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
2356 f
&= ~(CEPH_FEATURE_SERVER_LUMINOUS
|
2357 CEPH_FEATURE_CRUSH_CHOOSE_ARGS
);
2359 if (require_osd_release
< CEPH_RELEASE_KRAKEN
) {
2360 f
&= ~(CEPH_FEATURE_SERVER_KRAKEN
|
2361 CEPH_FEATURE_MSG_ADDR2
|
2362 CEPH_FEATURE_CRUSH_TUNABLES5
);
2364 if (require_osd_release
< CEPH_RELEASE_JEWEL
) {
2365 f
&= ~(CEPH_FEATURE_SERVER_JEWEL
|
2366 CEPH_FEATURE_NEW_OSDOP_ENCODING
);
2371 // serialize, unserialize
2372 void OSDMap::encode_client_old(bufferlist
& bl
) const
2379 ::encode(epoch
, bl
);
2380 ::encode(created
, bl
);
2381 ::encode(modified
, bl
);
2383 // for ::encode(pools, bl);
2384 __u32 n
= pools
.size();
2387 for (const auto &pool
: pools
) {
2390 ::encode(pool
.second
, bl
, 0);
2392 // for ::encode(pool_name, bl);
2393 n
= pool_name
.size();
2395 for (const auto &pname
: pool_name
) {
2398 ::encode(pname
.second
, bl
);
2400 // for ::encode(pool_max, bl);
2404 ::encode(flags
, bl
);
2406 ::encode(max_osd
, bl
);
2408 uint32_t n
= osd_state
.size();
2410 for (auto s
: osd_state
) {
2411 ::encode((uint8_t)s
, bl
);
2414 ::encode(osd_weight
, bl
);
2415 ::encode(osd_addrs
->client_addr
, bl
, 0);
2417 // for ::encode(pg_temp, bl);
2418 n
= pg_temp
->size();
2420 for (const auto pg
: *pg_temp
) {
2421 old_pg_t opg
= pg
.first
.get_old_pg();
2423 ::encode(pg
.second
, bl
);
2428 crush
->encode(cbl
, 0 /* legacy (no) features */);
2432 void OSDMap::encode_classic(bufferlist
& bl
, uint64_t features
) const
2434 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
2435 encode_client_old(bl
);
2444 ::encode(epoch
, bl
);
2445 ::encode(created
, bl
);
2446 ::encode(modified
, bl
);
2448 ::encode(pools
, bl
, features
);
2449 ::encode(pool_name
, bl
);
2450 ::encode(pool_max
, bl
);
2452 ::encode(flags
, bl
);
2454 ::encode(max_osd
, bl
);
2456 uint32_t n
= osd_state
.size();
2458 for (auto s
: osd_state
) {
2459 ::encode((uint8_t)s
, bl
);
2462 ::encode(osd_weight
, bl
);
2463 ::encode(osd_addrs
->client_addr
, bl
, features
);
2465 ::encode(*pg_temp
, bl
);
2469 crush
->encode(cbl
, 0 /* legacy (no) features */);
2475 ::encode(osd_addrs
->hb_back_addr
, bl
, features
);
2476 ::encode(osd_info
, bl
);
2477 ::encode(blacklist
, bl
, features
);
2478 ::encode(osd_addrs
->cluster_addr
, bl
, features
);
2479 ::encode(cluster_snapshot_epoch
, bl
);
2480 ::encode(cluster_snapshot
, bl
);
2481 ::encode(*osd_uuid
, bl
);
2482 ::encode(osd_xinfo
, bl
);
2483 ::encode(osd_addrs
->hb_front_addr
, bl
, features
);
2486 void OSDMap::encode(bufferlist
& bl
, uint64_t features
) const
2488 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
2489 encode_classic(bl
, features
);
2493 // only a select set of callers should *ever* be encoding new
2494 // OSDMaps. others should be passing around the canonical encoded
2495 // buffers from on high. select out those callers by passing in an
2496 // "impossible" feature bit.
2497 assert(features
& CEPH_FEATURE_RESERVED
);
2498 features
&= ~CEPH_FEATURE_RESERVED
;
2500 size_t start_offset
= bl
.length();
2502 buffer::list::iterator crc_it
;
2504 // meta-encoding: how we include client-used and osd-specific data
2505 ENCODE_START(8, 7, bl
);
2508 // NOTE: any new encoding dependencies must be reflected by
2509 // SIGNIFICANT_FEATURES
2511 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2514 ENCODE_START(v
, 1, bl
); // client-usable data
2517 ::encode(epoch
, bl
);
2518 ::encode(created
, bl
);
2519 ::encode(modified
, bl
);
2521 ::encode(pools
, bl
, features
);
2522 ::encode(pool_name
, bl
);
2523 ::encode(pool_max
, bl
);
2526 decltype(flags
) f
= flags
;
2527 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
)
2528 f
|= CEPH_OSDMAP_REQUIRE_LUMINOUS
| CEPH_OSDMAP_RECOVERY_DELETES
;
2529 else if (require_osd_release
== CEPH_RELEASE_KRAKEN
)
2530 f
|= CEPH_OSDMAP_REQUIRE_KRAKEN
;
2531 else if (require_osd_release
== CEPH_RELEASE_JEWEL
)
2532 f
|= CEPH_OSDMAP_REQUIRE_JEWEL
;
2535 ::encode(flags
, bl
);
2538 ::encode(max_osd
, bl
);
2540 ::encode(osd_state
, bl
);
2542 uint32_t n
= osd_state
.size();
2544 for (auto s
: osd_state
) {
2545 ::encode((uint8_t)s
, bl
);
2548 ::encode(osd_weight
, bl
);
2549 ::encode(osd_addrs
->client_addr
, bl
, features
);
2551 ::encode(*pg_temp
, bl
);
2552 ::encode(*primary_temp
, bl
);
2553 if (osd_primary_affinity
) {
2554 ::encode(*osd_primary_affinity
, bl
);
2562 crush
->encode(cbl
, features
);
2564 ::encode(erasure_code_profiles
, bl
);
2567 ::encode(pg_upmap
, bl
);
2568 ::encode(pg_upmap_items
, bl
);
2570 assert(pg_upmap
.empty());
2571 assert(pg_upmap_items
.empty());
2574 ::encode(crush_version
, bl
);
2576 ENCODE_FINISH(bl
); // client-usable data
2580 // NOTE: any new encoding dependencies must be reflected by
2581 // SIGNIFICANT_FEATURES
2582 uint8_t target_v
= 5;
2583 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2586 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
2587 ::encode(osd_addrs
->hb_back_addr
, bl
, features
);
2588 ::encode(osd_info
, bl
);
2590 // put this in a sorted, ordered map<> so that we encode in a
2591 // deterministic order.
2592 map
<entity_addr_t
,utime_t
> blacklist_map
;
2593 for (const auto &addr
: blacklist
)
2594 blacklist_map
.insert(make_pair(addr
.first
, addr
.second
));
2595 ::encode(blacklist_map
, bl
, features
);
2597 ::encode(osd_addrs
->cluster_addr
, bl
, features
);
2598 ::encode(cluster_snapshot_epoch
, bl
);
2599 ::encode(cluster_snapshot
, bl
);
2600 ::encode(*osd_uuid
, bl
);
2601 ::encode(osd_xinfo
, bl
);
2602 ::encode(osd_addrs
->hb_front_addr
, bl
, features
);
2603 if (target_v
>= 2) {
2604 ::encode(nearfull_ratio
, bl
);
2605 ::encode(full_ratio
, bl
);
2606 ::encode(backfillfull_ratio
, bl
);
2608 // 4 was string-based new_require_min_compat_client
2609 if (target_v
>= 5) {
2610 ::encode(require_min_compat_client
, bl
);
2611 ::encode(require_osd_release
, bl
);
2613 ENCODE_FINISH(bl
); // osd-only data
2616 ::encode((uint32_t)0, bl
); // dummy crc
2619 tail_offset
= bl
.length();
2621 ENCODE_FINISH(bl
); // meta-encoding wrapper
2625 front
.substr_of(bl
, start_offset
, crc_it
.get_off() - start_offset
);
2626 crc
= front
.crc32c(-1);
2627 if (tail_offset
< bl
.length()) {
2629 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
2630 crc
= tail
.crc32c(crc
);
2634 crc_it
.copy_in(4, (char*)&crc_le
);
2638 void OSDMap::decode(bufferlist
& bl
)
2640 auto p
= bl
.begin();
2644 void OSDMap::decode_classic(bufferlist::iterator
& p
)
2653 ::decode(created
, p
);
2654 ::decode(modified
, p
);
2658 int32_t max_pools
= 0;
2659 ::decode(max_pools
, p
);
2660 pool_max
= max_pools
;
2666 ::decode(pools
[t
], p
);
2671 } else if (v
== 5) {
2676 ::decode(pool_name
[t
], p
);
2683 ::decode(pool_name
, p
);
2684 ::decode(pool_max
, p
);
2686 // kludge around some old bug that zeroed out pool_max (#2307)
2687 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
2688 pool_max
= pools
.rbegin()->first
;
2693 ::decode(max_osd
, p
);
2697 osd_state
.resize(os
.size());
2698 for (unsigned i
= 0; i
< os
.size(); ++i
) {
2699 osd_state
[i
] = os
[i
];
2702 ::decode(osd_weight
, p
);
2703 ::decode(osd_addrs
->client_addr
, p
);
2709 ::decode_raw(opg
, p
);
2710 mempool::osdmap::vector
<int32_t> v
;
2712 pg_temp
->set(pg_t(opg
), v
);
2715 ::decode(*pg_temp
, p
);
2721 auto cblp
= cbl
.begin();
2722 crush
->decode(cblp
);
2728 ::decode(osd_addrs
->hb_back_addr
, p
);
2729 ::decode(osd_info
, p
);
2731 ::decode(pool_name
, p
);
2733 ::decode(blacklist
, p
);
2735 ::decode(osd_addrs
->cluster_addr
, p
);
2737 osd_addrs
->cluster_addr
.resize(osd_addrs
->client_addr
.size());
2740 ::decode(cluster_snapshot_epoch
, p
);
2741 ::decode(cluster_snapshot
, p
);
2745 ::decode(*osd_uuid
, p
);
2747 osd_uuid
->resize(max_osd
);
2750 ::decode(osd_xinfo
, p
);
2752 osd_xinfo
.resize(max_osd
);
2755 ::decode(osd_addrs
->hb_front_addr
, p
);
2757 osd_addrs
->hb_front_addr
.resize(osd_addrs
->hb_back_addr
.size());
2759 osd_primary_affinity
.reset();
2764 void OSDMap::decode(bufferlist::iterator
& bl
)
2767 * Older encodings of the OSDMap had a single struct_v which
2768 * covered the whole encoding, and was prior to our modern
2769 * stuff which includes a compatv and a size. So if we see
2770 * a struct_v < 7, we must rewind to the beginning and use our
2773 size_t start_offset
= bl
.get_off();
2774 size_t tail_offset
= 0;
2775 bufferlist crc_front
, crc_tail
;
2777 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
2779 int struct_v_size
= sizeof(struct_v
);
2780 bl
.advance(-struct_v_size
);
2785 * Since we made it past that hurdle, we can use our normal paths.
2788 DECODE_START(6, bl
); // client-usable data
2791 ::decode(epoch
, bl
);
2792 ::decode(created
, bl
);
2793 ::decode(modified
, bl
);
2795 ::decode(pools
, bl
);
2796 ::decode(pool_name
, bl
);
2797 ::decode(pool_max
, bl
);
2799 ::decode(flags
, bl
);
2801 ::decode(max_osd
, bl
);
2802 if (struct_v
>= 5) {
2803 ::decode(osd_state
, bl
);
2807 osd_state
.resize(os
.size());
2808 for (unsigned i
= 0; i
< os
.size(); ++i
) {
2809 osd_state
[i
] = os
[i
];
2812 ::decode(osd_weight
, bl
);
2813 ::decode(osd_addrs
->client_addr
, bl
);
2815 ::decode(*pg_temp
, bl
);
2816 ::decode(*primary_temp
, bl
);
2817 if (struct_v
>= 2) {
2818 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
2819 ::decode(*osd_primary_affinity
, bl
);
2820 if (osd_primary_affinity
->empty())
2821 osd_primary_affinity
.reset();
2823 osd_primary_affinity
.reset();
2829 auto cblp
= cbl
.begin();
2830 crush
->decode(cblp
);
2831 if (struct_v
>= 3) {
2832 ::decode(erasure_code_profiles
, bl
);
2834 erasure_code_profiles
.clear();
2836 if (struct_v
>= 4) {
2837 ::decode(pg_upmap
, bl
);
2838 ::decode(pg_upmap_items
, bl
);
2841 pg_upmap_items
.clear();
2843 if (struct_v
>= 6) {
2844 ::decode(crush_version
, bl
);
2846 DECODE_FINISH(bl
); // client-usable data
2850 DECODE_START(5, bl
); // extended, osd-only data
2851 ::decode(osd_addrs
->hb_back_addr
, bl
);
2852 ::decode(osd_info
, bl
);
2853 ::decode(blacklist
, bl
);
2854 ::decode(osd_addrs
->cluster_addr
, bl
);
2855 ::decode(cluster_snapshot_epoch
, bl
);
2856 ::decode(cluster_snapshot
, bl
);
2857 ::decode(*osd_uuid
, bl
);
2858 ::decode(osd_xinfo
, bl
);
2859 ::decode(osd_addrs
->hb_front_addr
, bl
);
2860 if (struct_v
>= 2) {
2861 ::decode(nearfull_ratio
, bl
);
2862 ::decode(full_ratio
, bl
);
2867 if (struct_v
>= 3) {
2868 ::decode(backfillfull_ratio
, bl
);
2870 backfillfull_ratio
= 0;
2872 if (struct_v
== 4) {
2876 require_min_compat_client
= ceph_release_from_name(r
.c_str());
2878 if (struct_v
>= 5) {
2879 ::decode(require_min_compat_client
, bl
);
2880 ::decode(require_osd_release
, bl
);
2881 if (require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
2882 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2883 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
2886 if (flags
& CEPH_OSDMAP_REQUIRE_LUMINOUS
) {
2887 // only for compat with post-kraken pre-luminous test clusters
2888 require_osd_release
= CEPH_RELEASE_LUMINOUS
;
2889 flags
&= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS
);
2890 flags
|= CEPH_OSDMAP_RECOVERY_DELETES
;
2891 } else if (flags
& CEPH_OSDMAP_REQUIRE_KRAKEN
) {
2892 require_osd_release
= CEPH_RELEASE_KRAKEN
;
2893 } else if (flags
& CEPH_OSDMAP_REQUIRE_JEWEL
) {
2894 require_osd_release
= CEPH_RELEASE_JEWEL
;
2896 require_osd_release
= 0;
2899 DECODE_FINISH(bl
); // osd-only data
2902 if (struct_v
>= 8) {
2903 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
2905 tail_offset
= bl
.get_off();
2908 crc_defined
= false;
2912 DECODE_FINISH(bl
); // wrapper
2916 uint32_t actual
= crc_front
.crc32c(-1);
2917 if (tail_offset
< bl
.get_off()) {
2919 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
2920 actual
= tail
.crc32c(actual
);
2922 if (crc
!= actual
) {
2924 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
2925 string s
= ss
.str();
2926 throw buffer::malformed_input(s
.c_str());
2933 void OSDMap::post_decode()
2937 for (const auto &pname
: pool_name
) {
2938 name_pool
[pname
.second
] = pname
.first
;
2942 _calc_up_osd_features();
2945 void OSDMap::dump_erasure_code_profiles(
2946 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
2949 f
->open_object_section("erasure_code_profiles");
2950 for (const auto &profile
: profiles
) {
2951 f
->open_object_section(profile
.first
.c_str());
2952 for (const auto &profm
: profile
.second
) {
2953 f
->dump_string(profm
.first
.c_str(), profm
.second
.c_str());
2960 void OSDMap::dump(Formatter
*f
) const
2962 f
->dump_int("epoch", get_epoch());
2963 f
->dump_stream("fsid") << get_fsid();
2964 f
->dump_stream("created") << get_created();
2965 f
->dump_stream("modified") << get_modified();
2966 f
->dump_string("flags", get_flag_string());
2967 f
->dump_unsigned("crush_version", get_crush_version());
2968 f
->dump_float("full_ratio", full_ratio
);
2969 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
2970 f
->dump_float("nearfull_ratio", nearfull_ratio
);
2971 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
2972 f
->dump_int("pool_max", get_pool_max());
2973 f
->dump_int("max_osd", get_max_osd());
2974 f
->dump_string("require_min_compat_client",
2975 ceph_release_name(require_min_compat_client
));
2976 f
->dump_string("min_compat_client",
2977 ceph_release_name(get_min_compat_client()));
2978 f
->dump_string("require_osd_release",
2979 ceph_release_name(require_osd_release
));
2981 f
->open_array_section("pools");
2982 for (const auto &pool
: pools
) {
2983 std::string
name("<unknown>");
2984 const auto &pni
= pool_name
.find(pool
.first
);
2985 if (pni
!= pool_name
.end())
2987 f
->open_object_section("pool");
2988 f
->dump_int("pool", pool
.first
);
2989 f
->dump_string("pool_name", name
);
2990 pool
.second
.dump(f
);
2995 f
->open_array_section("osds");
2996 for (int i
=0; i
<get_max_osd(); i
++)
2998 f
->open_object_section("osd_info");
2999 f
->dump_int("osd", i
);
3000 f
->dump_stream("uuid") << get_uuid(i
);
3001 f
->dump_int("up", is_up(i
));
3002 f
->dump_int("in", is_in(i
));
3003 f
->dump_float("weight", get_weightf(i
));
3004 f
->dump_float("primary_affinity", get_primary_affinityf(i
));
3005 get_info(i
).dump(f
);
3006 f
->dump_stream("public_addr") << get_addr(i
);
3007 f
->dump_stream("cluster_addr") << get_cluster_addr(i
);
3008 f
->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i
);
3009 f
->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i
);
3013 f
->open_array_section("state");
3014 for (const auto &state
: st
)
3015 f
->dump_string("state", state
);
3022 f
->open_array_section("osd_xinfo");
3023 for (int i
=0; i
<get_max_osd(); i
++) {
3025 f
->open_object_section("xinfo");
3026 f
->dump_int("osd", i
);
3027 osd_xinfo
[i
].dump(f
);
3033 f
->open_array_section("pg_upmap");
3034 for (auto& p
: pg_upmap
) {
3035 f
->open_object_section("mapping");
3036 f
->dump_stream("pgid") << p
.first
;
3037 f
->open_array_section("osds");
3038 for (auto q
: p
.second
) {
3039 f
->dump_int("osd", q
);
3045 f
->open_array_section("pg_upmap_items");
3046 for (auto& p
: pg_upmap_items
) {
3047 f
->open_object_section("mapping");
3048 f
->dump_stream("pgid") << p
.first
;
3049 f
->open_array_section("mappings");
3050 for (auto& q
: p
.second
) {
3051 f
->open_object_section("mapping");
3052 f
->dump_int("from", q
.first
);
3053 f
->dump_int("to", q
.second
);
3060 f
->open_array_section("pg_temp");
3064 f
->open_array_section("primary_temp");
3065 for (const auto &pg
: *primary_temp
) {
3066 f
->dump_stream("pgid") << pg
.first
;
3067 f
->dump_int("osd", pg
.second
);
3069 f
->close_section(); // primary_temp
3071 f
->open_object_section("blacklist");
3072 for (const auto &addr
: blacklist
) {
3075 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
3079 dump_erasure_code_profiles(erasure_code_profiles
, f
);
3082 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
3084 o
.push_back(new OSDMap
);
3086 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
3087 o
.push_back(new OSDMap
);
3089 o
.back()->build_simple(cct
, 1, fsid
, 16);
3090 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
3091 o
.back()->blacklist
[entity_addr_t()] = utime_t(5, 6);
3095 string
OSDMap::get_flag_string(unsigned f
)
3098 if ( f
& CEPH_OSDMAP_NEARFULL
)
3100 if (f
& CEPH_OSDMAP_FULL
)
3102 if (f
& CEPH_OSDMAP_PAUSERD
)
3104 if (f
& CEPH_OSDMAP_PAUSEWR
)
3106 if (f
& CEPH_OSDMAP_PAUSEREC
)
3108 if (f
& CEPH_OSDMAP_NOUP
)
3110 if (f
& CEPH_OSDMAP_NODOWN
)
3112 if (f
& CEPH_OSDMAP_NOOUT
)
3114 if (f
& CEPH_OSDMAP_NOIN
)
3116 if (f
& CEPH_OSDMAP_NOBACKFILL
)
3118 if (f
& CEPH_OSDMAP_NOREBALANCE
)
3119 s
+= ",norebalance";
3120 if (f
& CEPH_OSDMAP_NORECOVER
)
3122 if (f
& CEPH_OSDMAP_NOSCRUB
)
3124 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
3125 s
+= ",nodeep-scrub";
3126 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
3127 s
+= ",notieragent";
3128 if (f
& CEPH_OSDMAP_SORTBITWISE
)
3129 s
+= ",sortbitwise";
3130 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
3131 s
+= ",require_jewel_osds";
3132 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
3133 s
+= ",require_kraken_osds";
3134 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
3135 s
+= ",require_luminous_osds";
3136 if (f
& CEPH_OSDMAP_RECOVERY_DELETES
)
3137 s
+= ",recovery_deletes";
3138 if (f
& CEPH_OSDMAP_PURGED_SNAPDIRS
)
3139 s
+= ",purged_snapdirs";
3145 string
OSDMap::get_flag_string() const
3147 return get_flag_string(flags
);
3150 void OSDMap::print_pools(ostream
& out
) const
3152 for (const auto &pool
: pools
) {
3153 std::string
name("<unknown>");
3154 const auto &pni
= pool_name
.find(pool
.first
);
3155 if (pni
!= pool_name
.end())
3157 out
<< "pool " << pool
.first
3159 << "' " << pool
.second
<< "\n";
3161 for (const auto &snap
: pool
.second
.snaps
)
3162 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
3164 if (!pool
.second
.removed_snaps
.empty())
3165 out
<< "\tremoved_snaps " << pool
.second
.removed_snaps
<< "\n";
3170 void OSDMap::print(ostream
& out
) const
3172 out
<< "epoch " << get_epoch() << "\n"
3173 << "fsid " << get_fsid() << "\n"
3174 << "created " << get_created() << "\n"
3175 << "modified " << get_modified() << "\n";
3177 out
<< "flags " << get_flag_string() << "\n";
3178 out
<< "crush_version " << get_crush_version() << "\n";
3179 out
<< "full_ratio " << full_ratio
<< "\n";
3180 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
3181 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
3182 if (require_min_compat_client
> 0) {
3183 out
<< "require_min_compat_client "
3184 << ceph_release_name(require_min_compat_client
) << "\n";
3186 out
<< "min_compat_client " << ceph_release_name(get_min_compat_client())
3188 if (require_osd_release
> 0) {
3189 out
<< "require_osd_release " << ceph_release_name(require_osd_release
)
3192 if (get_cluster_snapshot().length())
3193 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
3198 out
<< "max_osd " << get_max_osd() << "\n";
3199 for (int i
=0; i
<get_max_osd(); i
++) {
3202 out
<< (is_up(i
) ? " up ":" down");
3203 out
<< (is_in(i
) ? " in ":" out");
3204 out
<< " weight " << get_weightf(i
);
3205 if (get_primary_affinity(i
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
)
3206 out
<< " primary_affinity " << get_primary_affinityf(i
);
3207 const osd_info_t
& info(get_info(i
));
3209 out
<< " " << get_addr(i
) << " " << get_cluster_addr(i
) << " " << get_hb_back_addr(i
)
3210 << " " << get_hb_front_addr(i
);
3214 if (!get_uuid(i
).is_zero())
3215 out
<< " " << get_uuid(i
);
3221 for (auto& p
: pg_upmap
) {
3222 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
3224 for (auto& p
: pg_upmap_items
) {
3225 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
3228 for (const auto pg
: *pg_temp
)
3229 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3231 for (const auto pg
: *primary_temp
)
3232 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
3234 for (const auto &addr
: blacklist
)
3235 out
<< "blacklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
3237 // ignore pg_swap_primary
3240 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
3242 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
3244 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3246 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
3248 bool should_dump_leaf(int i
) const override
{
3250 return true; // normal case
3252 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
3253 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
3254 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
3255 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
3256 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
3262 bool should_dump_empty_bucket() const override
{
3266 void dump(TextTable
*tbl
) {
3267 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
3268 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
3269 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3270 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
3271 tbl
->define_column("STATUS", TextTable::LEFT
, TextTable::RIGHT
);
3272 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
3273 tbl
->define_column("PRI-AFF", TextTable::LEFT
, TextTable::RIGHT
);
3277 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3278 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
)) {
3279 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), tbl
);
3285 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
3286 const char *c
= crush
->get_item_class(qi
.id
);
3291 << weightf_t(qi
.weight
);
3294 for (int k
= 0; k
< qi
.depth
; k
++)
3296 if (qi
.is_bucket()) {
3297 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
3298 << crush
->get_item_name(qi
.id
);
3300 name
<< "osd." << qi
.id
;
3304 if (!qi
.is_bucket()) {
3305 if (!osdmap
->exists(qi
.id
)) {
3310 if (osdmap
->is_up(qi
.id
)) {
3312 } else if (osdmap
->is_destroyed(qi
.id
)) {
3318 << weightf_t(osdmap
->get_weightf(qi
.id
))
3319 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
3322 *tbl
<< TextTable::endrow
;
3326 const OSDMap
*osdmap
;
3327 const unsigned filter
;
3330 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
3332 typedef CrushTreeDumper::FormattingDumper Parent
;
3334 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
3336 : Parent(crush
, osdmap_
->get_pool_names()), osdmap(osdmap_
), filter(f
) { }
3338 bool should_dump_leaf(int i
) const override
{
3340 return true; // normal case
3342 if (((filter
& OSDMap::DUMP_UP
) && osdmap
->is_up(i
)) ||
3343 ((filter
& OSDMap::DUMP_DOWN
) && osdmap
->is_down(i
)) ||
3344 ((filter
& OSDMap::DUMP_IN
) && osdmap
->is_in(i
)) ||
3345 ((filter
& OSDMap::DUMP_OUT
) && osdmap
->is_out(i
)) ||
3346 ((filter
& OSDMap::DUMP_DESTROYED
) && osdmap
->is_destroyed(i
))) {
3352 bool should_dump_empty_bucket() const override
{
3356 void dump(Formatter
*f
) {
3357 f
->open_array_section("nodes");
3360 f
->open_array_section("stray");
3361 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
3362 if (osdmap
->exists(i
) && !is_touched(i
) && should_dump_leaf(i
))
3363 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
3369 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
3370 Parent::dump_item_fields(qi
, f
);
3371 if (!qi
.is_bucket())
3374 if (osdmap
->is_up(qi
.id
)) {
3376 } else if (osdmap
->is_destroyed(qi
.id
)) {
3381 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
3382 f
->dump_string("status", s
);
3383 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
3384 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
3389 const OSDMap
*osdmap
;
3390 const unsigned filter
;
3393 void OSDMap::print_tree(Formatter
*f
, ostream
*out
, unsigned filter
) const
3396 OSDTreeFormattingDumper(crush
.get(), this, filter
).dump(f
);
3400 OSDTreePlainDumper(crush
.get(), this, filter
).dump(&tbl
);
3405 void OSDMap::print_summary(Formatter
*f
, ostream
& out
,
3406 const string
& prefix
) const
3409 f
->open_object_section("osdmap");
3410 f
->dump_int("epoch", get_epoch());
3411 f
->dump_int("num_osds", get_num_osds());
3412 f
->dump_int("num_up_osds", get_num_up_osds());
3413 f
->dump_int("num_in_osds", get_num_in_osds());
3414 f
->dump_bool("full", test_flag(CEPH_OSDMAP_FULL
) ? true : false);
3415 f
->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL
) ? true : false);
3416 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
3419 out
<< get_num_osds() << " osds: "
3420 << get_num_up_osds() << " up, "
3421 << get_num_in_osds() << " in";
3422 if (get_num_pg_temp())
3423 out
<< "; " << get_num_pg_temp() << " remapped pgs";
3425 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
3426 if (important_flags
)
3427 out
<< prefix
<< "flags " << get_flag_string(important_flags
) << "\n";
3431 void OSDMap::print_oneline_summary(ostream
& out
) const
3433 out
<< "e" << get_epoch() << ": "
3434 << get_num_osds() << " total, "
3435 << get_num_up_osds() << " up, "
3436 << get_num_in_osds() << " in";
3437 if (test_flag(CEPH_OSDMAP_FULL
))
3439 else if (test_flag(CEPH_OSDMAP_NEARFULL
))
3443 bool OSDMap::crush_rule_in_use(int rule_id
) const
3445 for (const auto &pool
: pools
) {
3446 if (pool
.second
.crush_rule
== rule_id
)
3452 int OSDMap::validate_crush_rules(CrushWrapper
*newcrush
,
3455 for (auto& i
: pools
) {
3456 auto& pool
= i
.second
;
3457 int ruleno
= pool
.get_crush_rule();
3458 if (!newcrush
->rule_exists(ruleno
)) {
3459 *ss
<< "pool " << i
.first
<< " references crush_rule " << ruleno
3460 << " but it is not present";
3463 if (newcrush
->get_rule_mask_ruleset(ruleno
) != ruleno
) {
3464 *ss
<< "rule " << ruleno
<< " mask ruleset does not match rule id";
3467 if (newcrush
->get_rule_mask_type(ruleno
) != (int)pool
.get_type()) {
3468 *ss
<< "pool " << i
.first
<< " type does not match rule " << ruleno
;
3471 if (pool
.get_size() < (int)newcrush
->get_rule_mask_min_size(ruleno
) ||
3472 pool
.get_size() > (int)newcrush
->get_rule_mask_max_size(ruleno
)) {
3473 *ss
<< "pool " << i
.first
<< " size " << pool
.get_size() << " does not"
3474 << " fall within rule " << ruleno
3475 << " min_size " << newcrush
->get_rule_mask_min_size(ruleno
)
3476 << " and max_size " << newcrush
->get_rule_mask_max_size(ruleno
);
3483 int OSDMap::build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
3484 int nosd
, int pg_bits
, int pgp_bits
,
3487 ldout(cct
, 10) << "build_simple on " << nosd
3488 << " osds" << dendl
;
3491 created
= modified
= ceph_clock_now();
3498 const md_config_t
*conf
= cct
->_conf
;
3499 vector
<string
> sections
;
3500 conf
->get_all_sections(sections
);
3502 for (auto §ion
: sections
) {
3503 if (section
.find("osd.") != 0)
3506 const char *begin
= section
.c_str() + 4;
3507 char *end
= (char*)begin
;
3508 int o
= strtol(begin
, &end
, 10);
3512 if (o
> cct
->_conf
->mon_max_osd
) {
3513 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
3521 set_max_osd(maxosd
+ 1);
3528 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
3530 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
3533 int poolbase
= get_max_osd() ? get_max_osd() : 1;
3535 const int default_replicated_rule
= crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
3536 assert(default_replicated_rule
>= 0);
3539 // pgp_num <= pg_num
3540 if (pgp_bits
> pg_bits
)
3543 vector
<string
> pool_names
;
3544 pool_names
.push_back("rbd");
3545 for (auto &plname
: pool_names
) {
3546 int64_t pool
= ++pool_max
;
3547 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
3548 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
3549 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
3550 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
3551 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
3552 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
3553 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
3554 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
3555 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
3556 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
3557 pools
[pool
].size
= cct
->_conf
->osd_pool_default_size
;
3558 pools
[pool
].min_size
= cct
->_conf
->get_osd_pool_default_min_size();
3559 pools
[pool
].crush_rule
= default_replicated_rule
;
3560 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
3561 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
3562 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
3563 pools
[pool
].last_change
= epoch
;
3564 pools
[pool
].application_metadata
.insert(
3565 {pg_pool_t::APPLICATION_NAME_RBD
, {}});
3566 pool_name
[pool
] = plname
;
3567 name_pool
[plname
] = pool
;
3571 for (int i
=0; i
<get_max_osd(); i
++) {
3573 set_weight(i
, CEPH_OSD_OUT
);
3576 map
<string
,string
> profile_map
;
3577 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
3579 lderr(cct
) << ss
.str() << dendl
;
3582 set_erasure_code_profile("default", profile_map
);
3586 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
3587 map
<string
,string
> &profile_map
,
3590 int r
= get_json_str_map(cct
->_conf
->osd_pool_default_erasure_code_profile
,
3596 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
3598 crush
.set_type_name(0, "osd");
3599 crush
.set_type_name(1, "host");
3600 crush
.set_type_name(2, "chassis");
3601 crush
.set_type_name(3, "rack");
3602 crush
.set_type_name(4, "row");
3603 crush
.set_type_name(5, "pdu");
3604 crush
.set_type_name(6, "pod");
3605 crush
.set_type_name(7, "room");
3606 crush
.set_type_name(8, "datacenter");
3607 crush
.set_type_name(9, "region");
3608 crush
.set_type_name(10, "root");
3612 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
3613 int nosd
, ostream
*ss
)
3618 int root_type
= _build_crush_types(crush
);
3620 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
3621 root_type
, 0, NULL
, NULL
, &rootid
);
3623 crush
.set_item_name(rootid
, "default");
3625 for (int o
=0; o
<nosd
; o
++) {
3626 map
<string
,string
> loc
;
3627 loc
["host"] = "localhost";
3628 loc
["rack"] = "localrack";
3629 loc
["root"] = "default";
3630 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
3632 snprintf(name
, sizeof(name
), "osd.%d", o
);
3633 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
3636 build_simple_crush_rules(cct
, crush
, "default", ss
);
3643 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
3644 CrushWrapper
& crush
,
3647 const md_config_t
*conf
= cct
->_conf
;
3652 int root_type
= _build_crush_types(crush
);
3654 int r
= crush
.add_bucket(0, 0,
3656 root_type
, 0, NULL
, NULL
, &rootid
);
3658 crush
.set_item_name(rootid
, "default");
3661 vector
<string
> sections
;
3662 conf
->get_all_sections(sections
);
3664 for (auto §ion
: sections
) {
3665 if (section
.find("osd.") != 0)
3668 const char *begin
= section
.c_str() + 4;
3669 char *end
= (char*)begin
;
3670 int o
= strtol(begin
, &end
, 10);
3674 string host
, rack
, row
, room
, dc
, pool
;
3675 vector
<string
> sectiontmp
;
3676 sectiontmp
.push_back("osd");
3677 sectiontmp
.push_back(section
);
3678 conf
->get_val_from_conf_file(sectiontmp
, "host", host
, false);
3679 conf
->get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
3680 conf
->get_val_from_conf_file(sectiontmp
, "row", row
, false);
3681 conf
->get_val_from_conf_file(sectiontmp
, "room", room
, false);
3682 conf
->get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
3683 conf
->get_val_from_conf_file(sectiontmp
, "root", pool
, false);
3685 if (host
.length() == 0)
3686 host
= "unknownhost";
3687 if (rack
.length() == 0)
3688 rack
= "unknownrack";
3690 map
<string
,string
> loc
;
3698 loc
["datacenter"] = dc
;
3699 loc
["root"] = "default";
3701 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
3702 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
3705 build_simple_crush_rules(cct
, crush
, "default", ss
);
3713 int OSDMap::build_simple_crush_rules(
3715 CrushWrapper
& crush
,
3719 int crush_rule
= crush
.get_osd_pool_default_crush_replicated_ruleset(cct
);
3720 string failure_domain
=
3721 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
3724 r
= crush
.add_simple_rule_at(
3725 "replicated_rule", root
, failure_domain
, "",
3726 "firstn", pg_pool_t::TYPE_REPLICATED
,
3730 // do not add an erasure rule by default or else we will implicitly
3731 // require the crush_v2 feature of clients
3735 int OSDMap::summarize_mapping_stats(
3737 const set
<int64_t> *pools
,
3745 for (auto &p
: get_pools())
3749 unsigned total_pg
= 0;
3750 unsigned moved_pg
= 0;
3751 vector
<unsigned> base_by_osd(get_max_osd(), 0);
3752 vector
<unsigned> new_by_osd(get_max_osd(), 0);
3753 for (int64_t pool_id
: ls
) {
3754 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
3755 vector
<int> up
, up2
;
3757 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
3758 pg_t
pgid(ps
, pool_id
, -1);
3759 total_pg
+= pi
->get_size();
3760 pg_to_up_acting_osds(pgid
, &up
, &up_primary
, nullptr, nullptr);
3761 for (int osd
: up
) {
3762 if (osd
>= 0 && osd
< get_max_osd())
3766 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
, nullptr, nullptr);
3767 for (int osd
: up2
) {
3768 if (osd
>= 0 && osd
< get_max_osd())
3771 if (pi
->type
== pg_pool_t::TYPE_ERASURE
) {
3772 for (unsigned i
=0; i
<up
.size(); ++i
) {
3773 if (up
[i
] != up2
[i
]) {
3777 } else if (pi
->type
== pg_pool_t::TYPE_REPLICATED
) {
3778 for (int osd
: up
) {
3779 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
3784 assert(0 == "unhandled pool type");
3790 unsigned num_up_in
= 0;
3791 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
3792 if (is_up(osd
) && is_in(osd
))
3799 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
3800 float base_stddev
= 0, new_stddev
= 0;
3801 int min
= -1, max
= -1;
3802 unsigned min_base_pg
= 0, max_base_pg
= 0;
3803 unsigned min_new_pg
= 0, max_new_pg
= 0;
3804 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
3805 if (is_up(osd
) && is_in(osd
)) {
3806 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
3807 base_stddev
+= base_diff
* base_diff
;
3808 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
3809 new_stddev
+= new_diff
* new_diff
;
3810 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
3812 min_base_pg
= base_by_osd
[osd
];
3813 min_new_pg
= new_by_osd
[osd
];
3815 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
3817 max_base_pg
= base_by_osd
[osd
];
3818 max_new_pg
= new_by_osd
[osd
];
3822 base_stddev
= sqrt(base_stddev
/ num_up_in
);
3823 new_stddev
= sqrt(new_stddev
/ num_up_in
);
3825 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
3829 f
->open_object_section("utilization");
3832 f
->dump_unsigned("moved_pgs", moved_pg
);
3833 f
->dump_unsigned("total_pgs", total_pg
);
3837 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
3838 ss
<< "moved " << moved_pg
<< " / " << total_pg
3839 << " (" << percent
<< "%)\n";
3843 f
->dump_float("avg_pgs", avg_pg
);
3844 f
->dump_float("std_dev", base_stddev
);
3845 f
->dump_float("expected_baseline_std_dev", edev
);
3847 f
->dump_float("new_std_dev", new_stddev
);
3849 ss
<< "avg " << avg_pg
<< "\n";
3850 ss
<< "stddev " << base_stddev
;
3852 ss
<< " -> " << new_stddev
;
3853 ss
<< " (expected baseline " << edev
<< ")\n";
3857 f
->dump_unsigned("min_osd", min
);
3858 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
3860 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
3862 ss
<< "min osd." << min
<< " with " << min_base_pg
;
3864 ss
<< " -> " << min_new_pg
;
3865 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
3867 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
3873 f
->dump_unsigned("max_osd", max
);
3874 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
3876 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
3878 ss
<< "max osd." << max
<< " with " << max_base_pg
;
3880 ss
<< " -> " << max_new_pg
;
3881 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
3883 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
3895 int OSDMap::clean_pg_upmaps(
3897 Incremental
*pending_inc
)
3899 ldout(cct
, 10) << __func__
<< dendl
;
3901 for (auto& p
: pg_upmap
) {
3904 pg_to_raw_osds(p
.first
, &raw
, &primary
);
3905 if (vectors_equal(raw
, p
.second
)) {
3906 ldout(cct
, 10) << " removing redundant pg_upmap " << p
.first
<< " "
3907 << p
.second
<< dendl
;
3908 pending_inc
->old_pg_upmap
.insert(p
.first
);
3912 for (auto& p
: pg_upmap_items
) {
3915 pg_to_raw_osds(p
.first
, &raw
, &primary
);
3916 mempool::osdmap::vector
<pair
<int,int>> newmap
;
3917 for (auto& q
: p
.second
) {
3918 if (std::find(raw
.begin(), raw
.end(), q
.first
) != raw
.end()) {
3919 newmap
.push_back(q
);
3922 if (newmap
.empty()) {
3923 ldout(cct
, 10) << " removing no-op pg_upmap_items " << p
.first
<< " "
3924 << p
.second
<< dendl
;
3925 pending_inc
->old_pg_upmap_items
.insert(p
.first
);
3927 } else if (newmap
!= p
.second
) {
3928 ldout(cct
, 10) << " simplifying partially no-op pg_upmap_items "
3929 << p
.first
<< " " << p
.second
<< " -> " << newmap
<< dendl
;
3930 pending_inc
->new_pg_upmap_items
[p
.first
] = newmap
;
3937 bool OSDMap::try_pg_upmap(
3939 pg_t pg
, ///< pg to potentially remap
3940 const set
<int>& overfull
, ///< osds we'd want to evacuate
3941 const vector
<int>& underfull
, ///< osds to move to, in order of preference
3943 vector
<int> *out
) ///< resulting alternative mapping
3945 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
3948 int rule
= crush
->find_rule(pool
->get_crush_rule(), pool
->get_type(),
3953 // get original mapping
3954 _pg_to_raw_osds(*pool
, pg
, orig
, NULL
);
3956 // make sure there is something there to remap
3958 for (auto osd
: *orig
) {
3959 if (overfull
.count(osd
)) {
3968 int r
= crush
->try_remap_rule(
3972 overfull
, underfull
,
3982 int OSDMap::calc_pg_upmaps(
3984 float max_deviation_ratio
,
3986 const set
<int64_t>& only_pools_orig
,
3987 OSDMap::Incremental
*pending_inc
)
3989 set
<int64_t> only_pools
;
3990 if (only_pools_orig
.empty()) {
3991 for (auto& i
: pools
) {
3992 only_pools
.insert(i
.first
);
3995 only_pools
= only_pools_orig
;
3998 tmp
.deepish_copy_from(*this);
3999 float start_deviation
= 0;
4000 float end_deviation
= 0;
4001 int num_changed
= 0;
4003 map
<int,set
<pg_t
>> pgs_by_osd
;
4005 float osd_weight_total
= 0;
4006 map
<int,float> osd_weight
;
4007 for (auto& i
: pools
) {
4008 if (!only_pools
.empty() && !only_pools
.count(i
.first
))
4010 for (unsigned ps
= 0; ps
< i
.second
.get_pg_num(); ++ps
) {
4011 pg_t
pg(ps
, i
.first
);
4013 tmp
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
4014 for (auto osd
: up
) {
4015 if (osd
!= CRUSH_ITEM_NONE
)
4016 pgs_by_osd
[osd
].insert(pg
);
4019 total_pgs
+= i
.second
.get_size() * i
.second
.get_pg_num();
4021 map
<int,float> pmap
;
4022 int ruleno
= tmp
.crush
->find_rule(i
.second
.get_crush_rule(),
4023 i
.second
.get_type(),
4024 i
.second
.get_size());
4025 tmp
.crush
->get_rule_weight_osd_map(ruleno
, &pmap
);
4026 ldout(cct
,30) << __func__
<< " pool " << i
.first
<< " ruleno " << ruleno
<< dendl
;
4027 for (auto p
: pmap
) {
4028 auto adjusted_weight
= tmp
.get_weightf(p
.first
) * p
.second
;
4029 if (adjusted_weight
== 0) {
4032 osd_weight
[p
.first
] += adjusted_weight
;
4033 osd_weight_total
+= adjusted_weight
;
4036 for (auto& i
: osd_weight
) {
4038 auto p
= pgs_by_osd
.find(i
.first
);
4039 if (p
!= pgs_by_osd
.end())
4040 pgs
= p
->second
.size();
4042 pgs_by_osd
.emplace(i
.first
, set
<pg_t
>());
4043 ldout(cct
, 20) << " osd." << i
.first
<< " weight " << i
.second
4044 << " pgs " << pgs
<< dendl
;
4047 if (osd_weight_total
== 0) {
4048 lderr(cct
) << __func__
<< " abort due to osd_weight_total == 0" << dendl
;
4051 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
4052 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
4053 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
4056 float total_deviation
= 0;
4057 map
<int,float> osd_deviation
; // osd, deviation(pgs)
4058 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
4060 for (auto& i
: pgs_by_osd
) {
4061 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
4062 float deviation
= (float)i
.second
.size() - target
;
4063 ldout(cct
, 20) << " osd." << i
.first
4064 << "\tpgs " << i
.second
.size()
4065 << "\ttarget " << target
4066 << "\tdeviation " << deviation
4068 osd_deviation
[i
.first
] = deviation
;
4069 deviation_osd
.insert(make_pair(deviation
, i
.first
));
4070 if (deviation
>= 1.0)
4071 overfull
.insert(i
.first
);
4072 total_deviation
+= abs(deviation
);
4074 if (num_changed
== 0) {
4075 start_deviation
= total_deviation
;
4077 end_deviation
= total_deviation
;
4079 // build underfull, sorted from least-full to most-average
4080 vector
<int> underfull
;
4081 for (auto i
= deviation_osd
.begin();
4082 i
!= deviation_osd
.end();
4084 if (i
->first
>= -.999)
4086 underfull
.push_back(i
->second
);
4088 ldout(cct
, 10) << " total_deviation " << total_deviation
4089 << " overfull " << overfull
4090 << " underfull " << underfull
<< dendl
;
4091 if (overfull
.empty() || underfull
.empty())
4095 bool restart
= false;
4096 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
4097 int osd
= p
->second
;
4098 float deviation
= p
->first
;
4099 // make sure osd is still there (belongs to this crush-tree)
4100 assert(osd_weight
.count(osd
));
4101 float target
= osd_weight
[osd
] * pgs_per_weight
;
4103 if (deviation
/target
< max_deviation_ratio
) {
4104 ldout(cct
, 10) << " osd." << osd
4105 << " target " << target
4106 << " deviation " << deviation
4107 << " -> ratio " << deviation
/target
4108 << " < max ratio " << max_deviation_ratio
<< dendl
;
4111 int num_to_move
= deviation
;
4112 ldout(cct
, 10) << " osd." << osd
<< " move " << num_to_move
<< dendl
;
4113 if (num_to_move
< 1)
4116 set
<pg_t
>& pgs
= pgs_by_osd
[osd
];
4118 // look for remaps we can un-remap
4119 for (auto pg
: pgs
) {
4120 auto p
= tmp
.pg_upmap_items
.find(pg
);
4121 if (p
!= tmp
.pg_upmap_items
.end()) {
4122 for (auto q
: p
->second
) {
4123 if (q
.second
== osd
) {
4124 ldout(cct
, 10) << " dropping pg_upmap_items " << pg
4125 << " " << p
->second
<< dendl
;
4126 tmp
.pg_upmap_items
.erase(p
);
4127 pending_inc
->old_pg_upmap_items
.insert(pg
);
4139 for (auto pg
: pgs
) {
4140 if (tmp
.pg_upmap
.count(pg
) ||
4141 tmp
.pg_upmap_items
.count(pg
)) {
4142 ldout(cct
, 20) << " already remapped " << pg
<< dendl
;
4145 ldout(cct
, 10) << " trying " << pg
<< dendl
;
4146 vector
<int> orig
, out
;
4147 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, &orig
, &out
)) {
4150 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
4151 if (orig
.size() != out
.size()) {
4154 assert(orig
!= out
);
4155 auto& rmi
= tmp
.pg_upmap_items
[pg
];
4156 for (unsigned i
= 0; i
< out
.size(); ++i
) {
4157 if (orig
[i
] != out
[i
]) {
4158 rmi
.push_back(make_pair(orig
[i
], out
[i
]));
4161 pending_inc
->new_pg_upmap_items
[pg
] = rmi
;
4162 ldout(cct
, 10) << " " << pg
<< " pg_upmap_items " << rmi
<< dendl
;
4172 ldout(cct
, 10) << " failed to find any changes to make" << dendl
;
4176 ldout(cct
, 10) << " hit max iterations, stopping" << dendl
;
4180 ldout(cct
, 10) << " start deviation " << start_deviation
<< dendl
;
4181 ldout(cct
, 10) << " end deviation " << end_deviation
<< dendl
;
4185 int OSDMap::get_osds_by_bucket_name(const string
&name
, set
<int> *osds
) const
4187 return crush
->get_leaves(name
, osds
);
4190 // get pools whose crush rules might reference the given osd
4191 void OSDMap::get_pool_ids_by_osd(CephContext
*cct
,
4193 set
<int64_t> *pool_ids
) const
4197 int r
= crush
->get_rules_by_osd(osd
, &raw_rules
);
4199 lderr(cct
) << __func__
<< " get_rules_by_osd failed: " << cpp_strerror(r
)
4204 for (auto &i
: raw_rules
) {
4205 // exclude any dead rule
4206 if (crush_rule_in_use(i
)) {
4210 for (auto &r
: rules
) {
4211 get_pool_ids_by_rule(r
, pool_ids
);
4215 template <typename F
>
4216 class OSDUtilizationDumper
: public CrushTreeDumper::Dumper
<F
> {
4218 typedef CrushTreeDumper::Dumper
<F
> Parent
;
4220 OSDUtilizationDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
,
4221 const PGStatService
*pgs_
, bool tree_
) :
4222 Parent(crush
, osdmap_
->get_pool_names()),
4226 average_util(average_utilization()),
4234 void dump_stray(F
*f
) {
4235 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4236 if (osdmap
->exists(i
) && !this->is_touched(i
))
4237 dump_item(CrushTreeDumper::Item(i
, 0, 0, 0), f
);
4241 void dump_item(const CrushTreeDumper::Item
&qi
, F
*f
) override
{
4242 if (!tree
&& qi
.is_bucket())
4245 float reweight
= qi
.is_bucket() ? -1 : osdmap
->get_weightf(qi
.id
);
4246 int64_t kb
= 0, kb_used
= 0, kb_avail
= 0;
4248 if (get_bucket_utilization(qi
.id
, &kb
, &kb_used
, &kb_avail
))
4250 util
= 100.0 * (double)kb_used
/ (double)kb
;
4254 var
= util
/ average_util
;
4256 size_t num_pgs
= qi
.is_bucket() ? 0 : pgs
->get_num_pg_by_osd(qi
.id
);
4258 dump_item(qi
, reweight
, kb
, kb_used
, kb_avail
, util
, var
, num_pgs
, f
);
4260 if (!qi
.is_bucket() && reweight
> 0) {
4261 if (min_var
< 0 || var
< min_var
)
4263 if (max_var
< 0 || var
> max_var
)
4266 double dev
= util
- average_util
;
4268 stddev
+= reweight
* dev
;
4273 virtual void dump_item(const CrushTreeDumper::Item
&qi
,
4280 const size_t num_pgs
,
4284 return sum
> 0 ? sqrt(stddev
/ sum
) : 0;
4287 double average_utilization() {
4288 int64_t kb
= 0, kb_used
= 0;
4289 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
4290 if (!osdmap
->exists(i
) || osdmap
->get_weight(i
) == 0)
4292 int64_t kb_i
, kb_used_i
, kb_avail_i
;
4293 if (get_osd_utilization(i
, &kb_i
, &kb_used_i
, &kb_avail_i
)) {
4295 kb_used
+= kb_used_i
;
4298 return kb
> 0 ? 100.0 * (double)kb_used
/ (double)kb
: 0;
4301 bool get_osd_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
4302 int64_t* kb_avail
) const {
4303 const osd_stat_t
*p
= pgs
->get_osd_stat(id
);
4304 if (!p
) return false;
4306 *kb_used
= p
->kb_used
;
4307 *kb_avail
= p
->kb_avail
;
4311 bool get_bucket_utilization(int id
, int64_t* kb
, int64_t* kb_used
,
4312 int64_t* kb_avail
) const {
4314 if (osdmap
->is_out(id
)) {
4320 return get_osd_utilization(id
, kb
, kb_used
, kb_avail
);
4327 for (int k
= osdmap
->crush
->get_bucket_size(id
) - 1; k
>= 0; k
--) {
4328 int item
= osdmap
->crush
->get_bucket_item(id
, k
);
4329 int64_t kb_i
= 0, kb_used_i
= 0, kb_avail_i
= 0;
4330 if (!get_bucket_utilization(item
, &kb_i
, &kb_used_i
, &kb_avail_i
))
4333 *kb_used
+= kb_used_i
;
4334 *kb_avail
+= kb_avail_i
;
4340 const OSDMap
*osdmap
;
4341 const PGStatService
*pgs
;
4343 double average_util
;
4351 class OSDUtilizationPlainDumper
: public OSDUtilizationDumper
<TextTable
> {
4353 typedef OSDUtilizationDumper
<TextTable
> Parent
;
4355 OSDUtilizationPlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
4356 const PGStatService
*pgs
, bool tree
) :
4357 Parent(crush
, osdmap
, pgs
, tree
) {}
4359 void dump(TextTable
*tbl
) {
4360 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
4361 tbl
->define_column("CLASS", TextTable::LEFT
, TextTable::RIGHT
);
4362 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4363 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
4364 tbl
->define_column("SIZE", TextTable::LEFT
, TextTable::RIGHT
);
4365 tbl
->define_column("USE", TextTable::LEFT
, TextTable::RIGHT
);
4366 tbl
->define_column("AVAIL", TextTable::LEFT
, TextTable::RIGHT
);
4367 tbl
->define_column("%USE", TextTable::LEFT
, TextTable::RIGHT
);
4368 tbl
->define_column("VAR", TextTable::LEFT
, TextTable::RIGHT
);
4369 tbl
->define_column("PGS", TextTable::LEFT
, TextTable::RIGHT
);
4371 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
4380 << si_t(pgs
->get_osd_sum().kb
<< 10)
4381 << si_t(pgs
->get_osd_sum().kb_used
<< 10)
4382 << si_t(pgs
->get_osd_sum().kb_avail
<< 10)
4383 << lowprecision_t(average_util
)
4385 << TextTable::endrow
;
4389 struct lowprecision_t
{
4391 explicit lowprecision_t(float _v
) : v(_v
) {}
4393 friend std::ostream
&operator<<(ostream
& out
, const lowprecision_t
& v
);
4395 using OSDUtilizationDumper
<TextTable
>::dump_item
;
4396 void dump_item(const CrushTreeDumper::Item
&qi
,
4403 const size_t num_pgs
,
4404 TextTable
*tbl
) override
{
4405 const char *c
= crush
->get_item_class(qi
.id
);
4410 << weightf_t(qi
.weight
)
4411 << weightf_t(reweight
)
4413 << si_t(kb_used
<< 10)
4414 << si_t(kb_avail
<< 10)
4415 << lowprecision_t(util
)
4416 << lowprecision_t(var
);
4418 if (qi
.is_bucket()) {
4426 for (int k
= 0; k
< qi
.depth
; k
++)
4428 if (qi
.is_bucket()) {
4429 int type
= crush
->get_bucket_type(qi
.id
);
4430 name
<< crush
->get_type_name(type
) << " "
4431 << crush
->get_item_name(qi
.id
);
4433 name
<< "osd." << qi
.id
;
4438 *tbl
<< TextTable::endrow
;
4444 out
<< "MIN/MAX VAR: " << lowprecision_t(min_var
)
4445 << "/" << lowprecision_t(max_var
) << " "
4446 << "STDDEV: " << lowprecision_t(dev());
4451 ostream
& operator<<(ostream
& out
,
4452 const OSDUtilizationPlainDumper::lowprecision_t
& v
)
4456 } else if (v
.v
< 0.001) {
4459 std::streamsize p
= out
.precision();
4460 return out
<< std::fixed
<< std::setprecision(2) << v
.v
<< std::setprecision(p
);
4464 class OSDUtilizationFormatDumper
: public OSDUtilizationDumper
<Formatter
> {
4466 typedef OSDUtilizationDumper
<Formatter
> Parent
;
4468 OSDUtilizationFormatDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap
,
4469 const PGStatService
*pgs
, bool tree
) :
4470 Parent(crush
, osdmap
, pgs
, tree
) {}
4472 void dump(Formatter
*f
) {
4473 f
->open_array_section("nodes");
4477 f
->open_array_section("stray");
4483 using OSDUtilizationDumper
<Formatter
>::dump_item
;
4484 void dump_item(const CrushTreeDumper::Item
&qi
,
4491 const size_t num_pgs
,
4492 Formatter
*f
) override
{
4493 f
->open_object_section("item");
4494 CrushTreeDumper::dump_item_fields(crush
, weight_set_names
, qi
, f
);
4495 f
->dump_float("reweight", reweight
);
4496 f
->dump_int("kb", kb
);
4497 f
->dump_int("kb_used", kb_used
);
4498 f
->dump_int("kb_avail", kb_avail
);
4499 f
->dump_float("utilization", util
);
4500 f
->dump_float("var", var
);
4501 f
->dump_unsigned("pgs", num_pgs
);
4502 CrushTreeDumper::dump_bucket_children(crush
, qi
, f
);
4507 void summary(Formatter
*f
) {
4508 f
->open_object_section("summary");
4509 f
->dump_int("total_kb", pgs
->get_osd_sum().kb
);
4510 f
->dump_int("total_kb_used", pgs
->get_osd_sum().kb_used
);
4511 f
->dump_int("total_kb_avail", pgs
->get_osd_sum().kb_avail
);
4512 f
->dump_float("average_utilization", average_util
);
4513 f
->dump_float("min_var", min_var
);
4514 f
->dump_float("max_var", max_var
);
4515 f
->dump_float("dev", dev());
4520 void print_osd_utilization(const OSDMap
& osdmap
,
4521 const PGStatService
*pgstat
,
4526 const CrushWrapper
*crush
= osdmap
.crush
.get();
4528 f
->open_object_section("df");
4529 OSDUtilizationFormatDumper
d(crush
, &osdmap
, pgstat
, tree
);
4535 OSDUtilizationPlainDumper
d(crush
, &osdmap
, pgstat
, tree
);
4538 out
<< tbl
<< d
.summary() << "\n";
4542 void OSDMap::check_health(health_check_map_t
*checks
) const
4544 int num_osds
= get_num_osds();
4547 // OSD_$subtree_DOWN
4549 if (num_osds
>= 0) {
4550 int num_in_osds
= 0;
4551 int num_down_in_osds
= 0;
4553 set
<int> down_in_osds
;
4554 set
<int> up_in_osds
;
4555 set
<int> subtree_up
;
4556 unordered_map
<int, set
<int> > subtree_type_down
;
4557 unordered_map
<int, int> num_osds_subtree
;
4558 int max_type
= crush
->get_max_type_id();
4560 for (int i
= 0; i
< get_max_osd(); i
++) {
4562 if (crush
->item_exists(i
)) {
4570 if (down_in_osds
.count(i
) || up_in_osds
.count(i
))
4573 down_in_osds
.insert(i
);
4576 for (int type
= 0; type
<= max_type
; type
++) {
4577 if (!crush
->get_type_name(type
))
4579 int r
= crush
->get_immediate_parent_id(current
, &parent_id
);
4582 // break early if this parent is already marked as up
4583 if (subtree_up
.count(parent_id
))
4585 type
= crush
->get_bucket_type(parent_id
);
4586 if (!subtree_type_is_down(
4587 g_ceph_context
, parent_id
, type
,
4588 &down_in_osds
, &up_in_osds
, &subtree_up
, &subtree_type_down
))
4590 current
= parent_id
;
4595 // calculate the number of down osds in each down subtree and
4596 // store it in num_osds_subtree
4597 for (int type
= 1; type
<= max_type
; type
++) {
4598 if (!crush
->get_type_name(type
))
4600 for (auto j
= subtree_type_down
[type
].begin();
4601 j
!= subtree_type_down
[type
].end();
4605 int num_children
= crush
->get_children(*j
, &children
);
4606 if (num_children
== 0)
4608 for (auto l
= children
.begin(); l
!= children
.end(); ++l
) {
4611 } else if (num_osds_subtree
[*l
] > 0) {
4612 num
= num
+ num_osds_subtree
[*l
];
4615 num_osds_subtree
[*j
] = num
;
4618 num_down_in_osds
= down_in_osds
.size();
4619 assert(num_down_in_osds
<= num_in_osds
);
4620 if (num_down_in_osds
> 0) {
4621 // summary of down subtree types and osds
4622 for (int type
= max_type
; type
> 0; type
--) {
4623 if (!crush
->get_type_name(type
))
4625 if (subtree_type_down
[type
].size() > 0) {
4627 ss
<< subtree_type_down
[type
].size() << " "
4628 << crush
->get_type_name(type
);
4629 if (subtree_type_down
[type
].size() > 1) {
4632 int sum_down_osds
= 0;
4633 for (auto j
= subtree_type_down
[type
].begin();
4634 j
!= subtree_type_down
[type
].end();
4636 sum_down_osds
= sum_down_osds
+ num_osds_subtree
[*j
];
4638 ss
<< " (" << sum_down_osds
<< " osds) down";
4639 string err
= string("OSD_") +
4640 string(crush
->get_type_name(type
)) + "_DOWN";
4641 boost::to_upper(err
);
4642 auto& d
= checks
->add(err
, HEALTH_WARN
, ss
.str());
4643 for (auto j
= subtree_type_down
[type
].rbegin();
4644 j
!= subtree_type_down
[type
].rend();
4647 ss
<< crush
->get_type_name(type
);
4649 ss
<< crush
->get_item_name(*j
);
4650 // at the top level, do not print location
4651 if (type
!= max_type
) {
4653 ss
<< crush
->get_full_location_ordered_string(*j
);
4656 int num
= num_osds_subtree
[*j
];
4657 ss
<< " (" << num
<< " osds)";
4659 d
.detail
.push_back(ss
.str());
4664 ss
<< down_in_osds
.size() << " osds down";
4665 auto& d
= checks
->add("OSD_DOWN", HEALTH_WARN
, ss
.str());
4666 for (auto it
= down_in_osds
.begin(); it
!= down_in_osds
.end(); ++it
) {
4668 ss
<< "osd." << *it
<< " (";
4669 ss
<< crush
->get_full_location_ordered_string(*it
);
4671 d
.detail
.push_back(ss
.str());
4675 if (!osds
.empty()) {
4677 ss
<< osds
.size() << " osds exist in the crush map but not in the osdmap";
4678 auto& d
= checks
->add("OSD_ORPHAN", HEALTH_WARN
, ss
.str());
4679 for (auto osd
: osds
) {
4681 ss
<< "osd." << osd
<< " exists in crush map but not in osdmap";
4682 d
.detail
.push_back(ss
.str());
4687 // OSD_OUT_OF_ORDER_FULL
4689 // An osd could configure failsafe ratio, to something different
4690 // but for now assume it is the same here.
4691 float fsr
= g_conf
->osd_failsafe_full_ratio
;
4692 if (fsr
> 1.0) fsr
/= 100;
4693 float fr
= get_full_ratio();
4694 float br
= get_backfillfull_ratio();
4695 float nr
= get_nearfull_ratio();
4697 list
<string
> detail
;
4698 // These checks correspond to how OSDService::check_full_status() in an OSD
4699 // handles the improper setting of these values.
4702 ss
<< "backfillfull_ratio (" << br
4703 << ") < nearfull_ratio (" << nr
<< "), increased";
4704 detail
.push_back(ss
.str());
4709 ss
<< "full_ratio (" << fr
<< ") < backfillfull_ratio (" << br
4711 detail
.push_back(ss
.str());
4716 ss
<< "osd_failsafe_full_ratio (" << fsr
<< ") < full_ratio (" << fr
4718 detail
.push_back(ss
.str());
4720 if (!detail
.empty()) {
4721 auto& d
= checks
->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR
,
4722 "full ratio(s) out of order");
4723 d
.detail
.swap(detail
);
4730 // OSD_FAILSAFE_FULL
4732 set
<int> full
, backfillfull
, nearfull
;
4733 get_full_osd_counts(&full
, &backfillfull
, &nearfull
);
4736 ss
<< full
.size() << " full osd(s)";
4737 auto& d
= checks
->add("OSD_FULL", HEALTH_ERR
, ss
.str());
4738 for (auto& i
: full
) {
4740 ss
<< "osd." << i
<< " is full";
4741 d
.detail
.push_back(ss
.str());
4744 if (backfillfull
.size()) {
4746 ss
<< backfillfull
.size() << " backfillfull osd(s)";
4747 auto& d
= checks
->add("OSD_BACKFILLFULL", HEALTH_WARN
, ss
.str());
4748 for (auto& i
: backfillfull
) {
4750 ss
<< "osd." << i
<< " is backfill full";
4751 d
.detail
.push_back(ss
.str());
4754 if (nearfull
.size()) {
4756 ss
<< nearfull
.size() << " nearfull osd(s)";
4757 auto& d
= checks
->add("OSD_NEARFULL", HEALTH_WARN
, ss
.str());
4758 for (auto& i
: nearfull
) {
4760 ss
<< "osd." << i
<< " is near full";
4761 d
.detail
.push_back(ss
.str());
4769 uint64_t warn_flags
=
4770 CEPH_OSDMAP_NEARFULL
|
4772 CEPH_OSDMAP_PAUSERD
|
4773 CEPH_OSDMAP_PAUSEWR
|
4774 CEPH_OSDMAP_PAUSEREC
|
4776 CEPH_OSDMAP_NODOWN
|
4779 CEPH_OSDMAP_NOBACKFILL
|
4780 CEPH_OSDMAP_NORECOVER
|
4781 CEPH_OSDMAP_NOSCRUB
|
4782 CEPH_OSDMAP_NODEEP_SCRUB
|
4783 CEPH_OSDMAP_NOTIERAGENT
|
4784 CEPH_OSDMAP_NOREBALANCE
;
4785 if (test_flag(warn_flags
)) {
4787 ss
<< get_flag_string(get_flags() & warn_flags
)
4789 checks
->add("OSDMAP_FLAGS", HEALTH_WARN
, ss
.str());
4795 list
<string
> detail
;
4796 const unsigned flags
=
4801 for (int i
= 0; i
< max_osd
; ++i
) {
4802 if (osd_state
[i
] & flags
) {
4805 OSDMap::calc_state_set(osd_state
[i
] & flags
, states
);
4806 ss
<< "osd." << i
<< " has flags " << states
;
4807 detail
.push_back(ss
.str());
4810 if (!detail
.empty()) {
4812 ss
<< detail
.size() << " osd(s) have {NOUP,NODOWN,NOIN,NOOUT} flags set";
4813 auto& d
= checks
->add("OSD_FLAGS", HEALTH_WARN
, ss
.str());
4814 d
.detail
.swap(detail
);
4818 // OLD_CRUSH_TUNABLES
4819 if (g_conf
->mon_warn_on_legacy_crush_tunables
) {
4820 string min
= crush
->get_min_required_version();
4821 if (min
< g_conf
->mon_crush_min_required_version
) {
4823 ss
<< "crush map has legacy tunables (require " << min
4824 << ", min is " << g_conf
->mon_crush_min_required_version
<< ")";
4825 auto& d
= checks
->add("OLD_CRUSH_TUNABLES", HEALTH_WARN
, ss
.str());
4826 d
.detail
.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
4830 // OLD_CRUSH_STRAW_CALC_VERSION
4831 if (g_conf
->mon_warn_on_crush_straw_calc_version_zero
) {
4832 if (crush
->get_straw_calc_version() == 0) {
4834 ss
<< "crush map has straw_calc_version=0";
4835 auto& d
= checks
->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN
, ss
.str());
4837 "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
4841 // CACHE_POOL_NO_HIT_SET
4842 if (g_conf
->mon_warn_on_cache_pools_without_hit_sets
) {
4843 list
<string
> detail
;
4844 for (map
<int64_t, pg_pool_t
>::const_iterator p
= pools
.begin();
4847 const pg_pool_t
& info
= p
->second
;
4848 if (info
.cache_mode_requires_hit_set() &&
4849 info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
) {
4851 ss
<< "pool '" << get_pool_name(p
->first
)
4852 << "' with cache_mode " << info
.get_cache_mode_name()
4853 << " needs hit_set_type to be set but it is not";
4854 detail
.push_back(ss
.str());
4857 if (!detail
.empty()) {
4859 ss
<< detail
.size() << " cache pools are missing hit_sets";
4860 auto& d
= checks
->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN
, ss
.str());
4861 d
.detail
.swap(detail
);
4865 // OSD_NO_SORTBITWISE
4866 if (!test_flag(CEPH_OSDMAP_SORTBITWISE
) &&
4867 (get_up_osd_features() &
4868 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT
)) {
4870 ss
<< "no legacy OSD present but 'sortbitwise' flag is not set";
4871 checks
->add("OSD_NO_SORTBITWISE", HEALTH_WARN
, ss
.str());
4874 // OSD_UPGRADE_FINISHED
4875 // none of these (yet) since we don't run until luminous upgrade is done.
4877 // POOL_NEARFULL/BACKFILLFULL/FULL
4879 list
<string
> full_detail
, backfillfull_detail
, nearfull_detail
;
4880 for (auto it
: get_pools()) {
4881 const pg_pool_t
&pool
= it
.second
;
4882 const string
& pool_name
= get_pool_name(it
.first
);
4883 if (pool
.has_flag(pg_pool_t::FLAG_FULL
)) {
4885 if (pool
.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA
)) {
4886 // may run out of space too,
4887 // but we want EQUOTA taking precedence
4888 ss
<< "pool '" << pool_name
<< "' is full (no quota)";
4890 ss
<< "pool '" << pool_name
<< "' is full (no space)";
4892 full_detail
.push_back(ss
.str());
4893 } else if (pool
.has_flag(pg_pool_t::FLAG_BACKFILLFULL
)) {
4895 ss
<< "pool '" << pool_name
<< "' is backfillfull";
4896 backfillfull_detail
.push_back(ss
.str());
4897 } else if (pool
.has_flag(pg_pool_t::FLAG_NEARFULL
)) {
4899 ss
<< "pool '" << pool_name
<< "' is nearfull";
4900 nearfull_detail
.push_back(ss
.str());
4903 if (!full_detail
.empty()) {
4905 ss
<< full_detail
.size() << " pool(s) full";
4906 auto& d
= checks
->add("POOL_FULL", HEALTH_WARN
, ss
.str());
4907 d
.detail
.swap(full_detail
);
4909 if (!backfillfull_detail
.empty()) {
4911 ss
<< backfillfull_detail
.size() << " pool(s) backfillfull";
4912 auto& d
= checks
->add("POOL_BACKFILLFULL", HEALTH_WARN
, ss
.str());
4913 d
.detail
.swap(backfillfull_detail
);
4915 if (!nearfull_detail
.empty()) {
4917 ss
<< nearfull_detail
.size() << " pool(s) nearfull";
4918 auto& d
= checks
->add("POOL_NEARFULL", HEALTH_WARN
, ss
.str());
4919 d
.detail
.swap(nearfull_detail
);
4924 int OSDMap::parse_osd_id_list(const vector
<string
>& ls
, set
<int> *out
,
4928 for (auto i
= ls
.begin(); i
!= ls
.end(); ++i
) {
4929 if (i
== ls
.begin() &&
4930 (*i
== "any" || *i
== "all" || *i
== "*")) {
4934 long osd
= parse_osd_id(i
->c_str(), ss
);
4936 *ss
<< "invalid osd id '" << *i
<< "'";