1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
20 #include "common/config.h"
21 #include "common/Formatter.h"
22 #include "common/TextTable.h"
23 #include "include/ceph_features.h"
24 #include "include/str_map.h"
26 #include "common/code_environment.h"
28 #include "crush/CrushTreeDumper.h"
29 #include "common/Clock.h"
31 #define dout_subsys ceph_subsys_osd
33 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap
, osdmap
, osdmap
);
34 MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental
, osdmap_inc
, osdmap
);
37 // ----------------------------------
40 void osd_info_t::dump(Formatter
*f
) const
42 f
->dump_int("last_clean_begin", last_clean_begin
);
43 f
->dump_int("last_clean_end", last_clean_end
);
44 f
->dump_int("up_from", up_from
);
45 f
->dump_int("up_thru", up_thru
);
46 f
->dump_int("down_at", down_at
);
47 f
->dump_int("lost_at", lost_at
);
50 void osd_info_t::encode(bufferlist
& bl
) const
53 ::encode(struct_v
, bl
);
54 ::encode(last_clean_begin
, bl
);
55 ::encode(last_clean_end
, bl
);
56 ::encode(up_from
, bl
);
57 ::encode(up_thru
, bl
);
58 ::encode(down_at
, bl
);
59 ::encode(lost_at
, bl
);
62 void osd_info_t::decode(bufferlist::iterator
& bl
)
65 ::decode(struct_v
, bl
);
66 ::decode(last_clean_begin
, bl
);
67 ::decode(last_clean_end
, bl
);
68 ::decode(up_from
, bl
);
69 ::decode(up_thru
, bl
);
70 ::decode(down_at
, bl
);
71 ::decode(lost_at
, bl
);
74 void osd_info_t::generate_test_instances(list
<osd_info_t
*>& o
)
76 o
.push_back(new osd_info_t
);
77 o
.push_back(new osd_info_t
);
78 o
.back()->last_clean_begin
= 1;
79 o
.back()->last_clean_end
= 2;
80 o
.back()->up_from
= 30;
81 o
.back()->up_thru
= 40;
82 o
.back()->down_at
= 5;
83 o
.back()->lost_at
= 6;
86 ostream
& operator<<(ostream
& out
, const osd_info_t
& info
)
88 out
<< "up_from " << info
.up_from
89 << " up_thru " << info
.up_thru
90 << " down_at " << info
.down_at
91 << " last_clean_interval [" << info
.last_clean_begin
<< "," << info
.last_clean_end
<< ")";
93 out
<< " lost_at " << info
.lost_at
;
97 // ----------------------------------
100 void osd_xinfo_t::dump(Formatter
*f
) const
102 f
->dump_stream("down_stamp") << down_stamp
;
103 f
->dump_float("laggy_probability", laggy_probability
);
104 f
->dump_int("laggy_interval", laggy_interval
);
105 f
->dump_int("features", features
);
106 f
->dump_unsigned("old_weight", old_weight
);
109 void osd_xinfo_t::encode(bufferlist
& bl
) const
111 ENCODE_START(3, 1, bl
);
112 ::encode(down_stamp
, bl
);
113 __u32 lp
= laggy_probability
* 0xfffffffful
;
115 ::encode(laggy_interval
, bl
);
116 ::encode(features
, bl
);
117 ::encode(old_weight
, bl
);
121 void osd_xinfo_t::decode(bufferlist::iterator
& bl
)
124 ::decode(down_stamp
, bl
);
127 laggy_probability
= (float)lp
/ (float)0xffffffff;
128 ::decode(laggy_interval
, bl
);
130 ::decode(features
, bl
);
134 ::decode(old_weight
, bl
);
140 void osd_xinfo_t::generate_test_instances(list
<osd_xinfo_t
*>& o
)
142 o
.push_back(new osd_xinfo_t
);
143 o
.push_back(new osd_xinfo_t
);
144 o
.back()->down_stamp
= utime_t(2, 3);
145 o
.back()->laggy_probability
= .123;
146 o
.back()->laggy_interval
= 123456;
147 o
.back()->old_weight
= 0x7fff;
150 ostream
& operator<<(ostream
& out
, const osd_xinfo_t
& xi
)
152 return out
<< "down_stamp " << xi
.down_stamp
153 << " laggy_probability " << xi
.laggy_probability
154 << " laggy_interval " << xi
.laggy_interval
155 << " old_weight " << xi
.old_weight
;
158 // ----------------------------------
159 // OSDMap::Incremental
161 int OSDMap::Incremental::get_net_marked_out(const OSDMap
*previous
) const
164 for (auto &weight
: new_weight
) {
165 if (weight
.second
== CEPH_OSD_OUT
&& !previous
->is_out(weight
.first
))
167 else if (weight
.second
!= CEPH_OSD_OUT
&& previous
->is_out(weight
.first
))
173 int OSDMap::Incremental::get_net_marked_down(const OSDMap
*previous
) const
176 for (auto &state
: new_state
) { //
177 if (state
.second
& CEPH_OSD_UP
) {
178 if (previous
->is_up(state
.first
))
187 int OSDMap::Incremental::identify_osd(uuid_d u
) const
189 for (auto &uuid
: new_uuid
)
190 if (uuid
.second
== u
)
195 int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext
*cct
,
196 const OSDMap
& osdmap
)
198 assert(epoch
== osdmap
.get_epoch() + 1);
200 for (auto &new_pool
: new_pools
) {
201 if (!new_pool
.second
.tiers
.empty()) {
202 pg_pool_t
& base
= new_pool
.second
;
204 for (const auto &tier_pool
: base
.tiers
) {
205 const auto &r
= new_pools
.find(tier_pool
);
207 if (r
== new_pools
.end()) {
208 const pg_pool_t
*orig
= osdmap
.get_pg_pool(tier_pool
);
210 lderr(cct
) << __func__
<< " no pool " << tier_pool
<< dendl
;
213 tier
= get_new_pool(tier_pool
, orig
);
217 if (tier
->tier_of
!= new_pool
.first
) {
218 lderr(cct
) << __func__
<< " " << r
->first
<< " tier_of != " << new_pool
.first
<< dendl
;
222 ldout(cct
, 10) << __func__
<< " from " << new_pool
.first
<< " to "
223 << tier_pool
<< dendl
;
224 tier
->snap_seq
= base
.snap_seq
;
225 tier
->snap_epoch
= base
.snap_epoch
;
226 tier
->snaps
= base
.snaps
;
227 tier
->removed_snaps
= base
.removed_snaps
;
235 bool OSDMap::subtree_is_down(int id
, set
<int> *down_cache
) const
241 down_cache
->count(id
)) {
246 crush
->get_children(id
, &children
);
247 for (const auto &child
: children
) {
248 if (!subtree_is_down(child
, down_cache
)) {
253 down_cache
->insert(id
);
258 bool OSDMap::containing_subtree_is_down(CephContext
*cct
, int id
, int subtree_type
, set
<int> *down_cache
) const
260 // use a stack-local down_cache if we didn't get one from the
261 // caller. then at least this particular call will avoid duplicated
263 set
<int> local_down_cache
;
265 down_cache
= &local_down_cache
;
274 type
= crush
->get_bucket_type(current
);
278 if (!subtree_is_down(current
, down_cache
)) {
279 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = false" << dendl
;
283 // is this a big enough subtree to be marked as down?
284 if (type
>= subtree_type
) {
285 ldout(cct
, 30) << "containing_subtree_is_down(" << id
<< ") = true ... " << type
<< " >= " << subtree_type
<< dendl
;
289 int r
= crush
->get_immediate_parent_id(current
, ¤t
);
296 void OSDMap::Incremental::encode_client_old(bufferlist
& bl
) const
302 ::encode(modified
, bl
);
303 int32_t new_t
= new_pool_max
;
305 ::encode(new_flags
, bl
);
306 ::encode(fullmap
, bl
);
309 ::encode(new_max_osd
, bl
);
310 // for ::encode(new_pools, bl);
311 __u32 n
= new_pools
.size();
313 for (const auto &new_pool
: new_pools
) {
316 ::encode(new_pool
.second
, bl
, 0);
318 // for ::encode(new_pool_names, bl);
319 n
= new_pool_names
.size();
322 for (const auto &new_pool_name
: new_pool_names
) {
323 n
= new_pool_name
.first
;
325 ::encode(new_pool_name
.second
, bl
);
327 // for ::encode(old_pools, bl);
328 n
= old_pools
.size();
330 for (auto &old_pool
: old_pools
) {
334 ::encode(new_up_client
, bl
, 0);
335 ::encode(new_state
, bl
);
336 ::encode(new_weight
, bl
);
337 // for ::encode(new_pg_temp, bl);
338 n
= new_pg_temp
.size();
341 for (const auto &pg_temp
: new_pg_temp
) {
342 old_pg_t opg
= pg_temp
.first
.get_old_pg();
344 ::encode(pg_temp
.second
, bl
);
348 void OSDMap::Incremental::encode_classic(bufferlist
& bl
, uint64_t features
) const
350 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
351 encode_client_old(bl
);
360 ::encode(modified
, bl
);
361 ::encode(new_pool_max
, bl
);
362 ::encode(new_flags
, bl
);
363 ::encode(fullmap
, bl
);
366 ::encode(new_max_osd
, bl
);
367 ::encode(new_pools
, bl
, features
);
368 ::encode(new_pool_names
, bl
);
369 ::encode(old_pools
, bl
);
370 ::encode(new_up_client
, bl
, features
);
371 ::encode(new_state
, bl
);
372 ::encode(new_weight
, bl
);
373 ::encode(new_pg_temp
, bl
);
378 ::encode(new_hb_back_up
, bl
, features
);
379 ::encode(new_up_thru
, bl
);
380 ::encode(new_last_clean_interval
, bl
);
381 ::encode(new_lost
, bl
);
382 ::encode(new_blacklist
, bl
, features
);
383 ::encode(old_blacklist
, bl
, features
);
384 ::encode(new_up_cluster
, bl
, features
);
385 ::encode(cluster_snapshot
, bl
);
386 ::encode(new_uuid
, bl
);
387 ::encode(new_xinfo
, bl
);
388 ::encode(new_hb_front_up
, bl
, features
);
391 void OSDMap::Incremental::encode(bufferlist
& bl
, uint64_t features
) const
393 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
394 encode_classic(bl
, features
);
398 // only a select set of callers should *ever* be encoding new
399 // OSDMaps. others should be passing around the canonical encoded
400 // buffers from on high. select out those callers by passing in an
401 // "impossible" feature bit.
402 assert(features
& CEPH_FEATURE_RESERVED
);
403 features
&= ~CEPH_FEATURE_RESERVED
;
405 size_t start_offset
= bl
.length();
407 buffer::list::iterator crc_it
;
409 // meta-encoding: how we include client-used and osd-specific data
410 ENCODE_START(8, 7, bl
);
414 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
417 ENCODE_START(v
, 1, bl
); // client-usable data
420 ::encode(modified
, bl
);
421 ::encode(new_pool_max
, bl
);
422 ::encode(new_flags
, bl
);
423 ::encode(fullmap
, bl
);
426 ::encode(new_max_osd
, bl
);
427 ::encode(new_pools
, bl
, features
);
428 ::encode(new_pool_names
, bl
);
429 ::encode(old_pools
, bl
);
430 ::encode(new_up_client
, bl
, features
);
431 ::encode(new_state
, bl
);
432 ::encode(new_weight
, bl
);
433 ::encode(new_pg_temp
, bl
);
434 ::encode(new_primary_temp
, bl
);
435 ::encode(new_primary_affinity
, bl
);
436 ::encode(new_erasure_code_profiles
, bl
);
437 ::encode(old_erasure_code_profiles
, bl
);
439 ::encode(new_pg_upmap
, bl
);
440 ::encode(old_pg_upmap
, bl
);
441 ::encode(new_pg_upmap_items
, bl
);
442 ::encode(old_pg_upmap_items
, bl
);
444 ENCODE_FINISH(bl
); // client-usable data
448 uint8_t target_v
= 5;
449 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
452 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
453 ::encode(new_hb_back_up
, bl
, features
);
454 ::encode(new_up_thru
, bl
);
455 ::encode(new_last_clean_interval
, bl
);
456 ::encode(new_lost
, bl
);
457 ::encode(new_blacklist
, bl
, features
);
458 ::encode(old_blacklist
, bl
, features
);
459 ::encode(new_up_cluster
, bl
, features
);
460 ::encode(cluster_snapshot
, bl
);
461 ::encode(new_uuid
, bl
);
462 ::encode(new_xinfo
, bl
);
463 ::encode(new_hb_front_up
, bl
, features
);
464 ::encode(features
, bl
); // NOTE: features arg, not the member
466 ::encode(new_nearfull_ratio
, bl
);
467 ::encode(new_full_ratio
, bl
);
468 ::encode(new_backfillfull_ratio
, bl
);
469 ::encode(new_require_min_compat_client
, bl
);
471 ENCODE_FINISH(bl
); // osd-only data
474 ::encode((uint32_t)0, bl
); // dummy inc_crc
477 tail_offset
= bl
.length();
479 ::encode(full_crc
, bl
);
481 ENCODE_FINISH(bl
); // meta-encoding wrapper
485 front
.substr_of(bl
, start_offset
, crc_it
.get_off() - start_offset
);
486 inc_crc
= front
.crc32c(-1);
488 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
489 inc_crc
= tail
.crc32c(inc_crc
);
492 crc_it
.copy_in(4, (char*)&crc_le
);
496 void OSDMap::Incremental::decode_classic(bufferlist::iterator
&p
)
504 ::decode(modified
, p
);
505 if (v
== 4 || v
== 5) {
509 ::decode(new_pool_max
, p
);
510 ::decode(new_flags
, p
);
511 ::decode(fullmap
, p
);
514 ::decode(new_max_osd
, p
);
520 ::decode(new_pools
[t
], p
);
523 ::decode(new_pools
, p
);
526 new_pool_names
.clear();
530 ::decode(new_pool_names
[t
], p
);
533 ::decode(new_pool_names
, p
);
543 ::decode(old_pools
, p
);
545 ::decode(new_up_client
, p
);
546 ::decode(new_state
, p
);
547 ::decode(new_weight
, p
);
554 ::decode_raw(opg
, p
);
555 ::decode(new_pg_temp
[pg_t(opg
)], p
);
558 ::decode(new_pg_temp
, p
);
561 // decode short map, too.
562 if (v
== 5 && p
.end())
569 ::decode(new_hb_back_up
, p
);
571 ::decode(new_pool_names
, p
);
572 ::decode(new_up_thru
, p
);
573 ::decode(new_last_clean_interval
, p
);
574 ::decode(new_lost
, p
);
575 ::decode(new_blacklist
, p
);
576 ::decode(old_blacklist
, p
);
578 ::decode(new_up_cluster
, p
);
580 ::decode(cluster_snapshot
, p
);
582 ::decode(new_uuid
, p
);
584 ::decode(new_xinfo
, p
);
586 ::decode(new_hb_front_up
, p
);
589 void OSDMap::Incremental::decode(bufferlist::iterator
& bl
)
592 * Older encodings of the Incremental had a single struct_v which
593 * covered the whole encoding, and was prior to our modern
594 * stuff which includes a compatv and a size. So if we see
595 * a struct_v < 7, we must rewind to the beginning and use our
598 size_t start_offset
= bl
.get_off();
599 size_t tail_offset
= 0;
600 bufferlist crc_front
, crc_tail
;
602 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
604 int struct_v_size
= sizeof(struct_v
);
605 bl
.advance(-struct_v_size
);
609 encode_features
= CEPH_FEATURE_PGID64
;
615 DECODE_START(4, bl
); // client-usable data
618 ::decode(modified
, bl
);
619 ::decode(new_pool_max
, bl
);
620 ::decode(new_flags
, bl
);
621 ::decode(fullmap
, bl
);
624 ::decode(new_max_osd
, bl
);
625 ::decode(new_pools
, bl
);
626 ::decode(new_pool_names
, bl
);
627 ::decode(old_pools
, bl
);
628 ::decode(new_up_client
, bl
);
629 ::decode(new_state
, bl
);
630 ::decode(new_weight
, bl
);
631 ::decode(new_pg_temp
, bl
);
632 ::decode(new_primary_temp
, bl
);
634 ::decode(new_primary_affinity
, bl
);
636 new_primary_affinity
.clear();
638 ::decode(new_erasure_code_profiles
, bl
);
639 ::decode(old_erasure_code_profiles
, bl
);
641 new_erasure_code_profiles
.clear();
642 old_erasure_code_profiles
.clear();
645 ::decode(new_pg_upmap
, bl
);
646 ::decode(old_pg_upmap
, bl
);
647 ::decode(new_pg_upmap_items
, bl
);
648 ::decode(old_pg_upmap_items
, bl
);
650 DECODE_FINISH(bl
); // client-usable data
654 DECODE_START(5, bl
); // extended, osd-only data
655 ::decode(new_hb_back_up
, bl
);
656 ::decode(new_up_thru
, bl
);
657 ::decode(new_last_clean_interval
, bl
);
658 ::decode(new_lost
, bl
);
659 ::decode(new_blacklist
, bl
);
660 ::decode(old_blacklist
, bl
);
661 ::decode(new_up_cluster
, bl
);
662 ::decode(cluster_snapshot
, bl
);
663 ::decode(new_uuid
, bl
);
664 ::decode(new_xinfo
, bl
);
665 ::decode(new_hb_front_up
, bl
);
667 ::decode(encode_features
, bl
);
669 encode_features
= CEPH_FEATURE_PGID64
| CEPH_FEATURE_OSDMAP_ENC
;
671 ::decode(new_nearfull_ratio
, bl
);
672 ::decode(new_full_ratio
, bl
);
674 new_nearfull_ratio
= -1;
678 ::decode(new_backfillfull_ratio
, bl
);
680 new_backfillfull_ratio
= -1;
683 ::decode(new_require_min_compat_client
, bl
);
684 DECODE_FINISH(bl
); // osd-only data
689 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
690 ::decode(inc_crc
, bl
);
691 tail_offset
= bl
.get_off();
692 ::decode(full_crc
, bl
);
699 DECODE_FINISH(bl
); // wrapper
703 uint32_t actual
= crc_front
.crc32c(-1);
704 if (tail_offset
< bl
.get_off()) {
706 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
707 actual
= tail
.crc32c(actual
);
709 if (inc_crc
!= actual
) {
711 ss
<< "bad crc, actual " << actual
<< " != expected " << inc_crc
;
713 throw buffer::malformed_input(s
.c_str());
718 void OSDMap::Incremental::dump(Formatter
*f
) const
720 f
->dump_int("epoch", epoch
);
721 f
->dump_stream("fsid") << fsid
;
722 f
->dump_stream("modified") << modified
;
723 f
->dump_int("new_pool_max", new_pool_max
);
724 f
->dump_int("new_flags", new_flags
);
725 f
->dump_float("new_full_ratio", new_full_ratio
);
726 f
->dump_float("new_nearfull_ratio", new_nearfull_ratio
);
727 f
->dump_float("new_backfillfull_ratio", new_backfillfull_ratio
);
728 f
->dump_string("new_require_min_compat_client", new_require_min_compat_client
);
730 if (fullmap
.length()) {
731 f
->open_object_section("full_map");
733 bufferlist fbl
= fullmap
; // kludge around constness.
734 auto p
= fbl
.begin();
739 if (crush
.length()) {
740 f
->open_object_section("crush");
742 bufferlist tbl
= crush
; // kludge around constness.
743 auto p
= tbl
.begin();
749 f
->dump_int("new_max_osd", new_max_osd
);
751 f
->open_array_section("new_pools");
753 for (const auto &new_pool
: new_pools
) {
754 f
->open_object_section("pool");
755 f
->dump_int("pool", new_pool
.first
);
756 new_pool
.second
.dump(f
);
760 f
->open_array_section("new_pool_names");
762 for (const auto &new_pool_name
: new_pool_names
) {
763 f
->open_object_section("pool_name");
764 f
->dump_int("pool", new_pool_name
.first
);
765 f
->dump_string("name", new_pool_name
.second
);
769 f
->open_array_section("old_pools");
771 for (const auto &old_pool
: old_pools
)
772 f
->dump_int("pool", old_pool
);
775 f
->open_array_section("new_up_osds");
777 for (const auto &upclient
: new_up_client
) {
778 f
->open_object_section("osd");
779 f
->dump_int("osd", upclient
.first
);
780 f
->dump_stream("public_addr") << upclient
.second
;
781 f
->dump_stream("cluster_addr") << new_up_cluster
.find(upclient
.first
)->second
;
782 f
->dump_stream("heartbeat_back_addr") << new_hb_back_up
.find(upclient
.first
)->second
;
783 map
<int32_t, entity_addr_t
>::const_iterator q
;
784 if ((q
= new_hb_front_up
.find(upclient
.first
)) != new_hb_front_up
.end())
785 f
->dump_stream("heartbeat_front_addr") << q
->second
;
790 f
->open_array_section("new_weight");
792 for (const auto &weight
: new_weight
) {
793 f
->open_object_section("osd");
794 f
->dump_int("osd", weight
.first
);
795 f
->dump_int("weight", weight
.second
);
800 f
->open_array_section("osd_state_xor");
801 for (const auto &ns
: new_state
) {
802 f
->open_object_section("osd");
803 f
->dump_int("osd", ns
.first
);
805 calc_state_set(new_state
.find(ns
.first
)->second
, st
);
806 f
->open_array_section("state_xor");
807 for (auto &state
: st
)
808 f
->dump_string("state", state
);
813 f
->open_array_section("new_pg_temp");
815 for (const auto &pg_temp
: new_pg_temp
) {
816 f
->open_object_section("pg");
817 f
->dump_stream("pgid") << pg_temp
.first
;
818 f
->open_array_section("osds");
820 for (const auto &osd
: pg_temp
.second
)
821 f
->dump_int("osd", osd
);
827 f
->open_array_section("primary_temp");
829 for (const auto &primary_temp
: new_primary_temp
) {
830 f
->dump_stream("pgid") << primary_temp
.first
;
831 f
->dump_int("osd", primary_temp
.second
);
833 f
->close_section(); // primary_temp
835 f
->open_array_section("new_pg_upmap");
836 for (auto& i
: new_pg_upmap
) {
837 f
->open_object_section("mapping");
838 f
->dump_stream("pgid") << i
.first
;
839 f
->open_array_section("osds");
840 for (auto osd
: i
.second
) {
841 f
->dump_int("osd", osd
);
847 f
->open_array_section("old_pg_upmap");
848 for (auto& i
: old_pg_upmap
) {
849 f
->dump_stream("pgid") << i
;
853 f
->open_array_section("new_pg_upmap_items");
854 for (auto& i
: new_pg_upmap_items
) {
855 f
->open_object_section("mapping");
856 f
->dump_stream("pgid") << i
.first
;
857 f
->open_array_section("mappings");
858 for (auto& p
: i
.second
) {
859 f
->open_object_section("mapping");
860 f
->dump_int("from", p
.first
);
861 f
->dump_int("to", p
.second
);
868 f
->open_array_section("old_pg_upmap_items");
869 for (auto& i
: old_pg_upmap_items
) {
870 f
->dump_stream("pgid") << i
;
874 f
->open_array_section("new_up_thru");
876 for (const auto &up_thru
: new_up_thru
) {
877 f
->open_object_section("osd");
878 f
->dump_int("osd", up_thru
.first
);
879 f
->dump_int("up_thru", up_thru
.second
);
884 f
->open_array_section("new_lost");
886 for (const auto &lost
: new_lost
) {
887 f
->open_object_section("osd");
888 f
->dump_int("osd", lost
.first
);
889 f
->dump_int("epoch_lost", lost
.second
);
894 f
->open_array_section("new_last_clean_interval");
896 for (const auto &last_clean_interval
: new_last_clean_interval
) {
897 f
->open_object_section("osd");
898 f
->dump_int("osd", last_clean_interval
.first
);
899 f
->dump_int("first", last_clean_interval
.second
.first
);
900 f
->dump_int("last", last_clean_interval
.second
.second
);
905 f
->open_array_section("new_blacklist");
906 for (const auto &blist
: new_blacklist
) {
909 f
->dump_stream(ss
.str().c_str()) << blist
.second
;
912 f
->open_array_section("old_blacklist");
913 for (const auto &blist
: old_blacklist
)
914 f
->dump_stream("addr") << blist
;
917 f
->open_array_section("new_xinfo");
918 for (const auto &xinfo
: new_xinfo
) {
919 f
->open_object_section("xinfo");
920 f
->dump_int("osd", xinfo
.first
);
921 xinfo
.second
.dump(f
);
926 if (cluster_snapshot
.size())
927 f
->dump_string("cluster_snapshot", cluster_snapshot
);
929 f
->open_array_section("new_uuid");
930 for (const auto &uuid
: new_uuid
) {
931 f
->open_object_section("osd");
932 f
->dump_int("osd", uuid
.first
);
933 f
->dump_stream("uuid") << uuid
.second
;
938 OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles
, f
);
939 f
->open_array_section("old_erasure_code_profiles");
940 for (const auto &erasure_code_profile
: old_erasure_code_profiles
) {
941 f
->dump_string("old", erasure_code_profile
.c_str());
946 void OSDMap::Incremental::generate_test_instances(list
<Incremental
*>& o
)
948 o
.push_back(new Incremental
);
951 // ----------------------------------
954 void OSDMap::set_epoch(epoch_t e
)
957 for (auto &pool
: pools
)
958 pool
.second
.last_change
= e
;
961 bool OSDMap::is_blacklisted(const entity_addr_t
& a
) const
963 if (blacklist
.empty())
966 // this specific instance?
967 if (blacklist
.count(a
))
970 // is entire ip blacklisted?
975 if (blacklist
.count(b
)) {
983 void OSDMap::get_blacklist(list
<pair
<entity_addr_t
,utime_t
> > *bl
) const
985 std::copy(blacklist
.begin(), blacklist
.end(), std::back_inserter(*bl
));
988 void OSDMap::set_max_osd(int m
)
993 osd_weight
.resize(m
);
994 for (; o
<max_osd
; o
++) {
996 osd_weight
[o
] = CEPH_OSD_OUT
;
1000 osd_addrs
->client_addr
.resize(m
);
1001 osd_addrs
->cluster_addr
.resize(m
);
1002 osd_addrs
->hb_back_addr
.resize(m
);
1003 osd_addrs
->hb_front_addr
.resize(m
);
1004 osd_uuid
->resize(m
);
1005 if (osd_primary_affinity
)
1006 osd_primary_affinity
->resize(m
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1011 int OSDMap::calc_num_osds()
1016 for (int i
=0; i
<max_osd
; i
++) {
1017 if (osd_state
[i
] & CEPH_OSD_EXISTS
) {
1019 if (osd_state
[i
] & CEPH_OSD_UP
) {
1022 if (get_weight(i
) != CEPH_OSD_OUT
) {
1030 void OSDMap::count_full_nearfull_osds(int *full
, int *backfill
, int *nearfull
) const
1035 for (int i
= 0; i
< max_osd
; ++i
) {
1036 if (exists(i
) && is_up(i
) && is_in(i
)) {
1037 if (osd_state
[i
] & CEPH_OSD_FULL
)
1039 else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
)
1041 else if (osd_state
[i
] & CEPH_OSD_NEARFULL
)
1047 static bool get_osd_utilization(const ceph::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
1048 int id
, int64_t* kb
, int64_t* kb_used
, int64_t* kb_avail
) {
1049 auto p
= osd_stat
.find(id
);
1050 if (p
== osd_stat
.end())
1053 *kb_used
= p
->second
.kb_used
;
1054 *kb_avail
= p
->second
.kb_avail
;
1058 void OSDMap::get_full_osd_util(const ceph::unordered_map
<int32_t,osd_stat_t
> &osd_stat
,
1059 map
<int, float> *full
, map
<int, float> *backfill
, map
<int, float> *nearfull
) const
1064 for (int i
= 0; i
< max_osd
; ++i
) {
1065 if (exists(i
) && is_up(i
) && is_in(i
)) {
1066 int64_t kb
, kb_used
, kb_avail
;
1067 if (osd_state
[i
] & CEPH_OSD_FULL
) {
1068 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1069 full
->emplace(i
, (float)kb_used
/ (float)kb
);
1070 } else if (osd_state
[i
] & CEPH_OSD_BACKFILLFULL
) {
1071 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1072 backfill
->emplace(i
, (float)kb_used
/ (float)kb
);
1073 } else if (osd_state
[i
] & CEPH_OSD_NEARFULL
) {
1074 if (get_osd_utilization(osd_stat
, i
, &kb
, &kb_used
, &kb_avail
))
1075 nearfull
->emplace(i
, (float)kb_used
/ (float)kb
);
1081 void OSDMap::get_all_osds(set
<int32_t>& ls
) const
1083 for (int i
=0; i
<max_osd
; i
++)
1088 void OSDMap::get_up_osds(set
<int32_t>& ls
) const
1090 for (int i
= 0; i
< max_osd
; i
++) {
1096 void OSDMap::calc_state_set(int state
, set
<string
>& st
)
1099 for (unsigned s
= 1; t
; s
<<= 1) {
1102 st
.insert(ceph_osd_state_name(s
));
1107 void OSDMap::adjust_osd_weights(const map
<int,double>& weights
, Incremental
& inc
) const
1110 for (const auto &weight
: weights
) {
1111 if (weight
.second
> max
)
1112 max
= weight
.second
;
1115 for (const auto &weight
: weights
) {
1116 inc
.new_weight
[weight
.first
] = (unsigned)((weight
.second
/ max
) * CEPH_OSD_IN
);
1120 int OSDMap::identify_osd(const entity_addr_t
& addr
) const
1122 for (int i
=0; i
<max_osd
; i
++)
1123 if (exists(i
) && (get_addr(i
) == addr
|| get_cluster_addr(i
) == addr
))
1128 int OSDMap::identify_osd(const uuid_d
& u
) const
1130 for (int i
=0; i
<max_osd
; i
++)
1131 if (exists(i
) && get_uuid(i
) == u
)
1136 int OSDMap::identify_osd_on_all_channels(const entity_addr_t
& addr
) const
1138 for (int i
=0; i
<max_osd
; i
++)
1139 if (exists(i
) && (get_addr(i
) == addr
|| get_cluster_addr(i
) == addr
||
1140 get_hb_back_addr(i
) == addr
|| get_hb_front_addr(i
) == addr
))
1145 int OSDMap::find_osd_on_ip(const entity_addr_t
& ip
) const
1147 for (int i
=0; i
<max_osd
; i
++)
1148 if (exists(i
) && (get_addr(i
).is_same_host(ip
) || get_cluster_addr(i
).is_same_host(ip
)))
1154 uint64_t OSDMap::get_features(int entity_type
, uint64_t *pmask
) const
1156 uint64_t features
= 0; // things we actually have
1157 uint64_t mask
= 0; // things we could have
1159 if (crush
->has_nondefault_tunables())
1160 features
|= CEPH_FEATURE_CRUSH_TUNABLES
;
1161 if (crush
->has_nondefault_tunables2())
1162 features
|= CEPH_FEATURE_CRUSH_TUNABLES2
;
1163 if (crush
->has_nondefault_tunables3())
1164 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1165 if (crush
->has_v4_buckets())
1166 features
|= CEPH_FEATURE_CRUSH_V4
;
1167 if (crush
->has_nondefault_tunables5())
1168 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1169 if (crush
->has_incompat_chooseargs())
1170 features
|= CEPH_FEATURE_CRUSH_CHOOSEARGS
;
1171 mask
|= CEPH_FEATURES_CRUSH
;
1173 if (!pg_upmap
.empty() || !pg_upmap_items
.empty())
1174 features
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1175 mask
|= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP
;
1177 for (auto &pool
: pools
) {
1178 if (pool
.second
.has_flag(pg_pool_t::FLAG_HASHPSPOOL
)) {
1179 features
|= CEPH_FEATURE_OSDHASHPSPOOL
;
1181 if (pool
.second
.is_erasure() &&
1182 entity_type
!= CEPH_ENTITY_TYPE_CLIENT
) { // not for clients
1183 features
|= CEPH_FEATURE_OSD_ERASURE_CODES
;
1185 if (!pool
.second
.tiers
.empty() ||
1186 pool
.second
.is_tier()) {
1187 features
|= CEPH_FEATURE_OSD_CACHEPOOL
;
1189 int ruleid
= crush
->find_rule(pool
.second
.get_crush_ruleset(),
1190 pool
.second
.get_type(),
1191 pool
.second
.get_size());
1193 if (crush
->is_v2_rule(ruleid
))
1194 features
|= CEPH_FEATURE_CRUSH_V2
;
1195 if (crush
->is_v3_rule(ruleid
))
1196 features
|= CEPH_FEATURE_CRUSH_TUNABLES3
;
1197 if (crush
->is_v5_rule(ruleid
))
1198 features
|= CEPH_FEATURE_CRUSH_TUNABLES5
;
1201 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1202 for (auto &erasure_code_profile
: erasure_code_profiles
) {
1203 auto& profile
= erasure_code_profile
.second
;
1204 const auto& plugin
= profile
.find("plugin");
1205 if (plugin
!= profile
.end()) {
1206 if (plugin
->second
== "isa" || plugin
->second
== "lrc")
1207 features
|= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2
;
1208 if (plugin
->second
== "shec")
1209 features
|= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3
;
1213 mask
|= CEPH_FEATURE_OSDHASHPSPOOL
| CEPH_FEATURE_OSD_CACHEPOOL
;
1214 if (entity_type
!= CEPH_ENTITY_TYPE_CLIENT
)
1215 mask
|= CEPH_FEATURE_OSD_ERASURE_CODES
;
1217 if (osd_primary_affinity
) {
1218 for (int i
= 0; i
< max_osd
; ++i
) {
1219 if ((*osd_primary_affinity
)[i
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1220 features
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1225 mask
|= CEPH_FEATURE_OSD_PRIMARY_AFFINITY
;
1227 if (entity_type
== CEPH_ENTITY_TYPE_OSD
) {
1228 const uint64_t jewel_features
= CEPH_FEATURE_SERVER_JEWEL
;
1229 if (test_flag(CEPH_OSDMAP_REQUIRE_JEWEL
)) {
1230 features
|= jewel_features
;
1232 mask
|= jewel_features
;
1234 const uint64_t kraken_features
= CEPH_FEATUREMASK_SERVER_KRAKEN
1235 | CEPH_FEATURE_MSG_ADDR2
;
1236 if (test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN
)) {
1237 features
|= kraken_features
;
1239 mask
|= kraken_features
;
1247 pair
<string
,string
> OSDMap::get_min_compat_client() const
1249 uint64_t f
= get_features(CEPH_ENTITY_TYPE_CLIENT
, nullptr);
1251 if (HAVE_FEATURE(f
, OSDMAP_PG_UPMAP
) || // v12.0.0-1733-g27d6f43
1252 HAVE_FEATURE(f
, CRUSH_CHOOSEARGS
)) { // v12.0.1-2172-gef1ef28
1253 return make_pair("luminous", "12.2.0");
1255 if (HAVE_FEATURE(f
, CRUSH_TUNABLES5
)) { // v10.0.0-612-g043a737
1256 return make_pair("jewel", "10.2.0");
1258 if (HAVE_FEATURE(f
, CRUSH_V4
)) { // v0.91-678-g325fc56
1259 return make_pair("hammer", "0.94");
1261 if (HAVE_FEATURE(f
, OSD_PRIMARY_AFFINITY
) || // v0.76-553-gf825624
1262 HAVE_FEATURE(f
, CRUSH_TUNABLES3
) || // v0.76-395-ge20a55d
1263 HAVE_FEATURE(f
, OSD_ERASURE_CODES
) || // v0.73-498-gbfc86a8
1264 HAVE_FEATURE(f
, OSD_CACHEPOOL
)) { // v0.67-401-gb91c1c5
1265 return make_pair("firefly", "0.80");
1267 if (HAVE_FEATURE(f
, CRUSH_TUNABLES2
) || // v0.54-684-g0cc47ff
1268 HAVE_FEATURE(f
, OSDHASHPSPOOL
)) { // v0.57-398-g8cc2b0f
1269 return make_pair("dumpling", "0.67");
1271 if (HAVE_FEATURE(f
, CRUSH_TUNABLES
)) { // v0.48argonaut-206-g6f381af
1272 return make_pair("argonaut", "0.48argonaut-207");
1274 return make_pair("argonaut", "0.48");
1277 void OSDMap::_calc_up_osd_features()
1280 cached_up_osd_features
= 0;
1281 for (int osd
= 0; osd
< max_osd
; ++osd
) {
1284 const osd_xinfo_t
&xi
= get_xinfo(osd
);
1286 cached_up_osd_features
= xi
.features
;
1289 cached_up_osd_features
&= xi
.features
;
1294 uint64_t OSDMap::get_up_osd_features() const
1296 return cached_up_osd_features
;
1299 void OSDMap::dedup(const OSDMap
*o
, OSDMap
*n
)
1301 if (o
->epoch
== n
->epoch
)
1307 if (o
->max_osd
!= n
->max_osd
)
1309 for (int i
= 0; i
< o
->max_osd
&& i
< n
->max_osd
; i
++) {
1310 if ( n
->osd_addrs
->client_addr
[i
] && o
->osd_addrs
->client_addr
[i
] &&
1311 *n
->osd_addrs
->client_addr
[i
] == *o
->osd_addrs
->client_addr
[i
])
1312 n
->osd_addrs
->client_addr
[i
] = o
->osd_addrs
->client_addr
[i
];
1315 if ( n
->osd_addrs
->cluster_addr
[i
] && o
->osd_addrs
->cluster_addr
[i
] &&
1316 *n
->osd_addrs
->cluster_addr
[i
] == *o
->osd_addrs
->cluster_addr
[i
])
1317 n
->osd_addrs
->cluster_addr
[i
] = o
->osd_addrs
->cluster_addr
[i
];
1320 if ( n
->osd_addrs
->hb_back_addr
[i
] && o
->osd_addrs
->hb_back_addr
[i
] &&
1321 *n
->osd_addrs
->hb_back_addr
[i
] == *o
->osd_addrs
->hb_back_addr
[i
])
1322 n
->osd_addrs
->hb_back_addr
[i
] = o
->osd_addrs
->hb_back_addr
[i
];
1325 if ( n
->osd_addrs
->hb_front_addr
[i
] && o
->osd_addrs
->hb_front_addr
[i
] &&
1326 *n
->osd_addrs
->hb_front_addr
[i
] == *o
->osd_addrs
->hb_front_addr
[i
])
1327 n
->osd_addrs
->hb_front_addr
[i
] = o
->osd_addrs
->hb_front_addr
[i
];
1332 // zoinks, no differences at all!
1333 n
->osd_addrs
= o
->osd_addrs
;
1336 // does crush match?
1338 ::encode(*o
->crush
, oc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1339 ::encode(*n
->crush
, nc
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1340 if (oc
.contents_equal(nc
)) {
1341 n
->crush
= o
->crush
;
1344 // does pg_temp match?
1345 if (o
->pg_temp
->size() == n
->pg_temp
->size()) {
1346 if (*o
->pg_temp
== *n
->pg_temp
)
1347 n
->pg_temp
= o
->pg_temp
;
1350 // does primary_temp match?
1351 if (o
->primary_temp
->size() == n
->primary_temp
->size()) {
1352 if (*o
->primary_temp
== *n
->primary_temp
)
1353 n
->primary_temp
= o
->primary_temp
;
1357 if (o
->osd_uuid
->size() == n
->osd_uuid
->size() &&
1358 *o
->osd_uuid
== *n
->osd_uuid
)
1359 n
->osd_uuid
= o
->osd_uuid
;
1362 void OSDMap::clean_temps(CephContext
*cct
,
1363 const OSDMap
& osdmap
, Incremental
*pending_inc
)
1365 ldout(cct
, 10) << __func__
<< dendl
;
1367 tmpmap
.deepish_copy_from(osdmap
);
1368 tmpmap
.apply_incremental(*pending_inc
);
1370 for (auto pg
: *tmpmap
.pg_temp
) {
1371 // if pool does not exist, remove any existing pg_temps associated with
1372 // it. we don't care about pg_temps on the pending_inc either; if there
1373 // are new_pg_temp entries on the pending, clear them out just as well.
1374 if (!osdmap
.have_pg_pool(pg
.first
.pool())) {
1375 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1376 << " for nonexistent pool " << pg
.first
.pool() << dendl
;
1377 pending_inc
->new_pg_temp
[pg
.first
].clear();
1381 unsigned num_up
= 0;
1382 for (auto o
: pg
.second
) {
1383 if (!tmpmap
.is_down(o
)) {
1389 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
1390 << " with all down osds" << pg
.second
<< dendl
;
1391 pending_inc
->new_pg_temp
[pg
.first
].clear();
1394 // redundant pg_temp?
1397 tmpmap
.pg_to_raw_up(pg
.first
, &raw_up
, &primary
);
1398 if (vectors_equal(raw_up
, pg
.second
)) {
1399 ldout(cct
, 10) << __func__
<< " removing pg_temp " << pg
.first
<< " "
1400 << pg
.second
<< " that matches raw_up mapping" << dendl
;
1401 if (osdmap
.pg_temp
->count(pg
.first
))
1402 pending_inc
->new_pg_temp
[pg
.first
].clear();
1404 pending_inc
->new_pg_temp
.erase(pg
.first
);
1408 for (auto &pg
: *tmpmap
.primary_temp
) {
1410 if (tmpmap
.is_down(pg
.second
)) {
1411 ldout(cct
, 10) << __func__
<< " removing primary_temp " << pg
.first
1412 << " to down " << pg
.second
<< dendl
;
1413 pending_inc
->new_primary_temp
[pg
.first
] = -1;
1416 // redundant primary_temp?
1417 vector
<int> real_up
, templess_up
;
1418 int real_primary
, templess_primary
;
1419 pg_t pgid
= pg
.first
;
1420 tmpmap
.pg_to_acting_osds(pgid
, &real_up
, &real_primary
);
1421 tmpmap
.pg_to_raw_up(pgid
, &templess_up
, &templess_primary
);
1422 if (real_primary
== templess_primary
){
1423 ldout(cct
, 10) << __func__
<< " removing primary_temp "
1424 << pgid
<< " -> " << real_primary
1425 << " (unnecessary/redundant)" << dendl
;
1426 if (osdmap
.primary_temp
->count(pgid
))
1427 pending_inc
->new_primary_temp
[pgid
] = -1;
1429 pending_inc
->new_primary_temp
.erase(pgid
);
1434 int OSDMap::apply_incremental(const Incremental
&inc
)
1436 new_blacklist_entries
= false;
1439 else if (inc
.fsid
!= fsid
)
1442 assert(inc
.epoch
== epoch
+1);
1445 modified
= inc
.modified
;
1448 if (inc
.fullmap
.length()) {
1449 bufferlist
bl(inc
.fullmap
);
1454 // nope, incremental.
1455 if (inc
.new_flags
>= 0)
1456 flags
= inc
.new_flags
;
1458 if (inc
.new_max_osd
>= 0)
1459 set_max_osd(inc
.new_max_osd
);
1461 if (inc
.new_pool_max
!= -1)
1462 pool_max
= inc
.new_pool_max
;
1464 for (const auto &pool
: inc
.new_pools
) {
1465 pools
[pool
.first
] = pool
.second
;
1466 pools
[pool
.first
].last_change
= epoch
;
1469 for (const auto &pname
: inc
.new_pool_names
) {
1470 auto pool_name_entry
= pool_name
.find(pname
.first
);
1471 if (pool_name_entry
!= pool_name
.end()) {
1472 name_pool
.erase(pool_name_entry
->second
);
1473 pool_name_entry
->second
= pname
.second
;
1475 pool_name
[pname
.first
] = pname
.second
;
1477 name_pool
[pname
.second
] = pname
.first
;
1480 for (const auto &pool
: inc
.old_pools
) {
1482 name_pool
.erase(pool_name
[pool
]);
1483 pool_name
.erase(pool
);
1486 for (const auto &weight
: inc
.new_weight
) {
1487 set_weight(weight
.first
, weight
.second
);
1489 // if we are marking in, clear the AUTOOUT and NEW bits, and clear
1490 // xinfo old_weight.
1491 if (weight
.second
) {
1492 osd_state
[weight
.first
] &= ~(CEPH_OSD_AUTOOUT
| CEPH_OSD_NEW
);
1493 osd_xinfo
[weight
.first
].old_weight
= 0;
1497 for (const auto &primary_affinity
: inc
.new_primary_affinity
) {
1498 set_primary_affinity(primary_affinity
.first
, primary_affinity
.second
);
1501 // erasure_code_profiles
1502 for (const auto &profile
: inc
.old_erasure_code_profiles
)
1503 erasure_code_profiles
.erase(profile
);
1505 for (const auto &profile
: inc
.new_erasure_code_profiles
) {
1506 set_erasure_code_profile(profile
.first
, profile
.second
);
1510 for (const auto &state
: inc
.new_state
) {
1511 const auto osd
= state
.first
;
1512 int s
= state
.second
? state
.second
: CEPH_OSD_UP
;
1513 if ((osd_state
[osd
] & CEPH_OSD_UP
) &&
1514 (s
& CEPH_OSD_UP
)) {
1515 osd_info
[osd
].down_at
= epoch
;
1516 osd_xinfo
[osd
].down_stamp
= modified
;
1518 if ((osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
1519 (s
& CEPH_OSD_EXISTS
)) {
1520 // osd is destroyed; clear out anything interesting.
1521 (*osd_uuid
)[osd
] = uuid_d();
1522 osd_info
[osd
] = osd_info_t();
1523 osd_xinfo
[osd
] = osd_xinfo_t();
1524 set_primary_affinity(osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1525 osd_addrs
->client_addr
[osd
].reset(new entity_addr_t());
1526 osd_addrs
->cluster_addr
[osd
].reset(new entity_addr_t());
1527 osd_addrs
->hb_front_addr
[osd
].reset(new entity_addr_t());
1528 osd_addrs
->hb_back_addr
[osd
].reset(new entity_addr_t());
1531 osd_state
[osd
] ^= s
;
1535 for (const auto &client
: inc
.new_up_client
) {
1536 osd_state
[client
.first
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
1537 osd_addrs
->client_addr
[client
.first
].reset(new entity_addr_t(client
.second
));
1538 if (inc
.new_hb_back_up
.empty())
1539 osd_addrs
->hb_back_addr
[client
.first
].reset(new entity_addr_t(client
.second
)); //this is a backward-compatibility hack
1541 osd_addrs
->hb_back_addr
[client
.first
].reset(
1542 new entity_addr_t(inc
.new_hb_back_up
.find(client
.first
)->second
));
1543 const auto j
= inc
.new_hb_front_up
.find(client
.first
);
1544 if (j
!= inc
.new_hb_front_up
.end())
1545 osd_addrs
->hb_front_addr
[client
.first
].reset(new entity_addr_t(j
->second
));
1547 osd_addrs
->hb_front_addr
[client
.first
].reset();
1549 osd_info
[client
.first
].up_from
= epoch
;
1552 for (const auto &cluster
: inc
.new_up_cluster
)
1553 osd_addrs
->cluster_addr
[cluster
.first
].reset(new entity_addr_t(cluster
.second
));
1556 for (const auto &thru
: inc
.new_up_thru
)
1557 osd_info
[thru
.first
].up_thru
= thru
.second
;
1559 for (const auto &interval
: inc
.new_last_clean_interval
) {
1560 osd_info
[interval
.first
].last_clean_begin
= interval
.second
.first
;
1561 osd_info
[interval
.first
].last_clean_end
= interval
.second
.second
;
1564 for (const auto &lost
: inc
.new_lost
)
1565 osd_info
[lost
.first
].lost_at
= lost
.second
;
1568 for (const auto &xinfo
: inc
.new_xinfo
)
1569 osd_xinfo
[xinfo
.first
] = xinfo
.second
;
1572 for (const auto &uuid
: inc
.new_uuid
)
1573 (*osd_uuid
)[uuid
.first
] = uuid
.second
;
1576 for (const auto &pg
: inc
.new_pg_temp
) {
1577 if (pg
.second
.empty())
1578 pg_temp
->erase(pg
.first
);
1580 (*pg_temp
)[pg
.first
] = pg
.second
;
1583 for (const auto &pg
: inc
.new_primary_temp
) {
1584 if (pg
.second
== -1)
1585 primary_temp
->erase(pg
.first
);
1587 (*primary_temp
)[pg
.first
] = pg
.second
;
1590 for (auto& p
: inc
.new_pg_upmap
) {
1591 pg_upmap
[p
.first
] = p
.second
;
1593 for (auto& pg
: inc
.old_pg_upmap
) {
1596 for (auto& p
: inc
.new_pg_upmap_items
) {
1597 pg_upmap_items
[p
.first
] = p
.second
;
1599 for (auto& pg
: inc
.old_pg_upmap_items
) {
1600 pg_upmap_items
.erase(pg
);
1604 if (!inc
.new_blacklist
.empty()) {
1605 blacklist
.insert(inc
.new_blacklist
.begin(),inc
.new_blacklist
.end());
1606 new_blacklist_entries
= true;
1608 for (const auto &addr
: inc
.old_blacklist
)
1609 blacklist
.erase(addr
);
1611 // cluster snapshot?
1612 if (inc
.cluster_snapshot
.length()) {
1613 cluster_snapshot
= inc
.cluster_snapshot
;
1614 cluster_snapshot_epoch
= inc
.epoch
;
1616 cluster_snapshot
.clear();
1617 cluster_snapshot_epoch
= 0;
1620 if (inc
.new_nearfull_ratio
>= 0) {
1621 nearfull_ratio
= inc
.new_nearfull_ratio
;
1623 if (inc
.new_backfillfull_ratio
>= 0) {
1624 backfillfull_ratio
= inc
.new_backfillfull_ratio
;
1626 if (inc
.new_full_ratio
>= 0) {
1627 full_ratio
= inc
.new_full_ratio
;
1629 if (inc
.new_require_min_compat_client
.length()) {
1630 require_min_compat_client
= inc
.new_require_min_compat_client
;
1633 // do new crush map last (after up/down stuff)
1634 if (inc
.crush
.length()) {
1635 bufferlist
bl(inc
.crush
);
1636 auto blp
= bl
.begin();
1637 crush
.reset(new CrushWrapper
);
1642 _calc_up_osd_features();
1647 int OSDMap::map_to_pg(
1651 const string
& nspace
,
1654 // calculate ps (placement seed)
1655 const pg_pool_t
*pool
= get_pg_pool(poolid
);
1660 ps
= pool
->hash_key(key
, nspace
);
1662 ps
= pool
->hash_key(name
, nspace
);
1663 *pg
= pg_t(ps
, poolid
);
1667 int OSDMap::object_locator_to_pg(
1668 const object_t
& oid
, const object_locator_t
& loc
, pg_t
&pg
) const
1670 if (loc
.hash
>= 0) {
1671 if (!get_pg_pool(loc
.get_pool())) {
1674 pg
= pg_t(loc
.hash
, loc
.get_pool());
1677 return map_to_pg(loc
.get_pool(), oid
.name
, loc
.key
, loc
.nspace
, &pg
);
1680 ceph_object_layout
OSDMap::make_object_layout(
1681 object_t oid
, int pg_pool
, string nspace
) const
1683 object_locator_t
loc(pg_pool
, nspace
);
1685 ceph_object_layout ol
;
1686 pg_t pgid
= object_locator_to_pg(oid
, loc
);
1687 ol
.ol_pgid
= pgid
.get_old_pg().v
;
1688 ol
.ol_stripe_unit
= 0;
1692 void OSDMap::_remove_nonexistent_osds(const pg_pool_t
& pool
,
1693 vector
<int>& osds
) const
1695 if (pool
.can_shift_osds()) {
1696 unsigned removed
= 0;
1697 for (unsigned i
= 0; i
< osds
.size(); i
++) {
1698 if (!exists(osds
[i
])) {
1703 osds
[i
- removed
] = osds
[i
];
1707 osds
.resize(osds
.size() - removed
);
1709 for (auto& osd
: osds
) {
1711 osd
= CRUSH_ITEM_NONE
;
1716 int OSDMap::_pg_to_raw_osds(
1717 const pg_pool_t
& pool
, pg_t pg
,
1722 ps_t pps
= pool
.raw_pg_to_pps(pg
); // placement ps
1723 unsigned size
= pool
.get_size();
1726 int ruleno
= crush
->find_rule(pool
.get_crush_ruleset(), pool
.get_type(), size
);
1728 crush
->do_rule(ruleno
, pps
, *osds
, size
, osd_weight
, pg
.pool());
1730 _remove_nonexistent_osds(pool
, *osds
);
1735 return osds
->size();
1738 int OSDMap::_pick_primary(const vector
<int>& osds
) const
1740 for (auto osd
: osds
) {
1741 if (osd
!= CRUSH_ITEM_NONE
) {
1748 void OSDMap::_apply_remap(const pg_pool_t
& pi
, pg_t raw_pg
, vector
<int> *raw
) const
1750 pg_t pg
= pi
.raw_pg_to_pg(raw_pg
);
1751 auto p
= pg_upmap
.find(pg
);
1752 if (p
!= pg_upmap
.end()) {
1753 // make sure targets aren't marked out
1754 for (auto osd
: p
->second
) {
1755 if (osd
!= CRUSH_ITEM_NONE
&& osd
< max_osd
&& osd_weight
[osd
] == 0) {
1756 // reject/ignore the explicit mapping
1760 *raw
= vector
<int>(p
->second
.begin(), p
->second
.end());
1764 auto q
= pg_upmap_items
.find(pg
);
1765 if (q
!= pg_upmap_items
.end()) {
1766 // NOTE: this approach does not allow a bidirectional swap,
1767 // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
1768 for (auto& r
: q
->second
) {
1769 // make sure the replacement value doesn't already appear
1770 bool exists
= false;
1772 for (unsigned i
= 0; i
< raw
->size(); ++i
) {
1773 int osd
= (*raw
)[i
];
1774 if (osd
== r
.second
) {
1778 // ignore mapping if target is marked out (or invalid osd id)
1779 if (osd
== r
.first
&&
1781 !(r
.second
!= CRUSH_ITEM_NONE
&& r
.second
< max_osd
&&
1782 osd_weight
[r
.second
] == 0)) {
1786 if (!exists
&& pos
>= 0) {
1787 (*raw
)[pos
] = r
.second
;
1794 // pg -> (up osd list)
1795 void OSDMap::_raw_to_up_osds(const pg_pool_t
& pool
, const vector
<int>& raw
,
1796 vector
<int> *up
) const
1798 if (pool
.can_shift_osds()) {
1801 up
->reserve(raw
.size());
1802 for (unsigned i
=0; i
<raw
.size(); i
++) {
1803 if (!exists(raw
[i
]) || is_down(raw
[i
]))
1805 up
->push_back(raw
[i
]);
1808 // set down/dne devices to NONE
1809 up
->resize(raw
.size());
1810 for (int i
= raw
.size() - 1; i
>= 0; --i
) {
1811 if (!exists(raw
[i
]) || is_down(raw
[i
])) {
1812 (*up
)[i
] = CRUSH_ITEM_NONE
;
1820 void OSDMap::_apply_primary_affinity(ps_t seed
,
1821 const pg_pool_t
& pool
,
1825 // do we have any non-default primary_affinity values for these osds?
1826 if (!osd_primary_affinity
)
1830 for (const auto osd
: *osds
) {
1831 if (osd
!= CRUSH_ITEM_NONE
&&
1832 (*osd_primary_affinity
)[osd
] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1840 // pick the primary. feed both the seed (for the pg) and the osd
1841 // into the hash/rng so that a proportional fraction of an osd's pgs
1842 // get rejected as primary.
1844 for (unsigned i
= 0; i
< osds
->size(); ++i
) {
1846 if (o
== CRUSH_ITEM_NONE
)
1848 unsigned a
= (*osd_primary_affinity
)[o
];
1849 if (a
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
1850 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1851 seed
, o
) >> 16) >= a
) {
1852 // we chose not to use this primary. note it anyway as a
1853 // fallback in case we don't pick anyone else, but keep looking.
1864 *primary
= (*osds
)[pos
];
1866 if (pool
.can_shift_osds() && pos
> 0) {
1867 // move the new primary to the front.
1868 for (int i
= pos
; i
> 0; --i
) {
1869 (*osds
)[i
] = (*osds
)[i
-1];
1871 (*osds
)[0] = *primary
;
1875 void OSDMap::_get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
1876 vector
<int> *temp_pg
, int *temp_primary
) const
1878 pg
= pool
.raw_pg_to_pg(pg
);
1879 const auto p
= pg_temp
->find(pg
);
1881 if (p
!= pg_temp
->end()) {
1882 for (unsigned i
=0; i
<p
->second
.size(); i
++) {
1883 if (!exists(p
->second
[i
]) || is_down(p
->second
[i
])) {
1884 if (pool
.can_shift_osds()) {
1887 temp_pg
->push_back(CRUSH_ITEM_NONE
);
1890 temp_pg
->push_back(p
->second
[i
]);
1894 const auto &pp
= primary_temp
->find(pg
);
1896 if (pp
!= primary_temp
->end()) {
1897 *temp_primary
= pp
->second
;
1898 } else if (!temp_pg
->empty()) { // apply pg_temp's primary
1899 for (unsigned i
= 0; i
< temp_pg
->size(); ++i
) {
1900 if ((*temp_pg
)[i
] != CRUSH_ITEM_NONE
) {
1901 *temp_primary
= (*temp_pg
)[i
];
1908 int OSDMap::pg_to_raw_osds(pg_t pg
, vector
<int> *raw
, int *primary
) const
1912 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
1915 int r
= _pg_to_raw_osds(*pool
, pg
, raw
, NULL
);
1917 *primary
= _pick_primary(*raw
);
1921 void OSDMap::pg_to_raw_up(pg_t pg
, vector
<int> *up
, int *primary
) const
1923 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
1933 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
1934 _apply_remap(*pool
, pg
, &raw
);
1935 _raw_to_up_osds(*pool
, raw
, up
);
1936 *primary
= _pick_primary(raw
);
1937 _apply_primary_affinity(pps
, *pool
, up
, primary
);
1940 void OSDMap::_pg_to_up_acting_osds(
1941 const pg_t
& pg
, vector
<int> *up
, int *up_primary
,
1942 vector
<int> *acting
, int *acting_primary
,
1943 bool raw_pg_to_pg
) const
1945 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
1947 (!raw_pg_to_pg
&& pg
.ps() >= pool
->get_pg_num())) {
1955 *acting_primary
= -1;
1960 vector
<int> _acting
;
1962 int _acting_primary
;
1964 _get_temp_osds(*pool
, pg
, &_acting
, &_acting_primary
);
1965 if (_acting
.empty() || up
|| up_primary
) {
1966 _pg_to_raw_osds(*pool
, pg
, &raw
, &pps
);
1967 _apply_remap(*pool
, pg
, &raw
);
1968 _raw_to_up_osds(*pool
, raw
, &_up
);
1969 _up_primary
= _pick_primary(_up
);
1970 _apply_primary_affinity(pps
, *pool
, &_up
, &_up_primary
);
1971 if (_acting
.empty()) {
1973 if (_acting_primary
== -1) {
1974 _acting_primary
= _up_primary
;
1981 *up_primary
= _up_primary
;
1985 acting
->swap(_acting
);
1987 *acting_primary
= _acting_primary
;
1990 int OSDMap::calc_pg_rank(int osd
, const vector
<int>& acting
, int nrep
)
1993 nrep
= acting
.size();
1994 for (int i
=0; i
<nrep
; i
++)
1995 if (acting
[i
] == osd
)
2000 int OSDMap::calc_pg_role(int osd
, const vector
<int>& acting
, int nrep
)
2002 return calc_pg_rank(osd
, acting
, nrep
);
2005 bool OSDMap::primary_changed(
2007 const vector
<int> &oldacting
,
2009 const vector
<int> &newacting
)
2011 if (oldacting
.empty() && newacting
.empty())
2012 return false; // both still empty
2013 if (oldacting
.empty() ^ newacting
.empty())
2014 return true; // was empty, now not, or vice versa
2015 if (oldprimary
!= newprimary
)
2016 return true; // primary changed
2017 if (calc_pg_rank(oldprimary
, oldacting
) !=
2018 calc_pg_rank(newprimary
, newacting
))
2020 return false; // same primary (tho replicas may have changed)
2024 // serialize, unserialize
2025 void OSDMap::encode_client_old(bufferlist
& bl
) const
2032 ::encode(epoch
, bl
);
2033 ::encode(created
, bl
);
2034 ::encode(modified
, bl
);
2036 // for ::encode(pools, bl);
2037 __u32 n
= pools
.size();
2040 for (const auto &pool
: pools
) {
2043 ::encode(pool
.second
, bl
, 0);
2045 // for ::encode(pool_name, bl);
2046 n
= pool_name
.size();
2048 for (const auto &pname
: pool_name
) {
2051 ::encode(pname
.second
, bl
);
2053 // for ::encode(pool_max, bl);
2057 ::encode(flags
, bl
);
2059 ::encode(max_osd
, bl
);
2060 ::encode(osd_state
, bl
);
2061 ::encode(osd_weight
, bl
);
2062 ::encode(osd_addrs
->client_addr
, bl
, 0);
2064 // for ::encode(pg_temp, bl);
2065 n
= pg_temp
->size();
2067 for (const auto pg
: *pg_temp
) {
2068 old_pg_t opg
= pg
.first
.get_old_pg();
2070 ::encode(pg
.second
, bl
);
2075 crush
->encode(cbl
, 0 /* legacy (no) features */);
2079 void OSDMap::encode_classic(bufferlist
& bl
, uint64_t features
) const
2081 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
2082 encode_client_old(bl
);
2091 ::encode(epoch
, bl
);
2092 ::encode(created
, bl
);
2093 ::encode(modified
, bl
);
2095 ::encode(pools
, bl
, features
);
2096 ::encode(pool_name
, bl
);
2097 ::encode(pool_max
, bl
);
2099 ::encode(flags
, bl
);
2101 ::encode(max_osd
, bl
);
2102 ::encode(osd_state
, bl
);
2103 ::encode(osd_weight
, bl
);
2104 ::encode(osd_addrs
->client_addr
, bl
, features
);
2106 ::encode(*pg_temp
, bl
);
2110 crush
->encode(cbl
, 0 /* legacy (no) features */);
2116 ::encode(osd_addrs
->hb_back_addr
, bl
, features
);
2117 ::encode(osd_info
, bl
);
2118 ::encode(blacklist
, bl
, features
);
2119 ::encode(osd_addrs
->cluster_addr
, bl
, features
);
2120 ::encode(cluster_snapshot_epoch
, bl
);
2121 ::encode(cluster_snapshot
, bl
);
2122 ::encode(*osd_uuid
, bl
);
2123 ::encode(osd_xinfo
, bl
);
2124 ::encode(osd_addrs
->hb_front_addr
, bl
, features
);
2127 void OSDMap::encode(bufferlist
& bl
, uint64_t features
) const
2129 if ((features
& CEPH_FEATURE_OSDMAP_ENC
) == 0) {
2130 encode_classic(bl
, features
);
2134 // only a select set of callers should *ever* be encoding new
2135 // OSDMaps. others should be passing around the canonical encoded
2136 // buffers from on high. select out those callers by passing in an
2137 // "impossible" feature bit.
2138 assert(features
& CEPH_FEATURE_RESERVED
);
2139 features
&= ~CEPH_FEATURE_RESERVED
;
2141 size_t start_offset
= bl
.length();
2143 buffer::list::iterator crc_it
;
2145 // meta-encoding: how we include client-used and osd-specific data
2146 ENCODE_START(8, 7, bl
);
2150 if (!HAVE_FEATURE(features
, OSDMAP_PG_UPMAP
)) {
2153 ENCODE_START(v
, 1, bl
); // client-usable data
2156 ::encode(epoch
, bl
);
2157 ::encode(created
, bl
);
2158 ::encode(modified
, bl
);
2160 ::encode(pools
, bl
, features
);
2161 ::encode(pool_name
, bl
);
2162 ::encode(pool_max
, bl
);
2164 ::encode(flags
, bl
);
2166 ::encode(max_osd
, bl
);
2167 ::encode(osd_state
, bl
);
2168 ::encode(osd_weight
, bl
);
2169 ::encode(osd_addrs
->client_addr
, bl
, features
);
2171 ::encode(*pg_temp
, bl
);
2172 ::encode(*primary_temp
, bl
);
2173 if (osd_primary_affinity
) {
2174 ::encode(*osd_primary_affinity
, bl
);
2182 crush
->encode(cbl
, features
);
2184 ::encode(erasure_code_profiles
, bl
);
2187 ::encode(pg_upmap
, bl
);
2188 ::encode(pg_upmap_items
, bl
);
2190 assert(pg_upmap
.empty());
2191 assert(pg_upmap_items
.empty());
2193 ENCODE_FINISH(bl
); // client-usable data
2197 uint8_t target_v
= 4;
2198 if (!HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
2201 ENCODE_START(target_v
, 1, bl
); // extended, osd-only data
2202 ::encode(osd_addrs
->hb_back_addr
, bl
, features
);
2203 ::encode(osd_info
, bl
);
2205 // put this in a sorted, ordered map<> so that we encode in a
2206 // deterministic order.
2207 map
<entity_addr_t
,utime_t
> blacklist_map
;
2208 for (const auto &addr
: blacklist
)
2209 blacklist_map
.insert(make_pair(addr
.first
, addr
.second
));
2210 ::encode(blacklist_map
, bl
, features
);
2212 ::encode(osd_addrs
->cluster_addr
, bl
, features
);
2213 ::encode(cluster_snapshot_epoch
, bl
);
2214 ::encode(cluster_snapshot
, bl
);
2215 ::encode(*osd_uuid
, bl
);
2216 ::encode(osd_xinfo
, bl
);
2217 ::encode(osd_addrs
->hb_front_addr
, bl
, features
);
2218 if (target_v
>= 2) {
2219 ::encode(nearfull_ratio
, bl
);
2220 ::encode(full_ratio
, bl
);
2221 ::encode(backfillfull_ratio
, bl
);
2222 ::encode(require_min_compat_client
, bl
);
2224 ENCODE_FINISH(bl
); // osd-only data
2227 ::encode((uint32_t)0, bl
); // dummy crc
2230 tail_offset
= bl
.length();
2232 ENCODE_FINISH(bl
); // meta-encoding wrapper
2236 front
.substr_of(bl
, start_offset
, crc_it
.get_off() - start_offset
);
2237 crc
= front
.crc32c(-1);
2238 if (tail_offset
< bl
.length()) {
2240 tail
.substr_of(bl
, tail_offset
, bl
.length() - tail_offset
);
2241 crc
= tail
.crc32c(crc
);
2245 crc_it
.copy_in(4, (char*)&crc_le
);
2249 void OSDMap::decode(bufferlist
& bl
)
2251 auto p
= bl
.begin();
2255 void OSDMap::decode_classic(bufferlist::iterator
& p
)
2264 ::decode(created
, p
);
2265 ::decode(modified
, p
);
2269 int32_t max_pools
= 0;
2270 ::decode(max_pools
, p
);
2271 pool_max
= max_pools
;
2277 ::decode(pools
[t
], p
);
2282 } else if (v
== 5) {
2287 ::decode(pool_name
[t
], p
);
2294 ::decode(pool_name
, p
);
2295 ::decode(pool_max
, p
);
2297 // kludge around some old bug that zeroed out pool_max (#2307)
2298 if (pools
.size() && pool_max
< pools
.rbegin()->first
) {
2299 pool_max
= pools
.rbegin()->first
;
2304 ::decode(max_osd
, p
);
2305 ::decode(osd_state
, p
);
2306 ::decode(osd_weight
, p
);
2307 ::decode(osd_addrs
->client_addr
, p
);
2313 ::decode_raw(opg
, p
);
2314 ::decode((*pg_temp
)[pg_t(opg
)], p
);
2317 ::decode(*pg_temp
, p
);
2323 auto cblp
= cbl
.begin();
2324 crush
->decode(cblp
);
2330 ::decode(osd_addrs
->hb_back_addr
, p
);
2331 ::decode(osd_info
, p
);
2333 ::decode(pool_name
, p
);
2335 ::decode(blacklist
, p
);
2337 ::decode(osd_addrs
->cluster_addr
, p
);
2339 osd_addrs
->cluster_addr
.resize(osd_addrs
->client_addr
.size());
2342 ::decode(cluster_snapshot_epoch
, p
);
2343 ::decode(cluster_snapshot
, p
);
2347 ::decode(*osd_uuid
, p
);
2349 osd_uuid
->resize(max_osd
);
2352 ::decode(osd_xinfo
, p
);
2354 osd_xinfo
.resize(max_osd
);
2357 ::decode(osd_addrs
->hb_front_addr
, p
);
2359 osd_addrs
->hb_front_addr
.resize(osd_addrs
->hb_back_addr
.size());
2361 osd_primary_affinity
.reset();
2366 void OSDMap::decode(bufferlist::iterator
& bl
)
2369 * Older encodings of the OSDMap had a single struct_v which
2370 * covered the whole encoding, and was prior to our modern
2371 * stuff which includes a compatv and a size. So if we see
2372 * a struct_v < 7, we must rewind to the beginning and use our
2375 size_t start_offset
= bl
.get_off();
2376 size_t tail_offset
= 0;
2377 bufferlist crc_front
, crc_tail
;
2379 DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl
); // wrapper
2381 int struct_v_size
= sizeof(struct_v
);
2382 bl
.advance(-struct_v_size
);
2387 * Since we made it past that hurdle, we can use our normal paths.
2390 DECODE_START(4, bl
); // client-usable data
2393 ::decode(epoch
, bl
);
2394 ::decode(created
, bl
);
2395 ::decode(modified
, bl
);
2397 ::decode(pools
, bl
);
2398 ::decode(pool_name
, bl
);
2399 ::decode(pool_max
, bl
);
2401 ::decode(flags
, bl
);
2403 ::decode(max_osd
, bl
);
2404 ::decode(osd_state
, bl
);
2405 ::decode(osd_weight
, bl
);
2406 ::decode(osd_addrs
->client_addr
, bl
);
2408 ::decode(*pg_temp
, bl
);
2409 ::decode(*primary_temp
, bl
);
2410 if (struct_v
>= 2) {
2411 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>);
2412 ::decode(*osd_primary_affinity
, bl
);
2413 if (osd_primary_affinity
->empty())
2414 osd_primary_affinity
.reset();
2416 osd_primary_affinity
.reset();
2422 auto cblp
= cbl
.begin();
2423 crush
->decode(cblp
);
2424 if (struct_v
>= 3) {
2425 ::decode(erasure_code_profiles
, bl
);
2427 erasure_code_profiles
.clear();
2429 if (struct_v
>= 4) {
2430 ::decode(pg_upmap
, bl
);
2431 ::decode(pg_upmap_items
, bl
);
2434 pg_upmap_items
.clear();
2436 DECODE_FINISH(bl
); // client-usable data
2440 DECODE_START(4, bl
); // extended, osd-only data
2441 ::decode(osd_addrs
->hb_back_addr
, bl
);
2442 ::decode(osd_info
, bl
);
2443 ::decode(blacklist
, bl
);
2444 ::decode(osd_addrs
->cluster_addr
, bl
);
2445 ::decode(cluster_snapshot_epoch
, bl
);
2446 ::decode(cluster_snapshot
, bl
);
2447 ::decode(*osd_uuid
, bl
);
2448 ::decode(osd_xinfo
, bl
);
2449 ::decode(osd_addrs
->hb_front_addr
, bl
);
2450 if (struct_v
>= 2) {
2451 ::decode(nearfull_ratio
, bl
);
2452 ::decode(full_ratio
, bl
);
2457 if (struct_v
>= 3) {
2458 ::decode(backfillfull_ratio
, bl
);
2460 backfillfull_ratio
= 0;
2463 ::decode(require_min_compat_client
, bl
);
2464 DECODE_FINISH(bl
); // osd-only data
2467 if (struct_v
>= 8) {
2468 crc_front
.substr_of(bl
.get_bl(), start_offset
, bl
.get_off() - start_offset
);
2470 tail_offset
= bl
.get_off();
2473 crc_defined
= false;
2477 DECODE_FINISH(bl
); // wrapper
2481 uint32_t actual
= crc_front
.crc32c(-1);
2482 if (tail_offset
< bl
.get_off()) {
2484 tail
.substr_of(bl
.get_bl(), tail_offset
, bl
.get_off() - tail_offset
);
2485 actual
= tail
.crc32c(actual
);
2487 if (crc
!= actual
) {
2489 ss
<< "bad crc, actual " << actual
<< " != expected " << crc
;
2490 string s
= ss
.str();
2491 throw buffer::malformed_input(s
.c_str());
2498 void OSDMap::post_decode()
2502 for (const auto &pname
: pool_name
) {
2503 name_pool
[pname
.second
] = pname
.first
;
2507 _calc_up_osd_features();
2510 void OSDMap::dump_erasure_code_profiles(
2511 const mempool::osdmap::map
<string
,map
<string
,string
>>& profiles
,
2514 f
->open_object_section("erasure_code_profiles");
2515 for (const auto &profile
: profiles
) {
2516 f
->open_object_section(profile
.first
.c_str());
2517 for (const auto &profm
: profile
.second
) {
2518 f
->dump_string(profm
.first
.c_str(), profm
.second
.c_str());
2525 void OSDMap::dump(Formatter
*f
) const
2527 f
->dump_int("epoch", get_epoch());
2528 f
->dump_stream("fsid") << get_fsid();
2529 f
->dump_stream("created") << get_created();
2530 f
->dump_stream("modified") << get_modified();
2531 f
->dump_string("flags", get_flag_string());
2532 f
->dump_float("full_ratio", full_ratio
);
2533 f
->dump_float("backfillfull_ratio", backfillfull_ratio
);
2534 f
->dump_float("nearfull_ratio", nearfull_ratio
);
2535 f
->dump_string("cluster_snapshot", get_cluster_snapshot());
2536 f
->dump_int("pool_max", get_pool_max());
2537 f
->dump_int("max_osd", get_max_osd());
2538 f
->dump_string("require_min_compat_client", require_min_compat_client
);
2539 auto mv
= get_min_compat_client();
2540 f
->dump_string("min_compat_client", mv
.first
);
2541 f
->dump_string("min_compat_client_version", mv
.second
);
2543 f
->open_array_section("pools");
2544 for (const auto &pool
: pools
) {
2545 std::string
name("<unknown>");
2546 const auto &pni
= pool_name
.find(pool
.first
);
2547 if (pni
!= pool_name
.end())
2549 f
->open_object_section("pool");
2550 f
->dump_int("pool", pool
.first
);
2551 f
->dump_string("pool_name", name
);
2552 pool
.second
.dump(f
);
2557 f
->open_array_section("osds");
2558 for (int i
=0; i
<get_max_osd(); i
++)
2560 f
->open_object_section("osd_info");
2561 f
->dump_int("osd", i
);
2562 f
->dump_stream("uuid") << get_uuid(i
);
2563 f
->dump_int("up", is_up(i
));
2564 f
->dump_int("in", is_in(i
));
2565 f
->dump_float("weight", get_weightf(i
));
2566 f
->dump_float("primary_affinity", get_primary_affinityf(i
));
2567 get_info(i
).dump(f
);
2568 f
->dump_stream("public_addr") << get_addr(i
);
2569 f
->dump_stream("cluster_addr") << get_cluster_addr(i
);
2570 f
->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i
);
2571 f
->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i
);
2575 f
->open_array_section("state");
2576 for (const auto &state
: st
)
2577 f
->dump_string("state", state
);
2584 f
->open_array_section("osd_xinfo");
2585 for (int i
=0; i
<get_max_osd(); i
++) {
2587 f
->open_object_section("xinfo");
2588 f
->dump_int("osd", i
);
2589 osd_xinfo
[i
].dump(f
);
2595 f
->open_array_section("pg_upmap");
2596 for (auto& p
: pg_upmap
) {
2597 f
->open_object_section("mapping");
2598 f
->dump_stream("pgid") << p
.first
;
2599 f
->open_array_section("osds");
2600 for (auto q
: p
.second
) {
2601 f
->dump_int("osd", q
);
2607 f
->open_array_section("pg_upmap_items");
2608 for (auto& p
: pg_upmap_items
) {
2609 f
->open_object_section("mapping");
2610 f
->dump_stream("pgid") << p
.first
;
2611 f
->open_array_section("mappings");
2612 for (auto& q
: p
.second
) {
2613 f
->open_object_section("mapping");
2614 f
->dump_int("from", q
.first
);
2615 f
->dump_int("to", q
.second
);
2622 f
->open_array_section("pg_temp");
2623 for (const auto &pg
: *pg_temp
) {
2624 f
->open_object_section("osds");
2625 f
->dump_stream("pgid") << pg
.first
;
2626 f
->open_array_section("osds");
2627 for (const auto osd
: pg
.second
)
2628 f
->dump_int("osd", osd
);
2634 f
->open_array_section("primary_temp");
2635 for (const auto &pg
: *primary_temp
) {
2636 f
->dump_stream("pgid") << pg
.first
;
2637 f
->dump_int("osd", pg
.second
);
2639 f
->close_section(); // primary_temp
2641 f
->open_object_section("blacklist");
2642 for (const auto &addr
: blacklist
) {
2645 f
->dump_stream(ss
.str().c_str()) << addr
.second
;
2649 dump_erasure_code_profiles(erasure_code_profiles
, f
);
2652 void OSDMap::generate_test_instances(list
<OSDMap
*>& o
)
2654 o
.push_back(new OSDMap
);
2656 CephContext
*cct
= new CephContext(CODE_ENVIRONMENT_UTILITY
);
2657 o
.push_back(new OSDMap
);
2659 o
.back()->build_simple(cct
, 1, fsid
, 16, 7, 8);
2660 o
.back()->created
= o
.back()->modified
= utime_t(1, 2); // fix timestamp
2661 o
.back()->blacklist
[entity_addr_t()] = utime_t(5, 6);
2665 string
OSDMap::get_flag_string(unsigned f
)
2668 if ( f
& CEPH_OSDMAP_NEARFULL
)
2670 if (f
& CEPH_OSDMAP_FULL
)
2672 if (f
& CEPH_OSDMAP_PAUSERD
)
2674 if (f
& CEPH_OSDMAP_PAUSEWR
)
2676 if (f
& CEPH_OSDMAP_PAUSEREC
)
2678 if (f
& CEPH_OSDMAP_NOUP
)
2680 if (f
& CEPH_OSDMAP_NODOWN
)
2682 if (f
& CEPH_OSDMAP_NOOUT
)
2684 if (f
& CEPH_OSDMAP_NOIN
)
2686 if (f
& CEPH_OSDMAP_NOBACKFILL
)
2688 if (f
& CEPH_OSDMAP_NOREBALANCE
)
2689 s
+= ",norebalance";
2690 if (f
& CEPH_OSDMAP_NORECOVER
)
2692 if (f
& CEPH_OSDMAP_NOSCRUB
)
2694 if (f
& CEPH_OSDMAP_NODEEP_SCRUB
)
2695 s
+= ",nodeep-scrub";
2696 if (f
& CEPH_OSDMAP_NOTIERAGENT
)
2697 s
+= ",notieragent";
2698 if (f
& CEPH_OSDMAP_SORTBITWISE
)
2699 s
+= ",sortbitwise";
2700 if (f
& CEPH_OSDMAP_REQUIRE_JEWEL
)
2701 s
+= ",require_jewel_osds";
2702 if (f
& CEPH_OSDMAP_REQUIRE_KRAKEN
)
2703 s
+= ",require_kraken_osds";
2704 if (f
& CEPH_OSDMAP_REQUIRE_LUMINOUS
)
2705 s
+= ",require_luminous_osds";
2711 string
OSDMap::get_flag_string() const
2713 return get_flag_string(flags
);
2720 qi() : item(0), depth(0), weight(0) {}
2721 qi(int i
, int d
, float w
) : item(i
), depth(d
), weight(w
) {}
2724 void OSDMap::print_pools(ostream
& out
) const
2726 for (const auto &pool
: pools
) {
2727 std::string
name("<unknown>");
2728 const auto &pni
= pool_name
.find(pool
.first
);
2729 if (pni
!= pool_name
.end())
2731 out
<< "pool " << pool
.first
2733 << "' " << pool
.second
<< "\n";
2735 for (const auto &snap
: pool
.second
.snaps
)
2736 out
<< "\tsnap " << snap
.second
.snapid
<< " '" << snap
.second
.name
<< "' " << snap
.second
.stamp
<< "\n";
2738 if (!pool
.second
.removed_snaps
.empty())
2739 out
<< "\tremoved_snaps " << pool
.second
.removed_snaps
<< "\n";
2744 void OSDMap::print(ostream
& out
) const
2746 out
<< "epoch " << get_epoch() << "\n"
2747 << "fsid " << get_fsid() << "\n"
2748 << "created " << get_created() << "\n"
2749 << "modified " << get_modified() << "\n";
2751 out
<< "flags " << get_flag_string() << "\n";
2752 out
<< "full_ratio " << full_ratio
<< "\n";
2753 out
<< "backfillfull_ratio " << backfillfull_ratio
<< "\n";
2754 out
<< "nearfull_ratio " << nearfull_ratio
<< "\n";
2755 if (require_min_compat_client
.length()) {
2756 out
<< "require_min_compat_client " << require_min_compat_client
<< "\n";
2758 auto mv
= get_min_compat_client();
2759 out
<< "min_compat_client " << mv
.first
<< " " << mv
.second
<< "\n";
2760 if (get_cluster_snapshot().length())
2761 out
<< "cluster_snapshot " << get_cluster_snapshot() << "\n";
2766 out
<< "max_osd " << get_max_osd() << "\n";
2767 for (int i
=0; i
<get_max_osd(); i
++) {
2770 out
<< (is_up(i
) ? " up ":" down");
2771 out
<< (is_in(i
) ? " in ":" out");
2772 out
<< " weight " << get_weightf(i
);
2773 if (get_primary_affinity(i
) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
)
2774 out
<< " primary_affinity " << get_primary_affinityf(i
);
2775 const osd_info_t
& info(get_info(i
));
2777 out
<< " " << get_addr(i
) << " " << get_cluster_addr(i
) << " " << get_hb_back_addr(i
)
2778 << " " << get_hb_front_addr(i
);
2782 if (!get_uuid(i
).is_zero())
2783 out
<< " " << get_uuid(i
);
2789 for (auto& p
: pg_upmap
) {
2790 out
<< "pg_upmap " << p
.first
<< " " << p
.second
<< "\n";
2792 for (auto& p
: pg_upmap_items
) {
2793 out
<< "pg_upmap_items " << p
.first
<< " " << p
.second
<< "\n";
2796 for (const auto pg
: *pg_temp
)
2797 out
<< "pg_temp " << pg
.first
<< " " << pg
.second
<< "\n";
2799 for (const auto pg
: *primary_temp
)
2800 out
<< "primary_temp " << pg
.first
<< " " << pg
.second
<< "\n";
2802 for (const auto &addr
: blacklist
)
2803 out
<< "blacklist " << addr
.first
<< " expires " << addr
.second
<< "\n";
2805 // ignore pg_swap_primary
2808 class OSDTreePlainDumper
: public CrushTreeDumper::Dumper
<TextTable
> {
2810 typedef CrushTreeDumper::Dumper
<TextTable
> Parent
;
2811 OSDTreePlainDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
)
2812 : Parent(crush
), osdmap(osdmap_
) {}
2814 void dump(TextTable
*tbl
) {
2815 tbl
->define_column("ID", TextTable::LEFT
, TextTable::RIGHT
);
2816 tbl
->define_column("WEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
2817 tbl
->define_column("TYPE NAME", TextTable::LEFT
, TextTable::LEFT
);
2818 tbl
->define_column("UP/DOWN", TextTable::LEFT
, TextTable::RIGHT
);
2819 tbl
->define_column("REWEIGHT", TextTable::LEFT
, TextTable::RIGHT
);
2820 tbl
->define_column("PRIMARY-AFFINITY", TextTable::LEFT
, TextTable::RIGHT
);
2824 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
2825 if (osdmap
->exists(i
) && !is_touched(i
))
2826 dump_item(CrushTreeDumper::Item(i
, 0, 0), tbl
);
2831 void dump_item(const CrushTreeDumper::Item
&qi
, TextTable
*tbl
) override
{
2834 << weightf_t(qi
.weight
);
2837 for (int k
= 0; k
< qi
.depth
; k
++)
2839 if (qi
.is_bucket()) {
2840 name
<< crush
->get_type_name(crush
->get_bucket_type(qi
.id
)) << " "
2841 << crush
->get_item_name(qi
.id
);
2843 name
<< "osd." << qi
.id
;
2847 if (!qi
.is_bucket()) {
2848 if (!osdmap
->exists(qi
.id
)) {
2852 *tbl
<< (osdmap
->is_up(qi
.id
) ? "up" : "down")
2853 << weightf_t(osdmap
->get_weightf(qi
.id
))
2854 << weightf_t(osdmap
->get_primary_affinityf(qi
.id
));
2857 *tbl
<< TextTable::endrow
;
2861 const OSDMap
*osdmap
;
2864 class OSDTreeFormattingDumper
: public CrushTreeDumper::FormattingDumper
{
2866 typedef CrushTreeDumper::FormattingDumper Parent
;
2868 OSDTreeFormattingDumper(const CrushWrapper
*crush
, const OSDMap
*osdmap_
)
2869 : Parent(crush
), osdmap(osdmap_
) {}
2871 void dump(Formatter
*f
) {
2872 f
->open_array_section("nodes");
2875 f
->open_array_section("stray");
2876 for (int i
= 0; i
< osdmap
->get_max_osd(); i
++) {
2877 if (osdmap
->exists(i
) && !is_touched(i
))
2878 dump_item(CrushTreeDumper::Item(i
, 0, 0), f
);
2884 void dump_item_fields(const CrushTreeDumper::Item
&qi
, Formatter
*f
) override
{
2885 Parent::dump_item_fields(qi
, f
);
2886 if (!qi
.is_bucket())
2888 f
->dump_unsigned("exists", (int)osdmap
->exists(qi
.id
));
2889 f
->dump_string("status", osdmap
->is_up(qi
.id
) ? "up" : "down");
2890 f
->dump_float("reweight", osdmap
->get_weightf(qi
.id
));
2891 f
->dump_float("primary_affinity", osdmap
->get_primary_affinityf(qi
.id
));
2896 const OSDMap
*osdmap
;
2899 void OSDMap::print_tree(Formatter
*f
, ostream
*out
) const
2902 OSDTreeFormattingDumper(crush
.get(), this).dump(f
);
2906 OSDTreePlainDumper(crush
.get(), this).dump(&tbl
);
2911 void OSDMap::print_summary(Formatter
*f
, ostream
& out
) const
2914 f
->open_object_section("osdmap");
2915 f
->dump_int("epoch", get_epoch());
2916 f
->dump_int("num_osds", get_num_osds());
2917 f
->dump_int("num_up_osds", get_num_up_osds());
2918 f
->dump_int("num_in_osds", get_num_in_osds());
2919 f
->dump_bool("full", test_flag(CEPH_OSDMAP_FULL
) ? true : false);
2920 f
->dump_bool("nearfull", test_flag(CEPH_OSDMAP_NEARFULL
) ? true : false);
2921 f
->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
2924 out
<< " osdmap e" << get_epoch() << ": "
2925 << get_num_osds() << " osds: "
2926 << get_num_up_osds() << " up, "
2927 << get_num_in_osds() << " in";
2928 if (get_num_pg_temp())
2929 out
<< "; " << get_num_pg_temp() << " remapped pgs";
2931 uint64_t important_flags
= flags
& ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS
;
2932 if (important_flags
)
2933 out
<< " flags " << get_flag_string(important_flags
) << "\n";
2937 void OSDMap::print_oneline_summary(ostream
& out
) const
2939 out
<< "e" << get_epoch() << ": "
2940 << get_num_osds() << " osds: "
2941 << get_num_up_osds() << " up, "
2942 << get_num_in_osds() << " in";
2943 if (test_flag(CEPH_OSDMAP_FULL
))
2945 else if (test_flag(CEPH_OSDMAP_NEARFULL
))
2949 bool OSDMap::crush_ruleset_in_use(int ruleset
) const
2951 for (const auto &pool
: pools
) {
2952 if (pool
.second
.crush_ruleset
== ruleset
)
2958 int OSDMap::build_simple(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
2959 int nosd
, int pg_bits
, int pgp_bits
)
2961 ldout(cct
, 10) << "build_simple on " << num_osd
2962 << " osds with " << pg_bits
<< " pg bits per osd, "
2966 created
= modified
= ceph_clock_now();
2973 const md_config_t
*conf
= cct
->_conf
;
2974 vector
<string
> sections
;
2975 conf
->get_all_sections(sections
);
2977 for (auto §ion
: sections
) {
2978 if (section
.find("osd.") != 0)
2981 const char *begin
= section
.c_str() + 4;
2982 char *end
= (char*)begin
;
2983 int o
= strtol(begin
, &end
, 10);
2987 if (o
> cct
->_conf
->mon_max_osd
) {
2988 lderr(cct
) << "[osd." << o
<< "] in config has id > mon_max_osd " << cct
->_conf
->mon_max_osd
<< dendl
;
2996 set_max_osd(maxosd
+ 1);
2999 // pgp_num <= pg_num
3000 if (pgp_bits
> pg_bits
)
3003 vector
<string
> pool_names
;
3004 pool_names
.push_back("rbd");
3009 r
= build_simple_crush_map(cct
, *crush
, nosd
, &ss
);
3011 r
= build_simple_crush_map_from_conf(cct
, *crush
, &ss
);
3014 int poolbase
= get_max_osd() ? get_max_osd() : 1;
3016 int const default_replicated_ruleset
= crush
->get_osd_pool_default_crush_replicated_ruleset(cct
);
3017 assert(default_replicated_ruleset
>= 0);
3019 for (auto &plname
: pool_names
) {
3020 int64_t pool
= ++pool_max
;
3021 pools
[pool
].type
= pg_pool_t::TYPE_REPLICATED
;
3022 pools
[pool
].flags
= cct
->_conf
->osd_pool_default_flags
;
3023 if (cct
->_conf
->osd_pool_default_flag_hashpspool
)
3024 pools
[pool
].set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
3025 if (cct
->_conf
->osd_pool_default_flag_nodelete
)
3026 pools
[pool
].set_flag(pg_pool_t::FLAG_NODELETE
);
3027 if (cct
->_conf
->osd_pool_default_flag_nopgchange
)
3028 pools
[pool
].set_flag(pg_pool_t::FLAG_NOPGCHANGE
);
3029 if (cct
->_conf
->osd_pool_default_flag_nosizechange
)
3030 pools
[pool
].set_flag(pg_pool_t::FLAG_NOSIZECHANGE
);
3031 pools
[pool
].size
= cct
->_conf
->osd_pool_default_size
;
3032 pools
[pool
].min_size
= cct
->_conf
->get_osd_pool_default_min_size();
3033 pools
[pool
].crush_ruleset
= default_replicated_ruleset
;
3034 pools
[pool
].object_hash
= CEPH_STR_HASH_RJENKINS
;
3035 pools
[pool
].set_pg_num(poolbase
<< pg_bits
);
3036 pools
[pool
].set_pgp_num(poolbase
<< pgp_bits
);
3037 pools
[pool
].last_change
= epoch
;
3038 pool_name
[pool
] = plname
;
3039 name_pool
[plname
] = pool
;
3042 for (int i
=0; i
<get_max_osd(); i
++) {
3044 set_weight(i
, CEPH_OSD_OUT
);
3047 map
<string
,string
> profile_map
;
3048 r
= get_erasure_code_profile_default(cct
, profile_map
, &ss
);
3050 lderr(cct
) << ss
.str() << dendl
;
3053 set_erasure_code_profile("default", profile_map
);
3057 int OSDMap::get_erasure_code_profile_default(CephContext
*cct
,
3058 map
<string
,string
> &profile_map
,
3061 int r
= get_json_str_map(cct
->_conf
->osd_pool_default_erasure_code_profile
,
3067 int OSDMap::_build_crush_types(CrushWrapper
& crush
)
3069 crush
.set_type_name(0, "osd");
3070 crush
.set_type_name(1, "host");
3071 crush
.set_type_name(2, "chassis");
3072 crush
.set_type_name(3, "rack");
3073 crush
.set_type_name(4, "row");
3074 crush
.set_type_name(5, "pdu");
3075 crush
.set_type_name(6, "pod");
3076 crush
.set_type_name(7, "room");
3077 crush
.set_type_name(8, "datacenter");
3078 crush
.set_type_name(9, "region");
3079 crush
.set_type_name(10, "root");
3083 int OSDMap::build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
3084 int nosd
, ostream
*ss
)
3089 int root_type
= _build_crush_types(crush
);
3091 int r
= crush
.add_bucket(0, 0, CRUSH_HASH_DEFAULT
,
3092 root_type
, 0, NULL
, NULL
, &rootid
);
3094 crush
.set_item_name(rootid
, "default");
3096 for (int o
=0; o
<nosd
; o
++) {
3097 map
<string
,string
> loc
;
3098 loc
["host"] = "localhost";
3099 loc
["rack"] = "localrack";
3100 loc
["root"] = "default";
3101 ldout(cct
, 10) << " adding osd." << o
<< " at " << loc
<< dendl
;
3103 snprintf(name
, sizeof(name
), "osd.%d", o
);
3104 crush
.insert_item(cct
, o
, 1.0, name
, loc
);
3107 build_simple_crush_rulesets(cct
, crush
, "default", ss
);
3114 int OSDMap::build_simple_crush_map_from_conf(CephContext
*cct
,
3115 CrushWrapper
& crush
,
3118 const md_config_t
*conf
= cct
->_conf
;
3123 int root_type
= _build_crush_types(crush
);
3125 int r
= crush
.add_bucket(0, 0,
3127 root_type
, 0, NULL
, NULL
, &rootid
);
3129 crush
.set_item_name(rootid
, "default");
3132 vector
<string
> sections
;
3133 conf
->get_all_sections(sections
);
3135 for (auto §ion
: sections
) {
3136 if (section
.find("osd.") != 0)
3139 const char *begin
= section
.c_str() + 4;
3140 char *end
= (char*)begin
;
3141 int o
= strtol(begin
, &end
, 10);
3145 string host
, rack
, row
, room
, dc
, pool
;
3146 vector
<string
> sectiontmp
;
3147 sectiontmp
.push_back("osd");
3148 sectiontmp
.push_back(section
);
3149 conf
->get_val_from_conf_file(sectiontmp
, "host", host
, false);
3150 conf
->get_val_from_conf_file(sectiontmp
, "rack", rack
, false);
3151 conf
->get_val_from_conf_file(sectiontmp
, "row", row
, false);
3152 conf
->get_val_from_conf_file(sectiontmp
, "room", room
, false);
3153 conf
->get_val_from_conf_file(sectiontmp
, "datacenter", dc
, false);
3154 conf
->get_val_from_conf_file(sectiontmp
, "root", pool
, false);
3156 if (host
.length() == 0)
3157 host
= "unknownhost";
3158 if (rack
.length() == 0)
3159 rack
= "unknownrack";
3161 map
<string
,string
> loc
;
3169 loc
["datacenter"] = dc
;
3170 loc
["root"] = "default";
3172 ldout(cct
, 5) << " adding osd." << o
<< " at " << loc
<< dendl
;
3173 crush
.insert_item(cct
, o
, 1.0, section
, loc
);
3176 build_simple_crush_rulesets(cct
, crush
, "default", ss
);
3184 int OSDMap::build_simple_crush_rulesets(CephContext
*cct
,
3185 CrushWrapper
& crush
,
3190 crush
._get_osd_pool_default_crush_replicated_ruleset(cct
, true);
3191 string failure_domain
=
3192 crush
.get_type_name(cct
->_conf
->osd_crush_chooseleaf_type
);
3194 if (crush_ruleset
== CEPH_DEFAULT_CRUSH_REPLICATED_RULESET
)
3195 crush_ruleset
= -1; // create ruleset 0 by default
3198 r
= crush
.add_simple_ruleset_at("replicated_ruleset", root
, failure_domain
,
3199 "firstn", pg_pool_t::TYPE_REPLICATED
,
3203 // do not add an erasure rule by default or else we will implicitly
3204 // require the crush_v2 feature of clients
3208 int OSDMap::summarize_mapping_stats(
3210 const set
<int64_t> *pools
,
3218 for (auto &p
: get_pools())
3222 unsigned total_pg
= 0;
3223 unsigned moved_pg
= 0;
3224 vector
<unsigned> base_by_osd(get_max_osd(), 0);
3225 vector
<unsigned> new_by_osd(get_max_osd(), 0);
3226 for (int64_t pool_id
: ls
) {
3227 const pg_pool_t
*pi
= get_pg_pool(pool_id
);
3228 vector
<int> up
, up2
, acting
;
3229 int up_primary
, acting_primary
;
3230 for (unsigned ps
= 0; ps
< pi
->get_pg_num(); ++ps
) {
3231 pg_t
pgid(ps
, pool_id
, -1);
3232 total_pg
+= pi
->get_size();
3233 pg_to_up_acting_osds(pgid
, &up
, &up_primary
,
3234 &acting
, &acting_primary
);
3235 for (int osd
: up
) {
3236 if (osd
>= 0 && osd
< get_max_osd())
3240 newmap
->pg_to_up_acting_osds(pgid
, &up2
, &up_primary
,
3241 &acting
, &acting_primary
);
3242 for (int osd
: up2
) {
3243 if (osd
>= 0 && osd
< get_max_osd())
3246 if (pi
->type
== pg_pool_t::TYPE_ERASURE
) {
3247 for (unsigned i
=0; i
<up
.size(); ++i
) {
3248 if (up
[i
] != up2
[i
]) {
3252 } else if (pi
->type
== pg_pool_t::TYPE_REPLICATED
) {
3253 for (int osd
: up
) {
3254 if (std::find(up2
.begin(), up2
.end(), osd
) == up2
.end()) {
3259 assert(0 == "unhandled pool type");
3265 unsigned num_up_in
= 0;
3266 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
3267 if (is_up(osd
) && is_in(osd
))
3274 float avg_pg
= (float)total_pg
/ (float)num_up_in
;
3275 float base_stddev
= 0, new_stddev
= 0;
3276 int min
= -1, max
= -1;
3277 unsigned min_base_pg
= 0, max_base_pg
= 0;
3278 unsigned min_new_pg
= 0, max_new_pg
= 0;
3279 for (int osd
= 0; osd
< get_max_osd(); ++osd
) {
3280 if (is_up(osd
) && is_in(osd
)) {
3281 float base_diff
= (float)base_by_osd
[osd
] - avg_pg
;
3282 base_stddev
+= base_diff
* base_diff
;
3283 float new_diff
= (float)new_by_osd
[osd
] - avg_pg
;
3284 new_stddev
+= new_diff
* new_diff
;
3285 if (min
< 0 || base_by_osd
[osd
] < min_base_pg
) {
3287 min_base_pg
= base_by_osd
[osd
];
3288 min_new_pg
= new_by_osd
[osd
];
3290 if (max
< 0 || base_by_osd
[osd
] > max_base_pg
) {
3292 max_base_pg
= base_by_osd
[osd
];
3293 max_new_pg
= new_by_osd
[osd
];
3297 base_stddev
= sqrt(base_stddev
/ num_up_in
);
3298 new_stddev
= sqrt(new_stddev
/ num_up_in
);
3300 float edev
= sqrt(avg_pg
* (1.0 - (1.0 / (double)num_up_in
)));
3304 f
->open_object_section("utilization");
3307 f
->dump_unsigned("moved_pgs", moved_pg
);
3308 f
->dump_unsigned("total_pgs", total_pg
);
3312 percent
= (float)moved_pg
* 100.0 / (float)total_pg
;
3313 ss
<< "moved " << moved_pg
<< " / " << total_pg
3314 << " (" << percent
<< "%)\n";
3318 f
->dump_float("avg_pgs", avg_pg
);
3319 f
->dump_float("std_dev", base_stddev
);
3320 f
->dump_float("expected_baseline_std_dev", edev
);
3322 f
->dump_float("new_std_dev", new_stddev
);
3324 ss
<< "avg " << avg_pg
<< "\n";
3325 ss
<< "stddev " << base_stddev
;
3327 ss
<< " -> " << new_stddev
;
3328 ss
<< " (expected baseline " << edev
<< ")\n";
3332 f
->dump_unsigned("min_osd", min
);
3333 f
->dump_unsigned("min_osd_pgs", min_base_pg
);
3335 f
->dump_unsigned("new_min_osd_pgs", min_new_pg
);
3337 ss
<< "min osd." << min
<< " with " << min_base_pg
;
3339 ss
<< " -> " << min_new_pg
;
3340 ss
<< " pgs (" << (float)min_base_pg
/ avg_pg
;
3342 ss
<< " -> " << (float)min_new_pg
/ avg_pg
;
3348 f
->dump_unsigned("max_osd", max
);
3349 f
->dump_unsigned("max_osd_pgs", max_base_pg
);
3351 f
->dump_unsigned("new_max_osd_pgs", max_new_pg
);
3353 ss
<< "max osd." << max
<< " with " << max_base_pg
;
3355 ss
<< " -> " << max_new_pg
;
3356 ss
<< " pgs (" << (float)max_base_pg
/ avg_pg
;
3358 ss
<< " -> " << (float)max_new_pg
/ avg_pg
;
3370 int OSDMap::clean_pg_upmaps(
3372 Incremental
*pending_inc
)
3374 ldout(cct
, 10) << __func__
<< dendl
;
3376 for (auto& p
: pg_upmap
) {
3379 pg_to_raw_osds(p
.first
, &raw
, &primary
);
3380 if (vectors_equal(raw
, p
.second
)) {
3381 ldout(cct
, 10) << " removing redundant pg_upmap " << p
.first
<< " "
3382 << p
.second
<< dendl
;
3383 pending_inc
->old_pg_upmap
.insert(p
.first
);
3387 for (auto& p
: pg_upmap_items
) {
3390 pg_to_raw_osds(p
.first
, &raw
, &primary
);
3391 mempool::osdmap::vector
<pair
<int,int>> newmap
;
3392 for (auto& q
: p
.second
) {
3393 if (std::find(raw
.begin(), raw
.end(), q
.first
) != raw
.end()) {
3394 newmap
.push_back(q
);
3397 if (newmap
.empty()) {
3398 ldout(cct
, 10) << " removing no-op pg_upmap_items " << p
.first
<< " "
3399 << p
.second
<< dendl
;
3400 pending_inc
->old_pg_upmap_items
.insert(p
.first
);
3402 } else if (newmap
!= p
.second
) {
3403 ldout(cct
, 10) << " simplifying partially no-op pg_upmap_items "
3404 << p
.first
<< " " << p
.second
<< " -> " << newmap
<< dendl
;
3405 pending_inc
->new_pg_upmap_items
[p
.first
] = newmap
;
3412 bool OSDMap::try_pg_upmap(
3414 pg_t pg
, ///< pg to potentially remap
3415 const set
<int>& overfull
, ///< osds we'd want to evacuate
3416 const vector
<int>& underfull
, ///< osds to move to, in order of preference
3418 vector
<int> *out
) ///< resulting alternative mapping
3420 const pg_pool_t
*pool
= get_pg_pool(pg
.pool());
3423 int rule
= crush
->find_rule(pool
->get_crush_ruleset(), pool
->get_type(),
3428 // get original mapping
3429 _pg_to_raw_osds(*pool
, pg
, orig
, NULL
);
3431 // make sure there is something there to remap
3433 for (auto osd
: *orig
) {
3434 if (overfull
.count(osd
)) {
3443 int r
= crush
->try_remap_rule(
3447 overfull
, underfull
,
3457 int OSDMap::calc_pg_upmaps(
3459 float max_deviation
,
3461 const set
<int64_t>& only_pools
,
3462 OSDMap::Incremental
*pending_inc
)
3465 tmp
.deepish_copy_from(*this);
3466 int num_changed
= 0;
3468 map
<int,set
<pg_t
>> pgs_by_osd
;
3470 for (auto& i
: pools
) {
3471 if (!only_pools
.empty() && !only_pools
.count(i
.first
))
3473 for (unsigned ps
= 0; ps
< i
.second
.get_pg_num(); ++ps
) {
3474 pg_t
pg(ps
, i
.first
);
3476 tmp
.pg_to_up_acting_osds(pg
, &up
, nullptr, nullptr, nullptr);
3477 for (auto osd
: up
) {
3478 if (osd
!= CRUSH_ITEM_NONE
)
3479 pgs_by_osd
[osd
].insert(pg
);
3482 total_pgs
+= i
.second
.get_size() * i
.second
.get_pg_num();
3484 float osd_weight_total
= 0;
3485 map
<int,float> osd_weight
;
3486 for (auto& i
: pgs_by_osd
) {
3487 float w
= crush
->get_item_weightf(i
.first
);
3488 osd_weight
[i
.first
] = w
;
3489 osd_weight_total
+= w
;
3490 ldout(cct
, 20) << " osd." << i
.first
<< " weight " << w
3491 << " pgs " << i
.second
.size() << dendl
;
3494 // NOTE: we assume we touch all osds with CRUSH!
3495 float pgs_per_weight
= total_pgs
/ osd_weight_total
;
3496 ldout(cct
, 10) << " osd_weight_total " << osd_weight_total
<< dendl
;
3497 ldout(cct
, 10) << " pgs_per_weight " << pgs_per_weight
<< dendl
;
3500 map
<int,float> osd_deviation
; // osd, deviation(pgs)
3501 multimap
<float,int> deviation_osd
; // deviation(pgs), osd
3503 for (auto& i
: pgs_by_osd
) {
3504 float target
= osd_weight
[i
.first
] * pgs_per_weight
;
3505 float deviation
= (float)i
.second
.size() - target
;
3506 ldout(cct
, 20) << " osd." << i
.first
3507 << "\tpgs " << i
.second
.size()
3508 << "\ttarget " << target
3509 << "\tdeviation " << deviation
3511 osd_deviation
[i
.first
] = deviation
;
3512 deviation_osd
.insert(make_pair(deviation
, i
.first
));
3514 overfull
.insert(i
.first
);
3517 // build underfull, sorted from least-full to most-average
3518 vector
<int> underfull
;
3519 for (auto i
= deviation_osd
.begin();
3520 i
!= deviation_osd
.end();
3522 if (i
->first
>= -.999)
3524 underfull
.push_back(i
->second
);
3526 ldout(cct
, 10) << " overfull " << overfull
3527 << " underfull " << underfull
<< dendl
;
3528 if (overfull
.empty() || underfull
.empty())
3532 bool restart
= false;
3533 for (auto p
= deviation_osd
.rbegin(); p
!= deviation_osd
.rend(); ++p
) {
3534 int osd
= p
->second
;
3535 float target
= osd_weight
[osd
] * pgs_per_weight
;
3536 float deviation
= deviation_osd
.rbegin()->first
;
3537 if (deviation
/target
< max_deviation
) {
3538 ldout(cct
, 10) << " osd." << osd
3539 << " target " << target
3540 << " deviation " << deviation
3541 << " -> " << deviation
/target
3542 << " < max " << max_deviation
<< dendl
;
3545 int num_to_move
= deviation
;
3546 ldout(cct
, 10) << " osd." << osd
<< " move " << num_to_move
<< dendl
;
3547 if (num_to_move
< 1)
3550 set
<pg_t
>& pgs
= pgs_by_osd
[osd
];
3552 // look for remaps we can un-remap
3553 for (auto pg
: pgs
) {
3554 auto p
= tmp
.pg_upmap_items
.find(pg
);
3555 if (p
!= tmp
.pg_upmap_items
.end()) {
3556 for (auto q
: p
->second
) {
3557 if (q
.second
== osd
) {
3558 ldout(cct
, 10) << " dropping pg_upmap_items " << pg
3559 << " " << p
->second
<< dendl
;
3560 tmp
.pg_upmap_items
.erase(p
);
3561 pending_inc
->old_pg_upmap_items
.insert(pg
);
3573 for (auto pg
: pgs
) {
3574 if (tmp
.pg_upmap
.count(pg
) ||
3575 tmp
.pg_upmap_items
.count(pg
)) {
3576 ldout(cct
, 20) << " already remapped " << pg
<< dendl
;
3579 ldout(cct
, 10) << " trying " << pg
<< dendl
;
3580 vector
<int> orig
, out
;
3581 if (!try_pg_upmap(cct
, pg
, overfull
, underfull
, &orig
, &out
)) {
3584 ldout(cct
, 10) << " " << pg
<< " " << orig
<< " -> " << out
<< dendl
;
3585 if (orig
.size() != out
.size()) {
3588 assert(orig
!= out
);
3589 auto& rmi
= tmp
.pg_upmap_items
[pg
];
3590 for (unsigned i
= 0; i
< out
.size(); ++i
) {
3591 if (orig
[i
] != out
[i
]) {
3592 rmi
.push_back(make_pair(orig
[i
], out
[i
]));
3595 pending_inc
->new_pg_upmap_items
[pg
] = rmi
;
3596 ldout(cct
, 10) << " " << pg
<< " pg_upmap_items " << rmi
<< dendl
;
3606 ldout(cct
, 10) << " failed to find any changes to make" << dendl
;
3610 ldout(cct
, 10) << " hit max iterations, stopping" << dendl
;