1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
33 #include <boost/smart_ptr/local_shared_ptr.hpp>
34 #include "include/btree_map.h"
35 #include "include/common_fwd.h"
36 #include "include/types.h"
37 #include "common/ceph_releases.h"
38 #include "osd_types.h"
40 //#include "include/ceph_features.h"
41 #include "crush/CrushWrapper.h"
43 // forward declaration
45 class health_check_map_t
;
48 * we track up to two intervals during which the osd was alive and
49 * healthy. the most recent is [up_from,up_thru), where up_thru is
50 * the last epoch the osd is known to have _started_. i.e., a lower
51 * bound on the actual osd death. down_at (if it is > up_from) is an
52 * upper bound on the actual osd death.
54 * the second is the last_clean interval [begin,end). in that case,
55 * the last interval is the last epoch known to have been either
56 * _finished_, or during which the osd cleanly shut down. when
57 * possible, we push this forward to the epoch the osd was eventually
60 * the lost_at is used to allow build_prior to proceed without waiting
61 * for an osd to recover. In certain cases, progress may be blocked
62 * because an osd is down that may contain updates (i.e., a pg may have
63 * gone rw during an interval). If the osd can't be brought online, we
64 * can force things to proceed knowing that we _might_ be losing some
65 * acked writes. If the osd comes back to life later, that's fine to,
66 * but those writes will still be lost (the divergent objects will be
70 epoch_t last_clean_begin
; // last interval that ended with a clean osd shutdown
71 epoch_t last_clean_end
;
72 epoch_t up_from
; // epoch osd marked up
73 epoch_t up_thru
; // lower bound on actual osd death (if > up_from)
74 epoch_t down_at
; // upper bound on actual osd death (if > up_from)
75 epoch_t lost_at
; // last epoch we decided data was "lost"
77 osd_info_t() : last_clean_begin(0), last_clean_end(0),
78 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
80 void dump(ceph::Formatter
*f
) const;
81 void encode(ceph::buffer::list
& bl
) const;
82 void decode(ceph::buffer::list::const_iterator
& bl
);
83 static void generate_test_instances(std::list
<osd_info_t
*>& o
);
85 WRITE_CLASS_ENCODER(osd_info_t
)
87 std::ostream
& operator<<(std::ostream
& out
, const osd_info_t
& info
);
90 utime_t down_stamp
; ///< timestamp when we were last marked down
91 float laggy_probability
; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
92 __u32 laggy_interval
; ///< average interval between being marked laggy and recovering
93 uint64_t features
; ///< features supported by this osd we should know about
94 __u32 old_weight
; ///< weight prior to being auto marked out
95 utime_t last_purged_snaps_scrub
; ///< last scrub of purged_snaps
96 epoch_t dead_epoch
= 0; ///< last epoch we were confirmed dead (not just down)
98 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
99 features(0), old_weight(0) {}
101 void dump(ceph::Formatter
*f
) const;
102 void encode(ceph::buffer::list
& bl
, uint64_t features
) const;
103 void decode(ceph::buffer::list::const_iterator
& bl
);
104 static void generate_test_instances(std::list
<osd_xinfo_t
*>& o
);
106 WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t
)
108 std::ostream
& operator<<(std::ostream
& out
, const osd_xinfo_t
& xi
);
113 ceph::buffer::list data
;
114 typedef btree::btree_map
<pg_t
,ceph_le32
*> map_t
;
117 void encode(ceph::buffer::list
& bl
) const {
119 uint32_t n
= map
.size();
121 for (auto &p
: map
) {
123 bl
.append((char*)p
.second
, (*p
.second
+ 1) * sizeof(ceph_le32
));
126 void decode(ceph::buffer::list::const_iterator
& p
) {
135 size_t start_off
= pstart
.get_off();
136 std::vector
<std::pair
<pg_t
,size_t>> offsets
;
138 for (unsigned i
=0; i
<n
; ++i
) {
141 offsets
[i
].first
= pgid
;
142 offsets
[i
].second
= p
.get_off() - start_off
;
145 p
+= vn
* sizeof(int32_t);
147 size_t len
= p
.get_off() - start_off
;
148 pstart
.copy(len
, data
);
149 if (data
.get_num_buffers() > 1) {
153 char *start
= data
.c_str();
154 for (auto i
: offsets
) {
155 map
.insert(map
.end(), std::make_pair(i
.first
, (ceph_le32
*)(start
+ i
.second
)));
159 ceph::buffer::list bl
;
161 auto p
= std::cbegin(bl
);
164 friend bool operator==(const PGTempMap
& l
, const PGTempMap
& r
) {
166 l
.map
.size() == r
.map
.size() &&
167 l
.data
.contents_equal(r
.data
);
171 map_t::const_iterator it
;
172 map_t::const_iterator end
;
173 std::pair
<pg_t
,std::vector
<int32_t>> current
;
174 void init_current() {
176 current
.first
= it
->first
;
177 ceph_assert(it
->second
);
178 current
.second
.resize(*it
->second
);
179 ceph_le32
*p
= it
->second
+ 1;
180 for (uint32_t n
= 0; n
< *it
->second
; ++n
, ++p
) {
181 current
.second
[n
] = *p
;
186 iterator(map_t::const_iterator p
,
187 map_t::const_iterator e
)
192 const std::pair
<pg_t
,std::vector
<int32_t>>& operator*() const {
195 const std::pair
<pg_t
,std::vector
<int32_t>>* operator->() const {
198 friend bool operator==(const iterator
& l
, const iterator
& r
) {
201 friend bool operator!=(const iterator
& l
, const iterator
& r
) {
204 iterator
& operator++() {
210 iterator
operator++(int) {
218 iterator
begin() const {
219 return iterator(map
.begin(), map
.end());
221 iterator
end() const {
222 return iterator(map
.end(), map
.end());
224 iterator
find(pg_t pgid
) const {
225 return iterator(map
.find(pgid
), map
.end());
227 size_t size() const {
230 size_t count(pg_t pgid
) const {
231 return map
.count(pgid
);
233 void erase(pg_t pgid
) {
240 void set(pg_t pgid
, const mempool::osdmap::vector
<int32_t>& v
) {
242 size_t need
= sizeof(ceph_le32
) * (1 + v
.size());
243 if (need
< data
.get_append_buffer_unused_tail_length()) {
244 ceph::buffer::ptr
z(data
.get_append_buffer_unused_tail_length());
246 data
.append(z
.c_str(), z
.length());
249 map
[pgid
] = (ceph_le32
*)(data
.back().end_c_str()) - (1 + v
.size());
251 mempool::osdmap::vector
<int32_t> get(pg_t pgid
) {
252 mempool::osdmap::vector
<int32_t> v
;
253 ceph_le32
*p
= map
[pgid
];
256 for (size_t i
= 0; i
< n
; ++i
, ++p
) {
262 // trivial implementation
263 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> > pg_temp
;
265 void encode(ceph::buffer::list
& bl
) const {
268 void decode(ceph::buffer::list::const_iterator
& p
) {
271 friend bool operator==(const PGTempMap
& l
, const PGTempMap
& r
) {
273 l
.pg_temp
.size() == r
.pg_temp
.size() &&
274 l
.pg_temp
== r
.pg_temp
;
278 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> >::const_iterator it
;
280 iterator(mempool::osdmap::map
<pg_t
,
281 mempool::osdmap::vector
<int32_t> >::const_iterator p
)
284 std::pair
<pg_t
,const mempool::osdmap::vector
<int32_t>&> operator*() const {
287 const std::pair
<const pg_t
,mempool::osdmap::vector
<int32_t>>* operator->() const {
290 friend bool operator==(const iterator
& l
, const iterator
& r
) {
293 friend bool operator!=(const iterator
& l
, const iterator
& r
) {
296 iterator
& operator++() {
300 iterator
operator++(int) {
306 iterator
begin() const {
307 return iterator(pg_temp
.cbegin());
309 iterator
end() const {
310 return iterator(pg_temp
.cend());
312 iterator
find(pg_t pgid
) const {
313 return iterator(pg_temp
.find(pgid
));
315 size_t size() const {
316 return pg_temp
.size();
318 size_t count(pg_t pgid
) const {
319 return pg_temp
.count(pgid
);
321 void erase(pg_t pgid
) {
327 void set(pg_t pgid
, const mempool::osdmap::vector
<int32_t>& v
) {
330 const mempool::osdmap::vector
<int32_t>& get(pg_t pgid
) {
331 return pg_temp
.at(pgid
);
334 void dump(ceph::Formatter
*f
) const {
335 for (const auto &pg
: *this) {
336 f
->open_object_section("osds");
337 f
->dump_stream("pgid") << pg
.first
;
338 f
->open_array_section("osds");
339 for (const auto osd
: pg
.second
)
340 f
->dump_int("osd", osd
);
346 WRITE_CLASS_ENCODER(PGTempMap
)
352 MEMPOOL_CLASS_HELPERS();
356 MEMPOOL_CLASS_HELPERS();
358 /// feature bits we were encoded with. the subsequent OSDMap
359 /// encoding should match.
360 uint64_t encode_features
;
362 epoch_t epoch
; // new epoch; we are a diff from epoch-1 to epoch
364 int64_t new_pool_max
; //incremented by the OSDMonitor on each pool create
366 ceph_release_t new_require_osd_release
{0xff};
367 uint32_t new_stretch_bucket_count
{0};
368 uint32_t new_degraded_stretch_mode
{0};
369 uint32_t new_recovering_stretch_mode
{0};
370 int32_t new_stretch_mode_bucket
{0};
371 bool stretch_mode_enabled
{false};
372 bool change_stretch_mode
{false};
374 enum class mutate_allow_crimson_t
: uint8_t {
377 // Monitor won't allow CLEAR to be set currently, but we may allow it later
379 } mutate_allow_crimson
= mutate_allow_crimson_t::NONE
;
382 ceph::buffer::list fullmap
; // in lieu of below.
383 ceph::buffer::list crush
;
387 mempool::osdmap::map
<int64_t,pg_pool_t
> new_pools
;
388 mempool::osdmap::map
<int64_t,std::string
> new_pool_names
;
389 mempool::osdmap::set
<int64_t> old_pools
;
390 mempool::osdmap::map
<std::string
,std::map
<std::string
,std::string
> > new_erasure_code_profiles
;
391 mempool::osdmap::vector
<std::string
> old_erasure_code_profiles
;
392 mempool::osdmap::map
<int32_t,entity_addrvec_t
> new_up_client
;
393 mempool::osdmap::map
<int32_t,entity_addrvec_t
> new_up_cluster
;
394 mempool::osdmap::map
<int32_t,uint32_t> new_state
; // XORed onto previous state.
395 mempool::osdmap::map
<int32_t,uint32_t> new_weight
;
396 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t> > new_pg_temp
; // [] to remove
397 mempool::osdmap::map
<pg_t
, int32_t> new_primary_temp
; // [-1] to remove
398 mempool::osdmap::map
<int32_t,uint32_t> new_primary_affinity
;
399 mempool::osdmap::map
<int32_t,epoch_t
> new_up_thru
;
400 mempool::osdmap::map
<int32_t,std::pair
<epoch_t
,epoch_t
> > new_last_clean_interval
;
401 mempool::osdmap::map
<int32_t,epoch_t
> new_lost
;
402 mempool::osdmap::map
<int32_t,uuid_d
> new_uuid
;
403 mempool::osdmap::map
<int32_t,osd_xinfo_t
> new_xinfo
;
405 mempool::osdmap::map
<entity_addr_t
,utime_t
> new_blocklist
;
406 mempool::osdmap::vector
<entity_addr_t
> old_blocklist
;
407 mempool::osdmap::map
<entity_addr_t
,utime_t
> new_range_blocklist
;
408 mempool::osdmap::vector
<entity_addr_t
> old_range_blocklist
;
409 mempool::osdmap::map
<int32_t, entity_addrvec_t
> new_hb_back_up
;
410 mempool::osdmap::map
<int32_t, entity_addrvec_t
> new_hb_front_up
;
412 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>> new_pg_upmap
;
413 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>> new_pg_upmap_items
;
414 mempool::osdmap::map
<pg_t
, int32_t> new_pg_upmap_primary
;
415 mempool::osdmap::set
<pg_t
> old_pg_upmap
, old_pg_upmap_items
, old_pg_upmap_primary
;
416 mempool::osdmap::map
<int64_t, snap_interval_set_t
> new_removed_snaps
;
417 mempool::osdmap::map
<int64_t, snap_interval_set_t
> new_purged_snaps
;
419 mempool::osdmap::map
<int32_t,uint32_t> new_crush_node_flags
;
420 mempool::osdmap::map
<int32_t,uint32_t> new_device_class_flags
;
422 std::string cluster_snapshot
;
424 float new_nearfull_ratio
= -1;
425 float new_backfillfull_ratio
= -1;
426 float new_full_ratio
= -1;
428 ceph_release_t new_require_min_compat_client
{0xff};
430 utime_t new_last_up_change
, new_last_in_change
;
432 mutable bool have_crc
; ///< crc values are defined
433 uint32_t full_crc
; ///< crc of the resulting OSDMap
434 mutable uint32_t inc_crc
; ///< crc of this incremental
436 int get_net_marked_out(const OSDMap
*previous
) const;
437 int get_net_marked_down(const OSDMap
*previous
) const;
438 int identify_osd(uuid_d u
) const;
440 void encode_client_old(ceph::buffer::list
& bl
) const;
441 void encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const;
442 void encode(ceph::buffer::list
& bl
, uint64_t features
=CEPH_FEATURES_ALL
) const;
443 void decode_classic(ceph::buffer::list::const_iterator
&p
);
444 void decode(ceph::buffer::list::const_iterator
&bl
);
445 void dump(ceph::Formatter
*f
) const;
446 static void generate_test_instances(std::list
<Incremental
*>& o
);
448 explicit Incremental(epoch_t e
=0) :
450 epoch(e
), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
451 have_crc(false), full_crc(0), inc_crc(0) {
453 explicit Incremental(ceph::buffer::list
&bl
) {
454 auto p
= std::cbegin(bl
);
457 explicit Incremental(ceph::buffer::list::const_iterator
&p
) {
461 pg_pool_t
*get_new_pool(int64_t pool
, const pg_pool_t
*orig
) {
462 if (new_pools
.count(pool
) == 0)
463 new_pools
[pool
] = *orig
;
464 return &new_pools
[pool
];
466 bool has_erasure_code_profile(const std::string
&name
) const {
467 auto i
= new_erasure_code_profiles
.find(name
);
468 return i
!= new_erasure_code_profiles
.end();
470 void set_erasure_code_profile(const std::string
&name
,
471 const std::map
<std::string
,std::string
>& profile
) {
472 new_erasure_code_profiles
[name
] = profile
;
474 mempool::osdmap::map
<std::string
,std::map
<std::string
,std::string
>> get_erasure_code_profiles() const {
475 return new_erasure_code_profiles
;
478 /// propagate update pools' (snap and other) metadata to any of their tiers
479 int propagate_base_properties_to_tiers(CephContext
*cct
, const OSDMap
&base
);
481 /// filter out osds with any pending state changing
482 size_t get_pending_state_osds(std::vector
<int> *osds
) {
486 for (auto &p
: new_state
) {
487 osds
->push_back(p
.first
);
493 bool pending_osd_has_state(int osd
, unsigned state
) {
494 return new_state
.count(osd
) && (new_state
[osd
] & state
) != 0;
497 bool pending_osd_state_set(int osd
, unsigned state
) {
498 if (pending_osd_has_state(osd
, state
))
500 new_state
[osd
] |= state
;
504 // cancel the specified pending osd state if there is any
505 // return ture on success, false otherwise.
506 bool pending_osd_state_clear(int osd
, unsigned state
) {
507 if (!pending_osd_has_state(osd
, state
)) {
508 // never has been set or already has been cancelled.
512 new_state
[osd
] &= ~state
;
513 if (!new_state
[osd
]) {
515 new_state
.erase(osd
);
520 bool in_new_removed_snaps(int64_t pool
, snapid_t snap
) const {
521 auto p
= new_removed_snaps
.find(pool
);
522 if (p
== new_removed_snaps
.end()) {
525 return p
->second
.contains(snap
);
528 void set_allow_crimson() { mutate_allow_crimson
= mutate_allow_crimson_t::SET
; }
533 epoch_t epoch
; // what epoch of the osd cluster descriptor is this
534 utime_t created
, modified
; // epoch start time
535 int32_t pool_max
; // the largest pool num, ever
539 int num_osd
; // not saved; see calc_num_osds
540 int num_up_osd
; // not saved; see calc_num_osds
541 int num_in_osd
; // not saved; see calc_num_osds
544 std::vector
<uint32_t> osd_state
;
546 mempool::osdmap::map
<int32_t,uint32_t> crush_node_flags
; // crush node -> CEPH_OSD_* flags
547 mempool::osdmap::map
<int32_t,uint32_t> device_class_flags
; // device class -> CEPH_OSD_* flags
549 utime_t last_up_change
, last_in_change
;
551 // These features affect OSDMap[::Incremental] encoding, or the
552 // encoding of some type embedded therein (CrushWrapper, something
553 // from osd_types, etc.).
554 static constexpr uint64_t SIGNIFICANT_FEATURES
=
555 CEPH_FEATUREMASK_PGID64
|
556 CEPH_FEATUREMASK_PGPOOL3
|
557 CEPH_FEATUREMASK_OSDENC
|
558 CEPH_FEATUREMASK_OSDMAP_ENC
|
559 CEPH_FEATUREMASK_OSD_POOLRESEND
|
560 CEPH_FEATUREMASK_NEW_OSDOP_ENCODING
|
561 CEPH_FEATUREMASK_MSG_ADDR2
|
562 CEPH_FEATUREMASK_CRUSH_TUNABLES5
|
563 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS
|
564 CEPH_FEATUREMASK_SERVER_LUMINOUS
|
565 CEPH_FEATUREMASK_SERVER_MIMIC
|
566 CEPH_FEATUREMASK_SERVER_NAUTILUS
|
567 CEPH_FEATUREMASK_SERVER_OCTOPUS
|
568 CEPH_FEATUREMASK_SERVER_REEF
;
571 mempool::osdmap::vector
<std::shared_ptr
<entity_addrvec_t
> > client_addrs
;
572 mempool::osdmap::vector
<std::shared_ptr
<entity_addrvec_t
> > cluster_addrs
;
573 mempool::osdmap::vector
<std::shared_ptr
<entity_addrvec_t
> > hb_back_addrs
;
574 mempool::osdmap::vector
<std::shared_ptr
<entity_addrvec_t
> > hb_front_addrs
;
576 std::shared_ptr
<addrs_s
> osd_addrs
;
578 entity_addrvec_t _blank_addrvec
;
580 mempool::osdmap::vector
<__u32
> osd_weight
; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
581 mempool::osdmap::vector
<osd_info_t
> osd_info
;
582 std::shared_ptr
<PGTempMap
> pg_temp
; // temp pg mapping (e.g. while we rebuild)
583 std::shared_ptr
< mempool::osdmap::map
<pg_t
,int32_t > > primary_temp
; // temp primary mapping (e.g. while we rebuild)
584 std::shared_ptr
< mempool::osdmap::vector
<__u32
> > osd_primary_affinity
; ///< 16.16 fixed point, 0x10000 = baseline
586 // remap (post-CRUSH, pre-up)
587 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<int32_t>> pg_upmap
; ///< remap pg
588 mempool::osdmap::map
<pg_t
,mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>> pg_upmap_items
; ///< remap osds in up set
589 mempool::osdmap::map
<pg_t
, int32_t> pg_upmap_primaries
; ///< remap primary of a pg
591 mempool::osdmap::map
<int64_t,pg_pool_t
> pools
;
592 mempool::osdmap::map
<int64_t,std::string
> pool_name
;
593 mempool::osdmap::map
<std::string
, std::map
<std::string
,std::string
>> erasure_code_profiles
;
594 mempool::osdmap::map
<std::string
,int64_t, std::less
<>> name_pool
;
596 std::shared_ptr
< mempool::osdmap::vector
<uuid_d
> > osd_uuid
;
597 mempool::osdmap::vector
<osd_xinfo_t
> osd_xinfo
;
601 uint64_t upper_64_bits
, lower_64_bits
;
602 uint64_t upper_mask
, lower_mask
;
613 static void get_ipv6_bytes(unsigned const char *addr
,
614 uint64_t *upper
, uint64_t *lower
);
617 range_bits(const entity_addr_t
& addr
);
618 void parse(const entity_addr_t
& addr
);
619 bool matches(const entity_addr_t
& addr
) const;
621 mempool::osdmap::unordered_map
<entity_addr_t
,utime_t
> blocklist
;
622 mempool::osdmap::map
<entity_addr_t
,utime_t
> range_blocklist
;
623 mempool::osdmap::map
<entity_addr_t
,range_bits
> calculated_ranges
;
625 /// queue of snaps to remove
626 mempool::osdmap::map
<int64_t, snap_interval_set_t
> removed_snaps_queue
;
628 /// removed_snaps additions this epoch
629 mempool::osdmap::map
<int64_t, snap_interval_set_t
> new_removed_snaps
;
631 /// removed_snaps removals this epoch
632 mempool::osdmap::map
<int64_t, snap_interval_set_t
> new_purged_snaps
;
634 epoch_t cluster_snapshot_epoch
;
635 std::string cluster_snapshot
;
636 bool new_blocklist_entries
;
638 float full_ratio
= 0, backfillfull_ratio
= 0, nearfull_ratio
= 0;
640 /// min compat client we want to support
641 ceph_release_t require_min_compat_client
{ceph_release_t::unknown
};
644 /// require osds to run at least this release
645 ceph_release_t require_osd_release
{ceph_release_t::unknown
};
648 mutable uint64_t cached_up_osd_features
;
650 mutable bool crc_defined
;
651 mutable uint32_t crc
;
653 void _calc_up_osd_features();
656 bool have_crc() const { return crc_defined
; }
657 uint32_t get_crc() const { return crc
; }
659 std::shared_ptr
<CrushWrapper
> crush
; // hierarchical map
660 bool stretch_mode_enabled
; // we are in stretch mode, requiring multiple sites
661 uint32_t stretch_bucket_count
; // number of sites we expect to be in
662 uint32_t degraded_stretch_mode
; // 0 if not degraded; else count of up sites
663 uint32_t recovering_stretch_mode
; // 0 if not recovering; else 1
664 int32_t stretch_mode_bucket
; // the bucket type we're stretched across
665 bool allow_crimson
{false};
667 uint32_t crush_version
= 1;
669 friend class OSDMonitor
;
675 num_osd(0), num_up_osd(0), num_in_osd(0),
677 osd_addrs(std::make_shared
<addrs_s
>()),
678 pg_temp(std::make_shared
<PGTempMap
>()),
679 primary_temp(std::make_shared
<mempool::osdmap::map
<pg_t
,int32_t>>()),
680 osd_uuid(std::make_shared
<mempool::osdmap::vector
<uuid_d
>>()),
681 cluster_snapshot_epoch(0),
682 new_blocklist_entries(false),
683 cached_up_osd_features(0),
684 crc_defined(false), crc(0),
685 crush(std::make_shared
<CrushWrapper
>()),
686 stretch_mode_enabled(false), stretch_bucket_count(0),
687 degraded_stretch_mode(0), recovering_stretch_mode(0), stretch_mode_bucket(0) {
691 OSDMap(const OSDMap
& other
) = default;
692 OSDMap
& operator=(const OSDMap
& other
) = default;
695 /// return feature mask subset that is relevant to OSDMap encoding
696 static uint64_t get_significant_features(uint64_t features
) {
697 return SIGNIFICANT_FEATURES
& features
;
700 uint64_t get_encoding_features() const;
702 void deepish_copy_from(const OSDMap
& o
) {
704 primary_temp
.reset(new mempool::osdmap::map
<pg_t
,int32_t>(*o
.primary_temp
));
705 pg_temp
.reset(new PGTempMap(*o
.pg_temp
));
706 osd_uuid
.reset(new mempool::osdmap::vector
<uuid_d
>(*o
.osd_uuid
));
708 if (o
.osd_primary_affinity
)
709 osd_primary_affinity
.reset(new mempool::osdmap::vector
<__u32
>(*o
.osd_primary_affinity
));
711 // NOTE: this still references shared entity_addrvec_t's.
712 osd_addrs
.reset(new addrs_s(*o
.osd_addrs
));
714 // NOTE: we do not copy crush. note that apply_incremental will
715 // allocate a new CrushWrapper, though.
719 const uuid_d
& get_fsid() const { return fsid
; }
720 void set_fsid(uuid_d
& f
) { fsid
= f
; }
722 epoch_t
get_epoch() const { return epoch
; }
723 void inc_epoch() { epoch
++; }
725 void set_epoch(epoch_t e
);
727 uint32_t get_crush_version() const {
728 return crush_version
;
732 const utime_t
& get_created() const { return created
; }
733 const utime_t
& get_modified() const { return modified
; }
735 bool is_blocklisted(const entity_addr_t
& a
, CephContext
*cct
=nullptr) const;
736 bool is_blocklisted(const entity_addrvec_t
& a
, CephContext
*cct
=nullptr) const;
737 void get_blocklist(std::list
<std::pair
<entity_addr_t
,utime_t
> > *bl
,
738 std::list
<std::pair
<entity_addr_t
,utime_t
> > *rl
) const;
739 void get_blocklist(std::set
<entity_addr_t
> *bl
,
740 std::set
<entity_addr_t
> *rl
) const;
742 std::string
get_cluster_snapshot() const {
743 if (cluster_snapshot_epoch
== epoch
)
744 return cluster_snapshot
;
745 return std::string();
748 float get_full_ratio() const {
751 float get_backfillfull_ratio() const {
752 return backfillfull_ratio
;
754 float get_nearfull_ratio() const {
755 return nearfull_ratio
;
757 void get_full_pools(CephContext
*cct
,
758 std::set
<int64_t> *full
,
759 std::set
<int64_t> *backfillfull
,
760 std::set
<int64_t> *nearfull
) const;
761 void get_full_osd_counts(std::set
<int> *full
, std::set
<int> *backfill
,
762 std::set
<int> *nearfull
) const;
765 /***** cluster state *****/
767 int get_max_osd() const { return max_osd
; }
768 void set_max_osd(int m
);
770 unsigned get_num_osds() const {
773 unsigned get_num_up_osds() const {
776 unsigned get_num_in_osds() const {
779 /// recalculate cached values for get_num{,_up,_in}_osds
782 void get_all_osds(std::set
<int32_t>& ls
) const;
783 void get_up_osds(std::set
<int32_t>& ls
) const;
784 void get_out_existing_osds(std::set
<int32_t>& ls
) const;
785 unsigned get_num_pg_temp() const {
786 return pg_temp
->size();
789 int get_flags() const { return flags
; }
790 bool test_flag(int f
) const { return flags
& f
; }
791 void set_flag(int f
) { flags
|= f
; }
792 void clear_flag(int f
) { flags
&= ~f
; }
794 void get_flag_set(std::set
<std::string
> *flagset
) const;
796 static void calc_state_set(int state
, std::set
<std::string
>& st
);
798 int get_state(int o
) const {
799 ceph_assert(o
< max_osd
);
802 int get_state(int o
, std::set
<std::string
>& st
) const {
803 ceph_assert(o
< max_osd
);
804 unsigned t
= osd_state
[o
];
805 calc_state_set(t
, st
);
808 void set_state(int o
, unsigned s
) {
809 ceph_assert(o
< max_osd
);
812 void set_weight(int o
, unsigned w
) {
813 ceph_assert(o
< max_osd
);
816 osd_state
[o
] |= CEPH_OSD_EXISTS
;
818 unsigned get_weight(int o
) const {
819 ceph_assert(o
< max_osd
);
820 return osd_weight
[o
];
822 float get_weightf(int o
) const {
823 return (float)get_weight(o
) / (float)CEPH_OSD_IN
;
825 void adjust_osd_weights(const std::map
<int,double>& weights
, Incremental
& inc
) const;
827 void set_primary_affinity(int o
, int w
) {
828 ceph_assert(o
< max_osd
);
829 if (!osd_primary_affinity
)
830 osd_primary_affinity
.reset(
831 new mempool::osdmap::vector
<__u32
>(
832 max_osd
, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
));
833 (*osd_primary_affinity
)[o
] = w
;
835 unsigned get_primary_affinity(int o
) const {
836 ceph_assert(o
< max_osd
);
837 if (!osd_primary_affinity
)
838 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
;
839 return (*osd_primary_affinity
)[o
];
841 float get_primary_affinityf(int o
) const {
842 return (float)get_primary_affinity(o
) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY
;
845 bool has_erasure_code_profile(const std::string
&name
) const {
846 auto i
= erasure_code_profiles
.find(name
);
847 return i
!= erasure_code_profiles
.end();
849 int get_erasure_code_profile_default(CephContext
*cct
,
850 std::map
<std::string
,std::string
> &profile_map
,
852 void set_erasure_code_profile(const std::string
&name
,
853 const std::map
<std::string
,std::string
>& profile
) {
854 erasure_code_profiles
[name
] = profile
;
856 const std::map
<std::string
,std::string
> &get_erasure_code_profile(
857 const std::string
&name
) const {
858 static std::map
<std::string
,std::string
> empty
;
859 auto i
= erasure_code_profiles
.find(name
);
860 if (i
== erasure_code_profiles
.end())
865 const mempool::osdmap::map
<std::string
,std::map
<std::string
,std::string
>> &get_erasure_code_profiles() const {
866 return erasure_code_profiles
;
869 bool get_allow_crimson() const {
870 return allow_crimson
;
873 bool exists(int osd
) const {
875 return osd
>= 0 && osd
< max_osd
&& (osd_state
[osd
] & CEPH_OSD_EXISTS
);
878 bool is_destroyed(int osd
) const {
879 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_DESTROYED
);
882 bool is_up(int osd
) const {
883 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_UP
);
886 bool has_been_up_since(int osd
, epoch_t epoch
) const {
887 return is_up(osd
) && get_up_from(osd
) <= epoch
;
890 bool is_down(int osd
) const {
894 bool is_stop(int osd
) const {
895 return exists(osd
) && is_down(osd
) &&
896 (osd_state
[osd
] & CEPH_OSD_STOP
);
899 bool is_out(int osd
) const {
900 return !exists(osd
) || get_weight(osd
) == CEPH_OSD_OUT
;
903 bool is_in(int osd
) const {
907 bool is_dead(int osd
) const {
909 return false; // unclear if they know they are removed from map
911 return get_xinfo(osd
).dead_epoch
> get_info(osd
).up_from
;
914 unsigned get_osd_crush_node_flags(int osd
) const;
915 unsigned get_crush_node_flags(int id
) const;
916 unsigned get_device_class_flags(int id
) const;
918 bool is_noup_by_osd(int osd
) const {
919 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NOUP
);
922 bool is_nodown_by_osd(int osd
) const {
923 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NODOWN
);
926 bool is_noin_by_osd(int osd
) const {
927 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NOIN
);
930 bool is_noout_by_osd(int osd
) const {
931 return exists(osd
) && (osd_state
[osd
] & CEPH_OSD_NOOUT
);
934 bool is_noup(int osd
) const {
935 if (test_flag(CEPH_OSDMAP_NOUP
)) // global?
937 if (is_noup_by_osd(osd
)) // by osd?
939 if (get_osd_crush_node_flags(osd
) & CEPH_OSD_NOUP
) // by crush-node?
941 if (auto class_id
= crush
->get_item_class_id(osd
); class_id
>= 0 &&
942 get_device_class_flags(class_id
) & CEPH_OSD_NOUP
) // by device-class?
947 bool is_nodown(int osd
) const {
948 if (test_flag(CEPH_OSDMAP_NODOWN
))
950 if (is_nodown_by_osd(osd
))
952 if (get_osd_crush_node_flags(osd
) & CEPH_OSD_NODOWN
)
954 if (auto class_id
= crush
->get_item_class_id(osd
); class_id
>= 0 &&
955 get_device_class_flags(class_id
) & CEPH_OSD_NODOWN
)
960 bool is_noin(int osd
) const {
961 if (test_flag(CEPH_OSDMAP_NOIN
))
963 if (is_noin_by_osd(osd
))
965 if (get_osd_crush_node_flags(osd
) & CEPH_OSD_NOIN
)
967 if (auto class_id
= crush
->get_item_class_id(osd
); class_id
>= 0 &&
968 get_device_class_flags(class_id
) & CEPH_OSD_NOIN
)
973 bool is_noout(int osd
) const {
974 if (test_flag(CEPH_OSDMAP_NOOUT
))
976 if (is_noout_by_osd(osd
))
978 if (get_osd_crush_node_flags(osd
) & CEPH_OSD_NOOUT
)
980 if (auto class_id
= crush
->get_item_class_id(osd
); class_id
>= 0 &&
981 get_device_class_flags(class_id
) & CEPH_OSD_NOOUT
)
987 * check if an entire crush subtree is down
989 bool subtree_is_down(int id
, std::set
<int> *down_cache
) const;
990 bool containing_subtree_is_down(CephContext
*cct
, int osd
, int subtree_type
, std::set
<int> *down_cache
) const;
992 bool subtree_type_is_down(CephContext
*cct
, int id
, int subtree_type
, std::set
<int> *down_in_osds
, std::set
<int> *up_in_osds
,
993 std::set
<int> *subtree_up
, std::unordered_map
<int, std::set
<int> > *subtree_type_down
) const;
995 int identify_osd(const entity_addr_t
& addr
) const;
996 int identify_osd(const uuid_d
& u
) const;
997 int identify_osd_on_all_channels(const entity_addr_t
& addr
) const;
999 bool have_addr(const entity_addr_t
& addr
) const {
1000 return identify_osd(addr
) >= 0;
1002 int find_osd_on_ip(const entity_addr_t
& ip
) const;
1004 const entity_addrvec_t
& get_addrs(int osd
) const {
1005 ceph_assert(exists(osd
));
1006 return osd_addrs
->client_addrs
[osd
] ?
1007 *osd_addrs
->client_addrs
[osd
] : _blank_addrvec
;
1009 const entity_addrvec_t
& get_most_recent_addrs(int osd
) const {
1010 return get_addrs(osd
);
1012 const entity_addrvec_t
&get_cluster_addrs(int osd
) const {
1013 ceph_assert(exists(osd
));
1014 return osd_addrs
->cluster_addrs
[osd
] ?
1015 *osd_addrs
->cluster_addrs
[osd
] : _blank_addrvec
;
1017 const entity_addrvec_t
&get_hb_back_addrs(int osd
) const {
1018 ceph_assert(exists(osd
));
1019 return osd_addrs
->hb_back_addrs
[osd
] ?
1020 *osd_addrs
->hb_back_addrs
[osd
] : _blank_addrvec
;
1022 const entity_addrvec_t
&get_hb_front_addrs(int osd
) const {
1023 ceph_assert(exists(osd
));
1024 return osd_addrs
->hb_front_addrs
[osd
] ?
1025 *osd_addrs
->hb_front_addrs
[osd
] : _blank_addrvec
;
1028 const uuid_d
& get_uuid(int osd
) const {
1029 ceph_assert(exists(osd
));
1030 return (*osd_uuid
)[osd
];
1033 const epoch_t
& get_up_from(int osd
) const {
1034 ceph_assert(exists(osd
));
1035 return osd_info
[osd
].up_from
;
1037 const epoch_t
& get_up_thru(int osd
) const {
1038 ceph_assert(exists(osd
));
1039 return osd_info
[osd
].up_thru
;
1041 const epoch_t
& get_down_at(int osd
) const {
1042 ceph_assert(exists(osd
));
1043 return osd_info
[osd
].down_at
;
1045 const osd_info_t
& get_info(int osd
) const {
1046 ceph_assert(osd
< max_osd
);
1047 return osd_info
[osd
];
1050 const osd_xinfo_t
& get_xinfo(int osd
) const {
1051 ceph_assert(osd
< max_osd
);
1052 return osd_xinfo
[osd
];
1055 int get_next_up_osd_after(int n
) const {
1056 if (get_max_osd() == 0)
1058 for (int i
= n
+ 1; i
!= n
; ++i
) {
1059 if (i
>= get_max_osd())
1069 int get_previous_up_osd_before(int n
) const {
1070 if (get_max_osd() == 0)
1072 for (int i
= n
- 1; i
!= n
; --i
) {
1074 i
= get_max_osd() - 1;
1084 void get_random_up_osds_by_subtree(int n
, // whoami
1085 std::string
&subtree
,
1086 int limit
, // how many
1088 std::set
<int> *want
) const;
1091 * get feature bits required by the current structure
1093 * @param entity_type [in] what entity type we are asking about
1094 * @param mask [out] std::set of all possible map-related features we could std::set
1095 * @return feature bits used by this map
1097 uint64_t get_features(int entity_type
, uint64_t *mask
) const;
1100 * get oldest *client* version (firefly, hammer, etc.) that can connect given
1101 * the feature bits required (according to get_features()).
1103 ceph_release_t
get_min_compat_client() const;
1106 * gets the required minimum *client* version that can connect to the cluster.
1108 ceph_release_t
get_require_min_compat_client() const;
1111 * get intersection of features supported by up osds
1113 uint64_t get_up_osd_features() const;
1115 void get_upmap_pgs(std::vector
<pg_t
> *upmap_pgs
) const;
1116 bool check_pg_upmaps(
1118 const std::vector
<pg_t
>& to_check
,
1119 std::vector
<pg_t
> *to_cancel
,
1120 std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int,int>>> *to_remap
) const;
1121 void clean_pg_upmaps(
1123 Incremental
*pending_inc
,
1124 const std::vector
<pg_t
>& to_cancel
,
1125 const std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int,int>>>& to_remap
) const;
1126 bool clean_pg_upmaps(CephContext
*cct
, Incremental
*pending_inc
) const;
1128 int apply_incremental(const Incremental
&inc
);
1130 /// try to re-use/reference addrs in oldmap from newmap
1131 static void dedup(const OSDMap
*oldmap
, OSDMap
*newmap
);
1133 static void clean_temps(CephContext
*cct
,
1134 const OSDMap
& oldmap
,
1135 const OSDMap
& nextmap
,
1136 Incremental
*pending_inc
);
1138 // serialize, unserialize
1140 void encode_client_old(ceph::buffer::list
& bl
) const;
1141 void encode_classic(ceph::buffer::list
& bl
, uint64_t features
) const;
1142 void decode_classic(ceph::buffer::list::const_iterator
& p
);
1145 void encode(ceph::buffer::list
& bl
, uint64_t features
=CEPH_FEATURES_ALL
) const;
1146 void decode(ceph::buffer::list
& bl
);
1147 void decode(ceph::buffer::list::const_iterator
& bl
);
1150 /**** mapping facilities ****/
1153 const std::string
& name
,
1154 const std::string
& key
,
1155 const std::string
& nspace
,
1157 int object_locator_to_pg(const object_t
& oid
, const object_locator_t
& loc
,
1159 pg_t
object_locator_to_pg(const object_t
& oid
,
1160 const object_locator_t
& loc
) const {
1162 int ret
= object_locator_to_pg(oid
, loc
, pg
);
1163 ceph_assert(ret
== 0);
1168 static object_locator_t
file_to_object_locator(const file_layout_t
& layout
) {
1169 return object_locator_t(layout
.pool_id
, layout
.pool_ns
);
1172 ceph_object_layout
file_to_object_layout(object_t oid
,
1173 file_layout_t
& layout
) const {
1174 return make_object_layout(oid
, layout
.pool_id
, layout
.pool_ns
);
1177 ceph_object_layout
make_object_layout(object_t oid
, int pg_pool
,
1178 std::string nspace
) const;
1180 int get_pg_num(int pg_pool
) const
1182 const pg_pool_t
*pool
= get_pg_pool(pg_pool
);
1183 ceph_assert(NULL
!= pool
);
1184 return pool
->get_pg_num();
1187 bool pg_exists(pg_t pgid
) const {
1188 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1189 return p
&& pgid
.ps() < p
->get_pg_num();
1192 int get_pg_pool_min_size(pg_t pgid
) const {
1193 if (!pg_exists(pgid
)) {
1196 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1198 return p
->get_min_size();
1201 int get_pg_pool_size(pg_t pgid
) const {
1202 if (!pg_exists(pgid
)) {
1205 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1207 return p
->get_size();
1210 int get_pg_pool_crush_rule(pg_t pgid
) const {
1211 if (!pg_exists(pgid
)) {
1214 const pg_pool_t
*p
= get_pg_pool(pgid
.pool());
1216 return p
->get_crush_rule();
1220 /// pg -> (raw osd std::list)
1221 void _pg_to_raw_osds(
1222 const pg_pool_t
& pool
, pg_t pg
,
1223 std::vector
<int> *osds
,
1225 int _pick_primary(const std::vector
<int>& osds
) const;
1226 void _remove_nonexistent_osds(const pg_pool_t
& pool
, std::vector
<int>& osds
) const;
1228 void _apply_primary_affinity(ps_t seed
, const pg_pool_t
& pool
,
1229 std::vector
<int> *osds
, int *primary
) const;
1231 /// apply pg_upmap[_items] mappings
1232 void _apply_upmap(const pg_pool_t
& pi
, pg_t pg
, std::vector
<int> *raw
) const;
1234 /// pg -> (up osd std::list)
1235 void _raw_to_up_osds(const pg_pool_t
& pool
, const std::vector
<int>& raw
,
1236 std::vector
<int> *up
) const;
1240 * Get the pg and primary temp, if they are specified.
1241 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
1242 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
1243 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
1245 void _get_temp_osds(const pg_pool_t
& pool
, pg_t pg
,
1246 std::vector
<int> *temp_pg
, int *temp_primary
) const;
1249 * map to up and acting. Fills in whatever fields are non-NULL.
1251 void _pg_to_up_acting_osds(const pg_t
& pg
, std::vector
<int> *up
, int *up_primary
,
1252 std::vector
<int> *acting
, int *acting_primary
,
1253 bool raw_pg_to_pg
= true) const;
1257 * This is suitable only for looking at raw CRUSH outputs. It skips
1258 * applying the temp and up checks and should not be used
1259 * by anybody for data mapping purposes.
1260 * raw and primary must be non-NULL
1262 void pg_to_raw_osds(pg_t pg
, std::vector
<int> *raw
, int *primary
) const;
1263 void pg_to_raw_upmap(pg_t pg
, std::vector
<int> *raw
,
1264 std::vector
<int> *raw_upmap
) const;
1265 /// map a pg to its acting set. @return acting set size
1266 void pg_to_acting_osds(const pg_t
& pg
, std::vector
<int> *acting
,
1267 int *acting_primary
) const {
1268 _pg_to_up_acting_osds(pg
, NULL
, NULL
, acting
, acting_primary
);
1270 void pg_to_acting_osds(pg_t pg
, std::vector
<int>& acting
) const {
1271 return pg_to_acting_osds(pg
, &acting
, NULL
);
1274 * This does not apply temp overrides and should not be used
1275 * by anybody for data mapping purposes. Specify both pointers.
1277 void pg_to_raw_up(pg_t pg
, std::vector
<int> *up
, int *primary
) const;
1279 * map a pg to its acting set as well as its up set. You must use
1280 * the acting set for data mapping purposes, but some users will
1281 * also find the up set useful for things like deciding what to
1283 * Each of these pointers must be non-NULL.
1285 void pg_to_up_acting_osds(pg_t pg
, std::vector
<int> *up
, int *up_primary
,
1286 std::vector
<int> *acting
, int *acting_primary
) const {
1287 _pg_to_up_acting_osds(pg
, up
, up_primary
, acting
, acting_primary
);
1289 void pg_to_up_acting_osds(pg_t pg
, std::vector
<int>& up
, std::vector
<int>& acting
) const {
1290 int up_primary
, acting_primary
;
1291 pg_to_up_acting_osds(pg
, &up
, &up_primary
, &acting
, &acting_primary
);
1293 bool pg_is_ec(pg_t pg
) const {
1294 auto i
= pools
.find(pg
.pool());
1295 ceph_assert(i
!= pools
.end());
1296 return i
->second
.is_erasure();
1298 bool get_primary_shard(const pg_t
& pgid
, spg_t
*out
) const {
1299 auto i
= get_pools().find(pgid
.pool());
1300 if (i
== get_pools().end()) {
1303 if (!i
->second
.is_erasure()) {
1308 std::vector
<int> acting
;
1309 pg_to_acting_osds(pgid
, &acting
, &primary
);
1310 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
1311 if (acting
[i
] == primary
) {
1312 *out
= spg_t(pgid
, shard_id_t(i
));
1318 bool get_primary_shard(const pg_t
& pgid
, int *primary
, spg_t
*out
) const {
1319 auto i
= get_pools().find(pgid
.pool());
1320 if (i
== get_pools().end()) {
1323 std::vector
<int> acting
;
1324 pg_to_acting_osds(pgid
, &acting
, primary
);
1325 if (i
->second
.is_erasure()) {
1326 for (uint8_t i
= 0; i
< acting
.size(); ++i
) {
1327 if (acting
[i
] == *primary
) {
1328 *out
= spg_t(pgid
, shard_id_t(i
));
1339 bool in_removed_snaps_queue(int64_t pool
, snapid_t snap
) const {
1340 auto p
= removed_snaps_queue
.find(pool
);
1341 if (p
== removed_snaps_queue
.end()) {
1344 return p
->second
.contains(snap
);
1347 const mempool::osdmap::map
<int64_t,snap_interval_set_t
>&
1348 get_removed_snaps_queue() const {
1349 return removed_snaps_queue
;
1351 const mempool::osdmap::map
<int64_t,snap_interval_set_t
>&
1352 get_new_removed_snaps() const {
1353 return new_removed_snaps
;
1355 const mempool::osdmap::map
<int64_t,snap_interval_set_t
>&
1356 get_new_purged_snaps() const {
1357 return new_purged_snaps
;
1360 int64_t lookup_pg_pool_name(std::string_view name
) const {
1361 auto p
= name_pool
.find(name
);
1362 if (p
== name_pool
.end())
1367 int64_t get_pool_max() const {
1370 const mempool::osdmap::map
<int64_t,pg_pool_t
>& get_pools() const {
1373 mempool::osdmap::map
<int64_t,pg_pool_t
>& get_pools() {
1376 void get_pool_ids_by_rule(int rule_id
, std::set
<int64_t> *pool_ids
) const {
1377 ceph_assert(pool_ids
);
1378 for (auto &p
: pools
) {
1379 if (p
.second
.get_crush_rule() == rule_id
) {
1380 pool_ids
->insert(p
.first
);
1384 void get_pool_ids_by_osd(CephContext
*cct
,
1386 std::set
<int64_t> *pool_ids
) const;
1387 const std::string
& get_pool_name(int64_t p
) const {
1388 auto i
= pool_name
.find(p
);
1389 ceph_assert(i
!= pool_name
.end());
1392 const mempool::osdmap::map
<int64_t,std::string
>& get_pool_names() const {
1395 bool have_pg_pool(int64_t p
) const {
1396 return pools
.count(p
);
1398 const pg_pool_t
* get_pg_pool(int64_t p
) const {
1399 auto i
= pools
.find(p
);
1400 if (i
!= pools
.end())
1404 unsigned get_pg_size(pg_t pg
) const {
1405 auto p
= pools
.find(pg
.pool());
1406 ceph_assert(p
!= pools
.end());
1407 return p
->second
.get_size();
1409 int get_pg_type(pg_t pg
) const {
1410 auto p
= pools
.find(pg
.pool());
1411 ceph_assert(p
!= pools
.end());
1412 return p
->second
.get_type();
1414 int get_pool_crush_rule(int64_t pool_id
) const {
1415 auto pool
= get_pg_pool(pool_id
);
1418 return pool
->get_crush_rule();
1422 pg_t
raw_pg_to_pg(pg_t pg
) const {
1423 auto p
= pools
.find(pg
.pool());
1424 ceph_assert(p
!= pools
.end());
1425 return p
->second
.raw_pg_to_pg(pg
);
1428 // pg -> acting primary osd
1429 int get_pg_acting_primary(pg_t pg
) const {
1431 _pg_to_up_acting_osds(pg
, nullptr, nullptr, nullptr, &primary
);
1436 * check whether an spg_t maps to a particular osd
1438 bool is_up_acting_osd_shard(spg_t pg
, int osd
) const {
1439 std::vector
<int> up
, acting
;
1440 _pg_to_up_acting_osds(pg
.pgid
, &up
, NULL
, &acting
, NULL
, false);
1441 if (calc_pg_role(pg_shard_t(osd
, pg
.shard
), acting
) >= 0 ||
1442 calc_pg_role(pg_shard_t(osd
, pg
.shard
), up
) >= 0) {
1449 static int calc_pg_role_broken(int osd
, const std::vector
<int>& acting
, int nrep
=0);
1450 static int calc_pg_role(pg_shard_t who
, const std::vector
<int>& acting
);
1451 static bool primary_changed_broken(
1453 const std::vector
<int> &oldacting
,
1455 const std::vector
<int> &newacting
);
1457 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
1458 int get_pg_acting_role(spg_t pg
, int osd
) const {
1459 std::vector
<int> group
;
1460 pg_to_acting_osds(pg
.pgid
, group
);
1461 return calc_pg_role(pg_shard_t(osd
, pg
.shard
), group
);
1466 pg_t pg
, ///< pg to potentially remap
1467 const std::set
<int>& overfull
, ///< osds we'd want to evacuate
1468 const std::vector
<int>& underfull
, ///< osds to move to, in order of preference
1469 const std::vector
<int>& more_underfull
, ///< less full osds to move to, in order of preference
1470 std::vector
<int> *orig
,
1471 std::vector
<int> *out
); ///< resulting alternative mapping
1473 int balance_primaries(
1476 Incremental
*pending_inc
,
1477 OSDMap
& tmp_osd_map
) const;
1479 int calc_desired_primary_distribution(
1481 int64_t pid
, // pool id
1482 const std::vector
<uint64_t> &osds
,
1483 std::map
<uint64_t, float>& desired_primary_distribution
) const; // vector of osd ids
1487 uint32_t max_deviation
, ///< max deviation from target (value >= 1)
1488 int max_iterations
, ///< max iterations to run
1489 const std::set
<int64_t>& pools
, ///< [optional] restrict to pool
1490 Incremental
*pending_inc
,
1491 std::random_device::result_type
*p_seed
= nullptr ///< [optional] for regression tests
1494 std::map
<uint64_t,std::set
<pg_t
>> get_pgs_by_osd(
1497 std::map
<uint64_t, std::set
<pg_t
>> *p_primaries_by_osd
= nullptr,
1498 std::map
<uint64_t, std::set
<pg_t
>> *p_acting_primaries_by_osd
= nullptr
1499 ) const; // used in calc_desired_primary_distribution()
1501 private: // Bunch of internal functions used only by calc_pg_upmaps (result of code refactoring)
1503 float get_osds_weight(
1505 const OSDMap
& tmp_osd_map
,
1507 std::map
<int,float>& osds_weight
1510 float build_pool_pgs_info (
1512 const std::set
<int64_t>& pools
, ///< [optional] restrict to pool
1513 const OSDMap
& tmp_osd_map
,
1515 std::map
<int, std::set
<pg_t
>>& pgs_by_osd
,
1516 std::map
<int,float>& osds_weight
1517 ); // return total weight of all OSDs
1519 float calc_deviations (
1521 const std::map
<int,std::set
<pg_t
>>& pgs_by_osd
,
1522 const std::map
<int,float>& osd_weight
,
1523 float pgs_per_weight
,
1524 std::map
<int,float>& osd_deviation
,
1525 std::multimap
<float,int>& deviation_osd
,
1527 ); // return current max deviation
1529 void fill_overfull_underfull (
1531 const std::multimap
<float,int>& deviation_osd
,
1533 std::set
<int>& overfull
,
1534 std::set
<int>& more_overfull
,
1535 std::vector
<int>& underfull
,
1536 std::vector
<int>& more_underfull
1539 int pack_upmap_results(
1541 const std::set
<pg_t
>& to_unmap
,
1542 const std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int, int>>>& to_upmap
,
1543 OSDMap
& tmp_osd_map
,
1544 OSDMap::Incremental
*pending_inc
1547 std::default_random_engine
get_random_engine(
1549 std::random_device::result_type
*p_seed
1552 bool try_drop_remap_overfull(
1554 const std::vector
<pg_t
>& pgs
,
1555 const OSDMap
& tmp_osd_map
,
1557 std::map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
1558 std::set
<pg_t
>& to_unmap
,
1559 std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>>& to_upmap
1562 typedef std::vector
<std::pair
<pg_t
, mempool::osdmap::vector
<std::pair
<int, int>>>>
1565 bool try_drop_remap_underfull(
1567 const candidates_t
& candidates
,
1569 std::map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
1570 std::set
<pg_t
>& to_unmap
,
1571 std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>>& to_upmap
1574 void add_remap_pair(
1579 size_t pg_pool_size
,
1581 std::set
<int>& existing
,
1582 std::map
<int,std::set
<pg_t
>>& temp_pgs_by_osd
,
1583 mempool::osdmap::vector
<std::pair
<int32_t,int32_t>> new_upmap_items
,
1584 std::map
<pg_t
, mempool::osdmap::vector
<std::pair
<int32_t,int32_t>>>& to_upmap
1587 int find_best_remap (
1589 const std::vector
<int>& orig
,
1590 const std::vector
<int>& out
,
1591 const std::set
<int>& existing
,
1592 const std::map
<int,float> osd_deviation
1595 candidates_t
build_candidates(
1597 const OSDMap
& tmp_osd_map
,
1598 const std::set
<pg_t
> to_skip
,
1599 const std::set
<int64_t>& only_pools
,
1601 std::random_device::result_type
*p_seed
1608 float pa_weighted_avg
;
1610 float optimal_score
; // based on primary_affinity values
1611 float adjusted_score
; // based on raw_score and pa_avg 1 is optimal
1612 float acting_raw_score
; // based on active_primaries (temporary)
1613 float acting_adj_score
; // based on raw_active_score and pa_avg 1 is optimal
1614 std::string err_msg
;
1615 } read_balance_info_t
;
1617 // This function calculates scores about the cluster read balance state
1618 // p_rb_info->acting_adj_score is the current read balance score (acting)
1619 // p_rb_info->adjusted_score is the stable read balance score
1620 // Return value of 0 is OK, negative means an error (may happen with
1621 // some arifically generated osamap files)
1623 int calc_read_balance_score(
1626 read_balance_info_t
*p_rb_info
) const;
1629 float rbi_round(float f
) const {
1630 return (f
> 0.0) ? floor(f
* 100 + 0.5) / 100 : ceil(f
* 100 - 0.5) / 100;
1633 int64_t has_zero_pa_pgs(
1635 int64_t pool_id
) const;
1638 read_balance_info_t
&rbi
1643 read_balance_info_t
&rbi
,
1649 float total_osd_weight
,
1650 uint max_prims_per_osd
,
1651 uint max_acting_prims_per_osd
,
1652 float avg_prims_per_osd
,
1653 bool prim_on_zero_pa
,
1654 bool acting_on_zero_pa
,
1655 float max_osd_score
) const;
1658 int get_osds_by_bucket_name(const std::string
&name
, std::set
<int> *osds
) const;
1660 bool have_pg_upmaps(pg_t pg
) const {
1661 return pg_upmap
.count(pg
) ||
1662 pg_upmap_items
.count(pg
);
1665 bool check_full(const std::set
<pg_shard_t
> &missing_on
) const {
1666 for (auto shard
: missing_on
) {
1667 if (get_state(shard
.osd
) & CEPH_OSD_FULL
)
1674 * handy helpers to build simple maps...
1677 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
1678 * it will be initialized with the specified number of OSDs in a
1679 * single host. If **num_osd** is < 0 the layout of the OSD map will
1680 * be built by reading the content of the configuration file.
1682 * @param cct [in] in core ceph context
1683 * @param e [in] initial epoch
1684 * @param fsid [in] id of the cluster
1685 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
1686 * @return **0** on success, negative errno on error.
1689 int build_simple_optioned(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
1690 int num_osd
, int pg_bits
, int pgp_bits
,
1693 int build_simple(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
1695 return build_simple_optioned(cct
, e
, fsid
, num_osd
, 0, 0, false);
1697 int build_simple_with_pool(CephContext
*cct
, epoch_t e
, uuid_d
&fsid
,
1698 int num_osd
, int pg_bits
, int pgp_bits
) {
1699 return build_simple_optioned(cct
, e
, fsid
, num_osd
,
1700 pg_bits
, pgp_bits
, true);
1702 static int _build_crush_types(CrushWrapper
& crush
);
1703 static int build_simple_crush_map(CephContext
*cct
, CrushWrapper
& crush
,
1704 int num_osd
, std::ostream
*ss
);
1705 static int build_simple_crush_map_from_conf(CephContext
*cct
,
1706 CrushWrapper
& crush
,
1708 static int build_simple_crush_rules(
1709 CephContext
*cct
, CrushWrapper
& crush
,
1710 const std::string
& root
,
1713 bool crush_rule_in_use(int rule_id
) const;
1715 int validate_crush_rules(CrushWrapper
*crush
, std::ostream
*ss
) const;
1719 primary_temp
->clear();
1723 void print_osd_line(int cur
, std::ostream
*out
, ceph::Formatter
*f
) const;
1725 void print(CephContext
*cct
, std::ostream
& out
) const;
1726 void print_osd(int id
, std::ostream
& out
) const;
1727 void print_osds(std::ostream
& out
) const;
1728 void print_pools(CephContext
*cct
, std::ostream
& out
) const;
1729 void print_summary(ceph::Formatter
*f
, std::ostream
& out
,
1730 const std::string
& prefix
, bool extra
=false) const;
1731 void print_oneline_summary(std::ostream
& out
) const;
1734 DUMP_IN
= 1, // only 'in' osds
1735 DUMP_OUT
= 2, // only 'out' osds
1736 DUMP_UP
= 4, // only 'up' osds
1737 DUMP_DOWN
= 8, // only 'down' osds
1738 DUMP_DESTROYED
= 16, // only 'destroyed' osds
1740 void print_tree(ceph::Formatter
*f
, std::ostream
*out
,
1741 unsigned dump_flags
=0, std::string bucket
="") const;
1743 int summarize_mapping_stats(
1745 const std::set
<int64_t> *pools
,
1747 ceph::Formatter
*f
) const;
1749 std::string
get_flag_string() const;
1750 static std::string
get_flag_string(unsigned flags
);
1751 static void dump_erasure_code_profiles(
1752 const mempool::osdmap::map
<std::string
,std::map
<std::string
,std::string
> > &profiles
,
1753 ceph::Formatter
*f
);
1754 void dump(ceph::Formatter
*f
, CephContext
*cct
= nullptr) const;
1755 void dump_osd(int id
, ceph::Formatter
*f
) const;
1756 void dump_osds(ceph::Formatter
*f
) const;
1757 void dump_pool(CephContext
*cct
, int64_t pid
, const pg_pool_t
&pdata
, ceph::Formatter
*f
) const;
1758 void dump_read_balance_score(CephContext
*cct
, int64_t pid
, const pg_pool_t
&pdata
, ceph::Formatter
*f
) const;
1759 static void generate_test_instances(std::list
<OSDMap
*>& o
);
1760 bool check_new_blocklist_entries() const { return new_blocklist_entries
; }
1762 void check_health(CephContext
*cct
, health_check_map_t
*checks
) const;
1764 int parse_osd_id_list(const std::vector
<std::string
>& ls
,
1766 std::ostream
*ss
) const;
1768 float pool_raw_used_rate(int64_t poolid
) const;
1769 std::optional
<std::string
> pending_require_osd_release() const;
1772 WRITE_CLASS_ENCODER_FEATURES(OSDMap
)
1773 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental
)
1776 #include "crimson/common/local_shared_foreign_ptr.h"
1777 using LocalOSDMapRef
= boost::local_shared_ptr
<const OSDMap
>;
1778 using OSDMapRef
= crimson::local_shared_foreign_ptr
<LocalOSDMapRef
>;
1780 using OSDMapRef
= std::shared_ptr
<const OSDMap
>;
1784 inline std::ostream
& operator<<(std::ostream
& out
, const OSDMap
& m
) {
1785 m
.print_oneline_summary(out
);
1791 void print_osd_utilization(const OSDMap
& osdmap
,
1796 const std::string
& filter
);