X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fosd%2FOSDMap.h;h=3a3e6155ee9933569c29a8dd647f898abdace0fb;hb=df9f7d3d5c0b91cdb889467b50e5170245f33d23;hp=1dc67090d4446a4691763d89c6b46c775b530bb7;hpb=7c673caec407dd16107e56e4b51a6d00f021315c;p=ceph.git diff --git a/ceph/src/osd/OSDMap.h b/ceph/src/osd/OSDMap.h index 1dc67090d..3a3e6155e 100644 --- a/ceph/src/osd/OSDMap.h +++ b/ceph/src/osd/OSDMap.h @@ -24,33 +24,25 @@ * disks, disk groups, total # osds, * */ +#include +#include +#include +#include +#include + +#include +#include "include/btree_map.h" +#include "include/common_fwd.h" #include "include/types.h" +#include "common/ceph_releases.h" #include "osd_types.h" //#include "include/ceph_features.h" #include "crush/CrushWrapper.h" -#include -#include -#include -#include -#include "include/memory.h" -using namespace std; // forward declaration -class CephContext; class CrushWrapper; - -// FIXME C++11 does not have std::equal for two differently-typed containers. -// use this until we move to c++14 -template -bool vectors_equal(A a, B b) -{ - return - a.size() == b.size() && - (a.empty() || - memcmp((char*)&a[0], (char*)&b[0], sizeof(a[0]) * a.size()) == 0); -} - +class health_check_map_t; /* * we track up to two intervals during which the osd was alive and @@ -59,7 +51,7 @@ bool vectors_equal(A a, B b) * bound on the actual osd death. down_at (if it is > up_from) is an * upper bound on the actual osd death. * - * the second is the last_clean interval [first,last]. in that case, + * the second is the last_clean interval [begin,end). in that case, * the last interval is the last epoch known to have been either * _finished_, or during which the osd cleanly shut down. when * possible, we push this forward to the epoch the osd was eventually @@ -85,14 +77,14 @@ struct osd_info_t { osd_info_t() : last_clean_begin(0), last_clean_end(0), up_from(0), up_thru(0), down_at(0), lost_at(0) {} - void dump(Formatter *f) const; - void encode(bufferlist& bl) const; - void decode(bufferlist::iterator& bl); - static void generate_test_instances(list& o); + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); }; WRITE_CLASS_ENCODER(osd_info_t) -ostream& operator<<(ostream& out, const osd_info_t& info); +std::ostream& operator<<(std::ostream& out, const osd_info_t& info); struct osd_xinfo_t { utime_t down_stamp; ///< timestamp when we were last marked down @@ -100,19 +92,258 @@ struct osd_xinfo_t { __u32 laggy_interval; ///< average interval between being marked laggy and recovering uint64_t features; ///< features supported by this osd we should know about __u32 old_weight; ///< weight prior to being auto marked out + utime_t last_purged_snaps_scrub; ///< last scrub of purged_snaps + epoch_t dead_epoch = 0; ///< last epoch we were confirmed dead (not just down) osd_xinfo_t() : laggy_probability(0), laggy_interval(0), features(0), old_weight(0) {} - void dump(Formatter *f) const; - void encode(bufferlist& bl) const; - void decode(bufferlist::iterator& bl); - static void generate_test_instances(list& o); + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list& o); }; -WRITE_CLASS_ENCODER(osd_xinfo_t) +WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t) + +std::ostream& operator<<(std::ostream& out, const osd_xinfo_t& xi); + + +struct PGTempMap { +#if 1 + ceph::buffer::list data; + typedef btree::btree_map map_t; + map_t map; + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + uint32_t n = map.size(); + encode(n, bl); + for (auto &p : map) { + encode(p.first, bl); + bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32)); + } + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + data.clear(); + map.clear(); + uint32_t n; + decode(n, p); + if (!n) + return; + auto pstart = p; + size_t start_off = pstart.get_off(); + std::vector> offsets; + offsets.resize(n); + for (unsigned i=0; i 1) { + data.rebuild(); + } + //map.reserve(n); + char *start = data.c_str(); + for (auto i : offsets) { + map.insert(map.end(), std::make_pair(i.first, (ceph_le32*)(start + i.second))); + } + } + void rebuild() { + ceph::buffer::list bl; + encode(bl); + auto p = std::cbegin(bl); + decode(p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.map.size() == r.map.size() && + l.data.contents_equal(r.data); + } + + class iterator { + map_t::const_iterator it; + map_t::const_iterator end; + std::pair> current; + void init_current() { + if (it != end) { + current.first = it->first; + ceph_assert(it->second); + current.second.resize(*it->second); + ceph_le32 *p = it->second + 1; + for (uint32_t n = 0; n < *it->second; ++n, ++p) { + current.second[n] = *p; + } + } + } + public: + iterator(map_t::const_iterator p, + map_t::const_iterator e) + : it(p), end(e) { + init_current(); + } + + const std::pair>& operator*() const { + return current; + } + const std::pair>* operator->() const { + return ¤t; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + if (it != end) + init_current(); + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + if (it != end) + init_current(); + return r; + } + }; + iterator begin() const { + return iterator(map.begin(), map.end()); + } + iterator end() const { + return iterator(map.end(), map.end()); + } + iterator find(pg_t pgid) const { + return iterator(map.find(pgid), map.end()); + } + size_t size() const { + return map.size(); + } + size_t count(pg_t pgid) const { + return map.count(pgid); + } + void erase(pg_t pgid) { + map.erase(pgid); + } + void clear() { + map.clear(); + data.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector& v) { + using ceph::encode; + size_t need = sizeof(ceph_le32) * (1 + v.size()); + if (need < data.get_append_buffer_unused_tail_length()) { + ceph::buffer::ptr z(data.get_append_buffer_unused_tail_length()); + z.zero(); + data.append(z.c_str(), z.length()); + } + encode(v, data); + map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size()); + } + mempool::osdmap::vector get(pg_t pgid) { + mempool::osdmap::vector v; + ceph_le32 *p = map[pgid]; + size_t n = *p++; + v.resize(n); + for (size_t i = 0; i < n; ++i, ++p) { + v[i] = *p; + } + return v; + } +#else + // trivial implementation + mempool::osdmap::map > pg_temp; -ostream& operator<<(ostream& out, const osd_xinfo_t& xi); + void encode(ceph::buffer::list& bl) const { + encode(pg_temp, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + decode(pg_temp, p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.pg_temp.size() == r.pg_temp.size() && + l.pg_temp == r.pg_temp; + } + class iterator { + mempool::osdmap::map >::const_iterator it; + public: + iterator(mempool::osdmap::map >::const_iterator p) + : it(p) {} + + std::pair&> operator*() const { + return *it; + } + const std::pair>* operator->() const { + return &*it; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + return r; + } + }; + iterator begin() const { + return iterator(pg_temp.cbegin()); + } + iterator end() const { + return iterator(pg_temp.cend()); + } + iterator find(pg_t pgid) const { + return iterator(pg_temp.find(pgid)); + } + size_t size() const { + return pg_temp.size(); + } + size_t count(pg_t pgid) const { + return pg_temp.count(pgid); + } + void erase(pg_t pgid) { + pg_temp.erase(pgid); + } + void clear() { + pg_temp.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector& v) { + pg_temp[pgid] = v; + } + const mempool::osdmap::vector& get(pg_t pgid) { + return pg_temp.at(pgid); + } +#endif + void dump(ceph::Formatter *f) const { + for (const auto &pg : *this) { + f->open_object_section("osds"); + f->dump_stream("pgid") << pg.first; + f->open_array_section("osds"); + for (const auto osd : pg.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + } +}; +WRITE_CLASS_ENCODER(PGTempMap) /** OSDMap */ @@ -132,47 +363,71 @@ public: utime_t modified; int64_t new_pool_max; //incremented by the OSDMonitor on each pool create int32_t new_flags; + ceph_release_t new_require_osd_release{0xff}; + uint32_t new_stretch_bucket_count{0}; + uint32_t new_degraded_stretch_mode{0}; + uint32_t new_recovering_stretch_mode{0}; + int32_t new_stretch_mode_bucket{0}; + bool stretch_mode_enabled{false}; + bool change_stretch_mode{false}; + + enum class mutate_allow_crimson_t : uint8_t { + NONE = 0, + SET = 1, + // Monitor won't allow CLEAR to be set currently, but we may allow it later + CLEAR = 2 + } mutate_allow_crimson = mutate_allow_crimson_t::NONE; // full (rare) - bufferlist fullmap; // in lieu of below. - bufferlist crush; + ceph::buffer::list fullmap; // in lieu of below. + ceph::buffer::list crush; // incremental int32_t new_max_osd; mempool::osdmap::map new_pools; - mempool::osdmap::map new_pool_names; + mempool::osdmap::map new_pool_names; mempool::osdmap::set old_pools; - mempool::osdmap::map > new_erasure_code_profiles; - mempool::osdmap::vector old_erasure_code_profiles; - mempool::osdmap::map new_up_client; - mempool::osdmap::map new_up_cluster; - mempool::osdmap::map new_state; // XORed onto previous state. + mempool::osdmap::map > new_erasure_code_profiles; + mempool::osdmap::vector old_erasure_code_profiles; + mempool::osdmap::map new_up_client; + mempool::osdmap::map new_up_cluster; + mempool::osdmap::map new_state; // XORed onto previous state. mempool::osdmap::map new_weight; mempool::osdmap::map > new_pg_temp; // [] to remove mempool::osdmap::map new_primary_temp; // [-1] to remove mempool::osdmap::map new_primary_affinity; mempool::osdmap::map new_up_thru; - mempool::osdmap::map > new_last_clean_interval; + mempool::osdmap::map > new_last_clean_interval; mempool::osdmap::map new_lost; mempool::osdmap::map new_uuid; mempool::osdmap::map new_xinfo; - mempool::osdmap::map new_blacklist; - mempool::osdmap::vector old_blacklist; - mempool::osdmap::map new_hb_back_up; - mempool::osdmap::map new_hb_front_up; + mempool::osdmap::map new_blocklist; + mempool::osdmap::vector old_blocklist; + mempool::osdmap::map new_range_blocklist; + mempool::osdmap::vector old_range_blocklist; + mempool::osdmap::map new_hb_back_up; + mempool::osdmap::map new_hb_front_up; mempool::osdmap::map> new_pg_upmap; - mempool::osdmap::map>> new_pg_upmap_items; - mempool::osdmap::set old_pg_upmap, old_pg_upmap_items; + mempool::osdmap::map>> new_pg_upmap_items; + mempool::osdmap::map new_pg_upmap_primary; + mempool::osdmap::set old_pg_upmap, old_pg_upmap_items, old_pg_upmap_primary; + mempool::osdmap::map new_removed_snaps; + mempool::osdmap::map new_purged_snaps; + + mempool::osdmap::map new_crush_node_flags; + mempool::osdmap::map new_device_class_flags; - string cluster_snapshot; + std::string cluster_snapshot; float new_nearfull_ratio = -1; float new_backfillfull_ratio = -1; float new_full_ratio = -1; - string new_require_min_compat_client; + ceph_release_t new_require_min_compat_client{0xff}; + + utime_t new_last_up_change, new_last_in_change; mutable bool have_crc; ///< crc values are defined uint32_t full_crc; ///< crc of the resulting OSDMap @@ -182,25 +437,24 @@ public: int get_net_marked_down(const OSDMap *previous) const; int identify_osd(uuid_d u) const; - void encode_client_old(bufferlist& bl) const; - void encode_classic(bufferlist& bl, uint64_t features) const; - void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; - void decode_classic(bufferlist::iterator &p); - void decode(bufferlist::iterator &bl); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); + void encode_client_old(ceph::buffer::list& bl) const; + void encode_classic(ceph::buffer::list& bl, uint64_t features) const; + void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode_classic(ceph::buffer::list::const_iterator &p); + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); explicit Incremental(epoch_t e=0) : encode_features(0), epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1), have_crc(false), full_crc(0), inc_crc(0) { - memset(&fsid, 0, sizeof(fsid)); } - explicit Incremental(bufferlist &bl) { - bufferlist::iterator p = bl.begin(); + explicit Incremental(ceph::buffer::list &bl) { + auto p = std::cbegin(bl); decode(p); } - explicit Incremental(bufferlist::iterator &p) { + explicit Incremental(ceph::buffer::list::const_iterator &p) { decode(p); } @@ -209,17 +463,69 @@ public: new_pools[pool] = *orig; return &new_pools[pool]; } - bool has_erasure_code_profile(const string &name) const { + bool has_erasure_code_profile(const std::string &name) const { auto i = new_erasure_code_profiles.find(name); return i != new_erasure_code_profiles.end(); } - void set_erasure_code_profile(const string &name, - const map& profile) { + void set_erasure_code_profile(const std::string &name, + const std::map& profile) { new_erasure_code_profiles[name] = profile; } + mempool::osdmap::map> get_erasure_code_profiles() const { + return new_erasure_code_profiles; + } + + /// propagate update pools' (snap and other) metadata to any of their tiers + int propagate_base_properties_to_tiers(CephContext *cct, const OSDMap &base); + + /// filter out osds with any pending state changing + size_t get_pending_state_osds(std::vector *osds) { + ceph_assert(osds); + osds->clear(); + + for (auto &p : new_state) { + osds->push_back(p.first); + } + + return osds->size(); + } + + bool pending_osd_has_state(int osd, unsigned state) { + return new_state.count(osd) && (new_state[osd] & state) != 0; + } - /// propage update pools' snap metadata to any of their tiers - int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base); + bool pending_osd_state_set(int osd, unsigned state) { + if (pending_osd_has_state(osd, state)) + return false; + new_state[osd] |= state; + return true; + } + + // cancel the specified pending osd state if there is any + // return ture on success, false otherwise. + bool pending_osd_state_clear(int osd, unsigned state) { + if (!pending_osd_has_state(osd, state)) { + // never has been set or already has been cancelled. + return false; + } + + new_state[osd] &= ~state; + if (!new_state[osd]) { + // all flags cleared + new_state.erase(osd); + } + return true; + } + + bool in_new_removed_snaps(int64_t pool, snapid_t snap) const { + auto p = new_removed_snaps.find(pool); + if (p == new_removed_snaps.end()) { + return false; + } + return p->second.contains(snap); + } + + void set_allow_crimson() { mutate_allow_crimson = mutate_allow_crimson_t::SET; } }; private: @@ -235,46 +541,110 @@ private: int num_in_osd; // not saved; see calc_num_osds int32_t max_osd; - vector osd_state; + std::vector osd_state; + + mempool::osdmap::map crush_node_flags; // crush node -> CEPH_OSD_* flags + mempool::osdmap::map device_class_flags; // device class -> CEPH_OSD_* flags + + utime_t last_up_change, last_in_change; + + // These features affect OSDMap[::Incremental] encoding, or the + // encoding of some type embedded therein (CrushWrapper, something + // from osd_types, etc.). + static constexpr uint64_t SIGNIFICANT_FEATURES = + CEPH_FEATUREMASK_PGID64 | + CEPH_FEATUREMASK_PGPOOL3 | + CEPH_FEATUREMASK_OSDENC | + CEPH_FEATUREMASK_OSDMAP_ENC | + CEPH_FEATUREMASK_OSD_POOLRESEND | + CEPH_FEATUREMASK_NEW_OSDOP_ENCODING | + CEPH_FEATUREMASK_MSG_ADDR2 | + CEPH_FEATUREMASK_CRUSH_TUNABLES5 | + CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS | + CEPH_FEATUREMASK_SERVER_LUMINOUS | + CEPH_FEATUREMASK_SERVER_MIMIC | + CEPH_FEATUREMASK_SERVER_NAUTILUS | + CEPH_FEATUREMASK_SERVER_OCTOPUS | + CEPH_FEATUREMASK_SERVER_REEF; struct addrs_s { - mempool::osdmap::vector > client_addr; - mempool::osdmap::vector > cluster_addr; - mempool::osdmap::vector > hb_back_addr; - mempool::osdmap::vector > hb_front_addr; - entity_addr_t blank; + mempool::osdmap::vector > client_addrs; + mempool::osdmap::vector > cluster_addrs; + mempool::osdmap::vector > hb_back_addrs; + mempool::osdmap::vector > hb_front_addrs; }; - ceph::shared_ptr osd_addrs; + std::shared_ptr osd_addrs; + + entity_addrvec_t _blank_addrvec; mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" mempool::osdmap::vector osd_info; - ceph::shared_ptr< mempool::osdmap::map > > pg_temp; // temp pg mapping (e.g. while we rebuild) - ceph::shared_ptr< mempool::osdmap::map > primary_temp; // temp primary mapping (e.g. while we rebuild) - ceph::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline + std::shared_ptr pg_temp; // temp pg mapping (e.g. while we rebuild) + std::shared_ptr< mempool::osdmap::map > primary_temp; // temp primary mapping (e.g. while we rebuild) + std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline // remap (post-CRUSH, pre-up) mempool::osdmap::map> pg_upmap; ///< remap pg - mempool::osdmap::map>> pg_upmap_items; ///< remap osds in up set + mempool::osdmap::map>> pg_upmap_items; ///< remap osds in up set + mempool::osdmap::map pg_upmap_primaries; ///< remap primary of a pg mempool::osdmap::map pools; - mempool::osdmap::map pool_name; - mempool::osdmap::map > erasure_code_profiles; - mempool::osdmap::map name_pool; + mempool::osdmap::map pool_name; + mempool::osdmap::map> erasure_code_profiles; + mempool::osdmap::map> name_pool; - ceph::shared_ptr< mempool::osdmap::vector > osd_uuid; + std::shared_ptr< mempool::osdmap::vector > osd_uuid; mempool::osdmap::vector osd_xinfo; - mempool::osdmap::unordered_map blacklist; + class range_bits { + struct ip6 { + uint64_t upper_64_bits, lower_64_bits; + uint64_t upper_mask, lower_mask; + }; + struct ip4 { + uint32_t ip_32_bits; + uint32_t mask; + }; + union { + ip6 ipv6; + ip4 ipv4; + } bits; + bool ipv6; + static void get_ipv6_bytes(unsigned const char *addr, + uint64_t *upper, uint64_t *lower); + public: + range_bits(); + range_bits(const entity_addr_t& addr); + void parse(const entity_addr_t& addr); + bool matches(const entity_addr_t& addr) const; + }; + mempool::osdmap::unordered_map blocklist; + mempool::osdmap::map range_blocklist; + mempool::osdmap::map calculated_ranges; + + /// queue of snaps to remove + mempool::osdmap::map removed_snaps_queue; + + /// removed_snaps additions this epoch + mempool::osdmap::map new_removed_snaps; + + /// removed_snaps removals this epoch + mempool::osdmap::map new_purged_snaps; epoch_t cluster_snapshot_epoch; - string cluster_snapshot; - bool new_blacklist_entries; + std::string cluster_snapshot; + bool new_blocklist_entries; float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0; /// min compat client we want to support - string require_min_compat_client; + ceph_release_t require_min_compat_client{ceph_release_t::unknown}; + +public: + /// require osds to run at least this release + ceph_release_t require_osd_release{ceph_release_t::unknown}; +private: mutable uint64_t cached_up_osd_features; mutable bool crc_defined; @@ -286,44 +656,59 @@ private: bool have_crc() const { return crc_defined; } uint32_t get_crc() const { return crc; } - ceph::shared_ptr crush; // hierarchical map + std::shared_ptr crush; // hierarchical map + bool stretch_mode_enabled; // we are in stretch mode, requiring multiple sites + uint32_t stretch_bucket_count; // number of sites we expect to be in + uint32_t degraded_stretch_mode; // 0 if not degraded; else count of up sites + uint32_t recovering_stretch_mode; // 0 if not recovering; else 1 + int32_t stretch_mode_bucket; // the bucket type we're stretched across + bool allow_crimson{false}; +private: + uint32_t crush_version = 1; friend class OSDMonitor; public: OSDMap() : epoch(0), - pool_max(-1), + pool_max(0), flags(0), num_osd(0), num_up_osd(0), num_in_osd(0), max_osd(0), osd_addrs(std::make_shared()), - pg_temp(std::make_shared>>()), + pg_temp(std::make_shared()), primary_temp(std::make_shared>()), osd_uuid(std::make_shared>()), cluster_snapshot_epoch(0), - new_blacklist_entries(false), + new_blocklist_entries(false), cached_up_osd_features(0), crc_defined(false), crc(0), - crush(std::make_shared()) { - memset(&fsid, 0, sizeof(fsid)); + crush(std::make_shared()), + stretch_mode_enabled(false), stretch_bucket_count(0), + degraded_stretch_mode(0), recovering_stretch_mode(0), stretch_mode_bucket(0) { } - // no copying private: OSDMap(const OSDMap& other) = default; OSDMap& operator=(const OSDMap& other) = default; public: + /// return feature mask subset that is relevant to OSDMap encoding + static uint64_t get_significant_features(uint64_t features) { + return SIGNIFICANT_FEATURES & features; + } + + uint64_t get_encoding_features() const; + void deepish_copy_from(const OSDMap& o) { *this = o; primary_temp.reset(new mempool::osdmap::map(*o.primary_temp)); - pg_temp.reset(new mempool::osdmap::map >(*o.pg_temp)); + pg_temp.reset(new PGTempMap(*o.pg_temp)); osd_uuid.reset(new mempool::osdmap::vector(*o.osd_uuid)); if (o.osd_primary_affinity) osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity)); - // NOTE: this still references shared entity_addr_t's. + // NOTE: this still references shared entity_addrvec_t's. osd_addrs.reset(new addrs_s(*o.osd_addrs)); // NOTE: we do not copy crush. note that apply_incremental will @@ -339,17 +724,25 @@ public: void set_epoch(epoch_t e); + uint32_t get_crush_version() const { + return crush_version; + } + /* stamps etc */ const utime_t& get_created() const { return created; } const utime_t& get_modified() const { return modified; } - bool is_blacklisted(const entity_addr_t& a) const; - void get_blacklist(list > *bl) const; + bool is_blocklisted(const entity_addr_t& a, CephContext *cct=nullptr) const; + bool is_blocklisted(const entity_addrvec_t& a, CephContext *cct=nullptr) const; + void get_blocklist(std::list > *bl, + std::list > *rl) const; + void get_blocklist(std::set *bl, + std::set *rl) const; - string get_cluster_snapshot() const { + std::string get_cluster_snapshot() const { if (cluster_snapshot_epoch == epoch) return cluster_snapshot; - return string(); + return std::string(); } float get_full_ratio() const { @@ -361,12 +754,13 @@ public: float get_nearfull_ratio() const { return nearfull_ratio; } - void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const; - void get_full_osd_util( - const ceph::unordered_map &osd_stat, - map *full, - map *backfill, - map *nearfull) const; + void get_full_pools(CephContext *cct, + std::set *full, + std::set *backfillfull, + std::set *nearfull) const; + void get_full_osd_counts(std::set *full, std::set *backfill, + std::set *nearfull) const; + /***** cluster state *****/ /* osds */ @@ -385,8 +779,9 @@ public: /// recalculate cached values for get_num{,_up,_in}_osds int calc_num_osds(); - void get_all_osds(set& ls) const; - void get_up_osds(set& ls) const; + void get_all_osds(std::set& ls) const; + void get_up_osds(std::set& ls) const; + void get_out_existing_osds(std::set& ls) const; unsigned get_num_pg_temp() const { return pg_temp->size(); } @@ -396,39 +791,41 @@ public: void set_flag(int f) { flags |= f; } void clear_flag(int f) { flags &= ~f; } - static void calc_state_set(int state, set& st); + void get_flag_set(std::set *flagset) const; + + static void calc_state_set(int state, std::set& st); int get_state(int o) const { - assert(o < max_osd); + ceph_assert(o < max_osd); return osd_state[o]; } - int get_state(int o, set& st) const { - assert(o < max_osd); + int get_state(int o, std::set& st) const { + ceph_assert(o < max_osd); unsigned t = osd_state[o]; calc_state_set(t, st); return osd_state[o]; } void set_state(int o, unsigned s) { - assert(o < max_osd); + ceph_assert(o < max_osd); osd_state[o] = s; } void set_weight(int o, unsigned w) { - assert(o < max_osd); + ceph_assert(o < max_osd); osd_weight[o] = w; if (w) osd_state[o] |= CEPH_OSD_EXISTS; } unsigned get_weight(int o) const { - assert(o < max_osd); + ceph_assert(o < max_osd); return osd_weight[o]; } float get_weightf(int o) const { return (float)get_weight(o) / (float)CEPH_OSD_IN; } - void adjust_osd_weights(const map& weights, Incremental& inc) const; + void adjust_osd_weights(const std::map& weights, Incremental& inc) const; void set_primary_affinity(int o, int w) { - assert(o < max_osd); + ceph_assert(o < max_osd); if (!osd_primary_affinity) osd_primary_affinity.reset( new mempool::osdmap::vector<__u32>( @@ -436,7 +833,7 @@ public: (*osd_primary_affinity)[o] = w; } unsigned get_primary_affinity(int o) const { - assert(o < max_osd); + ceph_assert(o < max_osd); if (!osd_primary_affinity) return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; return (*osd_primary_affinity)[o]; @@ -445,35 +842,43 @@ public: return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY; } - bool has_erasure_code_profile(const string &name) const { + bool has_erasure_code_profile(const std::string &name) const { auto i = erasure_code_profiles.find(name); return i != erasure_code_profiles.end(); } int get_erasure_code_profile_default(CephContext *cct, - map &profile_map, - ostream *ss); - void set_erasure_code_profile(const string &name, - const map& profile) { + std::map &profile_map, + std::ostream *ss); + void set_erasure_code_profile(const std::string &name, + const std::map& profile) { erasure_code_profiles[name] = profile; } - const map &get_erasure_code_profile( - const string &name) const { - static map empty; + const std::map &get_erasure_code_profile( + const std::string &name) const { + static std::map empty; auto i = erasure_code_profiles.find(name); if (i == erasure_code_profiles.end()) return empty; else return i->second; } - const mempool::osdmap::map > &get_erasure_code_profiles() const { + const mempool::osdmap::map> &get_erasure_code_profiles() const { return erasure_code_profiles; } + bool get_allow_crimson() const { + return allow_crimson; + } + bool exists(int osd) const { //assert(osd >= 0); return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS); } + bool is_destroyed(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED); + } + bool is_up(int osd) const { return exists(osd) && (osd_state[osd] & CEPH_OSD_UP); } @@ -486,6 +891,11 @@ public: return !is_up(osd); } + bool is_stop(int osd) const { + return exists(osd) && is_down(osd) && + (osd_state[osd] & CEPH_OSD_STOP); + } + bool is_out(int osd) const { return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT; } @@ -494,12 +904,94 @@ public: return !is_out(osd); } + bool is_dead(int osd) const { + if (!exists(osd)) { + return false; // unclear if they know they are removed from map + } + return get_xinfo(osd).dead_epoch > get_info(osd).up_from; + } + + unsigned get_osd_crush_node_flags(int osd) const; + unsigned get_crush_node_flags(int id) const; + unsigned get_device_class_flags(int id) const; + + bool is_noup_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP); + } + + bool is_nodown_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN); + } + + bool is_noin_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN); + } + + bool is_noout_by_osd(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT); + } + + bool is_noup(int osd) const { + if (test_flag(CEPH_OSDMAP_NOUP)) // global? + return true; + if (is_noup_by_osd(osd)) // by osd? + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node? + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class? + return true; + return false; + } + + bool is_nodown(int osd) const { + if (test_flag(CEPH_OSDMAP_NODOWN)) + return true; + if (is_nodown_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NODOWN) + return true; + return false; + } + + bool is_noin(int osd) const { + if (test_flag(CEPH_OSDMAP_NOIN)) + return true; + if (is_noin_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOIN) + return true; + return false; + } + + bool is_noout(int osd) const { + if (test_flag(CEPH_OSDMAP_NOOUT)) + return true; + if (is_noout_by_osd(osd)) + return true; + if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT) + return true; + if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 && + get_device_class_flags(class_id) & CEPH_OSD_NOOUT) + return true; + return false; + } + /** * check if an entire crush subtree is down */ - bool subtree_is_down(int id, set *down_cache) const; - bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set *down_cache) const; - + bool subtree_is_down(int id, std::set *down_cache) const; + bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, std::set *down_cache) const; + + bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, std::set *down_in_osds, std::set *up_in_osds, + std::set *subtree_up, std::unordered_map > *subtree_type_down) const; + int identify_osd(const entity_addr_t& addr) const; int identify_osd(const uuid_d& u) const; int identify_osd_on_all_channels(const entity_addr_t& addr) const; @@ -508,69 +1000,55 @@ public: return identify_osd(addr) >= 0; } int find_osd_on_ip(const entity_addr_t& ip) const; - const entity_addr_t &get_addr(int osd) const { - assert(exists(osd)); - return osd_addrs->client_addr[osd] ? *osd_addrs->client_addr[osd] : osd_addrs->blank; - } - const entity_addr_t &get_cluster_addr(int osd) const { - assert(exists(osd)); - if (!osd_addrs->cluster_addr[osd] || *osd_addrs->cluster_addr[osd] == entity_addr_t()) - return get_addr(osd); - return *osd_addrs->cluster_addr[osd]; - } - const entity_addr_t &get_hb_back_addr(int osd) const { - assert(exists(osd)); - return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank; - } - const entity_addr_t &get_hb_front_addr(int osd) const { - assert(exists(osd)); - return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank; - } - entity_inst_t get_most_recent_inst(int osd) const { - assert(exists(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_addr(osd)); + + const entity_addrvec_t& get_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->client_addrs[osd] ? + *osd_addrs->client_addrs[osd] : _blank_addrvec; } - entity_inst_t get_inst(int osd) const { - assert(is_up(osd)); - return get_most_recent_inst(osd); + const entity_addrvec_t& get_most_recent_addrs(int osd) const { + return get_addrs(osd); } - entity_inst_t get_cluster_inst(int osd) const { - assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd)); + const entity_addrvec_t &get_cluster_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->cluster_addrs[osd] ? + *osd_addrs->cluster_addrs[osd] : _blank_addrvec; } - entity_inst_t get_hb_back_inst(int osd) const { - assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd)); + const entity_addrvec_t &get_hb_back_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->hb_back_addrs[osd] ? + *osd_addrs->hb_back_addrs[osd] : _blank_addrvec; } - entity_inst_t get_hb_front_inst(int osd) const { - assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd)); + const entity_addrvec_t &get_hb_front_addrs(int osd) const { + ceph_assert(exists(osd)); + return osd_addrs->hb_front_addrs[osd] ? + *osd_addrs->hb_front_addrs[osd] : _blank_addrvec; } const uuid_d& get_uuid(int osd) const { - assert(exists(osd)); + ceph_assert(exists(osd)); return (*osd_uuid)[osd]; } const epoch_t& get_up_from(int osd) const { - assert(exists(osd)); + ceph_assert(exists(osd)); return osd_info[osd].up_from; } const epoch_t& get_up_thru(int osd) const { - assert(exists(osd)); + ceph_assert(exists(osd)); return osd_info[osd].up_thru; } const epoch_t& get_down_at(int osd) const { - assert(exists(osd)); + ceph_assert(exists(osd)); return osd_info[osd].down_at; } const osd_info_t& get_info(int osd) const { - assert(osd < max_osd); + ceph_assert(osd < max_osd); return osd_info[osd]; } const osd_xinfo_t& get_xinfo(int osd) const { - assert(osd < max_osd); + ceph_assert(osd < max_osd); return osd_xinfo[osd]; } @@ -602,11 +1080,18 @@ public: return -1; } + + void get_random_up_osds_by_subtree(int n, // whoami + std::string &subtree, + int limit, // how many + std::set skip, + std::set *want) const; + /** * get feature bits required by the current structure * * @param entity_type [in] what entity type we are asking about - * @param mask [out] set of all possible map-related features we could set + * @param mask [out] std::set of all possible map-related features we could std::set * @return feature bits used by this map */ uint64_t get_features(int entity_type, uint64_t *mask) const; @@ -615,39 +1100,59 @@ public: * get oldest *client* version (firefly, hammer, etc.) that can connect given * the feature bits required (according to get_features()). */ - pair get_min_compat_client() const; + ceph_release_t get_min_compat_client() const; + + /** + * gets the required minimum *client* version that can connect to the cluster. + */ + ceph_release_t get_require_min_compat_client() const; /** * get intersection of features supported by up osds */ uint64_t get_up_osd_features() const; + void get_upmap_pgs(std::vector *upmap_pgs) const; + bool check_pg_upmaps( + CephContext *cct, + const std::vector& to_check, + std::vector *to_cancel, + std::map>> *to_remap) const; + void clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc, + const std::vector& to_cancel, + const std::map>>& to_remap) const; + bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const; + int apply_incremental(const Incremental &inc); /// try to re-use/reference addrs in oldmap from newmap static void dedup(const OSDMap *oldmap, OSDMap *newmap); - static void clean_temps(CephContext *cct, const OSDMap& osdmap, + static void clean_temps(CephContext *cct, + const OSDMap& oldmap, + const OSDMap& nextmap, Incremental *pending_inc); // serialize, unserialize private: - void encode_client_old(bufferlist& bl) const; - void encode_classic(bufferlist& bl, uint64_t features) const; - void decode_classic(bufferlist::iterator& p); + void encode_client_old(ceph::buffer::list& bl) const; + void encode_classic(ceph::buffer::list& bl, uint64_t features) const; + void decode_classic(ceph::buffer::list::const_iterator& p); void post_decode(); public: - void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; - void decode(bufferlist& bl); - void decode(bufferlist::iterator& bl); + void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode(ceph::buffer::list& bl); + void decode(ceph::buffer::list::const_iterator& bl); /**** mapping facilities ****/ int map_to_pg( int64_t pool, - const string& name, - const string& key, - const string& nspace, + const std::string& name, + const std::string& key, + const std::string& nspace, pg_t *pg) const; int object_locator_to_pg(const object_t& oid, const object_locator_t& loc, pg_t &pg) const; @@ -655,7 +1160,7 @@ public: const object_locator_t& loc) const { pg_t pg; int ret = object_locator_to_pg(oid, loc, pg); - assert(ret == 0); + ceph_assert(ret == 0); return pg; } @@ -670,12 +1175,12 @@ public: } ceph_object_layout make_object_layout(object_t oid, int pg_pool, - string nspace) const; + std::string nspace) const; int get_pg_num(int pg_pool) const { const pg_pool_t *pool = get_pg_pool(pg_pool); - assert(NULL != pool); + ceph_assert(NULL != pool); return pool->get_pg_num(); } @@ -684,24 +1189,51 @@ public: return p && pgid.ps() < p->get_pg_num(); } + int get_pg_pool_min_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_min_size(); + } + + int get_pg_pool_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_size(); + } + + int get_pg_pool_crush_rule(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + ceph_assert(p); + return p->get_crush_rule(); + } + private: - /// pg -> (raw osd list) - int _pg_to_raw_osds( + /// pg -> (raw osd std::list) + void _pg_to_raw_osds( const pg_pool_t& pool, pg_t pg, - vector *osds, + std::vector *osds, ps_t *ppps) const; - int _pick_primary(const vector& osds) const; - void _remove_nonexistent_osds(const pg_pool_t& pool, vector& osds) const; + int _pick_primary(const std::vector& osds) const; + void _remove_nonexistent_osds(const pg_pool_t& pool, std::vector& osds) const; void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, - vector *osds, int *primary) const; + std::vector *osds, int *primary) const; /// apply pg_upmap[_items] mappings - void _apply_remap(const pg_pool_t& pi, pg_t pg, vector *raw) const; + void _apply_upmap(const pg_pool_t& pi, pg_t pg, std::vector *raw) const; - /// pg -> (up osd list) - void _raw_to_up_osds(const pg_pool_t& pool, const vector& raw, - vector *up) const; + /// pg -> (up osd std::list) + void _raw_to_up_osds(const pg_pool_t& pool, const std::vector& raw, + std::vector *up) const; /** @@ -711,13 +1243,13 @@ private: * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary. */ void _get_temp_osds(const pg_pool_t& pool, pg_t pg, - vector *temp_pg, int *temp_primary) const; + std::vector *temp_pg, int *temp_primary) const; /** * map to up and acting. Fills in whatever fields are non-NULL. */ - void _pg_to_up_acting_osds(const pg_t& pg, vector *up, int *up_primary, - vector *acting, int *acting_primary, + void _pg_to_up_acting_osds(const pg_t& pg, std::vector *up, int *up_primary, + std::vector *acting, int *acting_primary, bool raw_pg_to_pg = true) const; public: @@ -727,21 +1259,22 @@ public: * by anybody for data mapping purposes. * raw and primary must be non-NULL */ - int pg_to_raw_osds(pg_t pg, vector *raw, int *primary) const; + void pg_to_raw_osds(pg_t pg, std::vector *raw, int *primary) const; + void pg_to_raw_upmap(pg_t pg, std::vector *raw, + std::vector *raw_upmap) const; /// map a pg to its acting set. @return acting set size - int pg_to_acting_osds(const pg_t& pg, vector *acting, + void pg_to_acting_osds(const pg_t& pg, std::vector *acting, int *acting_primary) const { _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary); - return acting->size(); } - int pg_to_acting_osds(pg_t pg, vector& acting) const { + void pg_to_acting_osds(pg_t pg, std::vector& acting) const { return pg_to_acting_osds(pg, &acting, NULL); } /** * This does not apply temp overrides and should not be used * by anybody for data mapping purposes. Specify both pointers. */ - void pg_to_raw_up(pg_t pg, vector *up, int *primary) const; + void pg_to_raw_up(pg_t pg, std::vector *up, int *primary) const; /** * map a pg to its acting set as well as its up set. You must use * the acting set for data mapping purposes, but some users will @@ -749,30 +1282,30 @@ public: * set as pg_temp. * Each of these pointers must be non-NULL. */ - void pg_to_up_acting_osds(pg_t pg, vector *up, int *up_primary, - vector *acting, int *acting_primary) const { + void pg_to_up_acting_osds(pg_t pg, std::vector *up, int *up_primary, + std::vector *acting, int *acting_primary) const { _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary); } - void pg_to_up_acting_osds(pg_t pg, vector& up, vector& acting) const { + void pg_to_up_acting_osds(pg_t pg, std::vector& up, std::vector& acting) const { int up_primary, acting_primary; pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary); } bool pg_is_ec(pg_t pg) const { auto i = pools.find(pg.pool()); - assert(i != pools.end()); - return i->second.ec_pool(); + ceph_assert(i != pools.end()); + return i->second.is_erasure(); } bool get_primary_shard(const pg_t& pgid, spg_t *out) const { auto i = get_pools().find(pgid.pool()); if (i == get_pools().end()) { return false; } - if (!i->second.ec_pool()) { + if (!i->second.is_erasure()) { *out = spg_t(pgid); return true; } int primary; - vector acting; + std::vector acting; pg_to_acting_osds(pgid, &acting, &primary); for (uint8_t i = 0; i < acting.size(); ++i) { if (acting[i] == primary) { @@ -782,8 +1315,49 @@ public: } return false; } + bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const { + auto i = get_pools().find(pgid.pool()); + if (i == get_pools().end()) { + return false; + } + std::vector acting; + pg_to_acting_osds(pgid, &acting, primary); + if (i->second.is_erasure()) { + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == *primary) { + *out = spg_t(pgid, shard_id_t(i)); + return true; + } + } + } else { + *out = spg_t(pgid); + return true; + } + return false; + } - int64_t lookup_pg_pool_name(const string& name) const { + bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const { + auto p = removed_snaps_queue.find(pool); + if (p == removed_snaps_queue.end()) { + return false; + } + return p->second.contains(snap); + } + + const mempool::osdmap::map& + get_removed_snaps_queue() const { + return removed_snaps_queue; + } + const mempool::osdmap::map& + get_new_removed_snaps() const { + return new_removed_snaps; + } + const mempool::osdmap::map& + get_new_purged_snaps() const { + return new_purged_snaps; + } + + int64_t lookup_pg_pool_name(std::string_view name) const { auto p = name_pool.find(name); if (p == name_pool.end()) return -ENOENT; @@ -799,11 +1373,25 @@ public: mempool::osdmap::map& get_pools() { return pools; } - const string& get_pool_name(int64_t p) const { + void get_pool_ids_by_rule(int rule_id, std::set *pool_ids) const { + ceph_assert(pool_ids); + for (auto &p: pools) { + if (p.second.get_crush_rule() == rule_id) { + pool_ids->insert(p.first); + } + } + } + void get_pool_ids_by_osd(CephContext *cct, + int osd, + std::set *pool_ids) const; + const std::string& get_pool_name(int64_t p) const { auto i = pool_name.find(p); - assert(i != pool_name.end()); + ceph_assert(i != pool_name.end()); return i->second; } + const mempool::osdmap::map& get_pool_names() const { + return pool_name; + } bool have_pg_pool(int64_t p) const { return pools.count(p); } @@ -815,19 +1403,25 @@ public: } unsigned get_pg_size(pg_t pg) const { auto p = pools.find(pg.pool()); - assert(p != pools.end()); + ceph_assert(p != pools.end()); return p->second.get_size(); } int get_pg_type(pg_t pg) const { auto p = pools.find(pg.pool()); - assert(p != pools.end()); + ceph_assert(p != pools.end()); return p->second.get_type(); } + int get_pool_crush_rule(int64_t pool_id) const { + auto pool = get_pg_pool(pool_id); + if (!pool) + return -ENOENT; + return pool->get_crush_rule(); + } pg_t raw_pg_to_pg(pg_t pg) const { auto p = pools.find(pg.pool()); - assert(p != pools.end()); + ceph_assert(p != pools.end()); return p->second.raw_pg_to_pg(pg); } @@ -842,76 +1436,240 @@ public: * check whether an spg_t maps to a particular osd */ bool is_up_acting_osd_shard(spg_t pg, int osd) const { - vector up, acting; + std::vector up, acting; _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false); - if (pg.shard == shard_id_t::NO_SHARD) { - if (calc_pg_role(osd, acting, acting.size()) >= 0 || - calc_pg_role(osd, up, up.size()) >= 0) - return true; - } else { - if (pg.shard < (int)acting.size() && acting[pg.shard] == osd) - return true; - if (pg.shard < (int)up.size() && up[pg.shard] == osd) - return true; + if (calc_pg_role(pg_shard_t(osd, pg.shard), acting) >= 0 || + calc_pg_role(pg_shard_t(osd, pg.shard), up) >= 0) { + return true; } return false; } - /* what replica # is a given osd? 0 primary, -1 for none. */ - static int calc_pg_rank(int osd, const vector& acting, int nrep=0); - static int calc_pg_role(int osd, const vector& acting, int nrep=0); - static bool primary_changed( + static int calc_pg_role_broken(int osd, const std::vector& acting, int nrep=0); + static int calc_pg_role(pg_shard_t who, const std::vector& acting); + static bool primary_changed_broken( int oldprimary, - const vector &oldacting, + const std::vector &oldacting, int newprimary, - const vector &newacting); + const std::vector &newacting); /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) const { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(const pg_t& pg, int osd) const { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); + int get_pg_acting_role(spg_t pg, int osd) const { + std::vector group; + pg_to_acting_osds(pg.pgid, group); + return calc_pg_role(pg_shard_t(osd, pg.shard), group); } - bool osd_is_valid_op_target(pg_t pg, int osd) const { - int primary; - vector group; - int nrep = pg_to_acting_osds(pg, &group, &primary); - if (osd == primary) - return true; - if (pg_is_ec(pg)) - return false; - - return calc_pg_role(osd, group, nrep) >= 0; - } + bool try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const std::set& overfull, ///< osds we'd want to evacuate + const std::vector& underfull, ///< osds to move to, in order of preference + const std::vector& more_underfull, ///< less full osds to move to, in order of preference + std::vector *orig, + std::vector *out); ///< resulting alternative mapping - int clean_pg_upmaps( + int balance_primaries( CephContext *cct, - Incremental *pending_inc); + int64_t pid, + Incremental *pending_inc, + OSDMap& tmp_osd_map) const; - bool try_pg_upmap( + int calc_desired_primary_distribution( CephContext *cct, - pg_t pg, ///< pg to potentially remap - const set& overfull, ///< osds we'd want to evacuate - const vector& underfull, ///< osds to move to, in order of preference - vector *orig, - vector *out); ///< resulting alternative mapping + int64_t pid, // pool id + const std::vector &osds, + std::map& desired_primary_distribution) const; // vector of osd ids int calc_pg_upmaps( CephContext *cct, - float max_deviation, ///< max deviation from target (value < 1.0) + uint32_t max_deviation, ///< max deviation from target (value >= 1) int max_iterations, ///< max iterations to run - const set& pools, ///< [optional] restrict to pool - Incremental *pending_inc + const std::set& pools, ///< [optional] restrict to pool + Incremental *pending_inc, + std::random_device::result_type *p_seed = nullptr ///< [optional] for regression tests ); + std::map> get_pgs_by_osd( + CephContext *cct, + int64_t pid, + std::map> *p_primaries_by_osd = nullptr, + std::map> *p_acting_primaries_by_osd = nullptr + ) const; // used in calc_desired_primary_distribution() + +private: // Bunch of internal functions used only by calc_pg_upmaps (result of code refactoring) + + float get_osds_weight( + CephContext *cct, + const OSDMap& tmp_osd_map, + int64_t pid, + std::map& osds_weight + ) const; + + float build_pool_pgs_info ( + CephContext *cct, + const std::set& pools, ///< [optional] restrict to pool + const OSDMap& tmp_osd_map, + int& total_pgs, + std::map>& pgs_by_osd, + std::map& osds_weight + ); // return total weight of all OSDs + + float calc_deviations ( + CephContext *cct, + const std::map>& pgs_by_osd, + const std::map& osd_weight, + float pgs_per_weight, + std::map& osd_deviation, + std::multimap& deviation_osd, + float& stddev + ); // return current max deviation + + void fill_overfull_underfull ( + CephContext *cct, + const std::multimap& deviation_osd, + int max_deviation, + std::set& overfull, + std::set& more_overfull, + std::vector& underfull, + std::vector& more_underfull + ); + + int pack_upmap_results( + CephContext *cct, + const std::set& to_unmap, + const std::map>>& to_upmap, + OSDMap& tmp_osd_map, + OSDMap::Incremental *pending_inc + ); + + std::default_random_engine get_random_engine( + CephContext *cct, + std::random_device::result_type *p_seed + ); + + bool try_drop_remap_overfull( + CephContext *cct, + const std::vector& pgs, + const OSDMap& tmp_osd_map, + int osd, + std::map>& temp_pgs_by_osd, + std::set& to_unmap, + std::map>>& to_upmap + ); + +typedef std::vector>>> + candidates_t; + +bool try_drop_remap_underfull( + CephContext *cct, + const candidates_t& candidates, + int osd, + std::map>& temp_pgs_by_osd, + std::set& to_unmap, + std::map>>& to_upmap + ); + + void add_remap_pair( + CephContext *cct, + int orig, + int out, + pg_t pg, + size_t pg_pool_size, + int osd, + std::set& existing, + std::map>& temp_pgs_by_osd, + mempool::osdmap::vector> new_upmap_items, + std::map>>& to_upmap + ); + + int find_best_remap ( + CephContext *cct, + const std::vector& orig, + const std::vector& out, + const std::set& existing, + const std::map osd_deviation + ); + + candidates_t build_candidates( + CephContext *cct, + const OSDMap& tmp_osd_map, + const std::set to_skip, + const std::set& only_pools, + bool aggressive, + std::random_device::result_type *p_seed + ); + +public: + typedef struct { + float pa_avg; + float pa_weighted; + float pa_weighted_avg; + float raw_score; + float optimal_score; // based on primary_affinity values + float adjusted_score; // based on raw_score and pa_avg 1 is optimal + float acting_raw_score; // based on active_primaries (temporary) + float acting_adj_score; // based on raw_active_score and pa_avg 1 is optimal + std::string err_msg; + } read_balance_info_t; + // + // This function calculates scores about the cluster read balance state + // p_rb_info->acting_adj_score is the current read balance score (acting) + // p_rb_info->adjusted_score is the stable read balance score + // Return value of 0 is OK, negative means an error (may happen with + // some arifically generated osamap files) + // + int calc_read_balance_score( + CephContext *cct, + int64_t pool_id, + read_balance_info_t *p_rb_info) const; + +private: + float rbi_round(float f) const { + return (f > 0.0) ? floor(f * 100 + 0.5) / 100 : ceil(f * 100 - 0.5) / 100; + } + + int64_t has_zero_pa_pgs( + CephContext *cct, + int64_t pool_id) const; + + void zero_rbi( + read_balance_info_t &rbi + ) const; + + int set_rbi( + CephContext *cct, + read_balance_info_t &rbi, + int64_t pool_id, + float total_w_pa, + float pa_sum, + int num_osds, + int osd_pa_count, + float total_osd_weight, + uint max_prims_per_osd, + uint max_acting_prims_per_osd, + float avg_prims_per_osd, + bool prim_on_zero_pa, + bool acting_on_zero_pa, + float max_osd_score) const; + +public: + int get_osds_by_bucket_name(const std::string &name, std::set *osds) const; + + bool have_pg_upmaps(pg_t pg) const { + return pg_upmap.count(pg) || + pg_upmap_items.count(pg); + } + + bool check_full(const std::set &missing_on) const { + for (auto shard : missing_on) { + if (get_state(shard.osd) & CEPH_OSD_FULL) + return true; + } + return false; + } + /* * handy helpers to build simple maps... */ @@ -927,19 +1685,34 @@ public: * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0 * @return **0** on success, negative errno on error. */ +private: + int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits, + bool default_pool); +public: int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, - int num_osd, int pg_bits, int pgp_bits); + int num_osd) { + return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false); + } + int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits) { + return build_simple_optioned(cct, e, fsid, num_osd, + pg_bits, pgp_bits, true); + } static int _build_crush_types(CrushWrapper& crush); static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush, - int num_osd, ostream *ss); + int num_osd, std::ostream *ss); static int build_simple_crush_map_from_conf(CephContext *cct, CrushWrapper& crush, - ostream *ss); - static int build_simple_crush_rulesets(CephContext *cct, CrushWrapper& crush, - const string& root, - ostream *ss); + std::ostream *ss); + static int build_simple_crush_rules( + CephContext *cct, CrushWrapper& crush, + const std::string& root, + std::ostream *ss); + + bool crush_rule_in_use(int rule_id) const; - bool crush_ruleset_in_use(int ruleset) const; + int validate_crush_rules(CrushWrapper *crush, std::ostream *ss) const; void clear_temp() { pg_temp->clear(); @@ -947,38 +1720,79 @@ public: } private: - void print_osd_line(int cur, ostream *out, Formatter *f) const; + void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const; public: - void print(ostream& out) const; - void print_pools(ostream& out) const; - void print_summary(Formatter *f, ostream& out) const; - void print_oneline_summary(ostream& out) const; - void print_tree(Formatter *f, ostream *out) const; + void print(CephContext *cct, std::ostream& out) const; + void print_osd(int id, std::ostream& out) const; + void print_osds(std::ostream& out) const; + void print_pools(CephContext *cct, std::ostream& out) const; + void print_summary(ceph::Formatter *f, std::ostream& out, + const std::string& prefix, bool extra=false) const; + void print_oneline_summary(std::ostream& out) const; + + enum { + DUMP_IN = 1, // only 'in' osds + DUMP_OUT = 2, // only 'out' osds + DUMP_UP = 4, // only 'up' osds + DUMP_DOWN = 8, // only 'down' osds + DUMP_DESTROYED = 16, // only 'destroyed' osds + }; + void print_tree(ceph::Formatter *f, std::ostream *out, + unsigned dump_flags=0, std::string bucket="") const; int summarize_mapping_stats( OSDMap *newmap, - const set *pools, + const std::set *pools, std::string *out, - Formatter *f) const; + ceph::Formatter *f) const; - string get_flag_string() const; - static string get_flag_string(unsigned flags); + std::string get_flag_string() const; + static std::string get_flag_string(unsigned flags); static void dump_erasure_code_profiles( - const mempool::osdmap::map > &profiles, - Formatter *f); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); - bool check_new_blacklist_entries() const { return new_blacklist_entries; } + const mempool::osdmap::map > &profiles, + ceph::Formatter *f); + void dump(ceph::Formatter *f, CephContext *cct = nullptr) const; + void dump_osd(int id, ceph::Formatter *f) const; + void dump_osds(ceph::Formatter *f) const; + void dump_pool(CephContext *cct, int64_t pid, const pg_pool_t &pdata, ceph::Formatter *f) const; + void dump_read_balance_score(CephContext *cct, int64_t pid, const pg_pool_t &pdata, ceph::Formatter *f) const; + static void generate_test_instances(std::list& o); + bool check_new_blocklist_entries() const { return new_blocklist_entries; } + + void check_health(CephContext *cct, health_check_map_t *checks) const; + + int parse_osd_id_list(const std::vector& ls, + std::set *out, + std::ostream *ss) const; + + float pool_raw_used_rate(int64_t poolid) const; + std::optional pending_require_osd_release() const; + }; WRITE_CLASS_ENCODER_FEATURES(OSDMap) WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental) -typedef ceph::shared_ptr OSDMapRef; +#ifdef WITH_SEASTAR +#include "crimson/common/local_shared_foreign_ptr.h" +using LocalOSDMapRef = boost::local_shared_ptr; +using OSDMapRef = crimson::local_shared_foreign_ptr; +#else +using OSDMapRef = std::shared_ptr; +#endif + -inline ostream& operator<<(ostream& out, const OSDMap& m) { +inline std::ostream& operator<<(std::ostream& out, const OSDMap& m) { m.print_oneline_summary(out); return out; } +class PGMap; + +void print_osd_utilization(const OSDMap& osdmap, + const PGMap& pgmap, + std::ostream& out, + ceph::Formatter *f, + bool tree, + const std::string& filter); #endif