#ifndef CEPH_OSDMAP_H
#define CEPH_OSDMAP_H
+#include "include/cpp-btree/btree_map.h"
+
/*
* describe properties of the OSD cluster.
* disks, disk groups, total # osds,
// forward declaration
class CephContext;
class CrushWrapper;
+class health_check_map_t;
// FIXME C++11 does not have std::equal for two differently-typed containers.
// use this until we move to c++14
ostream& operator<<(ostream& out, const osd_xinfo_t& xi);
+struct PGTempMap {
+#if 1
+ bufferlist data;
+ typedef btree::btree_map<pg_t,int32_t*> map_t;
+ map_t map;
+
+ void encode(bufferlist& bl) const {
+ uint32_t n = map.size();
+ ::encode(n, bl);
+ for (auto &p : map) {
+ ::encode(p.first, bl);
+ bl.append((char*)p.second, (*p.second + 1) * sizeof(int32_t));
+ }
+ }
+ void decode(bufferlist::iterator& p) {
+ data.clear();
+ map.clear();
+ uint32_t n;
+ ::decode(n, p);
+ if (!n)
+ return;
+ bufferlist::iterator pstart = p;
+ size_t start_off = pstart.get_off();
+ vector<pair<pg_t,size_t>> offsets;
+ offsets.resize(n);
+ for (unsigned i=0; i<n; ++i) {
+ pg_t pgid;
+ ::decode(pgid, p);
+ offsets[i].first = pgid;
+ offsets[i].second = p.get_off() - start_off;
+ uint32_t vn;
+ ::decode(vn, p);
+ p.advance(vn * sizeof(int32_t));
+ }
+ size_t len = p.get_off() - start_off;
+ pstart.copy(len, data);
+ if (data.get_num_buffers() > 1) {
+ data.rebuild();
+ }
+ //map.reserve(n);
+ char *start = data.c_str();
+ for (auto i : offsets) {
+ map.insert(map.end(), make_pair(i.first, (int32_t*)(start + i.second)));
+ }
+ }
+ void rebuild() {
+ bufferlist bl;
+ encode(bl);
+ auto p = bl.begin();
+ decode(p);
+ }
+ friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
+ return
+ l.map.size() == r.map.size() &&
+ l.data.contents_equal(r.data);
+ }
+
+ class iterator {
+ map_t::const_iterator it;
+ map_t::const_iterator end;
+ pair<pg_t,vector<int32_t>> current;
+ void init_current() {
+ if (it != end) {
+ current.first = it->first;
+ assert(it->second);
+ current.second.resize(*it->second);
+ int32_t *p = it->second + 1;
+ for (int n = 0; n < *it->second; ++n, ++p) {
+ current.second[n] = *p;
+ }
+ }
+ }
+ public:
+ iterator(map_t::const_iterator p,
+ map_t::const_iterator e)
+ : it(p), end(e) {
+ init_current();
+ }
+
+ const pair<pg_t,vector<int32_t>>& operator*() const {
+ return current;
+ }
+ const pair<pg_t,vector<int32_t>>* operator->() const {
+ return ¤t;
+ }
+ friend bool operator==(const iterator& l, const iterator& r) {
+ return l.it == r.it;
+ }
+ friend bool operator!=(const iterator& l, const iterator& r) {
+ return l.it != r.it;
+ }
+ iterator& operator++() {
+ ++it;
+ if (it != end)
+ init_current();
+ return *this;
+ }
+ iterator operator++(int) {
+ iterator r = *this;
+ ++it;
+ if (it != end)
+ init_current();
+ return r;
+ }
+ };
+ iterator begin() const {
+ return iterator(map.begin(), map.end());
+ }
+ iterator end() const {
+ return iterator(map.end(), map.end());
+ }
+ iterator find(pg_t pgid) const {
+ return iterator(map.find(pgid), map.end());
+ }
+ size_t size() const {
+ return map.size();
+ }
+ size_t count(pg_t pgid) const {
+ return map.count(pgid);
+ }
+ void erase(pg_t pgid) {
+ map.erase(pgid);
+ }
+ void clear() {
+ map.clear();
+ data.clear();
+ }
+ void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
+ size_t need = sizeof(int32_t) * (1 + v.size());
+ if (need < data.get_append_buffer_unused_tail_length()) {
+ bufferptr z(data.get_append_buffer_unused_tail_length());
+ z.zero();
+ data.append(z.c_str(), z.length());
+ }
+ ::encode(v, data);
+ map[pgid] = (int32_t*)(data.back().end_c_str()) - (1 + v.size());
+ }
+ mempool::osdmap::vector<int32_t> get(pg_t pgid) {
+ mempool::osdmap::vector<int32_t> v;
+ int32_t *p = map[pgid];
+ size_t n = *p++;
+ v.resize(n);
+ for (size_t i = 0; i < n; ++i, ++p) {
+ v[i] = *p;
+ }
+ return v;
+ }
+#else
+ // trivial implementation
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
+
+ void encode(bufferlist& bl) const {
+ ::encode(pg_temp, bl);
+ }
+ void decode(bufferlist::iterator& p) {
+ ::decode(pg_temp, p);
+ }
+ friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
+ return
+ l.pg_temp.size() == r.pg_temp.size() &&
+ l.pg_temp == r.pg_temp;
+ }
+
+ class iterator {
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
+ public:
+ iterator(mempool::osdmap::map<pg_t,
+ mempool::osdmap::vector<int32_t> >::const_iterator p)
+ : it(p) {}
+
+ pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
+ return *it;
+ }
+ const pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
+ return &*it;
+ }
+ friend bool operator==(const iterator& l, const iterator& r) {
+ return l.it == r.it;
+ }
+ friend bool operator!=(const iterator& l, const iterator& r) {
+ return l.it != r.it;
+ }
+ iterator& operator++() {
+ ++it;
+ return *this;
+ }
+ iterator operator++(int) {
+ iterator r = *this;
+ ++it;
+ return r;
+ }
+ };
+ iterator begin() const {
+ return iterator(pg_temp.cbegin());
+ }
+ iterator end() const {
+ return iterator(pg_temp.cend());
+ }
+ iterator find(pg_t pgid) const {
+ return iterator(pg_temp.find(pgid));
+ }
+ size_t size() const {
+ return pg_temp.size();
+ }
+ size_t count(pg_t pgid) const {
+ return pg_temp.count(pgid);
+ }
+ void erase(pg_t pgid) {
+ pg_temp.erase(pgid);
+ }
+ void clear() {
+ pg_temp.clear();
+ }
+ void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
+ pg_temp[pgid] = v;
+ }
+ const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
+ return pg_temp.at(pgid);
+ }
+#endif
+ void dump(Formatter *f) const {
+ for (const auto &pg : *this) {
+ f->open_object_section("osds");
+ f->dump_stream("pgid") << pg.first;
+ f->open_array_section("osds");
+ for (const auto osd : pg.second)
+ f->dump_int("osd", osd);
+ f->close_section();
+ f->close_section();
+ }
+ }
+};
+WRITE_CLASS_ENCODER(PGTempMap)
+
/** OSDMap
*/
class OSDMap {
utime_t modified;
int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
int32_t new_flags;
+ int8_t new_require_osd_release = -1;
// full (rare)
bufferlist fullmap; // in lieu of below.
mempool::osdmap::vector<string> old_erasure_code_profiles;
mempool::osdmap::map<int32_t,entity_addr_t> new_up_client;
mempool::osdmap::map<int32_t,entity_addr_t> new_up_cluster;
- mempool::osdmap::map<int32_t,uint8_t> new_state; // XORed onto previous state.
+ mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state.
mempool::osdmap::map<int32_t,uint32_t> new_weight;
mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
float new_backfillfull_ratio = -1;
float new_full_ratio = -1;
- string new_require_min_compat_client;
+ int8_t new_require_min_compat_client = -1;
mutable bool have_crc; ///< crc values are defined
uint32_t full_crc; ///< crc of the resulting OSDMap
/// propage update pools' snap metadata to any of their tiers
int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base);
+
+ /// filter out osds with any pending state changing
+ size_t get_pending_state_osds(vector<int> *osds) {
+ assert(osds);
+ osds->clear();
+
+ for (auto &p : new_state) {
+ osds->push_back(p.first);
+ }
+
+ return osds->size();
+ }
+
+ bool pending_osd_has_state(int osd, unsigned state) {
+ return new_state.count(osd) && (new_state[osd] & state) != 0;
+ }
+
+ void pending_osd_state_set(int osd, unsigned state) {
+ new_state[osd] |= state;
+ }
+
+ // cancel the specified pending osd state if there is any
+ // return ture on success, false otherwise.
+ bool pending_osd_state_clear(int osd, unsigned state) {
+ if (!pending_osd_has_state(osd, state)) {
+ // never has been set or already has been cancelled.
+ return false;
+ }
+
+ new_state[osd] &= ~state;
+ return true;
+ }
+
};
private:
int num_in_osd; // not saved; see calc_num_osds
int32_t max_osd;
- vector<uint8_t> osd_state;
+ vector<uint32_t> osd_state;
struct addrs_s {
mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > client_addr;
mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
mempool::osdmap::vector<osd_info_t> osd_info;
- ceph::shared_ptr< mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > > pg_temp; // temp pg mapping (e.g. while we rebuild)
+ ceph::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
ceph::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
ceph::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
/// min compat client we want to support
- string require_min_compat_client;
+ uint8_t require_min_compat_client = 0; // CEPH_RELEASE_*
+
+public:
+ /// require osds to run at least this release
+ uint8_t require_osd_release = 0; // CEPH_RELEASE_*
+private:
mutable uint64_t cached_up_osd_features;
mutable bool crc_defined;
uint32_t get_crc() const { return crc; }
ceph::shared_ptr<CrushWrapper> crush; // hierarchical map
+private:
+ uint32_t crush_version = 1;
friend class OSDMonitor;
public:
OSDMap() : epoch(0),
- pool_max(-1),
+ pool_max(0),
flags(0),
num_osd(0), num_up_osd(0), num_in_osd(0),
max_osd(0),
osd_addrs(std::make_shared<addrs_s>()),
- pg_temp(std::make_shared<mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>>>()),
+ pg_temp(std::make_shared<PGTempMap>()),
primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
cluster_snapshot_epoch(0),
void deepish_copy_from(const OSDMap& o) {
*this = o;
primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
- pg_temp.reset(new mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >(*o.pg_temp));
+ pg_temp.reset(new PGTempMap(*o.pg_temp));
osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
if (o.osd_primary_affinity)
void set_epoch(epoch_t e);
+ uint32_t get_crush_version() const {
+ return crush_version;
+ }
+
/* stamps etc */
const utime_t& get_created() const { return created; }
const utime_t& get_modified() const { return modified; }
bool is_blacklisted(const entity_addr_t& a) const;
void get_blacklist(list<pair<entity_addr_t,utime_t > > *bl) const;
+ void get_blacklist(std::set<entity_addr_t> *bl) const;
string get_cluster_snapshot() const {
if (cluster_snapshot_epoch == epoch)
}
void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
void get_full_osd_util(
- const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
+ const mempool::pgmap::unordered_map<int32_t,osd_stat_t> &osd_stat,
map<int, float> *full,
map<int, float> *backfill,
map<int, float> *nearfull) const;
+ void get_full_osd_counts(set<int> *full, set<int> *backfill,
+ set<int> *nearfull) const;
+
+
/***** cluster state *****/
/* osds */
int get_max_osd() const { return max_osd; }
void get_all_osds(set<int32_t>& ls) const;
void get_up_osds(set<int32_t>& ls) const;
+ void get_out_osds(set<int32_t>& ls) const;
unsigned get_num_pg_temp() const {
return pg_temp->size();
}
return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
}
+ bool is_destroyed(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
+ }
+
bool is_up(int osd) const {
return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
}
return !is_out(osd);
}
+ bool is_noup(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
+ }
+
+ bool is_nodown(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
+ }
+
+ bool is_noin(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
+ }
+
+ bool is_noout(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
+ }
+
+ void get_noup_osds(vector<int> *osds) const {
+ assert(osds);
+ osds->clear();
+
+ for (int i = 0; i < max_osd; i++) {
+ if (is_noup(i)) {
+ osds->push_back(i);
+ }
+ }
+ }
+
+ void get_nodown_osds(vector<int> *osds) const {
+ assert(osds);
+ osds->clear();
+
+ for (int i = 0; i < max_osd; i++) {
+ if (is_nodown(i)) {
+ osds->push_back(i);
+ }
+ }
+ }
+
+ void get_noin_osds(vector<int> *osds) const {
+ assert(osds);
+ osds->clear();
+
+ for (int i = 0; i < max_osd; i++) {
+ if (is_noin(i)) {
+ osds->push_back(i);
+ }
+ }
+ }
+
+ void get_noout_osds(vector<int> *osds) const {
+ assert(osds);
+ osds->clear();
+
+ for (int i = 0; i < max_osd; i++) {
+ if (is_noout(i)) {
+ osds->push_back(i);
+ }
+ }
+ }
+
/**
* check if an entire crush subtree is down
*/
bool subtree_is_down(int id, set<int> *down_cache) const;
bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
+ bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
+ set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;
+
int identify_osd(const entity_addr_t& addr) const;
int identify_osd(const uuid_d& u) const;
int identify_osd_on_all_channels(const entity_addr_t& addr) const;
* get oldest *client* version (firefly, hammer, etc.) that can connect given
* the feature bits required (according to get_features()).
*/
- pair<string,string> get_min_compat_client() const;
+ uint8_t get_min_compat_client() const;
/**
* get intersection of features supported by up osds
return p && pgid.ps() < p->get_pg_num();
}
+ int get_pg_pool_min_size(pg_t pgid) const {
+ if (!pg_exists(pgid)) {
+ return -ENOENT;
+ }
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ assert(p);
+ return p->get_min_size();
+ }
+
+ int get_pg_pool_size(pg_t pgid) const {
+ if (!pg_exists(pgid)) {
+ return -ENOENT;
+ }
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ assert(p);
+ return p->get_size();
+ }
+
private:
/// pg -> (raw osd list)
- int _pg_to_raw_osds(
+ void _pg_to_raw_osds(
const pg_pool_t& pool, pg_t pg,
vector<int> *osds,
ps_t *ppps) const;
vector<int> *osds, int *primary) const;
/// apply pg_upmap[_items] mappings
- void _apply_remap(const pg_pool_t& pi, pg_t pg, vector<int> *raw) const;
+ void _apply_upmap(const pg_pool_t& pi, pg_t pg, vector<int> *raw) const;
/// pg -> (up osd list)
void _raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
* by anybody for data mapping purposes.
* raw and primary must be non-NULL
*/
- int pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
+ void pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
/// map a pg to its acting set. @return acting set size
- int pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
+ void pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
int *acting_primary) const {
_pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
- return acting->size();
}
- int pg_to_acting_osds(pg_t pg, vector<int>& acting) const {
+ void pg_to_acting_osds(pg_t pg, vector<int>& acting) const {
return pg_to_acting_osds(pg, &acting, NULL);
}
/**
/* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
int get_pg_acting_rank(pg_t pg, int osd) const {
vector<int> group;
- int nrep = pg_to_acting_osds(pg, group);
- return calc_pg_rank(osd, group, nrep);
+ pg_to_acting_osds(pg, group);
+ return calc_pg_rank(osd, group, group.size());
}
/* role is -1 (stray), 0 (primary), 1 (replica) */
int get_pg_acting_role(const pg_t& pg, int osd) const {
vector<int> group;
- int nrep = pg_to_acting_osds(pg, group);
- return calc_pg_role(osd, group, nrep);
+ pg_to_acting_osds(pg, group);
+ return calc_pg_role(osd, group, group.size());
}
bool osd_is_valid_op_target(pg_t pg, int osd) const {
int primary;
vector<int> group;
- int nrep = pg_to_acting_osds(pg, &group, &primary);
+ pg_to_acting_osds(pg, &group, &primary);
if (osd == primary)
return true;
if (pg_is_ec(pg))
return false;
- return calc_pg_role(osd, group, nrep) >= 0;
+ return calc_pg_role(osd, group, group.size()) >= 0;
}
int clean_pg_upmaps(
Incremental *pending_inc
);
+ int get_osds_by_bucket_name(const string &name, set<int> *osds) const;
+
/*
* handy helpers to build simple maps...
*/
* @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
* @return **0** on success, negative errno on error.
*/
+private:
+ int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
+ int num_osd, int pg_bits, int pgp_bits,
+ bool default_pool);
+public:
int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
- int num_osd, int pg_bits, int pgp_bits);
+ int num_osd) {
+ return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
+ }
+ int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
+ int num_osd, int pg_bits, int pgp_bits) {
+ return build_simple_optioned(cct, e, fsid, num_osd,
+ pg_bits, pgp_bits, true);
+ }
static int _build_crush_types(CrushWrapper& crush);
static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
int num_osd, ostream *ss);
static int build_simple_crush_map_from_conf(CephContext *cct,
CrushWrapper& crush,
ostream *ss);
- static int build_simple_crush_rulesets(CephContext *cct, CrushWrapper& crush,
- const string& root,
- ostream *ss);
+ static int build_simple_crush_rules(
+ CephContext *cct, CrushWrapper& crush,
+ const string& root,
+ ostream *ss);
bool crush_ruleset_in_use(int ruleset) const;
public:
void print(ostream& out) const;
void print_pools(ostream& out) const;
- void print_summary(Formatter *f, ostream& out) const;
+ void print_summary(Formatter *f, ostream& out, const string& prefix) const;
void print_oneline_summary(ostream& out) const;
- void print_tree(Formatter *f, ostream *out) const;
+
+ enum {
+ DUMP_IN = 1, // only 'in' osds
+ DUMP_OUT = 2, // only 'out' osds
+ DUMP_UP = 4, // only 'up' osds
+ DUMP_DOWN = 8, // only 'down' osds
+ };
+ void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0) const;
int summarize_mapping_stats(
OSDMap *newmap,
void dump(Formatter *f) const;
static void generate_test_instances(list<OSDMap*>& o);
bool check_new_blacklist_entries() const { return new_blacklist_entries; }
+
+ void check_health(health_check_map_t *checks) const;
};
WRITE_CLASS_ENCODER_FEATURES(OSDMap)
WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
return out;
}
+class PGStatService;
+
+void print_osd_utilization(const OSDMap& osdmap,
+ const PGStatService *pgstat,
+ ostream& out,
+ Formatter *f,
+ bool tree);
#endif