}
// balance?
- if (last_heartbeat == utime_t())
- last_heartbeat = now;
if (mds->get_nodeid() == 0 &&
g_conf->mds_bal_interval > 0 &&
(num_bal_times ||
dout(20) << "get_load no root, no load" << dendl;
}
- load.req_rate = mds->get_req_rate();
+ uint64_t num_requests = mds->get_num_requests();
+ bool new_req_rate = false;
+ if (last_get_load != utime_t() &&
+ now > last_get_load &&
+ num_requests >= last_num_requests) {
+ utime_t el = now;
+ el -= last_get_load;
+ if (el.sec() >= 1) {
+ load.req_rate = (num_requests - last_num_requests) / (double)el;
+ new_req_rate = true;
+ }
+ }
+ if (!new_req_rate) {
+ auto p = mds_load.find(mds->get_nodeid());
+ if (p != mds_load.end())
+ load.req_rate = p->second.req_rate;
+ }
+ last_get_load = now;
+ last_num_requests = num_requests;
+
load.queue_len = messenger->get_dispatch_queue_len();
ifstream cpu(PROCPREFIX "/proc/loadavg");
if (mds->get_nodeid() == 0) {
beat_epoch++;
-
mds_load.clear();
}
// my load
mds_load_t load = get_load(now);
- map<mds_rank_t, mds_load_t>::value_type val(mds->get_nodeid(), load);
- mds_load.insert(val);
+ mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
+ mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
+
+ mds_load[mds->get_nodeid()] = load;
// import_map -- how much do i import from whom
map<mds_rank_t, float> import_map;
/* This function DOES put the passed message before returning */
void MDBalancer::handle_heartbeat(MHeartbeat *m)
{
- typedef map<mds_rank_t, mds_load_t> mds_load_map_t;
-
mds_rank_t who = mds_rank_t(m->get_source().num());
dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
if (who == 0) {
dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
if (beat_epoch != m->get_beat()) {
+ beat_epoch = m->get_beat();
mds_load.clear();
}
- beat_epoch = m->get_beat();
+
send_heartbeat();
mds->mdcache->show_subtrees();
- }
-
- {
- // set mds_load[who]
- mds_load_map_t::value_type val(who, m->get_load());
- pair < mds_load_map_t::iterator, bool > rval (mds_load.insert(val));
- if (!rval.second) {
- rval.first->second = val.second;
+ } else if (mds->get_nodeid() == 0) {
+ if (beat_epoch != m->get_beat()) {
+ dout(10) << " old heartbeat epoch, ignoring" << dendl;
+ goto out;
}
}
- mds_import_map[ who ] = m->get_import_map();
+
+ mds_load[who] = m->get_load();
+ mds_import_map[who] = m->get_import_map();
{
unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
m->put();
}
-
-void MDBalancer::export_empties()
-{
- dout(5) << "export_empties checking for empty imports" << dendl;
-
- std::set<CDir *> subtrees;
- mds->mdcache->get_fullauth_subtrees(subtrees);
- for (auto &dir : subtrees) {
- if (dir->is_freezing() || dir->is_frozen())
- continue;
-
- if (!dir->inode->is_base() &&
- !dir->inode->is_stray() &&
- dir->get_num_head_items() == 0)
- mds->mdcache->migrator->export_empty_import(dir);
- }
-}
-
-
-
double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
mds_rank_t im, double& maxim)
{
<< dendl;
}
+ mds_meta_load.clear();
+
double total_load = 0.0;
multimap<double,mds_rank_t> load_map;
for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
- map<mds_rank_t, mds_load_t>::value_type val(i, mds_load_t(ceph_clock_now()));
- std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
- mds_load_t &load(r.first->second);
+ mds_load_t& load = mds_load.at(i);
double l = load.mds_load() * load_fac;
mds_meta_load[i] = l;
<< dendl;
// under or over?
- if (my_load < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
+ for (auto p : load_map) {
+ if (p.first < target_load * (1.0 + g_conf->mds_bal_min_rebalance)) {
+ dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
+ mds_last_epoch_under_map[p.second] = beat_epoch;
+ }
+ }
+
+ int last_epoch_under = mds_last_epoch_under_map[whoami];
+ if (last_epoch_under == beat_epoch) {
dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
- last_epoch_under = beat_epoch;
- mds->mdcache->show_subtrees();
return;
}
-
// am i over long enough?
if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
importers.insert(pair<double,mds_rank_t>(it->first,it->second));
importer_set.insert(it->second);
} else {
- dout(15) << " mds." << it->second << " is exporter" << dendl;
- exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
- exporter_set.insert(it->second);
+ int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
+ if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
+ dout(15) << " mds." << it->second << " is exporter" << dendl;
+ exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
+ exporter_set.insert(it->second);
+ }
}
}
/* fill in the metrics for each mds by grabbing load struct */
vector < map<string, double> > metrics (cluster_size);
- for (mds_rank_t i=mds_rank_t(0);
- i < mds_rank_t(cluster_size);
- i++) {
- map<mds_rank_t, mds_load_t>::value_type val(i, mds_load_t(ceph_clock_now()));
- std::pair < map<mds_rank_t, mds_load_t>::iterator, bool > r(mds_load.insert(val));
- mds_load_t &load(r.first->second);
+ for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+ mds_load_t& load = mds_load.at(i);
metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
{"all.meta_load", load.all.meta_load()},
}
// make a sorted list of my imports
- map<double,CDir*> import_pop_map;
- multimap<mds_rank_t,CDir*> import_from_map;
+ multimap<double, CDir*> import_pop_map;
+ multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
set<CDir*> fullauthsubs;
mds->mdcache->get_fullauth_subtrees(fullauthsubs);
- for (set<CDir*>::iterator it = fullauthsubs.begin();
- it != fullauthsubs.end();
- ++it) {
- CDir *im = *it;
- if (im->get_inode()->is_stray()) continue;
+ for (auto dir : fullauthsubs) {
+ CInode *diri = dir->get_inode();
+ if (diri->is_mdsdir())
+ continue;
+ if (diri->get_export_pin(false) != MDS_RANK_NONE)
+ continue;
+ if (dir->is_freezing() || dir->is_frozen())
+ continue; // export pbly already in progress
- double pop = im->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
+ mds_rank_t from = diri->authority().first;
+ double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
if (g_conf->mds_bal_idle_threshold > 0 &&
pop < g_conf->mds_bal_idle_threshold &&
- im->inode != mds->mdcache->get_root() &&
- im->inode->authority().first != mds->get_nodeid()) {
- dout(5) << " exporting idle (" << pop << ") import " << *im
- << " back to mds." << im->inode->authority().first
- << dendl;
- mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first);
+ diri != mds->mdcache->get_root() &&
+ from != mds->get_nodeid()) {
+ dout(5) << " exporting idle (" << pop << ") import " << *dir
+ << " back to mds." << from << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, from);
continue;
}
- import_pop_map[ pop ] = im;
- mds_rank_t from = im->inode->authority().first;
- dout(15) << " map: i imported " << *im << " from " << from << dendl;
- import_from_map.insert(pair<mds_rank_t,CDir*>(from, im));
+ dout(15) << " map: i imported " << *dir << " from " << from << dendl;
+ import_pop_map.insert(make_pair(pop, dir));
+ import_from_map.insert(make_pair(from, make_pair(dir, pop)));
}
-
-
// do my exports!
- set<CDir*> already_exporting;
+ map<mds_rank_t, double> export_pop_map;
for (auto &it : state.targets) {
mds_rank_t target = it.first;
double amount = it.second;
- if (amount < MIN_OFFLOAD) continue;
- if (amount / target_load < .2) continue;
+ if (amount / target_load < .2)
+ continue;
+ if (amount < MIN_OFFLOAD)
+ continue;
dout(5) << "want to send " << amount << " to mds." << target
//<< " .. " << (*it).second << " * " << load_fac
<< " -> " << amount
<< dendl;//" .. fudge is " << fudge << dendl;
- double have = 0.0;
+ double& have = export_pop_map[target];
mds->mdcache->show_subtrees();
// search imports from target
if (import_from_map.count(target)) {
dout(5) << " aha, looking through imports from target mds." << target << dendl;
- pair<multimap<mds_rank_t,CDir*>::iterator, multimap<mds_rank_t,CDir*>::iterator> p =
- import_from_map.equal_range(target);
- while (p.first != p.second) {
- CDir *dir = (*p.first).second;
+ for (auto p = import_from_map.equal_range(target);
+ p.first != p.second; ) {
+ CDir *dir = p.first->second.first;
+ double pop = p.first->second.second;
dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
- multimap<mds_rank_t,CDir*>::iterator plast = p.first++;
+ auto plast = p.first++;
- if (dir->inode->is_base() ||
- dir->inode->is_stray())
+ if (dir->inode->is_base())
continue;
- if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress
- double pop = dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
if (pop <= amount-have) {
- dout(5) << "reexporting " << *dir
- << " pop " << pop
+ dout(5) << "reexporting " << *dir << " pop " << pop
<< " back to mds." << target << dendl;
mds->mdcache->migrator->export_dir_nicely(dir, target);
have += pop;
import_from_map.erase(plast);
- import_pop_map.erase(pop);
+ for (auto q = import_pop_map.equal_range(pop);
+ q.first != q.second; ) {
+ if (q.first->second == dir) {
+ import_pop_map.erase(q.first);
+ break;
+ }
+ q.first++;
+ }
} else {
dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
}
- if (amount-have < MIN_OFFLOAD) break;
+ if (amount-have < MIN_OFFLOAD)
+ break;
}
}
- if (amount-have < MIN_OFFLOAD) {
+ }
+
+ // any other imports
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
continue;
- }
- // any other imports
- if (false)
- for (map<double,CDir*>::iterator import = import_pop_map.begin();
- import != import_pop_map.end();
- import++) {
- CDir *imp = (*import).second;
- if (imp->inode->is_base() ||
- imp->inode->is_stray())
- continue;
+ for (auto p = import_pop_map.begin();
+ p != import_pop_map.end(); ) {
+ CDir *dir = p->second;
+ if (dir->inode->is_base()) {
+ ++p;
+ continue;
+ }
- double pop = (*import).first;
- if (pop < amount-have || pop < MIN_REEXPORT) {
- dout(5) << "reexporting " << *imp
- << " pop " << pop
- << " back to mds." << imp->inode->authority()
- << dendl;
- have += pop;
- mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first);
- }
- if (amount-have < MIN_OFFLOAD) break;
+ double pop = p->first;
+ if (pop <= amount-have && pop > MIN_REEXPORT) {
+ dout(0) << "reexporting " << *dir << " pop " << pop
+ << " to mds." << target << dendl;
+ have += pop;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ import_pop_map.erase(p++);
+ } else {
+ ++p;
}
- if (amount-have < MIN_OFFLOAD) {
- //fudge = amount-have;
- continue;
+ if (amount-have < MIN_OFFLOAD)
+ break;
}
+ }
- // okay, search for fragments of my workload
- set<CDir*> candidates;
- mds->mdcache->get_fullauth_subtrees(candidates);
+ set<CDir*> already_exporting;
+
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
+ continue;
+
+ // okay, search for fragments of my workload
list<CDir*> exports;
- for (set<CDir*>::iterator pot = candidates.begin();
- pot != candidates.end();
- ++pot) {
- if ((*pot)->get_inode()->is_stray()) continue;
- find_exports(*pot, amount, exports, have, already_exporting);
- if (have > amount-MIN_OFFLOAD)
+ for (auto p = import_pop_map.rbegin();
+ p != import_pop_map.rend();
+ ++p) {
+ CDir *dir = p->second;
+ find_exports(dir, amount, exports, have, already_exporting);
+ if (amount-have < MIN_OFFLOAD)
break;
}
//fudge = amount - have;
- for (list<CDir*>::iterator it = exports.begin(); it != exports.end(); ++it) {
- dout(5) << " - exporting "
- << (*it)->pop_auth_subtree
- << " "
- << (*it)->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
- << " to mds." << target
- << " " << **it
- << dendl;
- mds->mdcache->migrator->export_dir_nicely(*it, target);
+ for (auto dir : exports) {
+ dout(5) << " - exporting " << dir->pop_auth_subtree
+ << " " << dir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate)
+ << " to mds." << target << " " << *dir << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
}
}
double& have,
set<CDir*>& already_exporting)
{
+ utime_t now = ceph_clock_now();
+ if ((double)(now - rebalance_time) > 0.1) {
+ derr << " balancer runs too long" << dendl_impl;
+ have = amount;
+ return;
+ }
+
+ assert(dir->is_auth());
+
double need = amount - have;
if (need < amount * g_conf->mds_bal_min_start)
return; // good enough!
+
double needmax = need * g_conf->mds_bal_need_max;
double needmin = need * g_conf->mds_bal_need_min;
double midchunk = need * g_conf->mds_bal_midchunk;
dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
double subdir_sum = 0;
- for (auto it = dir->begin(); it != dir->end(); ++it) {
- CInode *in = it->second->get_linkage()->get_inode();
- if (!in) continue;
- if (!in->is_dir()) continue;
+ for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
+ !it.end(); ) {
+ CInode *in = *it;
+ ++it;
+
+ assert(in->is_dir());
+ assert(in->get_parent_dir() == dir);
list<CDir*> dfls;
- in->get_dirfrags(dfls);
+ in->get_nested_dirfrags(dfls);
+
+ size_t num_idle_frags = 0;
for (list<CDir*>::iterator p = dfls.begin();
p != dfls.end();
++p) {
CDir *subdir = *p;
- if (!subdir->is_auth()) continue;
- if (already_exporting.count(subdir)) continue;
+ if (already_exporting.count(subdir))
+ continue;
- if (subdir->is_frozen()) continue; // can't export this right now!
+ // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
+ // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
+ if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
+ subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
+ continue; // can't export this right now!
// how popular?
double pop = subdir->pop_auth_subtree.meta_load(rebalance_time, mds->mdcache->decayrate);
subdir_sum += pop;
dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
- if (pop < minchunk) continue;
+ if (pop < minchunk) {
+ num_idle_frags++;
+ continue;
+ }
// lucky find?
if (pop > needmin && pop < needmax) {
} else
smaller.insert(pair<double,CDir*>(pop, subdir));
}
+ if (dfls.size() == num_idle_frags)
+ in->item_pop_lru.remove_myself();
}
dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
if (have > needmin)
return;
}
-
}
-void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who)
+void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
{
// hit inode
in->pop.get(type).hit(now, mds->mdcache->decayrate);
}
}
-void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount)
+void MDBalancer::hit_dir(const utime_t& now, CDir *dir, int type, int who, double amount)
{
// hit me
double v = dir->pop_me.get(type).hit(now, mds->mdcache->decayrate, amount);
bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
while (true) {
+ CDir *pdir = dir->inode->get_parent_dir();
dir->pop_nested.get(type).hit(now, mds->mdcache->decayrate, amount);
if (rd_adj != 0.0)
dir->pop_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
if (hit_subtree) {
dir->pop_auth_subtree.get(type).hit(now, mds->mdcache->decayrate, amount);
+
if (rd_adj != 0.0)
dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
+
+ if (dir->is_subtree_root())
+ hit_subtree = false; // end of auth domain, stop hitting auth counters.
+ else if (pdir)
+ pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
}
if (hit_subtree_nested) {
if (rd_adj != 0.0)
dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, mds->mdcache->decayrate, rd_adj);
}
-
- if (dir->is_subtree_root())
- hit_subtree = false; // end of auth domain, stop hitting auth counters.
-
- if (dir->inode->get_parent_dn() == 0) break;
- dir = dir->inode->get_parent_dn()->get_dir();
+ if (!pdir) break;
+ dir = pdir;
}
}
}
}
+void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc)
+{
+ DecayRate& rate = mds->mdcache->decayrate;
+
+ bool adjust_subtree_nest = dir->is_auth();
+ bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
+ CDir *cur = dir;
+ while (true) {
+ if (inc) {
+ pdir->pop_nested.add(now, rate, dir->pop_nested);
+ if (adjust_subtree) {
+ pdir->pop_auth_subtree.add(now, rate, dir->pop_auth_subtree);
+ pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+ }
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.add(now, rate, dir->pop_auth_subtree_nested);
+ } else {
+ pdir->pop_nested.sub(now, rate, dir->pop_nested);
+ if (adjust_subtree)
+ pdir->pop_auth_subtree.sub(now, rate, dir->pop_auth_subtree);
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.sub(now, rate, dir->pop_auth_subtree_nested);
+ }
+
+ if (pdir->is_subtree_root())
+ adjust_subtree = false;
+ cur = pdir;
+ pdir = pdir->inode->get_parent_dir();
+ if (!pdir) break;
+ }
+}
+
void MDBalancer::handle_mds_failure(mds_rank_t who)
{
if (0 == who) {
- last_epoch_under = 0;
+ mds_last_epoch_under_map.clear();
}
}
+
+int MDBalancer::dump_loads(Formatter *f)
+{
+ utime_t now = ceph_clock_now();
+ DecayRate& decayrate = mds->mdcache->decayrate;
+
+ list<CDir*> dfs;
+ if (mds->mdcache->get_root()) {
+ mds->mdcache->get_root()->get_dirfrags(dfs);
+ } else {
+ dout(5) << "dump_load no root" << dendl;
+ }
+
+ f->open_object_section("loads");
+
+ f->open_array_section("dirfrags");
+ while (!dfs.empty()) {
+ CDir *dir = dfs.front();
+ dfs.pop_front();
+
+ if (f) {
+ f->open_object_section("dir");
+ dir->dump_load(f, now, decayrate);
+ f->close_section();
+ }
+
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
+ CInode *in = it->second->get_linkage()->get_inode();
+ if (!in || !in->is_dir())
+ continue;
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (auto subdir : ls) {
+ if (subdir->pop_nested.meta_load() < .001)
+ continue;
+ dfs.push_back(subdir);
+ }
+ }
+ }
+ f->close_section(); // dirfrags array
+
+ f->open_object_section("mds_load");
+ {
+
+ auto dump_mds_load = [f, now](mds_load_t& load) {
+ f->dump_float("request_rate", load.req_rate);
+ f->dump_float("cache_hit_rate", load.cache_hit_rate);
+ f->dump_float("queue_length", load.queue_len);
+ f->dump_float("cpu_load", load.cpu_load_avg);
+ f->dump_float("mds_load", load.mds_load());
+
+ DecayRate rate; // no decay
+ f->open_object_section("auth_dirfrags");
+ load.auth.dump(f, now, rate);
+ f->close_section();
+ f->open_object_section("all_dirfrags");
+ load.all.dump(f, now, rate);
+ f->close_section();
+ };
+
+ for (auto p : mds_load) {
+ stringstream name;
+ name << "mds." << p.first;
+ f->open_object_section(name.str().c_str());
+ dump_mds_load(p.second);
+ f->close_section();
+ }
+ }
+ f->close_section(); // mds_load
+
+ f->open_object_section("mds_meta_load");
+ for (auto p : mds_meta_load) {
+ stringstream name;
+ name << "mds." << p.first;
+ f->dump_float(name.str().c_str(), p.second);
+ }
+ f->close_section(); // mds_meta_load
+
+ f->open_object_section("mds_import_map");
+ for (auto p : mds_import_map) {
+ stringstream name1;
+ name1 << "mds." << p.first;
+ f->open_array_section(name1.str().c_str());
+ for (auto q : p.second) {
+ f->open_object_section("from");
+ stringstream name2;
+ name2 << "mds." << q.first;
+ f->dump_float(name2.str().c_str(), q.second);
+ f->close_section();
+ }
+ f->close_section(); // mds.? array
+ }
+ f->close_section(); // mds_import_map
+
+ f->close_section(); // loads
+ return 0;
+}