assert(!root->is_auth());
CDir *rootdir = root->get_dirfrag(frag_t());
if (!rootdir) {
- discover_dir_frag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
+ open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
return;
}
}
* merge with parent and/or child subtrees, if is it appropriate.
* merge can ONLY happen if both parent and child have unambiguous auth.
*/
-void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool do_eval)
+void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
{
dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
<< " on " << *dir << dendl;
- if (mds->is_any_replay() || mds->is_resolve())
- do_eval = false;
-
show_subtrees();
CDir *root;
p = p->inode->get_parent_dir();
}
}
-
- if (do_eval)
- eval_subtree_root(dir->get_inode());
}
show_subtrees();
assert(subtrees.count(dir));
set<CDir*> oldbounds = subtrees[dir];
+ set<CInode*> to_eval;
// try merge at my root
- try_subtree_merge_at(dir);
+ try_subtree_merge_at(dir, &to_eval);
// try merge at my old bounds
- for (set<CDir*>::iterator p = oldbounds.begin();
- p != oldbounds.end();
- ++p)
- try_subtree_merge_at(*p);
+ for (auto bound : oldbounds)
+ try_subtree_merge_at(bound, &to_eval);
+
+ if (!(mds->is_any_replay() || mds->is_resolve())) {
+ for(auto in : to_eval)
+ eval_subtree_root(in);
+ }
}
class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
}
};
-void MDCache::try_subtree_merge_at(CDir *dir, bool do_eval)
+void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
{
dout(10) << "try_subtree_merge_at " << *dir << dendl;
assert(subtrees.count(dir));
- if (mds->is_any_replay() || mds->is_resolve())
- do_eval = false;
-
// merge with parent?
CDir *parent = dir;
if (!dir->inode->is_base())
}
}
- if (do_eval)
- eval_subtree_root(dir->get_inode());
+ if (to_eval && dir->get_inode()->is_auth())
+ to_eval->insert(dir->get_inode());
}
show_subtrees(15);
{
// evaluate subtree inode filelock?
// (we should scatter the filelock on subtree bounds)
- if (diri->is_auth())
- mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
+ assert(diri->is_auth());
+ mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
}
root = dir;
}
+ set<CInode*> to_eval;
+
// verify/adjust bounds.
// - these may be new, or
// - beneath existing ambiguous bounds (which will be collapsed),
t = get_subtree_root(t->get_parent_dir());
dout(10) << " swallowing intervening subtree at " << *t << dendl;
adjust_subtree_auth(t, auth);
- try_subtree_merge_at(t);
+ try_subtree_merge_at(t, &to_eval);
t = get_subtree_root(bound->get_parent_dir());
if (t == dir) break;
}
CDir *stray = *p;
dout(10) << " swallowing extra subtree at " << *stray << dendl;
adjust_subtree_auth(stray, auth);
- try_subtree_merge_at(stray);
+ try_subtree_merge_at(stray, &to_eval);
}
}
// swallowing subtree may add new subtree bounds
verify_subtree_bounds(dir, bounds);
show_subtrees();
+
+ if (!(mds->is_any_replay() || mds->is_resolve())) {
+ for(auto in : to_eval)
+ eval_subtree_root(in);
+ }
}
projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
}
-void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir,
- bool pop, bool imported)
+void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
{
dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
subtrees[oldparent].erase(dir);
assert(subtrees.count(newparent));
subtrees[newparent].insert(dir);
- try_subtree_merge_at(dir, !imported);
+ // caller is responsible for 'eval diri'
+ try_subtree_merge_at(dir, NULL);
} else {
// mid-subtree.
// did auth change?
if (oldparent->authority() != newparent->authority()) {
- adjust_subtree_auth(dir, oldparent->authority(), !imported); // caller is responsible for *diri.
- try_subtree_merge_at(dir, !imported);
+ adjust_subtree_auth(dir, oldparent->authority());
+ // caller is responsible for 'eval diri'
+ try_subtree_merge_at(dir, NULL);
}
}
}
// tell the migrator too.
migrator->handle_mds_failure_or_stop(who);
+ // tell the balancer too.
+ mds->balancer->handle_mds_failure(who);
+
// clean up any requests slave to/from this node
list<MDRequestRef> finish;
for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
in = dir->get_inode();
if (in->is_replica(p->first))
break;
+ in->add_replica(p->first);
if (in->is_base())
break;
}
rejoin_imported_caps.clear();
}
+class C_MDC_ReIssueCaps : public MDCacheContext {
+ CInode *in;
+public:
+ C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
+ MDCacheContext(mdc), in(i)
+ {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
+ mdcache->mds->locker->issue_caps(in);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
void MDCache::reissue_all_caps()
{
++p) {
CInode *in = p->second;
if (in->is_head() && in->is_any_caps()) {
+ // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
+ if (in->is_frozen_inode()) {
+ in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
+ continue;
+ }
if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
mds->locker->issue_caps(in);
}
<< ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
<< dendl;
+ mds->update_mlogger();
mds->mlogger->set(l_mdm_rss, last.get_rss());
mds->mlogger->set(l_mdm_heap, last.get_heap());
assert(!migrator->is_exporting());
assert(!migrator->is_importing());
+ if ((myin && myin->is_auth_pinned()) ||
+ (mydir && mydir->is_auth_pinned())) {
+ dout(7) << "still have auth pinned objects" << dendl;
+ return false;
+ }
// flush what we can from the log
mds->mdlog->trim(0);
void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
{
dout(10) << "open_remote_dir on " << *diri << dendl;
-
assert(diri->is_dir());
assert(!diri->is_auth());
assert(diri->get_dirfrag(approxfg) == 0);
- mds_rank_t auth = diri->authority().first;
-
- if (!mds->is_cluster_degraded() ||
- mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
- discover_dir_frag(diri, approxfg, fin);
- } else {
- // mds is down or recovering. forge a replica!
- forge_replica_dir(diri, approxfg, auth);
- if (fin)
- mds->queue_waiter(fin);
- }
+ discover_dir_frag(diri, approxfg, fin);
}
{
if (err < 0 && err != -EAGAIN) {
info.checked.clear();
- info.checked.insert(mds->get_nodeid());
info.checking = MDS_RANK_NONE;
info.check_peers = true;
info.fetch_backtrace = true;
info.last_err = err;
}
- if (info.check_peers) {
+ if (info.check_peers || info.discover) {
+ if (info.discover) {
+ // got backtrace from peer, but failed to find inode. re-check peers
+ info.discover = false;
+ info.ancestors.clear();
+ info.checked.clear();
+ }
info.check_peers = false;
info.checking = MDS_RANK_NONE;
do_open_ino_peer(ino, info);
info.fetch_backtrace = false;
info.checking = mds->get_nodeid();
info.checked.clear();
- info.checked.insert(mds->get_nodeid());
C_IO_MDC_OpenInoBacktraceFetched *fin =
new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
fetch_backtrace(ino, info.pool, fin->bl,
}
}
if (peer < 0) {
- if (all.size() > active.size() && all != info.checked) {
+ all.erase(mds->get_nodeid());
+ if (all != info.checked) {
dout(10) << " waiting for more peers to be active" << dendl;
} else {
dout(10) << " all MDS peers have been checked " << dendl;
info.waiters.push_back(fin);
} else {
open_ino_info_t& info = opening_inodes[ino];
- info.checked.insert(mds->get_nodeid());
info.want_replica = want_replica;
info.want_xlocked = want_xlocked;
info.tid = ++open_ino_last_tid;
fip.tid = tid;
fip.fin = c;
fip.hint = hint;
- fip.checked.insert(mds->get_nodeid());
_do_find_ino_peer(fip);
}
}
}
if (m == MDS_RANK_NONE) {
- if (all.size() > active.size()) {
+ all.erase(mds->get_nodeid());
+ if (all != fip.checked) {
dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
} else {
dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
return;
}
+ switch(mdr->internal_op) {
+ case CEPH_MDS_OP_FRAGMENTDIR:
+ logger->inc(l_mdss_ireq_fragmentdir);
+ break;
+ case CEPH_MDS_OP_EXPORTDIR:
+ logger->inc(l_mdss_ireq_exportdir);
+ break;
+ case CEPH_MDS_OP_ENQUEUE_SCRUB:
+ logger->inc(l_mdss_ireq_enqueue_scrub);
+ break;
+ case CEPH_MDS_OP_FLUSH:
+ logger->inc(l_mdss_ireq_flush);
+ break;
+ case CEPH_MDS_OP_REPAIR_FRAGSTATS:
+ logger->inc(l_mdss_ireq_fragstats);
+ break;
+ case CEPH_MDS_OP_REPAIR_INODESTATS:
+ logger->inc(l_mdss_ireq_inodestats);
+ break;
+ }
+
request_cleanup(mdr);
}
if (mds->get_state() <= MDSMap::STATE_REJOIN) {
if (mds->get_state() < MDSMap::STATE_REJOIN &&
- mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
+ mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
dis->put();
return;
}
return dir;
}
-CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, mds_rank_t from)
-{
- assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN);
-
- // forge a replica.
- CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) );
-
- // i'm assuming this is a subtree root.
- adjust_subtree_auth(dir, from);
-
- dout(7) << "forge_replica_dir added " << *dir << " while mds." << from << " is down" << dendl;
-
- return dir;
-}
-
CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
{
string name;
/* This function DOES put the passed message before returning */
void MDCache::handle_dir_update(MDirUpdate *m)
{
- CDir *dir = get_dirfrag(m->get_dirfrag());
+ dirfrag_t df = m->get_dirfrag();
+ CDir *dir = get_dirfrag(df);
if (!dir) {
- dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << dendl;
+ dout(5) << "dir_update on " << df << ", don't have it" << dendl;
// discover it?
if (m->should_discover()) {
// only try once!
// this is key to avoid a fragtree update race, among other things.
- m->tried_discover();
+ m->inc_tried_discover();
vector<CDentry*> trace;
CInode *in;
filepath path = m->get_path();
int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
if (r > 0)
return;
- assert(r == 0);
- open_remote_dirfrag(in, m->get_dirfrag().frag,
- new C_MDS_RetryMessage(mds, m));
- return;
+ if (r == 0 &&
+ in->ino() == df.ino &&
+ in->get_approx_dirfrag(df.frag) == NULL) {
+ open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
}
m->put();
return;
}
- // update
- dout(5) << "dir_update on " << *dir << dendl;
- dir->dir_rep = m->get_dir_rep();
- dir->dir_rep_by = m->get_dir_rep_by();
-
+ if (!m->has_tried_discover()) {
+ // Update if it already exists. Othwerwise it got updated by discover reply.
+ dout(5) << "dir_update on " << *dir << dendl;
+ dir->dir_rep = m->get_dir_rep();
+ dir->dir_rep_by = m->get_dir_rep_by();
+ }
+
// done
m->put();
}
}
if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
+
mds->locker->drop_locks(mdr.get());
mdr->drop_local_auth_pins();
- dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
+ if (!mdr->remote_auth_pins.empty())
+ mds->locker->notify_freeze_waiter(dir);
return;
}
/* Stray/purge statistics */
pcb.add_u64(l_mdc_num_strays, "num_strays",
- "Stray dentries", "stry");
+ "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
/* Recovery queue statistics */
pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
- "Files waiting for recovery", "recy");
+ "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
- "File recoveries completed", "recd");
+ "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
+
+ pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
+ "Internal Request type enqueue scrub");
+ pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
+ "Internal Request type export dir");
+ pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
+ "Internal Request type flush");
+ pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
+ "Internal Request type fragmentdir");
+ pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
+ "Internal Request type frag stats");
+ pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
+ "Internal Request type inode stats");
logger.reset(pcb.create_perf_counters());
g_ceph_context->get_perfcounters_collection()->add(logger.get());
* away.
*/
void MDCache::maybe_eval_stray(CInode *in, bool delay) {
- if (in->inode.nlink > 0 || in->is_base() || is_readonly() || mds->is_standby_replay())
+ if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
+ mds->get_state() <= MDSMap::STATE_REJOIN)
return;
+
CDentry *dn = in->get_projected_parent_dn();
if (dn->state_test(CDentry::STATE_PURGING)) {