#undef dout_prefix
#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
-
class ServerContext : public MDSInternalContextBase {
protected:
Server *server;
{
PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
- "Client requests", "hcr");
+ "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
- "Slave requests", "hsr");
+ "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
- "Client session messages", "hcs");
+ "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
is_full(false),
reconnect_done(NULL),
failed_reconnects(0),
+ reconnect_evicting(false),
terminating_sessions(false)
{
}
void Server::handle_client_session(MClientSession *m)
{
version_t pv;
+ bool blacklisted = false;
Session *session = get_session(m);
dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
assert(session->is_closed() ||
session->is_closing());
+ blacklisted = mds->objecter->with_osdmap(
+ [session](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blacklisted(session->info.inst.addr);
+ });
+
+ if (blacklisted) {
+ dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
+ m->put();
+ return;
+ }
+
session->set_client_metadata(m->client_meta);
dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
<< session->info.client_metadata.size() << " metadata entries:" << dendl;
finish_flush_session(session, m->get_seq());
break;
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
+ mdlog->flush();
+ break;
+
default:
ceph_abort();
}
return;
}
- while (1) {
- Session *session = mds->sessionmap.get_oldest_session(Session::STATE_STALE);
- if (!session)
- break;
+ // Collect a list of sessions exceeding the autoclose threshold
+ std::vector<Session *> to_evict;
+ const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
+ if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
+ return;
+ }
+ const auto &stale_sessions = sessions_p->second;
+ assert(stale_sessions != nullptr);
+
+ for (const auto &session: *stale_sessions) {
if (session->is_importing()) {
dout(10) << "stopping at importing session " << session->info.inst << dendl;
break;
<< session->last_cap_renew << ")" << dendl;
break;
}
-
+
+ to_evict.push_back(session);
+ }
+
+ for (const auto &session: to_evict) {
utime_t age = now;
age -= session->last_cap_renew;
- mds->clog->info() << "closing stale session " << session->info.inst
- << " after " << age;
- dout(10) << "autoclosing stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
- kill_session(session, NULL);
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after " << age << " seconds";
+ dout(10) << "autoclosing stale session " << session->info.inst << " last "
+ << session->last_cap_renew << dendl;
+
+ if (g_conf->mds_session_blacklist_on_timeout) {
+ std::stringstream ss;
+ mds->evict_client(session->info.inst.name.num(), false, true,
+ ss, nullptr);
+ } else {
+ kill_session(session, NULL);
+ }
}
}
*/
void Server::kill_session(Session *session, Context *on_safe)
{
+ assert(mds->mds_lock.is_locked_by_me());
+
if ((session->is_opening() ||
session->is_open() ||
session->is_stale()) &&
}
}
+size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
+{
+ std::list<Session*> victims;
+ const auto sessions = mds->sessionmap.get_sessions();
+ for (const auto p : sessions) {
+ if (!p.first.is_client()) {
+ // Do not apply OSDMap blacklist to MDS daemons, we find out
+ // about their death via MDSMap.
+ continue;
+ }
+
+ Session *s = p.second;
+ if (blacklist.count(s->info.inst.addr)) {
+ victims.push_back(s);
+ }
+ }
+
+ for (const auto s : victims) {
+ kill_session(s, nullptr);
+ }
+
+ dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
+
+ return victims.size();
+}
+
void Server::journal_close_session(Session *session, int state, Context *on_safe)
{
uint64_t sseq = mds->sessionmap.set_state(session, state);
void Server::reconnect_tick()
{
+ if (reconnect_evicting) {
+ dout(4) << "reconnect_tick: waiting for evictions" << dendl;
+ return;
+ }
+
utime_t reconnect_end = reconnect_start;
reconnect_end += g_conf->mds_reconnect_timeout;
if (ceph_clock_now() >= reconnect_end &&
!client_reconnect_gather.empty()) {
dout(10) << "reconnect timed out" << dendl;
+
+ // If we're doing blacklist evictions, use this to wait for them before
+ // proceeding to reconnect_gather_finish
+ MDSGatherBuilder gather(g_ceph_context);
+
for (set<client_t>::iterator p = client_reconnect_gather.begin();
p != client_reconnect_gather.end();
++p) {
Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
assert(session);
dout(1) << "reconnect gave up on " << session->info.inst << dendl;
- kill_session(session, NULL);
+
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after waiting " << g_conf->mds_reconnect_timeout
+ << " seconds during MDS startup";
+
+ if (g_conf->mds_session_blacklist_on_timeout) {
+ std::stringstream ss;
+ mds->evict_client(session->info.inst.name.num(), false, true, ss,
+ gather.new_sub());
+ } else {
+ kill_session(session, NULL);
+ }
+
failed_reconnects++;
}
client_reconnect_gather.clear();
- reconnect_gather_finish();
+
+ if (gather.has_subs()) {
+ dout(1) << "reconnect will complete once clients are evicted" << dendl;
+ gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
+ [this](int r){reconnect_gather_finish();})));
+ gather.activate();
+ reconnect_evicting = true;
+ } else {
+ reconnect_gather_finish();
+ }
}
}
(*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
mdr->drop_local_auth_pins();
- CDir *dir = NULL;
- if (CInode *in = dynamic_cast<CInode*>(*p)) {
- if (!in->is_root())
- dir = in->get_parent_dir();
- } else if (CDentry *dn = dynamic_cast<CDentry*>(*p)) {
- dir = dn->get_dir();
- } else {
- ceph_abort();
- }
- if (dir) {
- if (dir->is_freezing_dir())
- mdcache->fragment_freeze_inc_num_waiters(dir);
- if (dir->is_freezing_tree()) {
- while (!dir->is_freezing_tree_root())
- dir = dir->get_parent_dir();
- mdcache->migrator->export_freeze_inc_num_waiters(dir);
- }
- }
+ mds->locker->notify_freeze_waiter(*p);
return;
}
}
*/
mds->locker->drop_locks(mdr.get(), NULL);
mdr->drop_local_auth_pins();
+ if (!mdr->remote_auth_pins.empty())
+ mds->locker->notify_freeze_waiter(ref);
return 0;
}
CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
if (!dir) return 0;
- dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl;
-
- // make sure we can auth_pin (or have already authpinned) dir
- if (dir->is_frozen()) {
- dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl;
- dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
- }
CInode *diri = dir->get_inode();
if (!mdr->reqid.name.is_mds()) {
if (!dir && diri->is_frozen()) {
dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
assert(diri->get_parent_dir());
- diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
return 0;
}
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
- // need read access to directory inode
- if (!check_access(mdr, diri, MAY_READ))
- return;
+ if (diri != NULL) {
+ // need read access to directory inode
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+ }
}
if (want_parent) {
return;
}
- if (!(req->head.args.open.flags & CEPH_O_EXCL)) {
+ bool excl = req->head.args.open.flags & CEPH_O_EXCL;
+
+ if (!excl) {
int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
&mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
if (r > 0) return;
}
return;
}
- // r == -ENOENT
}
- bool excl = (req->head.args.open.flags & CEPH_O_EXCL);
set<SimpleLock*> rdlocks, wrlocks, xlocks;
file_layout_t *dir_layout = NULL;
CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
return;
}
+ // created null dn.
CDir *dir = dn->get_dir();
CInode *diri = dir->get_inode();
rdlocks.insert(&diri->authlock);
return;
}
- // created null dn.
-
// create inode.
SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
snapid_t follows = realm->get_newest_seq();
dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
// link and unlock the NEW dentry
- dn->pop_projected_linkage();
+ CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+ if (!dnl->get_inode())
+ dn->link_remote(dnl, targeti);
dn->mark_dirty(dnpv, mdr->ls);
// target inode
mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
le->metablob.add_null_dentry(dn, true);
+ dn->push_projected_linkage();
}
journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
if (inc) {
// link the new dentry
- dn->pop_projected_linkage();
+ CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+ if (!dnl->get_inode())
+ dn->link_remote(dnl, targeti);
dn->mark_dirty(dpv, mdr->ls);
} else {
// unlink main dentry
dn->get_dir()->unlink_inode(dn);
+ dn->pop_projected_linkage();
dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
}
dn->pre_dirty();
inode_t *pi = in->project_inode();
- dn->make_path_string(pi->stray_prior_path);
+ dn->make_path_string(pi->stray_prior_path, true);
mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
pi->version = in->pre_dirty();
pi->ctime = mdr->get_op_stamp();
struct C_MDS_SlaveRmdirCommit : public ServerContext {
MDRequestRef mdr;
- C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r)
- : ServerContext(s), mdr(r) { }
+ CDentry *straydn;
+ C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
+ : ServerContext(s), mdr(r), straydn(sd) { }
void finish(int r) override {
- server->_commit_slave_rmdir(mdr, r);
+ server->_commit_slave_rmdir(mdr, r, straydn);
}
};
dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
// set up commit waiter
- mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr);
+ mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
}
-void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r)
+void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
{
dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
if (r == 0) {
+ if (mdr->more()->slave_update_journaled) {
+ CInode *strayin = straydn->get_projected_linkage()->get_inode();
+ if (strayin && !strayin->snaprealm)
+ mdcache->clear_dirty_bits_for_stray(strayin);
+ }
+
mdr->cleanup();
if (mdr->more()->slave_update_journaled) {
req->destdnpath.push_dentry(dn->name);
if (straydn)
mdcache->replicate_stray(straydn, who, req->stray);
+
+ req->srcdn_auth = mdr->more()->srcdn_auth_mds;
// srcdn auth will verify our current witness list is sufficient
req->witnesses = witnesse;
if (tpi) {
tpi->ctime = mdr->get_op_stamp();
tpi->change_attr++;
- destdn->make_path_string(tpi->stray_prior_path);
+ destdn->make_path_string(tpi->stray_prior_path, true);
tpi->nlink--;
if (tpi->nlink == 0)
oldin->state_set(CInode::STATE_ORPHAN);
if (destdn->is_auth() && !destdnl->is_null()) {
mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
(destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
- if (destdnl->is_primary())
+ if (destdnl->is_primary()) {
+ assert(straydn);
mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ }
}
// move srcdn
// target inode
if (!linkmerge) {
if (destdnl->is_primary()) {
+ assert(straydn);
if (destdn->is_auth()) {
// project snaprealm, too
if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
if (srcdnl->is_primary() && destdn->is_auth())
srci->first = destdn->first;
- if (oldin && oldin->is_dir())
+ if (oldin && oldin->is_dir()) {
+ assert(straydn);
mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
+ }
if (srci->is_dir())
mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
CDentry::linkage_t *destdnl = destdn->get_linkage();
CInode *oldin = destdnl->get_inode();
-
- bool imported_inode = false;
// primary+remote link merge?
bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
if (destdnl->is_primary()) {
assert(straydn);
dout(10) << "straydn is " << *straydn << dendl;
- destdn->get_dir()->unlink_inode(destdn);
+ destdn->get_dir()->unlink_inode(destdn, false);
straydn->pop_projected_linkage();
if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
//oldin->open_snaprealm(); might be sufficient..
}
} else if (destdnl->is_remote()) {
- destdn->get_dir()->unlink_inode(destdn);
+ destdn->get_dir()->unlink_inode(destdn, false);
if (oldin->is_auth())
oldin->pop_and_dirty_projected_inode(mdr->ls);
}
} else { // primary
if (linkmerge) {
dout(10) << "merging primary onto remote link" << dendl;
- destdn->get_dir()->unlink_inode(destdn);
+ destdn->get_dir()->unlink_inode(destdn, false);
}
destdnl = destdn->pop_projected_linkage();
if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
// hack: fix auth bit
in->state_set(CInode::STATE_AUTH);
- imported_inode = true;
mdr->clear_ambiguous_auth();
}
// update subtree map?
if (destdnl->is_primary() && in->is_dir())
- mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true, imported_inode);
+ mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
if (straydn && oldin->is_dir())
mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
<< " " << mdr->slave_request->srcdnpath
<< " to " << mdr->slave_request->destdnpath
<< dendl;
-
+
+ if (mdr->slave_request->is_interrupted()) {
+ dout(10) << " slave request interrupted, sending noop reply" << dendl;
+ MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ reply->mark_interrupted();
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ mdr->slave_request->put();
+ mdr->slave_request = 0;
+ return;
+ }
+
// discover destdn
filepath destpath(mdr->slave_request->destdnpath);
dout(10) << " dest " << destpath << dendl;
mdr->more()->is_ambiguous_auth = false;
}
+ if (straydn && mdr->more()->slave_update_journaled) {
+ CInode *strayin = straydn->get_projected_linkage()->get_inode();
+ if (strayin && !strayin->snaprealm)
+ mdcache->clear_dirty_bits_for_stray(strayin);
+ }
mds->queue_waiters(finished);
mdr->cleanup();
// witnessed? or add extra witnesses?
assert(mdr->more()->witnessed.count(from) == 0);
- if (ack->witnesses.empty()) {
+ if (ack->is_interrupted()) {
+ dout(10) << " slave request interrupted, noop" << dendl;
+ } else if (ack->witnesses.empty()) {
mdr->more()->witnessed.insert(from);
if (!ack->is_not_journaled())
mdr->more()->has_journaled_slaves = true;