*/
#include <boost/lexical_cast.hpp>
-#include "include/assert.h" // lexical_cast includes system assert.h
+#include "include/ceph_assert.h" // lexical_cast includes system assert.h
#include <boost/config/warning_disable.hpp>
#include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
#include "MDSRank.h"
#include "Server.h"
#include "InoTable.h"
#include "SnapClient.h"
#include "Mutation.h"
+#include "cephfs_features.h"
#include "msg/Messenger.h"
#include "osdc/Objecter.h"
-#include "messages/MClientSession.h"
-#include "messages/MClientRequest.h"
-#include "messages/MClientReply.h"
-#include "messages/MClientReconnect.h"
-#include "messages/MClientCaps.h"
-#include "messages/MClientSnap.h"
-
-#include "messages/MMDSSlaveRequest.h"
-
-#include "messages/MLock.h"
-
#include "events/EUpdate.h"
#include "events/ESlaveUpdate.h"
#include "events/ESession.h"
#include "events/EOpen.h"
#include "events/ECommitted.h"
+#include "events/EPurged.h"
+#include "include/stringify.h"
#include "include/filepath.h"
#include "common/errno.h"
#include "common/Timer.h"
#include "osd/OSDMap.h"
#include <errno.h>
+#include <math.h>
#include <list>
#include <iostream>
-#include <boost/utility/string_view.hpp>
-using namespace std;
+#include <string_view>
#include "common/config.h"
#undef dout_prefix
#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
-class ServerContext : public MDSInternalContextBase {
+class ServerContext : public MDSContext {
protected:
Server *server;
MDSRank *get_mds() override
public:
explicit ServerContext(Server *s) : server(s) {
- assert(server != NULL);
+ ceph_assert(server != NULL);
+ }
+};
+
+class Batch_Getattr_Lookup : public BatchOp {
+protected:
+ Server* server;
+ ceph::ref_t<MDRequestImpl> mdr;
+ MDCache* mdcache;
+ int res = 0;
+public:
+ Batch_Getattr_Lookup(Server* s, ceph::ref_t<MDRequestImpl> r, MDCache* mdc) : server(s), mdr(std::move(r)), mdcache(mdc) {}
+ void add_request(const ceph::ref_t<MDRequestImpl>& m) override {
+ mdr->batch_reqs.push_back(m);
+ }
+ void set_request(const ceph::ref_t<MDRequestImpl>& m) override {
+ mdr = m;
+ }
+ void _forward(mds_rank_t t) override {
+ mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
+ mdr->set_mds_stamp(ceph_clock_now());
+ for (auto& m : mdr->batch_reqs) {
+ if (!m->killed)
+ mdcache->request_forward(m, t);
+ }
+ mdr->batch_reqs.clear();
+ }
+ void _respond(int r) override {
+ mdr->set_mds_stamp(ceph_clock_now());
+ for (auto& m : mdr->batch_reqs) {
+ if (!m->killed) {
+ m->tracei = mdr->tracei;
+ m->tracedn = mdr->tracedn;
+ server->respond_to_request(m, r);
+ }
+ }
+ mdr->batch_reqs.clear();
+ server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
+ }
+ void print(std::ostream& o) {
+ o << "[batch front=" << *mdr << "]";
}
};
}
public:
explicit ServerLogContext(Server *s) : server(s) {
- assert(server != NULL);
+ ceph_assert(server != NULL);
}
explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
- assert(server != NULL);
+ ceph_assert(server != NULL);
}
};
void Server::create_logger()
{
PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
- plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
- "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
+
+ plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
+ "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
- "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
- plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
- "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
- plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
- plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
- plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
- "Request type lookup hash of inode");
- plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
- "Request type lookup inode");
- plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
- "Request type lookup parent");
- plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
- "Request type lookup name");
- plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
- "Request type lookup");
- plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
- "Request type lookup snapshot");
- plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
- "Request type get attribute");
- plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
- "Request type set attribute");
- plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
- "Request type set file layout");
- plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
- "Request type set directory layout");
- plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
- "Request type set extended attribute");
- plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
- "Request type remove extended attribute");
- plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
- "Request type read directory");
- plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
- "Request type set file lock");
- plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
- "Request type get file lock");
- plb.add_u64_counter(l_mdss_req_create, "req_create",
- "Request type create");
- plb.add_u64_counter(l_mdss_req_open, "req_open",
- "Request type open");
- plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
- "Request type make node");
- plb.add_u64_counter(l_mdss_req_link, "req_link",
- "Request type link");
- plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
- "Request type unlink");
- plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
- "Request type remove directory");
- plb.add_u64_counter(l_mdss_req_rename, "req_rename",
- "Request type rename");
- plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
- "Request type make directory");
- plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
- "Request type symbolic link");
- plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
- "Request type list snapshot");
- plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
- "Request type make snapshot");
- plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
- "Request type remove snapshot");
- plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
- "Request type rename snapshot");
+ "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_handle_client_session,
+ "handle_client_session", "Client session messages", "hcs",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
+ "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
+
+ // fop latencies are useful
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
+ "Request type lookup hash of inode latency");
+ plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
+ "Request type lookup inode latency");
+ plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
+ "Request type lookup parent latency");
+ plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
+ "Request type lookup name latency");
+ plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
+ "Request type lookup latency");
+ plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
+ "Request type lookup snapshot latency");
+ plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
+ "Request type get attribute latency");
+ plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
+ "Request type set attribute latency");
+ plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
+ "Request type set file layout latency");
+ plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
+ "Request type set directory layout latency");
+ plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
+ "Request type set extended attribute latency");
+ plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
+ "Request type remove extended attribute latency");
+ plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
+ "Request type read directory latency");
+ plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
+ "Request type set file lock latency");
+ plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
+ "Request type get file lock latency");
+ plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
+ "Request type create latency");
+ plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
+ "Request type open latency");
+ plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
+ "Request type make node latency");
+ plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
+ "Request type link latency");
+ plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
+ "Request type unlink latency");
+ plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
+ "Request type remove directory latency");
+ plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
+ "Request type rename latency");
+ plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
+ "Request type make directory latency");
+ plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
+ "Request type symbolic link latency");
+ plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
+ "Request type list snapshot latency");
+ plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
+ "Request type make snapshot latency");
+ plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
+ "Request type remove snapshot latency");
+ plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
+ "Request type rename snapshot latency");
+
+ plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+ plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
+ "Client requests dispatched");
+ plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
+ "Server requests dispatched");
+
logger = plb.create_perf_counters();
g_ceph_context->get_perfcounters_collection()->add(logger);
}
Server::Server(MDSRank *m) :
mds(m),
mdcache(mds->mdcache), mdlog(mds->mdlog),
- logger(0),
- is_full(false),
- reconnect_done(NULL),
- failed_reconnects(0),
- reconnect_evicting(false),
- terminating_sessions(false)
+ recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
{
+ replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
+ cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+ max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+ delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+ supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
}
-
-/* This function DOES put the passed message before returning*/
-void Server::dispatch(Message *m)
+void Server::dispatch(const cref_t<Message> &m)
{
switch (m->get_type()) {
case CEPH_MSG_CLIENT_RECONNECT:
- handle_client_reconnect(static_cast<MClientReconnect*>(m));
+ handle_client_reconnect(ref_cast<MClientReconnect>(m));
return;
}
+/*
+ *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
+
+1. In reconnect phase, client sent unsafe requests to mds.
+2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
+(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
+3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
+
+*/
+ bool sessionclosed_isok = replay_unsafe_with_closed_session;
// active?
// handle_slave_request()/handle_client_session() will wait if necessary
if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
- MClientRequest *req = static_cast<MClientRequest*>(m);
+ const auto &req = ref_cast<MClientRequest>(m);
if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
Session *session = mds->get_session(req);
- if (!session || session->is_closed()) {
+ if (!session || (!session->is_open() && !sessionclosed_isok)) {
dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
- req->put();
return;
}
bool queue_replay = false;
- if (req->is_replay()) {
+ if (req->is_replay() || req->is_async()) {
dout(3) << "queuing replayed op" << dendl;
queue_replay = true;
+ if (req->head.ino &&
+ !session->have_completed_request(req->get_reqid().tid, nullptr)) {
+ mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
+ }
} else if (req->get_retry_attempt()) {
// process completed request in clientreplay stage. The completed request
// might have created new file/directorie. This guarantees MDS sends a reply
switch (m->get_type()) {
case CEPH_MSG_CLIENT_SESSION:
- handle_client_session(static_cast<MClientSession*>(m));
+ handle_client_session(ref_cast<MClientSession>(m));
return;
case CEPH_MSG_CLIENT_REQUEST:
- handle_client_request(static_cast<MClientRequest*>(m));
+ handle_client_request(ref_cast<MClientRequest>(m));
+ return;
+ case CEPH_MSG_CLIENT_RECLAIM:
+ handle_client_reclaim(ref_cast<MClientReclaim>(m));
return;
case MSG_MDS_SLAVE_REQUEST:
- handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
+ handle_slave_request(ref_cast<MMDSSlaveRequest>(m));
return;
default:
derr << "server unknown message " << m->get_type() << dendl;
- assert(0 == "server unknown message");
+ ceph_abort_msg("server unknown message");
}
}
version_t cmapv;
interval_set<inodeno_t> inos;
version_t inotablev;
+ interval_set<inodeno_t> purge_inos;
+ LogSegment *ls = nullptr;
Context *fin;
public:
C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
- C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
- ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
+ C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv, Context *fin_ = NULL) :
+ ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), fin(fin_) { }
+ C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv,
+ interval_set<inodeno_t> _purge_inos, LogSegment *_ls, Context *fin_ = NULL) :
+ ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), purge_inos(std::move(_purge_inos)), ls(_ls), fin(fin_){}
void finish(int r) override {
- assert(r == 0);
- server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
+ ceph_assert(r == 0);
+ server->_session_logged(session, state_seq, open, cmapv, inos, inotablev, purge_inos, ls);
if (fin) {
fin->complete(r);
}
}
};
-/* This function DOES put the passed message before returning*/
-void Server::handle_client_session(MClientSession *m)
+Session* Server::find_session_by_uuid(std::string_view uuid)
+{
+ Session* session = nullptr;
+ for (auto& it : mds->sessionmap.get_sessions()) {
+ auto& metadata = it.second->info.client_metadata;
+
+ auto p = metadata.find("uuid");
+ if (p == metadata.end() || p->second != uuid)
+ continue;
+
+ if (!session) {
+ session = it.second;
+ } else if (!session->reclaiming_from) {
+ assert(it.second->reclaiming_from == session);
+ session = it.second;
+ } else {
+ assert(session->reclaiming_from == it.second);
+ }
+ }
+ return session;
+}
+
+void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
+{
+ if (!session->is_open() && !session->is_stale()) {
+ dout(10) << "session not open, dropping this req" << dendl;
+ return;
+ }
+
+ auto reply = make_message<MClientReclaimReply>(0);
+ if (m->get_uuid().empty()) {
+ dout(10) << __func__ << " invalid message (no uuid)" << dendl;
+ reply->set_result(-EINVAL);
+ mds->send_message_client(reply, session);
+ return;
+ }
+
+ unsigned flags = m->get_flags();
+ if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
+ dout(10) << __func__ << " unsupported flags" << dendl;
+ reply->set_result(-EOPNOTSUPP);
+ mds->send_message_client(reply, session);
+ return;
+ }
+
+ Session* target = find_session_by_uuid(m->get_uuid());
+ if (target) {
+ if (session->info.auth_name != target->info.auth_name) {
+ dout(10) << __func__ << " session auth_name " << session->info.auth_name
+ << " != target auth_name " << target->info.auth_name << dendl;
+ reply->set_result(-EPERM);
+ mds->send_message_client(reply, session);
+ }
+
+ assert(!target->reclaiming_from);
+ assert(!session->reclaiming_from);
+ session->reclaiming_from = target;
+ reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
+ }
+
+ if (flags & CEPH_RECLAIM_RESET) {
+ finish_reclaim_session(session, reply);
+ return;
+ }
+
+ ceph_abort();
+}
+
+void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
+{
+ Session *target = session->reclaiming_from;
+ if (target) {
+ session->reclaiming_from = nullptr;
+
+ Context *send_reply;
+ if (reply) {
+ int64_t session_id = session->get_client().v;
+ send_reply = new LambdaContext([this, session_id, reply](int r) {
+ assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
+ if (!session) {
+ return;
+ }
+ auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
+ reply->set_epoch(epoch);
+ mds->send_message_client(reply, session);
+ });
+ } else {
+ send_reply = nullptr;
+ }
+
+ bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
+ return map.is_blacklisted(target->info.inst.addr);
+ });
+
+ if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
+ kill_session(target, send_reply);
+ } else {
+ std::stringstream ss;
+ mds->evict_client(target->get_client().v, false, true, ss, send_reply);
+ }
+ } else if (reply) {
+ mds->send_message_client(reply, session);
+ }
+}
+
+void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
+{
+ Session *session = mds->get_session(m);
+ dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ assert(m->get_source().is_client()); // should _not_ come from an mds!
+
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ return;
+ }
+
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
+ finish_reclaim_session(session);
+ } else {
+ reclaim_session(session, m);
+ }
+}
+
+void Server::handle_client_session(const cref_t<MClientSession> &m)
{
version_t pv;
- bool blacklisted = false;
Session *session = mds->get_session(m);
dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
- assert(m->get_source().is_client()); // should _not_ come from an mds!
+ ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
if (!session) {
dout(0) << " ignoring sessionless msg " << *m << dendl;
- m->put();
+ auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "sessionless";
+ mds->send_message(reply, m->get_connection());
return;
}
session->is_killing() ||
terminating_sessions) {
dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
- // set client metadata for session opened by prepare_force_open_sessions
- if (!m->client_meta.empty())
- session->set_client_metadata(m->client_meta);
- m->put();
return;
}
- assert(session->is_closed() ||
- session->is_closing());
+ ceph_assert(session->is_closed() || session->is_closing());
if (mds->is_stopping()) {
dout(10) << "mds is stopping, dropping open req" << dendl;
- m->put();
return;
}
- blacklisted = mds->objecter->with_osdmap(
- [session](const OSDMap &osd_map) -> bool {
- return osd_map.is_blacklisted(session->info.inst.addr);
- });
+ {
+ auto& addr = session->info.inst.addr;
+ session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
+ auto& client_metadata = session->info.client_metadata;
+
+ auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
+ auto now = ceph_clock_now();
+ auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
+ auto elapsed = now - m->get_recv_stamp();
+ CachedStackStringStream css;
+ *css << "New client session:"
+ << " addr=\"" << session->info.inst.addr << "\""
+ << ",elapsed=" << elapsed
+ << ",throttled=" << throttle_elapsed
+ << ",status=\"" << status << "\"";
+ if (!err.empty()) {
+ *css << ",error=\"" << err << "\"";
+ }
+ const auto& metadata = session->info.client_metadata;
+ if (auto it = metadata.find("root"); it != metadata.end()) {
+ *css << ",root=\"" << it->second << "\"";
+ }
+ dout(2) << css->strv() << dendl;
+ };
+
+ auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
+ auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ m->metadata["error_string"] = err_str;
+ mds->send_message_client(m, session);
+ log_session_status("REJECTED", err_str);
+ };
+
+ bool blacklisted = mds->objecter->with_osdmap(
+ [&addr](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blacklisted(addr);
+ });
+
+ if (blacklisted) {
+ dout(10) << "rejecting blacklisted client " << addr << dendl;
+ send_reject_message("blacklisted");
+ session->clear();
+ break;
+ }
- if (blacklisted) {
- dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
- mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
- m->put();
- return;
- }
+ if (client_metadata.features.empty())
+ infer_supported_features(session, client_metadata);
- session->set_client_metadata(m->client_meta);
- dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
- << session->info.client_metadata.size() << " metadata entries:" << dendl;
- for (map<string, string>::iterator i = session->info.client_metadata.begin();
- i != session->info.client_metadata.end(); ++i) {
- dout(20) << " " << i->first << ": " << i->second << dendl;
- }
-
- // Special case for the 'root' metadata path; validate that the claimed
- // root is actually within the caps of the session
- if (session->info.client_metadata.count("root")) {
- const auto claimed_root = session->info.client_metadata.at("root");
- // claimed_root has a leading "/" which we strip before passing
- // into caps check
- if (claimed_root.empty() || claimed_root[0] != '/' ||
- !session->auth_caps.path_capable(claimed_root.substr(1))) {
- derr << __func__ << " forbidden path claimed as mount root: "
- << claimed_root << " by " << m->get_source() << dendl;
- // Tell the client we're rejecting their open
- mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
- mds->clog->warn() << "client session with invalid root '" <<
- claimed_root << "' denied (" << session->info.inst << ")";
- session->clear();
- // Drop out; don't record this session in SessionMap or journal it.
- break;
+ dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
+ dout(20) << " features: '" << client_metadata.features << "'" << dendl;
+ dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
+ for (const auto& p : client_metadata) {
+ dout(20) << " " << p.first << ": " << p.second << dendl;
}
- }
- if (session->is_closed())
- mds->sessionmap.add_session(session);
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= client_metadata.features;
+ if (!missing_features.empty()) {
+ stringstream ss;
+ ss << "missing required features '" << missing_features << "'";
+ send_reject_message(ss.str());
+ mds->clog->warn() << "client session (" << session->info.inst
+ << ") lacks required features " << missing_features
+ << "; client supports " << client_metadata.features;
+ session->clear();
+ break;
+ }
- pv = mds->sessionmap.mark_projected(session);
- sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
- mds->sessionmap.touch_session(session);
- mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
- new C_MDS_session_finish(this, session, sseq, true, pv));
- mdlog->flush();
+ // Special case for the 'root' metadata path; validate that the claimed
+ // root is actually within the caps of the session
+ if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
+ auto claimed_root = it->second;
+ stringstream ss;
+ bool denied = false;
+ // claimed_root has a leading "/" which we strip before passing
+ // into caps check
+ if (claimed_root.empty() || claimed_root[0] != '/') {
+ denied = true;
+ ss << "invalue root '" << claimed_root << "'";
+ } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
+ denied = true;
+ ss << "non-allowable root '" << claimed_root << "'";
+ }
+
+ if (denied) {
+ // Tell the client we're rejecting their open
+ send_reject_message(ss.str());
+ mds->clog->warn() << "client session with " << ss.str()
+ << " denied (" << session->info.inst << ")";
+ session->clear();
+ break;
+ }
+ }
+
+ if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
+ if (find_session_by_uuid(it->second)) {
+ send_reject_message("duplicated session uuid");
+ mds->clog->warn() << "client session with duplicated session uuid '"
+ << it->second << "' denied (" << session->info.inst << ")";
+ session->clear();
+ break;
+ }
+ }
+
+ if (session->is_closed())
+ mds->sessionmap.add_session(session);
+
+ pv = mds->sessionmap.mark_projected(session);
+ sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ mds->sessionmap.touch_session(session);
+ auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
+ ceph_assert(r == 0);
+ log_session_status("ACCEPTED", "");
+ });
+ mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
+ new C_MDS_session_finish(this, session, sseq, true, pv, fin));
+ mdlog->flush();
+ }
break;
case CEPH_SESSION_REQUEST_RENEWCAPS:
- if (session->is_open() ||
- session->is_stale()) {
+ if (session->is_open() || session->is_stale()) {
mds->sessionmap.touch_session(session);
if (session->is_stale()) {
mds->sessionmap.set_state(session, Session::STATE_OPEN);
mds->locker->resume_stale_caps(session);
mds->sessionmap.touch_session(session);
}
- m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
+ auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
+ mds->send_message_client(reply, session);
} else {
dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
}
session->is_closing() ||
session->is_killing()) {
dout(10) << "already closed|closing|killing, dropping this req" << dendl;
- m->put();
return;
}
if (session->is_importing()) {
dout(10) << "ignoring close req on importing session" << dendl;
- m->put();
return;
}
- assert(session->is_open() ||
+ ceph_assert(session->is_open() ||
session->is_stale() ||
session->is_opening());
if (m->get_seq() < session->get_push_seq()) {
dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
<< ", dropping" << dendl;
- m->put();
return;
}
// We are getting a seq that is higher than expected.
<< ", BUGGY!" << dendl;
mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
<< session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
- m->put();
return;
}
journal_close_session(session, Session::STATE_CLOSING, NULL);
default:
ceph_abort();
}
- m->put();
+}
+
+
+void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
+ if (!session->is_open() ||
+ !session->get_connection() ||
+ !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+ return;
+ }
+
+ version_t seq = session->wait_for_flush(gather->new_sub());
+ mds->send_message_client(
+ make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
}
void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
{
for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
- assert(session);
- if (!session->is_open() ||
- !session->connection.get() ||
- !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
- continue;
- version_t seq = session->wait_for_flush(gather.new_sub());
- mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+ ceph_assert(session);
+ flush_session(session, &gather);
}
}
void Server::finish_flush_session(Session *session, version_t seq)
{
- list<MDSInternalContextBase*> finished;
+ MDSContext::vec finished;
session->finish_flush(seq, finished);
mds->queue_waiters(finished);
}
void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
- interval_set<inodeno_t>& inos, version_t piv)
+ const interval_set<inodeno_t>& inos, version_t piv,
+ const interval_set<inodeno_t>& purge_inos, LogSegment *ls)
{
- dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
- << " " << pv << dendl;
-
+ dout(10) << "_session_logged " << session->info.inst
+ << " state_seq " << state_seq
+ << " " << (open ? "open":"close")
+ << " " << pv
+ << " purge_inos : " << purge_inos << dendl;
+
+ if (NULL != ls) {
+ dout(10) << "_session_logged seq : " << ls->seq << dendl;
+ if (purge_inos.size()){
+ ls->purge_inodes.insert(purge_inos);
+ mdcache->purge_inodes(purge_inos, ls);
+ }
+ }
+
if (piv) {
- assert(session->is_closing() || session->is_killing() ||
+ ceph_assert(session->is_closing() || session->is_killing() ||
session->is_opening()); // re-open closing session
session->info.prealloc_inos.subtract(inos);
+ session->delegated_inos.clear();
mds->inotable->apply_release_ids(inos);
- assert(mds->inotable->get_version() == piv);
+ ceph_assert(mds->inotable->get_version() == piv);
}
mds->sessionmap.mark_dirty(session);
<< ", noop" << dendl;
// close must have been canceled (by an import?), or any number of other things..
} else if (open) {
- assert(session->is_opening());
+ ceph_assert(session->is_opening());
mds->sessionmap.set_state(session, Session::STATE_OPEN);
mds->sessionmap.touch_session(session);
- assert(session->connection != NULL);
- session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
- if (mdcache->is_readonly())
- session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
+ ceph_assert(session->get_connection());
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+ if (mdcache->is_readonly()) {
+ auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
+ mds->send_message_client(m, session);
+ }
} else if (session->is_closing() ||
session->is_killing()) {
// kill any lingering capabilities, leases, requests
Capability *cap = session->caps.front();
CInode *in = cap->get_inode();
dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
- mds->locker->remove_client_cap(in, session->info.inst.name.num());
+ mds->locker->remove_client_cap(in, cap, true);
}
while (!session->leases.empty()) {
ClientLease *r = session->leases.front();
dout(20) << " killing client lease of " << *dn << dendl;
dn->remove_client_lease(r, mds->locker);
}
- if (client_reconnect_gather.count(session->info.get_client())) {
+ if (client_reconnect_gather.erase(session->info.get_client())) {
dout(20) << " removing client from reconnect set" << dendl;
- client_reconnect_gather.erase(session->info.get_client());
-
if (client_reconnect_gather.empty()) {
dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
reconnect_gather_finish();
}
}
+ if (client_reclaim_gather.erase(session->info.get_client())) {
+ dout(20) << " removing client from reclaim set" << dendl;
+ if (client_reclaim_gather.empty()) {
+ dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
+ mds->maybe_clientreplay_done();
+ }
+ }
if (session->is_closing()) {
// mark con disposable. if there is a fault, we will get a
// ms_handle_remote_reset() and realize they had in fact closed.
// do this *before* sending the message to avoid a possible
// race.
- if (session->connection != NULL) {
+ if (session->get_connection()) {
// Conditional because terminate_sessions will indiscrimately
// put sessions in CLOSING whether they ever had a conn or not.
- session->connection->mark_disposable();
+ session->get_connection()->mark_disposable();
}
// reset session
- mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
+ mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
mds->sessionmap.set_state(session, Session::STATE_CLOSED);
session->clear();
mds->sessionmap.remove_session(session);
} else if (session->is_killing()) {
// destroy session, close connection
- if (session->connection != NULL) {
- session->connection->mark_down();
- session->connection->set_priv(NULL);
+ if (session->get_connection()) {
+ session->get_connection()->mark_down();
+ mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+ session->set_connection(nullptr);
}
mds->sessionmap.remove_session(session);
} else {
* - sessions learned from other MDSs during a cross-MDS rename
*/
version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
+ map<client_t,client_metadata_t>& cmm,
map<client_t, pair<Session*,uint64_t> >& smap)
{
version_t pv = mds->sessionmap.get_projected();
<< dendl;
mds->objecter->with_osdmap(
- [this, &cm](const OSDMap &osd_map) {
+ [this, &cm, &cmm](const OSDMap &osd_map) {
for (auto p = cm.begin(); p != cm.end(); ) {
if (osd_map.is_blacklisted(p->second.addr)) {
dout(10) << " ignoring blacklisted client." << p->first
<< " (" << p->second.addr << ")" << dendl;
+ cmm.erase(p->first);
cm.erase(p++);
} else {
++p;
session->is_closing() ||
session->is_killing()) {
sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ auto q = cmm.find(p->first);
+ if (q != cmm.end())
+ session->info.client_metadata.merge(q->second);
} else {
- assert(session->is_open() ||
+ ceph_assert(session->is_open() ||
session->is_opening() ||
session->is_stale());
sseq = 0;
dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
mds->sessionmap.set_state(session, Session::STATE_OPEN);
mds->sessionmap.touch_session(session);
- mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
+
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+
if (mdcache->is_readonly())
- mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
+ mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
}
} else {
dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
- assert(session->is_open() || session->is_stale());
+ ceph_assert(session->is_open() || session->is_stale());
}
if (dec_import) {
void Server::terminate_sessions()
{
- dout(2) << "terminate_sessions" << dendl;
+ dout(5) << "terminating all sessions..." << dendl;
terminating_sessions = true;
void Server::find_idle_sessions()
{
- dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
+ auto now = clock::now();
+ auto last_cleared_laggy = mds->last_cleared_laggy();
+
+ dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
// timeout/stale
// (caps go stale, lease die)
- utime_t now = ceph_clock_now();
- utime_t cutoff = now;
- cutoff -= g_conf->mds_session_timeout;
- while (1) {
- Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
- if (!session) break;
- dout(20) << "laggiest active session is " << session->info.inst << dendl;
- if (session->last_cap_renew >= cutoff) {
- dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
- << session->last_cap_renew << ")" << dendl;
- break;
- }
-
- dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
- mds->sessionmap.set_state(session, Session::STATE_STALE);
- mds->locker->revoke_stale_caps(session);
- mds->locker->remove_stale_leases(session);
- mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
- finish_flush_session(session, session->get_push_seq());
- }
-
- // autoclose
- cutoff = now;
- cutoff -= g_conf->mds_session_autoclose;
+ double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
+ double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
// don't kick clients if we've been laggy
- if (mds->get_laggy_until() > cutoff) {
- dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
- << ", not kicking any clients to be safe" << dendl;
+ if (last_cleared_laggy < cutoff) {
+ dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
+ << "), not marking any client stale" << dendl;
return;
}
- if (mds->sessionmap.get_sessions().size() == 1 &&
- mds->mdsmap->get_num_in_mds() == 1) {
- dout(20) << "not evicting a slow client, because there is only one"
- << dendl;
- return;
+ std::vector<Session*> to_evict;
+
+ bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
+ const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+ if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+ std::vector<Session*> new_stale;
+
+ for (auto session : *(sessions_p1->second)) {
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ break;
+ }
+
+ if (session->last_seen > session->last_cap_renew) {
+ last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+ }
+
+ if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
+ dout(20) << "evicting session " << session->info.inst << " since autoclose "
+ "has arrived" << dendl;
+ // evict session without marking it stale
+ to_evict.push_back(session);
+ continue;
+ }
+
+ if (defer_session_stale &&
+ !session->is_any_flush_waiter() &&
+ !mds->locker->is_revoking_any_caps_from(session->get_client())) {
+ dout(20) << "deferring marking session " << session->info.inst << " stale "
+ "since it holds no caps" << dendl;
+ continue;
+ }
+
+ auto it = session->info.client_metadata.find("timeout");
+ if (it != session->info.client_metadata.end()) {
+ unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
+ if (timeout == 0) {
+ dout(10) << "skipping session " << session->info.inst
+ << ", infinite timeout specified" << dendl;
+ continue;
+ }
+ double cutoff = queue_max_age + timeout;
+ if (last_cap_renew_span < cutoff) {
+ dout(10) << "skipping session " << session->info.inst
+ << ", timeout (" << timeout << ") specified"
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+
+ // do not go through stale, evict it directly.
+ to_evict.push_back(session);
+ } else {
+ dout(10) << "new stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ new_stale.push_back(session);
+ }
+ }
+
+ for (auto session : new_stale) {
+ mds->sessionmap.set_state(session, Session::STATE_STALE);
+ if (mds->locker->revoke_stale_caps(session)) {
+ mds->locker->remove_stale_leases(session);
+ finish_flush_session(session, session->get_push_seq());
+ auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
+ mds->send_message_client(m, session);
+ } else {
+ to_evict.push_back(session);
+ }
+ }
}
+ // autoclose
+ cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
+
// Collect a list of sessions exceeding the autoclose threshold
- std::vector<Session *> to_evict;
- const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
- if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
- return;
+ const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+ if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
+ for (auto session : *(sessions_p2->second)) {
+ assert(session->is_stale());
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "oldest stale session is " << session->info.inst
+ << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ break;
+ }
+ to_evict.push_back(session);
+ }
}
- const auto &stale_sessions = sessions_p->second;
- assert(stale_sessions != nullptr);
- for (const auto &session: *stale_sessions) {
+ for (auto session: to_evict) {
if (session->is_importing()) {
- dout(10) << "stopping at importing session " << session->info.inst << dendl;
- break;
- }
- assert(session->is_stale());
- if (session->last_cap_renew >= cutoff) {
- dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
- << session->last_cap_renew << ")" << dendl;
- break;
+ dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
+ continue;
}
- to_evict.push_back(session);
- }
-
- for (const auto &session: to_evict) {
- utime_t age = now;
- age -= session->last_cap_renew;
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
mds->clog->warn() << "evicting unresponsive client " << *session
- << ", after " << age << " seconds";
- dout(10) << "autoclosing stale session " << session->info.inst << " last "
- << session->last_cap_renew << dendl;
+ << ", after " << last_cap_renew_span << " seconds";
+ dout(10) << "autoclosing stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
- if (g_conf->mds_session_blacklist_on_timeout) {
+ if (g_conf()->mds_session_blacklist_on_timeout) {
std::stringstream ss;
- mds->evict_client(session->info.inst.name.num(), false, true,
- ss, nullptr);
+ mds->evict_client(session->get_client().v, false, true, ss, nullptr);
} else {
kill_session(session, NULL);
}
}
}
+void Server::evict_cap_revoke_non_responders() {
+ if (!cap_revoke_eviction_timeout) {
+ return;
+ }
+
+ auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
+
+ for (auto const &client: to_evict) {
+ mds->clog->warn() << "client id " << client << " has not responded to"
+ << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
+ << " seconds, evicting";
+ dout(1) << __func__ << ": evicting cap revoke non-responder client id "
+ << client << dendl;
+
+ std::stringstream ss;
+ bool evicted = mds->evict_client(client.v, false,
+ g_conf()->mds_session_blacklist_on_evict,
+ ss, nullptr);
+ if (evicted && logger) {
+ logger->inc(l_mdss_cap_revoke_eviction);
+ }
+ }
+}
+
+void Server::handle_conf_change(const std::set<std::string>& changed) {
+ if (changed.count("mds_replay_unsafe_with_closed_session")) {
+ replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
+ }
+ if (changed.count("mds_cap_revoke_eviction_timeout")) {
+ cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+ dout(20) << __func__ << " cap revoke eviction timeout changed to "
+ << cap_revoke_eviction_timeout << dendl;
+ }
+ if (changed.count("mds_recall_max_decay_rate")) {
+ recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+ }
+ if (changed.count("mds_max_snaps_per_dir")) {
+ max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+ dout(20) << __func__ << " max snapshots per directory changed to "
+ << max_snaps_per_dir << dendl;
+ }
+ if (changed.count("mds_client_delegate_inos_pct")) {
+ delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+ }
+}
+
/*
- * XXX bump in the interface here, not using an MDSInternalContextBase here
+ * XXX bump in the interface here, not using an MDSContext here
* because all the callers right now happen to use a SaferCond
*/
-void Server::kill_session(Session *session, Context *on_safe)
+void Server::kill_session(Session *session, Context *on_safe, bool need_purge_inos)
{
- assert(mds->mds_lock.is_locked_by_me());
+ ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
if ((session->is_opening() ||
session->is_open() ||
session->is_stale()) &&
!session->is_importing()) {
dout(10) << "kill_session " << session << dendl;
- journal_close_session(session, Session::STATE_KILLING, on_safe);
+ journal_close_session(session, Session::STATE_KILLING, on_safe, need_purge_inos);
} else {
dout(10) << "kill_session importing or already closing/killing " << session << dendl;
- assert(session->is_closing() ||
- session->is_closed() ||
- session->is_killing() ||
- session->is_importing());
- if (on_safe) {
- on_safe->complete(0);
+ if (session->is_closing() ||
+ session->is_killing()) {
+ if (on_safe)
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
+ } else {
+ ceph_assert(session->is_closed() ||
+ session->is_importing());
+ if (on_safe)
+ on_safe->complete(0);
}
}
}
size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
{
- std::list<Session*> victims;
- const auto sessions = mds->sessionmap.get_sessions();
- for (const auto p : sessions) {
+ bool prenautilus = mds->objecter->with_osdmap(
+ [&](const OSDMap& o) {
+ return o.require_osd_release < ceph_release_t::nautilus;
+ });
+
+ std::vector<Session*> victims;
+ const auto& sessions = mds->sessionmap.get_sessions();
+ for (const auto& p : sessions) {
if (!p.first.is_client()) {
// Do not apply OSDMap blacklist to MDS daemons, we find out
// about their death via MDSMap.
}
Session *s = p.second;
- if (blacklist.count(s->info.inst.addr)) {
+ auto inst_addr = s->info.inst.addr;
+ // blacklist entries are always TYPE_ANY for nautilus+
+ inst_addr.set_type(entity_addr_t::TYPE_ANY);
+ if (blacklist.count(inst_addr)) {
victims.push_back(s);
+ continue;
+ }
+ if (prenautilus) {
+ // ...except pre-nautilus, they were TYPE_LEGACY
+ inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
+ if (blacklist.count(inst_addr)) {
+ victims.push_back(s);
+ }
}
}
- for (const auto s : victims) {
+ for (const auto& s : victims) {
kill_session(s, nullptr);
}
return victims.size();
}
-void Server::journal_close_session(Session *session, int state, Context *on_safe)
+void Server::journal_close_session(Session *session, int state, Context *on_safe, bool need_purge_inos)
{
+ dout(10) << __func__ << " : "
+ << "("<< need_purge_inos << ")"
+ << session->info.inst
+ << "(" << session->info.prealloc_inos.size() << "|" << session->pending_prealloc_inos.size() << ")" << dendl;
+
uint64_t sseq = mds->sessionmap.set_state(session, state);
version_t pv = mds->sessionmap.mark_projected(session);
version_t piv = 0;
// release alloc and pending-alloc inos for this session
// and wipe out session state, in case the session close aborts for some reason
interval_set<inodeno_t> both;
- both.insert(session->info.prealloc_inos);
both.insert(session->pending_prealloc_inos);
+ if (!need_purge_inos)
+ both.insert(session->info.prealloc_inos);
if (both.size()) {
mds->inotable->project_release_ids(both);
piv = mds->inotable->get_projected_version();
} else
piv = 0;
-
- mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
- new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
+
+ if(need_purge_inos && session->info.prealloc_inos.size()) {
+ dout(10) << "start purge indoes " << session->info.prealloc_inos << dendl;
+ LogSegment* ls = mdlog->get_current_segment();
+ LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, session->info.prealloc_inos);
+ MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv,
+ session->info.prealloc_inos, ls, on_safe);
+ mdlog->start_submit_entry(e, c);
+ } else {
+ interval_set<inodeno_t> empty;
+ LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, empty);
+ MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe);
+ mdlog->start_submit_entry(e, c);
+ }
mdlog->flush();
// clean up requests, too
- elist<MDRequestImpl*>::iterator p =
- session->requests.begin(member_offset(MDRequestImpl,
- item_session_request));
- while (!p.end()) {
- MDRequestRef mdr = mdcache->request_get((*p)->reqid);
+ for (auto p = session->requests.begin(); !p.end(); ) {
+ MDRequestRef mdr(*p);
++p;
mdcache->request_kill(mdr);
}
finish_flush_session(session, session->get_push_seq());
}
-void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
+void Server::reconnect_clients(MDSContext *reconnect_done_)
{
reconnect_done = reconnect_done_;
+ auto now = clock::now();
set<Session*> sessions;
mds->sessionmap.get_client_session_set(sessions);
for (auto session : sessions) {
- if (session->is_open())
- client_reconnect_gather.insert(session->get_client());
+ if (session->is_open()) {
+ client_reconnect_gather.insert(session->get_client());
+ session->set_reconnecting(true);
+ session->last_cap_renew = now;
+ }
}
if (client_reconnect_gather.empty()) {
// clients will get the mdsmap and discover we're reconnecting via the monitor.
- reconnect_start = ceph_clock_now();
+ reconnect_start = now;
dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
mds->sessionmap.dump();
}
-/* This function DOES put the passed message before returning*/
-void Server::handle_client_reconnect(MClientReconnect *m)
+void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
{
- dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
+ dout(7) << "handle_client_reconnect " << m->get_source()
+ << (m->has_more() ? " (more)" : "") << dendl;
client_t from = m->get_source().num();
Session *session = mds->get_session(m);
- assert(session);
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "sessionless";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ if (!session->is_open()) {
+ dout(0) << " ignoring msg from not-open session" << *m << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
return;
}
- utime_t delay = ceph_clock_now();
- delay -= reconnect_start;
+ auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
bool deny = false;
mds->clog->info() << "denied reconnect attempt (mds is "
<< ceph_mds_state_name(mds->get_state())
<< ") from " << m->get_source_inst()
- << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
- deny = true;
- } else if (!session->is_open()) {
- dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
- mds->clog->info() << "denied reconnect attempt (mds is "
- << ceph_mds_state_name(mds->get_state())
- << ") from " << m->get_source_inst() << " (session is closed)";
- deny = true;
- } else if (mdcache->is_readonly()) {
- dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
- mds->clog->info() << "denied reconnect attempt (mds is read-only)";
+ << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
deny = true;
+ } else {
+ std::string error_str;
+ if (!session->is_open()) {
+ error_str = "session is closed";
+ } else if (mdcache->is_readonly()) {
+ error_str = "mds is readonly";
+ } else {
+ if (session->info.client_metadata.features.empty())
+ infer_supported_features(session, session->info.client_metadata);
+
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= session->info.client_metadata.features;
+ if (!missing_features.empty()) {
+ stringstream ss;
+ ss << "missing required features '" << missing_features << "'";
+ error_str = ss.str();
+ }
+ }
+
+ if (!error_str.empty()) {
+ deny = true;
+ dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
+ mds->clog->info() << "denied reconnect attempt from "
+ << m->get_source_inst() << " (" << error_str << ")";
+ }
}
if (deny) {
- m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
- m->put();
+ auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+ mds->send_message_client(r, session);
+ if (session->is_open())
+ kill_session(session, nullptr);
return;
}
- // notify client of success with an OPEN
- m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
- session->last_cap_renew = ceph_clock_now();
- mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
+ if (!m->has_more()) {
+ // notify client of success with an OPEN
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+ mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
+ }
+
+ session->last_cap_renew = clock::now();
// snaprealms
- for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
- p != m->realms.end();
- ++p) {
- CInode *in = mdcache->get_inode(inodeno_t(p->ino));
+ for (const auto &r : m->realms) {
+ CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
if (in && in->state_test(CInode::STATE_PURGING))
continue;
if (in) {
- assert(in->snaprealm);
- if (in->snaprealm->have_past_parents_open()) {
- dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
- mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
+ if (in->snaprealm) {
+ dout(15) << "open snaprealm (w inode) on " << *in << dendl;
} else {
- dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
- mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
+ // this can happen if we are non-auth or we rollback snaprealm
+ dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
}
+ mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
} else {
- dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
- << " seq " << p->seq << dendl;
- mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
+ dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
+ << " seq " << r.realm.seq << dendl;
+ mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
}
}
// caps
- for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
- p != m->caps.end();
- ++p) {
+ for (const auto &p : m->caps) {
// make sure our last_cap_id is MAX over all issued caps
- if (p->second.capinfo.cap_id > mdcache->last_cap_id)
- mdcache->last_cap_id = p->second.capinfo.cap_id;
+ if (p.second.capinfo.cap_id > mdcache->last_cap_id)
+ mdcache->last_cap_id = p.second.capinfo.cap_id;
- CInode *in = mdcache->get_inode(p->first);
+ CInode *in = mdcache->get_inode(p.first);
if (in && in->state_test(CInode::STATE_PURGING))
continue;
if (in && in->is_auth()) {
// we recovered it, and it's ours. take note.
- dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
+ dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
<< " on " << *in << dendl;
- in->reconnect_cap(from, p->second, session);
- mdcache->add_reconnected_cap(from, p->first, p->second);
- recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
+ in->reconnect_cap(from, p.second, session);
+ mdcache->add_reconnected_cap(from, p.first, p.second);
+ recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
continue;
}
// not mine.
dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
// add to cap export list.
- p->second.path.clear(); // we don't need path
- mdcache->rejoin_export_caps(p->first, from, p->second,
- in->authority().first);
+ mdcache->rejoin_export_caps(p.first, from, p.second,
+ in->authority().first, true);
} else {
// don't know if the inode is mine
- dout(10) << "missing ino " << p->first << ", will load later" << dendl;
- p->second.path.clear(); // we don't need path
- mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
+ dout(10) << "missing ino " << p.first << ", will load later" << dendl;
+ mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
}
}
- mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
- // remove from gather set
- client_reconnect_gather.erase(from);
- if (client_reconnect_gather.empty())
- reconnect_gather_finish();
+ reconnect_last_seen = clock::now();
- m->put();
+ if (!m->has_more()) {
+ mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
+
+ // remove from gather set
+ client_reconnect_gather.erase(from);
+ session->set_reconnecting(false);
+ if (client_reconnect_gather.empty())
+ reconnect_gather_finish();
+ }
}
+void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
+{
+ int supported = -1;
+ auto it = client_metadata.find("ceph_version");
+ if (it != client_metadata.end()) {
+ // user space client
+ if (it->second.compare(0, 16, "ceph version 12.") == 0)
+ supported = CEPHFS_FEATURE_LUMINOUS;
+ else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
+ supported = CEPHFS_FEATURE_KRAKEN;
+ } else {
+ it = client_metadata.find("kernel_version");
+ if (it != client_metadata.end()) {
+ // kernel client
+ if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
+ supported = CEPHFS_FEATURE_LUMINOUS;
+ }
+ }
+ if (supported == -1 &&
+ session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
+ supported = CEPHFS_FEATURE_JEWEL;
+
+ if (supported >= 0) {
+ unsigned long value = (1UL << (supported + 1)) - 1;
+ client_metadata.features = feature_bitset_t(value);
+ dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
+ }
+}
+void Server::update_required_client_features()
+{
+ vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
+
+ /* If this blows up on you, you added a release without adding a new release bit to cephfs_features.h */
+ static_assert(CEPHFS_CURRENT_RELEASE == CEPH_RELEASE_MAX-1);
+
+ ceph_release_t min_compat = mds->mdsmap->get_min_compat_client();
+ if (min_compat >= ceph_release_t::octopus)
+ bits.push_back(CEPHFS_FEATURE_OCTOPUS);
+ else if (min_compat >= ceph_release_t::nautilus)
+ bits.push_back(CEPHFS_FEATURE_NAUTILUS);
+ else if (min_compat >= ceph_release_t::mimic)
+ bits.push_back(CEPHFS_FEATURE_MIMIC);
+ else if (min_compat >= ceph_release_t::luminous)
+ bits.push_back(CEPHFS_FEATURE_LUMINOUS);
+ else if (min_compat >= ceph_release_t::kraken)
+ bits.push_back(CEPHFS_FEATURE_KRAKEN);
+ else if (min_compat >= ceph_release_t::jewel)
+ bits.push_back(CEPHFS_FEATURE_JEWEL);
+
+ std::sort(bits.begin(), bits.end());
+ required_client_features = feature_bitset_t(bits);
+ dout(7) << "required_client_features: " << required_client_features << dendl;
+
+ if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto session : sessions) {
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= session->info.client_metadata.features;
+ if (!missing_features.empty()) {
+ bool blacklisted = mds->objecter->with_osdmap(
+ [session](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blacklisted(session->info.inst.addr);
+ });
+ if (blacklisted)
+ continue;
+
+ mds->clog->warn() << "evicting session " << *session << ", missing required features '"
+ << missing_features << "'";
+ std::stringstream ss;
+ mds->evict_client(session->get_client().v, false,
+ g_conf()->mds_session_blacklist_on_evict, ss);
+ }
+ }
+ }
+}
void Server::reconnect_gather_finish()
{
dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
- assert(reconnect_done);
- reconnect_done->complete(0);
+ ceph_assert(reconnect_done);
+
+ if (!mds->snapclient->is_synced()) {
+ // make sure snaptable cache is populated. snaprealms will be
+ // extensively used in rejoin stage.
+ dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
+ mds->snapclient->wait_for_sync(reconnect_done);
+ } else {
+ reconnect_done->complete(0);
+ }
reconnect_done = NULL;
}
void Server::reconnect_tick()
{
if (reconnect_evicting) {
- dout(4) << "reconnect_tick: waiting for evictions" << dendl;
+ dout(7) << "reconnect_tick: waiting for evictions" << dendl;
return;
}
- utime_t reconnect_end = reconnect_start;
- reconnect_end += g_conf->mds_reconnect_timeout;
- if (ceph_clock_now() >= reconnect_end &&
- !client_reconnect_gather.empty()) {
- dout(10) << "reconnect timed out" << dendl;
+ if (client_reconnect_gather.empty())
+ return;
- // If we're doing blacklist evictions, use this to wait for them before
- // proceeding to reconnect_gather_finish
- MDSGatherBuilder gather(g_ceph_context);
+ auto now = clock::now();
+ auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+ if (elapse1 < g_conf()->mds_reconnect_timeout)
+ return;
- for (set<client_t>::iterator p = client_reconnect_gather.begin();
- p != client_reconnect_gather.end();
- ++p) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
- assert(session);
- dout(1) << "reconnect gave up on " << session->info.inst << dendl;
-
- mds->clog->warn() << "evicting unresponsive client " << *session
- << ", after waiting " << g_conf->mds_reconnect_timeout
- << " seconds during MDS startup";
-
- if (g_conf->mds_session_blacklist_on_timeout) {
- std::stringstream ss;
- mds->evict_client(session->info.inst.name.num(), false, true, ss,
- gather.new_sub());
- } else {
- kill_session(session, NULL);
- }
+ vector<Session*> remaining_sessions;
+ remaining_sessions.reserve(client_reconnect_gather.size());
+ for (auto c : client_reconnect_gather) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+ ceph_assert(session);
+ remaining_sessions.push_back(session);
+ // client re-sends cap flush messages before the reconnect message
+ if (session->last_seen > reconnect_last_seen)
+ reconnect_last_seen = session->last_seen;
+ }
+
+ auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+ if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
+ dout(7) << "reconnect_tick: last seen " << elapse2
+ << " seconds ago, extending reconnect interval" << dendl;
+ return;
+ }
- failed_reconnects++;
+ dout(7) << "reconnect timed out, " << remaining_sessions.size()
+ << " clients have not reconnected in time" << dendl;
+
+ // If we're doing blacklist evictions, use this to wait for them before
+ // proceeding to reconnect_gather_finish
+ MDSGatherBuilder gather(g_ceph_context);
+
+ for (auto session : remaining_sessions) {
+ // Keep sessions that have specified timeout. These sessions will prevent
+ // mds from going to active. MDS goes to active after they all have been
+ // killed or reclaimed.
+ if (session->info.client_metadata.find("timeout") !=
+ session->info.client_metadata.end()) {
+ dout(1) << "reconnect keeps " << session->info.inst
+ << ", need to be reclaimed" << dendl;
+ client_reclaim_gather.insert(session->get_client());
+ continue;
}
- client_reconnect_gather.clear();
- if (gather.has_subs()) {
- dout(1) << "reconnect will complete once clients are evicted" << dendl;
- gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
- [this](int r){reconnect_gather_finish();})));
- gather.activate();
- reconnect_evicting = true;
+ dout(1) << "reconnect gives up on " << session->info.inst << dendl;
+
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after waiting " << elapse1
+ << " seconds during MDS startup";
+
+ if (g_conf()->mds_session_blacklist_on_timeout) {
+ std::stringstream ss;
+ mds->evict_client(session->get_client().v, false, true, ss,
+ gather.new_sub());
} else {
- reconnect_gather_finish();
+ kill_session(session, NULL, true);
}
+
+ failed_reconnects++;
+ }
+ client_reconnect_gather.clear();
+
+ if (gather.has_subs()) {
+ dout(1) << "reconnect will complete once clients are evicted" << dendl;
+ gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
+ [this](int r){reconnect_gather_finish();})));
+ gather.activate();
+ reconnect_evicting = true;
+ } else {
+ reconnect_gather_finish();
}
}
if (!locks.length()) return;
int numlocks;
ceph_filelock lock;
- bufferlist::iterator p = locks.begin();
- ::decode(numlocks, p);
+ auto p = locks.cbegin();
+ decode(numlocks, p);
for (int i = 0; i < numlocks; ++i) {
- ::decode(lock, p);
+ decode(lock, p);
lock.client = client;
in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
++in->get_fcntl_lock_state()->client_held_lock_counts[client];
}
- ::decode(numlocks, p);
+ decode(numlocks, p);
for (int i = 0; i < numlocks; ++i) {
- ::decode(lock, p);
+ decode(lock, p);
lock.client = client;
in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
++in->get_flock_lock_state()->client_held_lock_counts[client];
}
}
-
/**
* Call this when the MDCache is oversized, to send requests to the clients
* to trim some caps, and consequently unpin some inodes in the MDCache so
* that it can trim too.
*/
-void Server::recall_client_state(void)
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
{
- /* try to recall at least 80% of all caps */
- uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
- uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
- if (max_caps_per_client < min_caps_per_client) {
- dout(0) << "max_caps_per_client " << max_caps_per_client
- << " < min_caps_per_client " << min_caps_per_client << dendl;
- max_caps_per_client = min_caps_per_client + 1;
- }
-
- /* unless this ratio is smaller: */
- /* ratio: determine the amount of caps to recall from each client. Use
- * percentage full over the cache reservation. Cap the ratio at 80% of client
- * caps. */
- double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
-
- dout(10) << "recall_client_state " << ratio
- << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
- << dendl;
+ const auto now = clock::now();
+ const bool steady = !!(flags&RecallFlags::STEADY);
+ const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
+ const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
+ const bool trim = !!(flags&RecallFlags::TRIM);
+
+ const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+ const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+ const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+ const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
+ const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
+
+ dout(7) << __func__ << ":"
+ << " min=" << min_caps_per_client
+ << " max=" << max_caps_per_client
+ << " total=" << Capability::count()
+ << " flags=" << flags
+ << dendl;
- set<Session*> sessions;
- mds->sessionmap.get_client_session_set(sessions);
- for (auto &session : sessions) {
+ /* trim caps of sessions with the most caps first */
+ std::multimap<uint64_t, Session*> caps_session;
+ auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
+ auto num_caps = s->caps.size();
+ auto cache_liveness = s->get_session_cache_liveness();
+ if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
+ caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+ }
+ };
+ mds->sessionmap.get_client_sessions(std::move(f));
+
+ std::pair<bool, uint64_t> result = {false, 0};
+ auto& [throttled, caps_recalled] = result;
+ last_recall_state = now;
+ for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
if (!session->is_open() ||
+ !session->get_connection() ||
!session->info.inst.name.is_client())
continue;
- dout(10) << " session " << session->info.inst
- << " caps " << session->caps.size()
+ dout(10) << __func__ << ":"
+ << " session " << session->info.inst
+ << " caps " << num_caps
<< ", leases " << session->leases.size()
<< dendl;
- uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
- if (session->caps.size() > newlim) {
- MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+ uint64_t newlim;
+ if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+ newlim = min_caps_per_client;
+ } else {
+ newlim = num_caps-recall_max_caps;
+ }
+ if (num_caps > newlim) {
+ /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+ uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+ newlim = num_caps-recall;
+ const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+ const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
+ const uint64_t global_recall_throttle = recall_throttle.get();
+ if (session_recall_throttle+recall > recall_max_decay_threshold) {
+ dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
+ dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+ dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+ throttled = true;
+ break;
+ }
+
+ // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+ if (steady) {
+ const auto session_recall = session->get_recall_caps();
+ const auto session_release = session->get_release_caps();
+ if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+ /* The session has been unable to keep up with the number of caps
+ * recalled (by half); additionally, to prevent marking sessions
+ * we've just begun to recall from, the session_recall counter
+ * (decayed count of caps recently recalled) is **greater** than the
+ * session threshold for the session's cap recall throttle.
+ */
+ dout(15) << " 2*session_release < session_recall"
+ " (2*" << session_release << " < " << session_recall << ") &&"
+ " 2*session_recall < recall_max_decay_threshold"
+ " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ } else if (recall < recall_max_caps && 2*recall < session_recall) {
+ /* The number of caps recalled is less than the number we *could*
+ * recall (so there isn't much left to recall?) and the number of
+ * caps is less than the current recall_caps counter (decayed count
+ * of caps recently recalled).
+ */
+ dout(15) << " 2*recall < session_recall "
+ " (2*" << recall << " < " << session_recall << ") &&"
+ " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ }
+ }
+
+ dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
+ auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
m->head.max_caps = newlim;
mds->send_message_client(m, session);
- session->notify_recall_sent(newlim);
+ if (gather) {
+ flush_session(session, gather);
+ }
+ caps_recalled += session->notify_recall_sent(newlim);
+ recall_throttle.hit(recall);
}
}
+
+ dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+ return result;
}
void Server::force_clients_readonly()
if (!session->info.inst.name.is_client() ||
!(session->is_open() || session->is_stale()))
continue;
- mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
+ mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
}
}
void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
{
dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
- assert(!mdr->has_completed);
+ ceph_assert(!mdr->has_completed);
// note trace items for eventual reply.
mdr->tracei = in;
if (mds->queue_one_replay()) {
dout(10) << " queued next replay op" << dendl;
} else {
- dout(10) << " journaled last replay op, flushing" << dendl;
- mdlog->flush();
+ dout(10) << " journaled last replay op" << dendl;
}
} else if (mdr->did_early_reply)
mds->locker->drop_rdlocks_for_early_reply(mdr.get());
}
void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
- const char *event)
+ std::string_view event)
{
if (mdr) {
string event_str("submit entry: ");
event_str += event;
- mdr->mark_event_string(event_str);
+ mdr->mark_event(event_str);
}
mdlog->submit_entry(le, fin);
}
void Server::respond_to_request(MDRequestRef& mdr, int r)
{
if (mdr->client_request) {
- reply_client_request(mdr, new MClientReply(mdr->client_request, r));
+ if (mdr->is_batch_op() && mdr->is_batch_head) {
+ int mask = mdr->client_request->head.args.getattr.mask;
+
+ std::unique_ptr<BatchOp> bop;
+ if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) {
+ dout(20) << __func__ << ": respond other getattr ops. " << *mdr << dendl;
+ auto it = mdr->in[0]->batch_ops.find(mask);
+ bop = std::move(it->second);
+ mdr->in[0]->batch_ops.erase(it);
+ } else {
+ dout(20) << __func__ << ": respond other lookup ops. " << *mdr << dendl;
+ auto it = mdr->dn[0].back()->batch_ops.find(mask);
+ bop = std::move(it->second);
+ mdr->dn[0].back()->batch_ops.erase(it);
+ }
- // add here to avoid counting ops multiple times (e.g., locks, loading)
- switch(mdr->client_request->get_op()) {
- case CEPH_MDS_OP_LOOKUPHASH:
- logger->inc(l_mdss_req_lookuphash);
- break;
- case CEPH_MDS_OP_LOOKUPINO:
- logger->inc(l_mdss_req_lookupino);
- break;
- case CEPH_MDS_OP_LOOKUPPARENT:
- logger->inc(l_mdss_req_lookupparent);
- break;
- case CEPH_MDS_OP_LOOKUPNAME:
- logger->inc(l_mdss_req_lookupname);
- break;
- case CEPH_MDS_OP_LOOKUP:
- logger->inc(l_mdss_req_lookup);
- break;
- case CEPH_MDS_OP_LOOKUPSNAP:
- logger->inc(l_mdss_req_lookupsnap);
- break;
- case CEPH_MDS_OP_GETATTR:
- logger->inc(l_mdss_req_getattr);
- break;
- case CEPH_MDS_OP_SETATTR:
- logger->inc(l_mdss_req_setattr);
- break;
- case CEPH_MDS_OP_SETLAYOUT:
- logger->inc(l_mdss_req_setlayout);
- break;
- case CEPH_MDS_OP_SETDIRLAYOUT:
- logger->inc(l_mdss_req_setdirlayout);
- break;
- case CEPH_MDS_OP_SETXATTR:
- logger->inc(l_mdss_req_setxattr);
- break;
- case CEPH_MDS_OP_RMXATTR:
- logger->inc(l_mdss_req_rmxattr);
- break;
- case CEPH_MDS_OP_READDIR:
- logger->inc(l_mdss_req_readdir);
- break;
- case CEPH_MDS_OP_SETFILELOCK:
- logger->inc(l_mdss_req_setfilelock);
- break;
- case CEPH_MDS_OP_GETFILELOCK:
- logger->inc(l_mdss_req_getfilelock);
- break;
- case CEPH_MDS_OP_CREATE:
- logger->inc(l_mdss_req_create);
- case CEPH_MDS_OP_OPEN:
- logger->inc(l_mdss_req_open);
- break;
- case CEPH_MDS_OP_MKNOD:
- logger->inc(l_mdss_req_mknod);
- break;
- case CEPH_MDS_OP_LINK:
- logger->inc(l_mdss_req_link);
- break;
- case CEPH_MDS_OP_UNLINK:
- logger->inc(l_mdss_req_unlink);
- break;
- case CEPH_MDS_OP_RMDIR:
- logger->inc(l_mdss_req_rmdir);
- break;
- case CEPH_MDS_OP_RENAME:
- logger->inc(l_mdss_req_rename);
- break;
- case CEPH_MDS_OP_MKDIR:
- logger->inc(l_mdss_req_mkdir);
- break;
- case CEPH_MDS_OP_SYMLINK:
- logger->inc(l_mdss_req_symlink);
- break;
- case CEPH_MDS_OP_LSSNAP:
- logger->inc(l_mdss_req_lssnap);
- break;
- case CEPH_MDS_OP_MKSNAP:
- logger->inc(l_mdss_req_mksnap);
- break;
- case CEPH_MDS_OP_RMSNAP:
- logger->inc(l_mdss_req_rmsnap);
- break;
- case CEPH_MDS_OP_RENAMESNAP:
- logger->inc(l_mdss_req_renamesnap);
- break;
+ bop->respond(r);
+ } else {
+ reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
}
} else if (mdr->internal_op > -1) {
dout(10) << "respond_to_request on internal request " << mdr << dendl;
if (!mdr->internal_op_finish)
- assert(0 == "trying to respond to internal op without finisher");
+ ceph_abort_msg("trying to respond to internal op without finisher");
mdr->internal_op_finish->complete(r);
mdcache->request_finish(mdr);
}
}
+// statistics mds req op number and latency
+void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
+{
+ int code = l_mdss_first;
+ switch(req->get_op()) {
+ case CEPH_MDS_OP_LOOKUPHASH:
+ code = l_mdss_req_lookuphash_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPINO:
+ code = l_mdss_req_lookupino_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPPARENT:
+ code = l_mdss_req_lookupparent_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPNAME:
+ code = l_mdss_req_lookupname_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUP:
+ code = l_mdss_req_lookup_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ code = l_mdss_req_lookupsnap_latency;
+ break;
+ case CEPH_MDS_OP_GETATTR:
+ code = l_mdss_req_getattr_latency;
+ break;
+ case CEPH_MDS_OP_SETATTR:
+ code = l_mdss_req_setattr_latency;
+ break;
+ case CEPH_MDS_OP_SETLAYOUT:
+ code = l_mdss_req_setlayout_latency;
+ break;
+ case CEPH_MDS_OP_SETDIRLAYOUT:
+ code = l_mdss_req_setdirlayout_latency;
+ break;
+ case CEPH_MDS_OP_SETXATTR:
+ code = l_mdss_req_setxattr_latency;
+ break;
+ case CEPH_MDS_OP_RMXATTR:
+ code = l_mdss_req_rmxattr_latency;
+ break;
+ case CEPH_MDS_OP_READDIR:
+ code = l_mdss_req_readdir_latency;
+ break;
+ case CEPH_MDS_OP_SETFILELOCK:
+ code = l_mdss_req_setfilelock_latency;
+ break;
+ case CEPH_MDS_OP_GETFILELOCK:
+ code = l_mdss_req_getfilelock_latency;
+ break;
+ case CEPH_MDS_OP_CREATE:
+ code = l_mdss_req_create_latency;
+ break;
+ case CEPH_MDS_OP_OPEN:
+ code = l_mdss_req_open_latency;
+ break;
+ case CEPH_MDS_OP_MKNOD:
+ code = l_mdss_req_mknod_latency;
+ break;
+ case CEPH_MDS_OP_LINK:
+ code = l_mdss_req_link_latency;
+ break;
+ case CEPH_MDS_OP_UNLINK:
+ code = l_mdss_req_unlink_latency;
+ break;
+ case CEPH_MDS_OP_RMDIR:
+ code = l_mdss_req_rmdir_latency;
+ break;
+ case CEPH_MDS_OP_RENAME:
+ code = l_mdss_req_rename_latency;
+ break;
+ case CEPH_MDS_OP_MKDIR:
+ code = l_mdss_req_mkdir_latency;
+ break;
+ case CEPH_MDS_OP_SYMLINK:
+ code = l_mdss_req_symlink_latency;
+ break;
+ case CEPH_MDS_OP_LSSNAP:
+ code = l_mdss_req_lssnap_latency;
+ break;
+ case CEPH_MDS_OP_MKSNAP:
+ code = l_mdss_req_mksnap_latency;
+ break;
+ case CEPH_MDS_OP_RMSNAP:
+ code = l_mdss_req_rmsnap_latency;
+ break;
+ case CEPH_MDS_OP_RENAMESNAP:
+ code = l_mdss_req_renamesnap_latency;
+ break;
+ default: ceph_abort();
+ }
+ logger->tinc(code, lat);
+}
+
void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
{
- if (!g_conf->mds_early_reply)
+ if (!g_conf()->mds_early_reply)
return;
if (mdr->no_early_reply) {
return;
}
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
entity_inst_t client_inst = req->get_source_inst();
if (client_inst.name.is_mds())
return;
}
- MClientReply *reply = new MClientReply(req, 0);
+ auto reply = make_message<MClientReply>(*req, 0);
reply->set_unsafe();
// mark xlocks "done", indicating that we are exposing uncommitted changes.
if (tracedn)
mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
- set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
- req->get_dentry_wanted(), mdr);
+ set_trace_dist(reply, tracei, tracedn, mdr);
}
reply->set_extra_bl(mdr->reply_extra_bl);
- req->get_connection()->send_message(reply);
+ mds->send_message_client(reply, mdr->session);
mdr->did_early_reply = true;
mds->logger->inc(l_mds_reply);
utime_t lat = ceph_clock_now() - req->get_recv_stamp();
mds->logger->tinc(l_mds_reply_latency, lat);
+ if (client_inst.name.is_client()) {
+ mds->sessionmap.hit_session(mdr->session);
+ }
+ perf_gather_op_latency(req, lat);
dout(20) << "lat " << lat << dendl;
mdr->mark_event("early_replied");
* include a trace to tracei
* Clean up mdr
*/
-void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
+void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
{
- assert(mdr.get());
- MClientRequest *req = mdr->client_request;
+ ceph_assert(mdr.get());
+ const cref_t<MClientRequest> &req = mdr->client_request;
dout(7) << "reply_client_request " << reply->get_result()
<< " (" << cpp_strerror(reply->get_result())
apply_allocated_inos(mdr, session);
// get tracei/tracedn from mdr?
- snapid_t snapid = mdr->snapid;
CInode *tracei = mdr->tracei;
CDentry *tracedn = mdr->tracedn;
bool is_replay = mdr->client_request->is_replay();
bool did_early_reply = mdr->did_early_reply;
entity_inst_t client_inst = req->get_source_inst();
- int dentry_wanted = req->get_dentry_wanted();
if (!did_early_reply && !is_replay) {
mds->logger->inc(l_mds_reply);
utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
mds->logger->tinc(l_mds_reply_latency, lat);
+ if (session && client_inst.name.is_client()) {
+ mds->sessionmap.hit_session(session);
+ }
+ perf_gather_op_latency(req, lat);
dout(20) << "lat " << lat << dendl;
if (tracei)
mdcache->request_drop_non_rdlocks(mdr);
// reply at all?
- if (client_inst.name.is_mds() || !session) {
- reply->put(); // mds doesn't need a reply
- reply = 0;
- } else {
+ if (session && !client_inst.name.is_mds()) {
// send reply.
if (!did_early_reply && // don't issue leases if we sent an earlier reply already
(tracei || tracedn)) {
mdcache->try_reconnect_cap(tracei, session);
} else {
// include metadata in reply
- set_trace_dist(session, reply, tracei, tracedn,
- snapid, dentry_wanted,
- mdr);
+ set_trace_dist(reply, tracei, tracedn, mdr);
}
}
reply->set_extra_bl(mdr->reply_extra_bl);
reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
- req->get_connection()->send_message(reply);
+ mds->send_message_client(reply, session);
}
if (req->is_queued_for_replay() &&
}
}
-
-void Server::encode_empty_dirstat(bufferlist& bl)
-{
- static DirStat empty;
- empty.encode(bl);
-}
-
-void Server::encode_infinite_lease(bufferlist& bl)
-{
- LeaseStat e;
- e.seq = 0;
- e.mask = -1;
- e.duration_ms = -1;
- ::encode(e, bl);
- dout(20) << "encode_infinite_lease " << e << dendl;
-}
-
-void Server::encode_null_lease(bufferlist& bl)
-{
- LeaseStat e;
- e.seq = 0;
- e.mask = 0;
- e.duration_ms = 0;
- ::encode(e, bl);
- dout(20) << "encode_null_lease " << e << dendl;
-}
-
-
/*
* pass inode OR dentry (not both, or we may get confused)
*
* trace is in reverse order (i.e. root inode comes last)
*/
-void Server::set_trace_dist(Session *session, MClientReply *reply,
+void Server::set_trace_dist(const ref_t<MClientReply> &reply,
CInode *in, CDentry *dn,
- snapid_t snapid,
- int dentry_wanted,
MDRequestRef& mdr)
{
// skip doing this for debugging purposes?
- if (g_conf->mds_inject_traceless_reply_probability &&
+ if (g_conf()->mds_inject_traceless_reply_probability &&
mdr->ls && !mdr->o_trunc &&
- (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
+ (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
dout(5) << "deliberately skipping trace for " << *reply << dendl;
return;
}
// inode, dentry, dir, ..., inode
bufferlist bl;
mds_rank_t whoami = mds->get_nodeid();
- client_t client = session->get_client();
+ Session *session = mdr->session;
+ snapid_t snapid = mdr->snapid;
utime_t now = ceph_clock_now();
dout(20) << "set_trace_dist snapid " << snapid << dendl;
- //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
-
// realm
if (snapid == CEPH_NOSNAP) {
SnapRealm *realm;
if (dir->is_complete())
dir->verify_fragstat();
#endif
- dir->encode_dirstat(bl, whoami);
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
+ dir->get_dist_spec(ds.dist, whoami);
+
+ dir->encode_dirstat(bl, session->info, ds);
dout(20) << "set_trace_dist added dir " << *dir << dendl;
- ::encode(dn->get_name(), bl);
- if (snapid == CEPH_NOSNAP)
- mds->locker->issue_client_lease(dn, client, bl, now, session);
- else
- encode_null_lease(bl);
+ encode(dn->get_name(), bl);
+
+ int lease_mask = 0;
+ CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
+ if (dnl->is_primary()) {
+ ceph_assert(dnl->get_inode() == in);
+ lease_mask = CEPH_LEASE_PRIMARY_LINK;
+ } else {
+ if (dnl->is_remote())
+ ceph_assert(dnl->get_remote_ino() == in->ino());
+ else
+ ceph_assert(!in);
+ }
+ mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
} else
reply->head.is_dentry = 0;
reply->set_trace(bl);
}
-
-
-
-/***
- * process a client request
- * This function DOES put the passed message before returning
- */
-void Server::handle_client_request(MClientRequest *req)
+void Server::handle_client_request(const cref_t<MClientRequest> &req)
{
dout(4) << "handle_client_request " << *req << dendl;
return;
}
+ bool sessionclosed_isok = replay_unsafe_with_closed_session;
// active session?
Session *session = 0;
if (req->get_source().is_client()) {
session = mds->get_session(req);
if (!session) {
dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
- } else if (session->is_closed() ||
+ } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
session->is_closing() ||
session->is_killing()) {
dout(5) << "session closed|closing|killing, dropping" << dendl;
if (!session) {
if (req->is_queued_for_replay())
mds->queue_one_replay();
- req->put();
return;
}
}
// completed request?
bool has_completed = false;
if (req->is_replay() || req->get_retry_attempt()) {
- assert(session);
+ ceph_assert(session);
inodeno_t created;
if (session->have_completed_request(req->get_reqid().tid, &created)) {
has_completed = true;
+ if (!session->is_open())
+ return;
// Don't send traceless reply if the completed request has created
// new inode. Treat the request as lookup request instead.
if (req->is_replay() ||
req->get_op() != CEPH_MDS_OP_OPEN &&
req->get_op() != CEPH_MDS_OP_CREATE)) {
dout(5) << "already completed " << req->get_reqid() << dendl;
- MClientReply *reply = new MClientReply(req, 0);
+ auto reply = make_message<MClientReply>(*req, 0);
if (created != inodeno_t()) {
bufferlist extra;
- ::encode(created, extra);
+ encode(created, extra);
reply->set_extra_bl(extra);
}
- req->get_connection()->send_message(reply);
+ mds->send_message_client(reply, session);
if (req->is_queued_for_replay())
mds->queue_one_replay();
- req->put();
return;
}
if (req->get_op() != CEPH_MDS_OP_OPEN &&
// trim completed_request list
if (req->get_oldest_client_tid() > 0) {
dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
- assert(session);
+ ceph_assert(session);
if (session->trim_completed_requests(req->get_oldest_client_tid())) {
// Sessions 'completed_requests' was dirtied, mark it to be
// potentially flushed at segment expiry.
mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
if (session->get_num_trim_requests_warnings() > 0 &&
- session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
+ session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
session->reset_num_trim_requests_warnings();
} else {
if (session->get_num_completed_requests() >=
- (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+ (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
session->inc_num_trim_requests_warnings();
stringstream ss;
ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
// (only if NOT replay!)
if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
client_t client = req->get_source().num();
- for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
- p != req->releases.end();
- ++p)
- mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
+ for (const auto &r : req->releases) {
+ mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
+ }
req->releases.clear();
}
});
}
+void Server::clear_batch_ops(const MDRequestRef& mdr)
+{
+ int mask = mdr->client_request->head.args.getattr.mask;
+ if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR && mdr->in[0]) {
+ mdr->in[0]->batch_ops.erase(mask);
+ } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP && mdr->dn[0].size()) {
+ mdr->dn[0].back()->batch_ops.erase(mask);
+ }
+}
+
void Server::dispatch_client_request(MDRequestRef& mdr)
{
// we shouldn't be waiting on anyone.
- assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
+ ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
+
+ if (mdr->killed) {
+ dout(10) << "request " << *mdr << " was killed" << dendl;
+ //if the mdr is a "batch_op" and it has followers, pick a follower as
+ //the new "head of the batch ops" and go on processing the new one.
+ if (mdr->is_batch_op() && mdr->is_batch_head ) {
+ if (!mdr->batch_reqs.empty()) {
+ MDRequestRef new_batch_head;
+ for (auto itr = mdr->batch_reqs.cbegin(); itr != mdr->batch_reqs.cend();) {
+ auto req = *itr;
+ itr = mdr->batch_reqs.erase(itr);
+ if (!req->killed) {
+ new_batch_head = req;
+ break;
+ }
+ }
+
+ if (!new_batch_head) {
+ clear_batch_ops(mdr);
+ return;
+ }
- if (mdr->killed) {
- dout(10) << "request " << *mdr << " was killed" << dendl;
- return;
+ new_batch_head->batch_reqs = std::move(mdr->batch_reqs);
+
+ mdr = new_batch_head;
+ mdr->is_batch_head = true;
+ int mask = mdr->client_request->head.args.getattr.mask;
+ if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) {
+ auto& fin = mdr->in[0]->batch_ops[mask];
+ fin->set_request(new_batch_head);
+ } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP) {
+ auto& fin = mdr->dn[0].back()->batch_ops[mask];
+ fin->set_request(new_batch_head);
+ }
+ } else {
+ clear_batch_ops(mdr);
+ return;
+ }
+ } else {
+ return;
+ }
} else if (mdr->aborted) {
mdr->aborted = false;
mdcache->request_kill(mdr);
return;
}
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
if (logger) logger->inc(l_mdss_dispatch_client_request);
dout(7) << "dispatch_client_request " << *req << dendl;
- if (req->may_write()) {
- if (mdcache->is_readonly()) {
- dout(10) << " read-only FS" << dendl;
- respond_to_request(mdr, -EROFS);
- return;
- }
- if (mdr->has_more() && mdr->more()->slave_error) {
- dout(10) << " got error from slaves" << dendl;
- respond_to_request(mdr, mdr->more()->slave_error);
- return;
- }
+ if (req->may_write() && mdcache->is_readonly()) {
+ dout(10) << " read-only FS" << dendl;
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ if (mdr->has_more() && mdr->more()->slave_error) {
+ dout(10) << " got error from slaves" << dendl;
+ respond_to_request(mdr, mdr->more()->slave_error);
+ return;
}
if (is_full) {
// ---------------------------------------
// SLAVE REQUESTS
-/* This function DOES put the passed message before returning*/
-void Server::handle_slave_request(MMDSSlaveRequest *m)
+void Server::handle_slave_request(const cref_t<MMDSSlaveRequest> &m)
{
dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
mds_rank_t from = mds_rank_t(m->get_source().num());
// the purpose of rename notify is enforcing causal message ordering. making sure
// bystanders have received all messages from rename srcdn's auth MDS.
if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
- MMDSSlaveRequest::OP_RENAMENOTIFYACK);
+ auto reply = make_message<MMDSSlaveRequest>(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
mds->send_message(reply, m->get_connection());
- m->put();
return;
}
CDentry *straydn = NULL;
- if (m->stray.length() > 0) {
- straydn = mdcache->add_replica_stray(m->stray, from);
- assert(straydn);
- m->stray.clear();
+ if (m->straybl.length() > 0) {
+ mdcache->decode_replica_stray(straydn, m->straybl, from);
+ ceph_assert(straydn);
+ m->straybl.clear();
+ }
+
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ dout(3) << "not clientreplay|active yet, waiting" << dendl;
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
}
// am i a new slave?
if (mdr->attempt > m->get_attempt()) {
dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
<< ", dropping " << *m << dendl;
- m->put();
return;
}
-
if (mdr->attempt < m->get_attempt()) {
// mine is old, close it out
dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
mdr.reset();
} else if (mdr->slave_to_mds != from) {
dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
- m->put();
return;
}
- if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
- mdr->aborted = true;
- if (mdr->slave_request) {
- // only abort on-going xlock, wrlock and auth pin
- assert(!mdr->slave_did_prepare());
+ // may get these while mdr->slave_request is non-null
+ if (m->get_op() == MMDSSlaveRequest::OP_DROPLOCKS) {
+ mds->locker->drop_locks(mdr.get());
+ return;
+ }
+ if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
+ if (m->is_abort()) {
+ mdr->aborted = true;
+ if (mdr->slave_request) {
+ // only abort on-going xlock, wrlock and auth pin
+ ceph_assert(!mdr->slave_did_prepare());
+ } else {
+ mdcache->request_finish(mdr);
+ }
} else {
+ if (m->inode_export.length() > 0)
+ mdr->more()->inode_import = m->inode_export;
+ // finish off request.
mdcache->request_finish(mdr);
}
- m->put();
return;
}
}
if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
dout(10) << "missing slave request for " << m->get_reqid()
<< " OP_FINISH, must have lost race with a forward" << dendl;
- m->put();
return;
}
mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
mdr->set_op_stamp(m->op_stamp);
}
- assert(mdr->slave_request == 0); // only one at a time, please!
+ ceph_assert(mdr->slave_request == 0); // only one at a time, please!
if (straydn) {
mdr->pin(straydn);
mdr->straydn = straydn;
}
- if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
- dout(3) << "not clientreplay|active yet, waiting" << dendl;
- mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
- return;
- } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
- mdr->locks.empty()) {
+ if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
+ mdr->locks.empty()) {
dout(3) << "not active yet, waiting" << dendl;
mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
return;
}
- mdr->slave_request = m;
+ mdr->reset_slave_request(m);
dispatch_slave_request(mdr);
}
-/* This function DOES put the passed message before returning*/
-void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
+void Server::handle_slave_request_reply(const cref_t<MMDSSlaveRequest> &m)
{
mds_rank_t from = mds_rank_t(m->get_source().num());
if (!mdcache->have_uncommitted_master(r, from)) {
dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
<< from << " reqid " << r << dendl;
- m->put();
return;
}
dout(3) << "not clientreplay|active yet, waiting" << dendl;
if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
metareqid_t r = m->get_reqid();
mdcache->committed_master_slave(r, from);
- m->put();
return;
}
if (m->get_attempt() != mdr->attempt) {
dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
<< m->get_attempt() << dendl;
- m->put();
return;
}
SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
m->get_object_info());
mdr->more()->slaves.insert(from);
+ lock->decode_locked_state(m->get_lock_data());
dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
- mdr->xlocks.insert(lock);
- mdr->locks.insert(lock);
+ mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
mdr->finish_locking(lock);
lock->get_xlock(mdr, mdr->get_client());
- assert(mdr->more()->waiting_on_slave.count(from));
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
mdr->more()->waiting_on_slave.erase(from);
- assert(mdr->more()->waiting_on_slave.empty());
+ ceph_assert(mdr->more()->waiting_on_slave.empty());
mdcache->dispatch_request(mdr);
}
break;
m->get_object_info());
mdr->more()->slaves.insert(from);
dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
- mdr->remote_wrlocks[lock] = from;
- mdr->locks.insert(lock);
+ auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
+ ceph_assert(it->is_remote_wrlock());
+ ceph_assert(it->wrlock_target == from);
+
mdr->finish_locking(lock);
- assert(mdr->more()->waiting_on_slave.count(from));
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
mdr->more()->waiting_on_slave.erase(from);
- assert(mdr->more()->waiting_on_slave.empty());
+ ceph_assert(mdr->more()->waiting_on_slave.empty());
mdcache->dispatch_request(mdr);
}
break;
default:
ceph_abort();
}
-
- // done with reply.
- m->put();
}
-/* This function DOES put the mdr->slave_request before returning*/
void Server::dispatch_slave_request(MDRequestRef& mdr)
{
dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
<< *lock << " on " << *lock->get_parent() << dendl;
} else {
// use acquire_locks so that we get auth_pinning.
- set<SimpleLock*> rdlocks;
- set<SimpleLock*> wrlocks = mdr->wrlocks;
- set<SimpleLock*> xlocks = mdr->xlocks;
+ MutationImpl::LockOpVec lov;
+ for (const auto& p : mdr->locks) {
+ if (p.is_xlock())
+ lov.add_xlock(p.lock);
+ else if (p.is_wrlock())
+ lov.add_wrlock(p.lock);
+ }
int replycode = 0;
switch (op) {
case MMDSSlaveRequest::OP_XLOCK:
- xlocks.insert(lock);
+ lov.add_xlock(lock);
replycode = MMDSSlaveRequest::OP_XLOCKACK;
break;
case MMDSSlaveRequest::OP_WRLOCK:
- wrlocks.insert(lock);
+ lov.add_wrlock(lock);
replycode = MMDSSlaveRequest::OP_WRLOCKACK;
break;
}
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
// ack
- MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
+ auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, replycode);
r->set_lock_type(lock->get_type());
lock->get_parent()->set_object_info(r->get_object_info());
+ if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
+ lock->encode_locked_state(r->get_lock_data());
mds->send_message(r, mdr->slave_request->get_connection());
}
// done.
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
}
break;
{
SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
mdr->slave_request->get_object_info());
- assert(lock);
+ ceph_assert(lock);
+ auto it = mdr->locks.find(lock);
+ ceph_assert(it != mdr->locks.end());
bool need_issue = false;
switch (op) {
case MMDSSlaveRequest::OP_UNXLOCK:
- mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
+ mds->locker->xlock_finish(it, mdr.get(), &need_issue);
break;
case MMDSSlaveRequest::OP_UNWRLOCK:
- mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
+ mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
break;
}
if (need_issue)
mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
// done. no ack necessary.
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
}
break;
- case MMDSSlaveRequest::OP_DROPLOCKS:
- mds->locker->drop_locks(mdr.get());
- mdr->slave_request->put();
- mdr->slave_request = 0;
- break;
-
case MMDSSlaveRequest::OP_AUTHPIN:
handle_slave_auth_pin(mdr);
break;
handle_slave_rename_prep(mdr);
break;
- case MMDSSlaveRequest::OP_FINISH:
- // information about rename imported caps
- if (mdr->slave_request->inode_export.length() > 0)
- mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
- // finish off request.
- mdcache->request_finish(mdr);
- break;
-
default:
ceph_abort();
}
}
-/* This function DOES put the mdr->slave_request before returning*/
void Server::handle_slave_auth_pin(MDRequestRef& mdr)
{
dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
// build list of objects
list<MDSCacheObject*> objects;
CInode *auth_pin_freeze = NULL;
+ bool nonblocking = mdr->slave_request->is_nonblocking();
bool fail = false, wouldblock = false, readonly = false;
+ ref_t<MMDSSlaveRequest> reply;
if (mdcache->is_readonly()) {
dout(10) << " read-only FS" << dendl;
}
if (!fail) {
- for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
- p != mdr->slave_request->get_authpins().end();
- ++p) {
- MDSCacheObject *object = mdcache->get_object(*p);
+ for (const auto &oi : mdr->slave_request->get_authpins()) {
+ MDSCacheObject *object = mdcache->get_object(oi);
if (!object) {
- dout(10) << " don't have " << *p << dendl;
+ dout(10) << " don't have " << oi << dendl;
fail = true;
break;
}
objects.push_back(object);
- if (*p == mdr->slave_request->get_authpin_freeze())
+ if (oi == mdr->slave_request->get_authpin_freeze())
auth_pin_freeze = static_cast<CInode*>(object);
}
}
// can we auth pin them?
if (!fail) {
- for (list<MDSCacheObject*>::iterator p = objects.begin();
- p != objects.end();
- ++p) {
- if (!(*p)->is_auth()) {
- dout(10) << " not auth for " << **p << dendl;
+ for (const auto& obj : objects) {
+ if (!obj->is_auth()) {
+ dout(10) << " not auth for " << *obj << dendl;
fail = true;
break;
}
- if (mdr->is_auth_pinned(*p))
+ if (mdr->is_auth_pinned(obj))
continue;
- if (!mdr->can_auth_pin(*p)) {
- if (mdr->slave_request->is_nonblock()) {
- dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
+ if (!mdr->can_auth_pin(obj)) {
+ if (nonblocking) {
+ dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
fail = true;
wouldblock = true;
break;
}
// wait
- dout(10) << " waiting for authpinnable on " << **p << dendl;
- (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ dout(10) << " waiting for authpinnable on " << *obj << dendl;
+ obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
mdr->drop_local_auth_pins();
- mds->locker->notify_freeze_waiter(*p);
- return;
+ mds->locker->notify_freeze_waiter(obj);
+ goto blocked;
}
}
}
- // auth pin!
- if (fail) {
- mdr->drop_local_auth_pins(); // just in case
- } else {
+ if (!fail) {
/* freeze authpin wrong inode */
if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
mdr->more()->rename_inode != auth_pin_freeze)
if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
mds->mdlog->flush();
- return;
+ goto blocked;
}
}
- for (list<MDSCacheObject*>::iterator p = objects.begin();
- p != objects.end();
- ++p) {
- dout(10) << "auth_pinning " << **p << dendl;
- mdr->auth_pin(*p);
- }
}
- // ack!
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
-
- // return list of my auth_pins (if any)
- for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
- p != mdr->auth_pins.end();
- ++p) {
- MDSCacheObjectInfo info;
- (*p)->set_object_info(info);
- reply->get_authpins().push_back(info);
- if (*p == (MDSCacheObject*)auth_pin_freeze)
- auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
- }
+ reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
- if (wouldblock)
- reply->mark_error_wouldblock();
- if (readonly)
- reply->mark_error_rofs();
+ if (fail) {
+ mdr->drop_local_auth_pins(); // just in case
+ if (readonly)
+ reply->mark_error_rofs();
+ if (wouldblock)
+ reply->mark_error_wouldblock();
+ } else {
+ // auth pin!
+ for (const auto& obj : objects) {
+ dout(10) << "auth_pinning " << *obj << dendl;
+ mdr->auth_pin(obj);
+ }
+ // return list of my auth_pins (if any)
+ for (const auto &p : mdr->object_states) {
+ if (!p.second.auth_pinned)
+ continue;
+ MDSCacheObjectInfo info;
+ p.first->set_object_info(info);
+ reply->get_authpins().push_back(info);
+ if (p.first == (MDSCacheObject*)auth_pin_freeze)
+ auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
+ }
+ }
mds->send_message_mds(reply, mdr->slave_to_mds);
// clean up this request
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
+ return;
+
+blocked:
+ if (mdr->slave_request->should_notify_blocking()) {
+ reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
+ reply->mark_req_blocked();
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ mdr->slave_request->clear_notify_blocking();
+ }
return;
}
-/* This function DOES NOT put the passed ack before returning*/
-void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
{
dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
mds_rank_t from = mds_rank_t(ack->get_source().num());
+ if (ack->is_req_blocked()) {
+ mdr->disable_lock_cache();
+ // slave auth pin is blocked, drop locks to avoid deadlock
+ mds->locker->drop_locks(mdr.get(), nullptr);
+ return;
+ }
+
// added auth pins?
set<MDSCacheObject*> pinned;
- for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
- p != ack->get_authpins().end();
- ++p) {
- MDSCacheObject *object = mdcache->get_object(*p);
- assert(object); // we pinned it
+ for (const auto &oi : ack->get_authpins()) {
+ MDSCacheObject *object = mdcache->get_object(oi);
+ ceph_assert(object); // we pinned it
dout(10) << " remote has pinned " << *object << dendl;
- if (!mdr->is_auth_pinned(object))
- mdr->remote_auth_pins[object] = from;
- if (*p == ack->get_authpin_freeze())
+ mdr->set_remote_auth_pinned(object, from);
+ if (oi == ack->get_authpin_freeze())
mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
pinned.insert(object);
}
// removed frozen auth pin ?
if (mdr->more()->is_remote_frozen_authpin &&
ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
- auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
- assert(p != mdr->remote_auth_pins.end());
- if (p->second == from) {
+ auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
+ ceph_assert(stat_p);
+ if (stat_p->remote_auth_pinned == from) {
mdr->more()->is_remote_frozen_authpin = false;
}
}
// removed auth pins?
- map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
- while (p != mdr->remote_auth_pins.end()) {
- MDSCacheObject* object = p->first;
- if (p->second == from && pinned.count(object) == 0) {
+ for (auto& p : mdr->object_states) {
+ if (p.second.remote_auth_pinned == MDS_RANK_NONE)
+ continue;
+ MDSCacheObject* object = p.first;
+ if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
dout(10) << " remote has unpinned " << *object << dendl;
- mdr->remote_auth_pins.erase(p++);
- } else {
- ++p;
+ mdr->_clear_remote_auth_pinned(p.second);
}
}
+ // note slave
+ mdr->more()->slaves.insert(from);
+
+ // clear from waiting list
+ auto ret = mdr->more()->waiting_on_slave.erase(from);
+ ceph_assert(ret);
+
if (ack->is_error_rofs()) {
mdr->more()->slave_error = -EROFS;
- mdr->aborted = true;
} else if (ack->is_error_wouldblock()) {
mdr->more()->slave_error = -EWOULDBLOCK;
- mdr->aborted = true;
}
-
- // note slave
- mdr->more()->slaves.insert(from);
-
- // clear from waiting list
- assert(mdr->more()->waiting_on_slave.count(from));
- mdr->more()->waiting_on_slave.erase(from);
// go again?
if (mdr->more()->waiting_on_slave.empty())
bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
{
const auto size = in->get_frag_size();
- if (size >= g_conf->mds_bal_fragment_size_max) {
- dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
+ if (size >= g_conf()->mds_bal_fragment_size_max) {
+ dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
respond_to_request(mdr, -ENOSPC);
return false;
}
return true;
}
-
-/** validate_dentry_dir
- *
- * verify that the dir exists and would own the dname.
- * do not check if the dentry exists.
- */
-CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, boost::string_view dname)
-{
- // make sure parent is a dir?
- if (!diri->is_dir()) {
- dout(7) << "validate_dentry_dir: not a dir" << dendl;
- respond_to_request(mdr, -ENOTDIR);
- return NULL;
- }
-
- // which dirfrag?
- frag_t fg = diri->pick_dirfrag(dname);
- CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
- if (!dir)
- return 0;
-
- // frozen?
- if (dir->is_frozen()) {
- dout(7) << "dir is frozen " << *dir << dendl;
- dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
- return NULL;
- }
-
- return dir;
-}
-
-
-/** prepare_null_dentry
- * prepare a null (or existing) dentry in given dir.
- * wait for any dn lock.
- */
-CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, boost::string_view dname, bool okexist)
-{
- dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
- assert(dir->is_auth());
-
- client_t client = mdr->get_client();
-
- // does it already exist?
- CDentry *dn = dir->lookup(dname);
- if (dn) {
- /*
- if (dn->lock.is_xlocked_by_other(mdr)) {
- dout(10) << "waiting on xlocked dentry " << *dn << dendl;
- dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
- }
- */
- if (!dn->get_linkage(client, mdr)->is_null()) {
- // name already exists
- dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
- if (!okexist) {
- respond_to_request(mdr, -EEXIST);
- return 0;
- }
- } else {
- dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
- }
-
- return dn;
- }
-
- // make sure dir is complete
- if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
- dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
- dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
- }
-
- // create
- dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
- dn->mark_new();
- dout(10) << "prepare_null_dentry added " << *dn << dendl;
- return dn;
-}
-
CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
{
CDentry *straydn = mdr->straydn;
if (straydn) {
string straydname;
in->name_stray_dentry(straydname);
- if (straydn->get_name() == straydname)
- return straydn;
-
- assert(!mdr->done_locking);
- mdr->unpin(straydn);
+ ceph_assert(straydn->get_name() == straydname);
+ return straydn;
}
CDir *straydir = mdcache->get_stray_dir(in);
// state. In that corner case, session's prealloc_inos are being freed.
// To simplify the code, we disallow using/refilling session's prealloc_ino
// while session is opening.
- bool allow_prealloc_inos = !mdr->session->is_opening();
+ bool allow_prealloc_inos = mdr->session->is_open();
// assign ino
- if (allow_prealloc_inos &&
- mdr->session->info.prealloc_inos.size()) {
- mdr->used_prealloc_ino =
- in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
+ if (allow_prealloc_inos && (mdr->used_prealloc_ino = in->inode.ino = mdr->session->take_ino(useino))) {
mds->sessionmap.mark_projected(mdr->session);
-
dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
<< " (" << mdr->session->info.prealloc_inos
<< ", " << mdr->session->info.prealloc_inos.size() << " left)"
<< dendl;
} else {
mdr->alloc_ino =
- in->inode.ino = mds->inotable->project_alloc_id();
+ in->inode.ino = mds->inotable->project_alloc_id(useino);
dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
}
}
if (allow_prealloc_inos &&
- mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
- int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
+ mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
+ int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
- assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
+ ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
mds->sessionmap.mark_projected(mdr->session);
dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
in->inode.mode = mode;
+ // FIPS zeroization audit 20191117: this memset is not security related.
memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
if (in->inode.is_dir()) {
- in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
} else if (layout) {
in->inode.layout = *layout;
} else {
in->inode.change_attr = 0;
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
if (req->get_data().length()) {
- bufferlist::iterator p = req->get_data().begin();
+ auto p = req->get_data().cbegin();
// xattrs on new inode?
CInode::mempool_xattr_map xattrs;
- ::decode(xattrs, p);
+ decode_noshare(xattrs, p);
for (const auto &p : xattrs) {
dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
}
if (!mds->mdsmap->get_inline_data_enabled() ||
- !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
+ !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
in->inode.inline_data.version = CEPH_INLINE_NONE;
mdcache->add_inode(in); // add
mds->inotable->apply_alloc_id(mdr->alloc_ino);
}
if (mdr->prealloc_inos.size()) {
- assert(session);
+ ceph_assert(session);
session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
session->info.prealloc_inos.insert(mdr->prealloc_inos);
- mds->sessionmap.mark_dirty(session);
+ mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
}
if (mdr->used_prealloc_ino) {
- assert(session);
+ ceph_assert(session);
session->info.used_inos.erase(mdr->used_prealloc_ino);
mds->sessionmap.mark_dirty(session);
}
}
};
-CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
-{
- // figure parent dir vs dname
- if (refpath.depth() == 0) {
- dout(7) << "can't do that to root" << dendl;
- respond_to_request(mdr, -EINVAL);
- return 0;
- }
- string dname = refpath.last_dentry();
- refpath.pop_dentry();
-
- dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
-
- // traverse to parent dir
- CInode *diri;
- int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
- if (r > 0) return 0; // delayed
- if (r < 0) {
- if (r == -ESTALE) {
- dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
- mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
- return 0;
+class CF_MDS_MDRContextFactory : public MDSContextFactory {
+public:
+ CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
+ mdcache(cache), mdr(mdr), drop_locks(dl) {}
+ MDSContext *build() {
+ if (drop_locks) {
+ mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
+ mdr->drop_local_auth_pins();
}
- respond_to_request(mdr, r);
- return 0;
+ return new C_MDS_RetryRequest(mdcache, mdr);
}
-
- // is it an auth dir?
- CDir *dir = validate_dentry_dir(mdr, diri, dname);
- if (!dir)
- return 0; // forwarded or waiting for freeze
-
- dout(10) << "traverse_to_auth_dir " << *dir << dendl;
- return dir;
-}
+private:
+ MDCache *mdcache;
+ MDRequestRef mdr;
+ bool drop_locks;
+};
/* If this returns null, the request has been handled
* as appropriate: forwarded on, or the client's been replied to */
-CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
- set<SimpleLock*> &rdlocks,
+CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
bool want_auth,
- bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
- a snapped dir */
- file_layout_t **layout,
- bool no_lookup) // true if we cannot return a null dentry lease
+ bool no_want_auth)
{
- const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
+ const filepath& refpath = mdr->get_filepath();
dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
- if (mdr->done_locking)
- return mdr->in[n];
+ if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+ return mdr->in[0];
// traverse
- int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
+ CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
+ int flags = 0;
+ if (refpath.is_last_snap()) {
+ if (!no_want_auth)
+ want_auth = true;
+ } else {
+ flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
+ }
+ if (want_auth)
+ flags |= MDS_TRAVERSE_WANT_AUTH;
+ int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
if (r > 0)
- return NULL; // delayed
+ return nullptr; // delayed
if (r < 0) { // error
- if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
- if (!no_lookup)
- mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
+ if (r == -ENOENT && !mdr->dn[0].empty()) {
+ if (mdr->client_request &&
+ mdr->client_request->get_dentry_wanted())
+ mdr->tracedn = mdr->dn[0].back();
respond_to_request(mdr, r);
} else if (r == -ESTALE) {
dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
- MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
+ MDSContext *c = new C_MDS_TryFindInode(this, mdr);
mdcache->find_ino_peers(refpath.get_ino(), c);
} else {
dout(10) << "FAIL on error " << r << dendl;
respond_to_request(mdr, r);
}
- return 0;
+ return nullptr;
}
- CInode *ref = mdr->in[n];
+ CInode *ref = mdr->in[0];
dout(10) << "ref is " << *ref << dendl;
- // fw to inode auth?
- if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
- want_auth = true;
-
if (want_auth) {
- if (ref->is_ambiguous_auth()) {
- dout(10) << "waiting for single auth on " << *ref << dendl;
- ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
- }
- if (!ref->is_auth()) {
- dout(10) << "fw to auth for " << *ref << dendl;
- mdcache->request_forward(mdr, ref->authority().first);
- return 0;
- }
-
// auth_pin?
// do NOT proceed if freezing, as cap release may defer in that case, and
// we could deadlock when we try to lock @ref.
if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
(ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
- ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
- /* If we have any auth pins, this will deadlock.
- * But the only way to get here if we've already got auth pins
- * is because we're on an inode with snapshots that got updated
- * between dispatches of this request. So we're going to drop
- * our locks and our auth pins and reacquire them later.
- *
- * This is safe since we're only in this function when working on
- * a single MDS request; otherwise we'd be in
- * rdlock_path_xlock_dentry.
- */
- mds->locker->drop_locks(mdr.get(), NULL);
- mdr->drop_local_auth_pins();
- if (!mdr->remote_auth_pins.empty())
+ ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
+ if (mdr->is_any_remote_auth_pin())
mds->locker->notify_freeze_waiter(ref);
return 0;
}
-
mdr->auth_pin(ref);
}
- for (int i=0; i<(int)mdr->dn[n].size(); i++)
- rdlocks.insert(&mdr->dn[n][i]->lock);
- if (layout)
- mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
- else
- mds->locker->include_snap_rdlocks(rdlocks, ref);
-
// set and pin ref
mdr->pin(ref);
return ref;
* create null dentry in place (or use existing if okexist).
* get rdlocks on traversed dentries, xlock on new dentry.
*/
-CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
- set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
- bool okexist, bool mustexist, bool alwaysxlock,
- file_layout_t **layout)
+CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
+ bool create, bool okexist, bool want_layout)
{
- const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
-
+ const filepath& refpath = mdr->get_filepath();
dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
- client_t client = mdr->get_client();
+ if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+ return mdr->dn[0].back();
+
+ // figure parent dir vs dname
+ if (refpath.depth() == 0) {
+ dout(7) << "invalid path (zero length)" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return nullptr;
+ }
+
+ if (refpath.is_last_snap()) {
+ respond_to_request(mdr, -EROFS);
+ return nullptr;
+ }
- if (mdr->done_locking)
- return mdr->dn[n].back();
+ if (refpath.is_last_dot_or_dotdot()) {
+ dout(7) << "invalid path (last dot or dot_dot)" << dendl;
+ if (create)
+ respond_to_request(mdr, -EEXIST);
+ else
+ respond_to_request(mdr, -ENOTEMPTY);
+ return nullptr;
+ }
- CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
- if (!dir) return 0;
+ // traverse to parent dir
+ CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
+ int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
+ MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
+ MDS_TRAVERSE_WANT_AUTH;
+ if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
+ flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
+ if (create)
+ flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
+ if (want_layout)
+ flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
+ int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
+ if (r > 0)
+ return nullptr; // delayed
+ if (r < 0) {
+ if (r == -ESTALE) {
+ dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
+ mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
+ return nullptr;
+ }
+ respond_to_request(mdr, r);
+ return nullptr;
+ }
+ CDentry *dn = mdr->dn[0].back();
+ CDir *dir = dn->get_dir();
CInode *diri = dir->get_inode();
+
if (!mdr->reqid.name.is_mds()) {
if (diri->is_system() && !diri->is_root()) {
respond_to_request(mdr, -EROFS);
- return 0;
+ return nullptr;
}
}
+
if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
respond_to_request(mdr, -ENOENT);
- return 0;
+ return nullptr;
}
- // make a null dentry?
- boost::string_view dname = refpath.last_dentry();
- CDentry *dn;
- if (mustexist) {
- dn = dir->lookup(dname);
-
- // make sure dir is complete
- if (!dn && !dir->is_complete() &&
- (!dir->has_bloom() || dir->is_in_bloom(dname))) {
- dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
- dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_null()) {
+ if (!create && okexist) {
+ respond_to_request(mdr, -ENOENT);
+ return nullptr;
}
- // readable?
- if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
- dout(10) << "waiting on xlocked dentry " << *dn << dendl;
- dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
- }
-
- // exists?
- if (!dn || dn->get_linkage(client, mdr)->is_null()) {
- dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
- respond_to_request(mdr, -ENOENT);
- return 0;
- }
+ snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ dn->first = std::max(dn->first, next_snap);
} else {
- dn = prepare_null_dentry(mdr, dir, dname, okexist);
- if (!dn)
- return 0;
+ if (!okexist) {
+ respond_to_request(mdr, -EEXIST);
+ return nullptr;
+ }
+ mdr->in[0] = dnl->get_inode();
}
- mdr->dn[n].push_back(dn);
- CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
- mdr->in[n] = dnl->get_inode();
-
- // -- lock --
- // NOTE: rename takes the same set of locks for srcdn
- for (int i=0; i<(int)mdr->dn[n].size(); i++)
- rdlocks.insert(&mdr->dn[n][i]->lock);
- if (alwaysxlock || dnl->is_null())
- xlocks.insert(&dn->lock); // new dn, xlock
- else
- rdlocks.insert(&dn->lock); // existing dn, rdlock
- wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
- wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
- if (layout)
- mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
- else
- mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
+ return dn;
+}
+
+/** rdlock_two_paths_xlock_destdn
+ * traverse two paths and lock the two paths in proper order.
+ * The order of taking locks is:
+ * 1. Lock directory inodes or dentries according to which trees they
+ * are under. Lock objects under fs root before objects under mdsdir.
+ * 2. Lock directory inodes or dentries according to their depth, in
+ * ascending order.
+ * 3. Lock directory inodes or dentries according to inode numbers or
+ * dentries' parent inode numbers, in ascending order.
+ * 4. Lock dentries in the same directory in order of their keys.
+ * 5. Lock non-directory inodes according to inode numbers, in ascending
+ * order.
+ */
+std::pair<CDentry*, CDentry*>
+Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
+{
+
+ const filepath& refpath = mdr->get_filepath();
+ const filepath& refpath2 = mdr->get_filepath2();
+
+ dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
+
+ if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+ return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
+
+ if (refpath.depth() != 1 || refpath2.depth() != 1) {
+ respond_to_request(mdr, -EINVAL);
+ return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
+ }
+
+ if (refpath.is_last_snap() || refpath2.is_last_snap()) {
+ respond_to_request(mdr, -EROFS);
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ // traverse to parent dir
+ CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
+ int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
+ int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
+ if (r != 0) {
+ if (r == -ESTALE) {
+ dout(10) << "ESTALE on path, attempting recovery" << dendl;
+ mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
+ } else if (r < 0) {
+ respond_to_request(mdr, r);
+ }
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
+ r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
+ if (r != 0) {
+ if (r == -ESTALE) {
+ dout(10) << "ESTALE on path2, attempting recovery" << dendl;
+ mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
+ } else if (r < 0) {
+ respond_to_request(mdr, r);
+ }
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ CDentry *srcdn = mdr->dn[1].back();
+ CDir *srcdir = srcdn->get_dir();
+ CDentry *destdn = mdr->dn[0].back();
+ CDir *destdir = destdn->get_dir();
+
+ if (!mdr->reqid.name.is_mds()) {
+ if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
+ (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
+ respond_to_request(mdr, -EROFS);
+ return std::make_pair(nullptr, nullptr);
+ }
+ }
+
+ if (!destdir->get_inode()->is_base() &&
+ destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
+ respond_to_request(mdr, -ENOENT);
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ MutationImpl::LockOpVec lov;
+ if (srcdir->get_inode() == destdir->get_inode()) {
+ lov.add_wrlock(&destdir->inode->filelock);
+ lov.add_wrlock(&destdir->inode->nestlock);
+ if (xlock_srcdn && srcdir != destdir) {
+ mds_rank_t srcdir_auth = srcdir->authority().first;
+ if (srcdir_auth != mds->get_nodeid()) {
+ lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
+ lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
+ }
+ }
+
+ if (srcdn->get_name() > destdn->get_name())
+ lov.add_xlock(&destdn->lock);
+
+ if (xlock_srcdn)
+ lov.add_xlock(&srcdn->lock);
+ else
+ lov.add_rdlock(&srcdn->lock);
+
+ if (srcdn->get_name() < destdn->get_name())
+ lov.add_xlock(&destdn->lock);
+ } else {
+ int cmp = mdr->compare_paths();
+ bool lock_destdir_first =
+ (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
+
+ if (lock_destdir_first) {
+ lov.add_wrlock(&destdir->inode->filelock);
+ lov.add_wrlock(&destdir->inode->nestlock);
+ lov.add_xlock(&destdn->lock);
+ }
+
+ if (xlock_srcdn) {
+ mds_rank_t srcdir_auth = srcdir->authority().first;
+ if (srcdir_auth == mds->get_nodeid()) {
+ lov.add_wrlock(&srcdir->inode->filelock);
+ lov.add_wrlock(&srcdir->inode->nestlock);
+ } else {
+ lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
+ lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
+ }
+ lov.add_xlock(&srcdn->lock);
+ } else {
+ lov.add_rdlock(&srcdn->lock);
+ }
+
+ if (!lock_destdir_first) {
+ lov.add_wrlock(&destdir->inode->filelock);
+ lov.add_wrlock(&destdir->inode->nestlock);
+ lov.add_xlock(&destdn->lock);
+ }
+ }
- return dn;
-}
+ CInode *auth_pin_freeze = nullptr;
+ // XXX any better way to do this?
+ if (xlock_srcdn && !srcdn->is_auth()) {
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
+ }
+ if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+ return std::make_pair(nullptr, nullptr);
+ if (srcdn->get_projected_linkage()->is_null()) {
+ respond_to_request(mdr, -ENOENT);
+ return std::make_pair(nullptr, nullptr);
+ }
+ if (destdn->get_projected_linkage()->is_null()) {
+ snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ destdn->first = std::max(destdn->first, next_snap);
+ }
+ mdr->locking_state |= MutationImpl::PATH_LOCKED;
+ return std::make_pair(destdn, srcdn);
+}
/**
* try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
{
CDir *dir = diri->get_dirfrag(fg);
- // not open and inode not mine?
- if (!dir && !diri->is_auth()) {
- mds_rank_t inauth = diri->authority().first;
- dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
- mdcache->request_forward(mdr, inauth);
- return 0;
- }
+ if (dir) {
+ // am i auth for the dirfrag?
+ if (!dir->is_auth()) {
+ mds_rank_t auth = dir->authority().first;
+ dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
+ << ", fw to mds." << auth << dendl;
+ mdcache->request_forward(mdr, auth);
+ return nullptr;
+ }
+ } else {
+ // not open and inode not mine?
+ if (!diri->is_auth()) {
+ mds_rank_t inauth = diri->authority().first;
+ dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
+ mdcache->request_forward(mdr, inauth);
+ return nullptr;
+ }
- // not open and inode frozen?
- if (!dir && diri->is_frozen()) {
- dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
- assert(diri->get_parent_dir());
- diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
- return 0;
- }
+ // not open and inode frozen?
+ if (diri->is_frozen()) {
+ dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
+ ceph_assert(diri->get_parent_dir());
+ diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return nullptr;
+ }
- // invent?
- if (!dir)
+ // invent?
dir = diri->get_or_open_dirfrag(mdcache, fg);
-
- // am i auth for the dirfrag?
- if (!dir->is_auth()) {
- mds_rank_t auth = dir->authority().first;
- dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
- << ", fw to mds." << auth << dendl;
- mdcache->request_forward(mdr, auth);
- return 0;
}
return dir;
void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ const cref_t<MClientRequest> &req = mdr->client_request;
if (req->get_filepath().depth() == 0 && is_lookup) {
// refpath can't be empty for lookup but it can for
if (mask & CEPH_STAT_RSTAT)
want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
- CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, want_auth, false, NULL,
- !is_lookup);
- if (!ref) return;
+ CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
+ if (!ref)
+ return;
+
+ mdr->getattr_caps = mask;
+
+ if (mdr->snapid == CEPH_NOSNAP && !mdr->is_batch_head && mdr->is_batch_op()) {
+ if (!is_lookup) {
+ auto em = ref->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
+ if (em.second) {
+ em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr, mdcache);
+ } else {
+ dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
+ em.first->second->add_request(mdr);
+ return;
+ }
+ } else {
+ CDentry* dn = mdr->dn[0].back();
+ auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
+ if (em.second) {
+ em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr, mdcache);
+ mdr->pin(dn);
+ } else {
+ dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
+ em.first->second->add_request(mdr);
+ return;
+ }
+ }
+ mdr->is_batch_head = true;
+ }
/*
* if client currently holds the EXCL cap on a field, do not rdlock
mdr->snapid <= cap->client_follows))
issued = cap->issued();
+ // FIXME
+ MutationImpl::LockOpVec lov;
if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
- rdlocks.insert(&ref->linklock);
+ lov.add_rdlock(&ref->linklock);
if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
- rdlocks.insert(&ref->authlock);
+ lov.add_rdlock(&ref->authlock);
if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
- rdlocks.insert(&ref->xattrlock);
+ lov.add_rdlock(&ref->xattrlock);
if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
// Don't wait on unstable filelock if client is allowed to read file size.
// This can reduce the response time of getattr in the case that multiple
// clients do stat(2) and there are writers.
// The downside of this optimization is that mds may not issue Fs caps along
// with getattr reply. Client may need to send more getattr requests.
- if (mdr->rdlocks.count(&ref->filelock)) {
- rdlocks.insert(&ref->filelock);
+ if (mdr->is_rdlocked(&ref->filelock)) {
+ lov.add_rdlock(&ref->filelock);
} else if (ref->filelock.is_stable() ||
ref->filelock.get_num_wrlocks() > 0 ||
!ref->filelock.can_read(mdr->get_client())) {
- rdlocks.insert(&ref->filelock);
- mdr->done_locking = false;
+ lov.add_rdlock(&ref->filelock);
+ mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
}
}
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (!check_access(mdr, ref, MAY_READ))
// value for them. (currently this matters for xattrs and inline data)
mdr->getattr_caps = mask;
- mds->balancer->hit_inode(now, ref, META_POP_IRD,
- req->get_source().num());
+ mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
// reply
dout(10) << "reply to stat on " << *req << dendl;
}
};
-/* This function DOES clean up the mdr before returning*/
/*
* filepath: ino
*/
void Server::handle_client_lookup_ino(MDRequestRef& mdr,
bool want_parent, bool want_dentry)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ if ((uint64_t)req->head.args.lookupino.snapid > 0)
+ return _lookup_snap_ino(mdr);
inodeno_t ino = req->get_filepath().get_ino();
CInode *in = mdcache->get_inode(ino);
return;
}
- if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
+ if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
!in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
return;
}
CDentry *dn = in->get_projected_parent_dn();
CInode *diri = dn ? dn->get_dir()->inode : NULL;
- set<SimpleLock*> rdlocks;
+ MutationImpl::LockOpVec lov;
if (dn && (want_parent || want_dentry)) {
mdr->pin(dn);
- rdlocks.insert(&dn->lock);
+ lov.add_rdlock(&dn->lock);
}
- unsigned mask = req->head.args.getattr.mask;
+ unsigned mask = req->head.args.lookupino.mask;
if (mask) {
Capability *cap = in->get_client_cap(mdr->get_client());
int issued = 0;
if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
issued = cap->issued();
+ // FIXME
// permission bits, ACL/security xattrs
if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
- rdlocks.insert(&in->authlock);
+ lov.add_rdlock(&in->authlock);
if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
- rdlocks.insert(&in->xattrlock);
+ lov.add_rdlock(&in->xattrlock);
mdr->getattr_caps = mask;
}
- if (!rdlocks.empty()) {
- set<SimpleLock*> wrlocks, xlocks;
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!lov.empty()) {
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (diri != NULL) {
}
}
+void Server::_lookup_snap_ino(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ vinodeno_t vino;
+ vino.ino = req->get_filepath().get_ino();
+ vino.snapid = (__u64)req->head.args.lookupino.snapid;
+ inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
+ __u32 hash = req->head.args.lookupino.hash;
+
+ dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
+
+ CInode *in = mdcache->lookup_snap_inode(vino);
+ if (!in) {
+ in = mdcache->get_inode(vino.ino);
+ if (in) {
+ if (in->state_test(CInode::STATE_PURGING) ||
+ !in->has_snap_data(vino.snapid)) {
+ if (in->is_dir() || !parent_ino) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ in = NULL;
+ }
+ }
+ }
+
+ if (in) {
+ dout(10) << "reply to lookup_snap_ino " << *in << dendl;
+ mdr->snapid = vino.snapid;
+ mdr->tracei = in;
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ CInode *diri = NULL;
+ if (parent_ino) {
+ diri = mdcache->get_inode(parent_ino);
+ if (!diri) {
+ mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
+ return;
+ }
+
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_rdlock(&diri->dirfragtreelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ frag_t frag = diri->dirfragtree[hash];
+ CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
+ if (!dir)
+ return;
+
+ if (!dir->is_complete()) {
+ if (dir->is_frozen()) {
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+ return;
+ }
+
+ respond_to_request(mdr, -ESTALE);
+ } else {
+ mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
+ }
+}
+
void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
{
inodeno_t ino = mdr->client_request->get_filepath().get_ino();
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_open(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
dout(7) << "open on " << req->get_filepath() << dendl;
int flags = req->head.args.open.flags;
return;
}
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
+ CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
if (!cur)
return;
if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
- assert(!need_auth);
- mdr->done_locking = false;
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ ceph_assert(!need_auth);
+ mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
if (!cur)
return;
}
}
if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
- !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
+ !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
dout(7) << "old client cannot open inline data file " << *cur << dendl;
respond_to_request(mdr, -EPERM);
return;
return;
}
+ MutationImpl::LockOpVec lov;
+
unsigned mask = req->head.args.open.mask;
if (mask) {
Capability *cap = cur->get_client_cap(mdr->get_client());
issued = cap->issued();
// permission bits, ACL/security xattrs
if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
- rdlocks.insert(&cur->authlock);
+ lov.add_rdlock(&cur->authlock);
if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
- rdlocks.insert(&cur->xattrlock);
+ lov.add_rdlock(&cur->xattrlock);
mdr->getattr_caps = mask;
}
// O_TRUNC
if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
- assert(cur->is_auth());
+ ceph_assert(cur->is_auth());
- xlocks.insert(&cur->filelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (!check_access(mdr, cur, MAY_WRITE))
// this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
// and that data itself is flushed so that we can read the snapped data off disk.
if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
- rdlocks.insert(&cur->filelock);
+ lov.add_rdlock(&cur->filelock);
}
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
mask = MAY_READ;
if (cur->is_file() || cur->is_dir()) {
if (mdr->snapid == CEPH_NOSNAP) {
// register new cap
- Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
+ Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
if (cap)
dout(12) << "open issued caps " << ccap_string(cap->pending())
<< " for " << req->get_source()
// make sure this inode gets into the journal
if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
- !cur->item_open_file.is_on_list()) {
- LogSegment *ls = mds->mdlog->get_current_segment();
+ mdcache->open_file_table.should_log_open(cur)) {
EOpen *le = new EOpen(mds->mdlog);
mdlog->start_entry(le);
le->add_clean_inode(cur);
- ls->open_files.push_back(&cur->item_open_file);
mdlog->submit_entry(le);
}
// hit pop
if (cmode & CEPH_FILE_MODE_WR)
- mds->balancer->hit_inode(now, cur, META_POP_IWR);
+ mds->balancer->hit_inode(cur, META_POP_IWR);
else
- mds->balancer->hit_inode(now, cur, META_POP_IRD,
+ mds->balancer->hit_inode(cur, META_POP_IRD,
mdr->client_request->get_source().num());
CDentry *dn = 0;
if (req->get_dentry_wanted()) {
- assert(mdr->dn[0].size());
+ ceph_assert(mdr->dn[0].size());
dn = mdr->dn[0].back();
}
class C_MDS_openc_finish : public ServerLogContext {
CDentry *dn;
CInode *newi;
- snapid_t follows;
public:
- C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
- ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
+ C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+ ServerLogContext(s, r), dn(d), newi(ni) {}
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
dn->pop_projected_linkage();
MDRequestRef null_ref;
get_mds()->mdcache->send_dentry_link(dn, null_ref);
- utime_t now = ceph_clock_now();
- get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
+ get_mds()->balancer->hit_inode(newi, META_POP_IWR);
server->respond_to_request(mdr, 0);
- assert(g_conf->mds_kill_openc_at != 1);
+ ceph_assert(g_conf()->mds_kill_openc_at != 1);
}
};
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_openc(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
client_t client = mdr->get_client();
dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
}
bool excl = req->head.args.open.flags & CEPH_O_EXCL;
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
+ if (!dn)
+ return;
- if (!excl) {
- int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
- &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
- if (r > 0) return;
- if (r == 0) {
- // it existed.
- handle_client_open(mdr);
- return;
- }
- if (r < 0 && r != -ENOENT) {
- if (r == -ESTALE) {
- dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
- MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
- mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
- } else {
- dout(10) << "FAIL on error " << r << dendl;
- respond_to_request(mdr, r);
- }
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (!excl && !dnl->is_null()) {
+ // it existed.
+ mds->locker->xlock_downgrade(&dn->lock, mdr.get());
+
+ MutationImpl::LockOpVec lov;
+ lov.add_rdlock(&dnl->get_inode()->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
- }
- }
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- file_layout_t *dir_layout = NULL;
- CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
- !excl, false, false, &dir_layout);
- if (!dn) return;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
+ handle_client_open(mdr);
return;
}
+
+ ceph_assert(dnl->is_null());
+
// set layout
file_layout_t layout;
- if (dir_layout)
- layout = *dir_layout;
+ if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
else
layout = mdcache->default_file_layout;
// created null dn.
CDir *dir = dn->get_dir();
CInode *diri = dir->get_inode();
- rdlocks.insert(&diri->authlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
-
if (!check_access(mdr, diri, access))
return;
-
if (!check_fragment_space(mdr, dir))
return;
- CDentry::linkage_t *dnl = dn->get_projected_linkage();
-
- if (!dnl->is_null()) {
- // it existed.
- assert(req->head.args.open.flags & CEPH_O_EXCL);
- dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
- mdr->tracei = dnl->get_inode();
- mdr->tracedn = dn;
- respond_to_request(mdr, -EEXIST);
- return;
- }
+ if (mdr->dn[0].size() == 1)
+ mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
// create inode.
- SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
- snapid_t follows = realm->get_newest_seq();
-
CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
req->head.args.open.mode | S_IFREG, &layout);
- assert(in);
+ ceph_assert(in);
// it's a file.
dn->push_projected_linkage(in);
if (layout.pool_id != mdcache->default_file_layout.pool_id)
in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
in->inode.update_backtrace();
- if (cmode & CEPH_FILE_MODE_WR) {
- in->inode.client_ranges[client].range.first = 0;
- in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
- in->inode.client_ranges[client].follows = follows;
- }
in->inode.rstat.rfiles = 1;
- assert(dn->first == follows+1);
+ SnapRealm *realm = diri->find_snaprealm();
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ ceph_assert(dn->first == follows+1);
in->first = dn->first;
+
+ // do the open
+ Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
+ in->authlock.set_state(LOCK_EXCL);
+ in->xattrlock.set_state(LOCK_EXCL);
+
+ if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+ in->inode.client_ranges[client].range.first = 0;
+ in->inode.client_ranges[client].range.last = in->inode.layout.stripe_unit;
+ in->inode.client_ranges[client].follows = follows;
+ cap->mark_clientwriteable();
+ }
// prepare finisher
mdr->ls = mdlog->get_current_segment();
mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
le->metablob.add_primary_dentry(dn, in, true, true, true);
- // do the open
- mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
- in->authlock.set_state(LOCK_EXCL);
- in->xattrlock.set_state(LOCK_EXCL);
-
// make sure this inode gets into the journal
le->metablob.add_opened_ino(in->ino());
- LogSegment *ls = mds->mdlog->get_current_segment();
- ls->open_files.push_back(&in->item_open_file);
- C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
+ C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
- if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
+ if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
+ openc_response_t ocresp;
+
+ dout(10) << "adding created_ino and delegated_inos" << dendl;
+ ocresp.created_ino = in->inode.ino;
+
+ if (delegate_inos_pct && !req->is_queued_for_replay()) {
+ // Try to delegate some prealloc_inos to the client, if it's down to half the max
+ unsigned frac = 100 / delegate_inos_pct;
+ if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
+ mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
+ }
+
+ encode(ocresp, mdr->reply_extra_bl);
+ } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
dout(10) << "adding ino to reply to indicate inode was created" << dendl;
// add the file created flag onto the reply if create_flags features is supported
- ::encode(in->inode.ino, mdr->reply_extra_bl);
+ encode(in->inode.ino, mdr->reply_extra_bl);
}
journal_and_reply(mdr, in, dn, le, fin);
void Server::handle_client_readdir(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
client_t client = req->get_source().num();
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
+ MutationImpl::LockOpVec lov;
+ CInode *diri = rdlock_path_pin_ref(mdr, false, true);
if (!diri) return;
// it's a directory, right?
return;
}
- rdlocks.insert(&diri->filelock);
- rdlocks.insert(&diri->dirfragtreelock);
+ lov.add_rdlock(&diri->filelock);
+ lov.add_rdlock(&diri->dirfragtreelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (!check_access(mdr, diri, MAY_READ))
// ok!
dout(10) << "handle_client_readdir on " << *dir << dendl;
- assert(dir->is_auth());
+ ceph_assert(dir->is_auth());
if (!dir->is_complete()) {
if (dir->is_frozen()) {
unsigned max_bytes = req->head.args.readdir.max_bytes;
if (!max_bytes)
// make sure at least one item can be encoded
- max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
// start final blob
bufferlist dirbl;
- dir->encode_dirstat(dirbl, mds->get_nodeid());
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
+ dir->get_dist_spec(ds.dist, mds->get_nodeid());
+
+ dir->encode_dirstat(dirbl, mdr->session->info, ds);
// count bytes available.
// this isn't perfect, but we should capture the main variable/unbounded size items!
return;
}
}
- assert(in);
+ ceph_assert(in);
if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
// dentry
dout(12) << "including dn " << *dn << dendl;
- ::encode(dn->get_name(), dnbl);
- mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
+ encode(dn->get_name(), dnbl);
+ int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
+ mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
// inode
dout(12) << "including inode " << *in << dendl;
dnbl.swap(keep);
break;
}
- assert(r >= 0);
+ ceph_assert(r >= 0);
numfiles++;
// touch dn
}
// finish final blob
- ::encode(numfiles, dirbl);
- ::encode(flags, dirbl);
+ encode(numfiles, dirbl);
+ encode(flags, dirbl);
dirbl.claim_append(dnbl);
// yay, reply
mdr->reply_extra_bl = dirbl;
// bump popularity. NOTE: this doesn't quite capture it.
- mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
+ mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
// reply
mdr->tracei = diri;
*/
class C_MDS_inode_update_finish : public ServerLogContext {
CInode *in;
- bool truncating_smaller, changed_ranges;
+ bool truncating_smaller, changed_ranges, new_realm;
public:
C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
- bool sm=false, bool cr=false) :
- ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
+ bool sm=false, bool cr=false, bool nr=false) :
+ ServerLogContext(s, r), in(i),
+ truncating_smaller(sm), changed_ranges(cr), new_realm(nr) { }
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
// apply
in->pop_and_dirty_projected_inode(mdr->ls);
mdr->apply();
+ MDSRank *mds = get_mds();
+
// notify any clients
if (truncating_smaller && in->inode.is_truncating()) {
- get_mds()->locker->issue_truncate(in);
- get_mds()->mdcache->truncate_inode(in, mdr->ls);
+ mds->locker->issue_truncate(in);
+ mds->mdcache->truncate_inode(in, mdr->ls);
+ }
+
+ if (new_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(in, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(in, op);
}
- utime_t now = ceph_clock_now();
- get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
+ get_mds()->balancer->hit_inode(in, META_POP_IWR);
server->respond_to_request(mdr, 0);
void Server::handle_client_file_setlock(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
// get the inode to operate on, and set up any locks needed for that
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
if (!cur)
return;
- xlocks.insert(&cur->flocklock);
+ lov.add_xlock(&cur->flocklock);
/* acquire_locks will return true if it gets the locks. If it fails,
it will redeliver this request at a later date, so drop the request.
*/
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
+ if (!mds->locker->acquire_locks(mdr, lov)) {
dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
return;
}
dout(10) << " state prior to lock change: " << *lock_state << dendl;
if (CEPH_LOCK_UNLOCK == set_lock.type) {
list<ceph_filelock> activated_locks;
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
if (lock_state->is_waiting(set_lock)) {
dout(10) << " unlock removing waiting lock " << set_lock << dendl;
lock_state->remove_waiting(set_lock);
respond_to_request(mdr, -EWOULDBLOCK);
} else {
dout(10) << " added to waiting list" << dendl;
- assert(lock_state->is_waiting(set_lock));
+ ceph_assert(lock_state->is_waiting(set_lock));
mdr->more()->flock_was_waiting = true;
mds->locker->drop_locks(mdr.get());
mdr->drop_local_auth_pins();
void Server::handle_client_file_readlock(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
// get the inode to operate on, and set up any locks needed for that
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
if (!cur)
return;
/* acquire_locks will return true if it gets the locks. If it fails,
it will redeliver this request at a later date, so drop the request.
*/
- rdlocks.insert(&cur->flocklock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
+ lov.add_rdlock(&cur->flocklock);
+ if (!mds->locker->acquire_locks(mdr, lov)) {
dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
return;
}
lock_state->look_for_lock(checking_lock);
bufferlist lock_bl;
- ::encode(checking_lock, lock_bl);
+ encode(checking_lock, lock_bl);
mdr->reply_extra_bl = lock_bl;
respond_to_request(mdr, 0);
void Server::handle_client_setattr(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
if (!cur) return;
if (mdr->snapid != CEPH_NOSNAP) {
// xlock inode
if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
- xlocks.insert(&cur->authlock);
+ lov.add_xlock(&cur->authlock);
if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
- xlocks.insert(&cur->filelock);
+ lov.add_xlock(&cur->filelock);
if (mask & CEPH_SETATTR_CTIME)
- wrlocks.insert(&cur->versionlock);
+ lov.add_wrlock(&cur->versionlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
// adjust client's max_size?
CInode::mempool_inode::client_range_map new_ranges;
bool max_increased = false;
- mds->locker->calc_new_client_ranges(cur, pi.inode.size, &new_ranges, &max_increased);
+ mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
if (pi.inode.client_ranges != new_ranges) {
dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
pi.inode.client_ranges = new_ranges;
}
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
// log + wait
truncating_smaller, changed_ranges));
// flush immediately if there are readers/writers waiting
- if (xlocks.count(&cur->filelock) &&
+ if (mdr->is_xlocked(&cur->filelock) &&
(cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
mds->mdlog->flush();
}
{
CInode *in = mdr->in[0];
client_t client = mdr->get_client();
- assert(in);
+ ceph_assert(in);
dout(10) << "do_open_truncate " << *in << dendl;
SnapRealm *realm = in->find_snaprealm();
- mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
+ Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "open_truncate");
// prepare
auto &pi = in->project_inode();
pi.inode.version = in->pre_dirty();
- pi.inode.mtime = pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
}
bool changed_ranges = false;
- if (cmode & CEPH_FILE_MODE_WR) {
+ if (cap && (cmode & CEPH_FILE_MODE_WR)) {
pi.inode.client_ranges[client].range.first = 0;
pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
- pi.inode.client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
+ pi.inode.client_ranges[client].follows = realm->get_newest_seq();
changed_ranges = true;
+ cap->mark_clientwriteable();
}
le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
// make sure ino gets into the journal
le->metablob.add_opened_ino(in->ino());
- LogSegment *ls = mds->mdlog->get_current_segment();
- ls->open_files.push_back(&in->item_open_file);
mdr->o_trunc = true;
CDentry *dn = 0;
if (mdr->client_request->get_dentry_wanted()) {
- assert(mdr->dn[0].size());
+ ceph_assert(mdr->dn[0].size());
dn = mdr->dn[0].back();
}
/* This function cleans up the passed mdr */
void Server::handle_client_setlayout(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
if (!cur) return;
if (mdr->snapid != CEPH_NOSNAP) {
return;
}
- xlocks.insert(&cur->filelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (!check_access(mdr, cur, access))
// add the old pool to the inode
pi.inode.add_old_pool(old_layout.pool_id);
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
// log + wait
journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
}
+bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
+{
+ if (mdr->locking_state & MutationImpl::ALL_LOCKED)
+ return true;
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&in->policylock);
+ if (xlock_snaplock)
+ lov.add_xlock(&in->snaplock);
+ else
+ lov.add_rdlock(&in->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return false;
+
+ if (want_layout && in->get_projected_inode()->has_layout()) {
+ mdr->dir_layout = in->get_projected_inode()->layout;
+ want_layout = false;
+ }
+ if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
+ return false;
+ }
+
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ return true;
+}
+
+CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
+{
+ CInode *in = mdcache->get_inode(ino);
+ if (!in || in->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -ESTALE);
+ return nullptr;
+ }
+ if (!in->is_auth()) {
+ mdcache->request_forward(mdr, in->authority().first);
+ return nullptr;
+ }
+
+ return in;
+}
+
void Server::handle_client_setdirlayout(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- file_layout_t *dir_layout = NULL;
- CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
- if (!cur) return;
+ const cref_t<MClientRequest> &req = mdr->client_request;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
+ // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur)
return;
- }
if (!cur->is_dir()) {
respond_to_request(mdr, -ENOTDIR);
return;
}
- xlocks.insert(&cur->policylock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!xlock_policylock(mdr, cur, true))
return;
// validate layout
file_layout_t layout;
if (old_pi->has_layout())
layout = old_pi->layout;
- else if (dir_layout)
- layout = *dir_layout;
+ else if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
else
layout = mdcache->default_file_layout;
if (name == "quota") {
string::iterator begin = value.begin();
string::iterator end = value.end();
+ if (begin == end) {
+ // keep quota unchanged. (for create_quota_realm())
+ return 0;
+ }
keys_and_values<string::iterator> p; // create instance of parser
std::map<string, string> m; // map to receive results
if (!qi::parse(begin, end, p, m)) { // returns true if successful
return 0;
}
+void Server::create_quota_realm(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+
+ auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
+ req->set_filepath(filepath(in->ino()));
+ req->set_string2("ceph.quota");
+ // empty vxattr value
+ req->set_tid(mds->issue_tid());
+
+ mds->send_message_mds(req, in->authority().first);
+}
+
/*
* Verify that the file layout attribute carried by client
* is well-formatted.
string value,
file_layout_t *layout)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
epoch_t epoch;
int r;
epoch = osdmap.get_epoch();
});
- assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
+ ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
} else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
return 0;
}
-void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
- file_layout_t *dir_layout,
- set<SimpleLock*> rdlocks,
- set<SimpleLock*> wrlocks,
- set<SimpleLock*> xlocks)
+void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
string name(req->get_path2());
bufferlist bl = req->get_data();
string value (bl.c_str(), bl.length());
return;
}
+ bool new_realm = false;
if (name.compare(0, 15, "ceph.dir.layout") == 0) {
if (!cur->is_dir()) {
respond_to_request(mdr, -EINVAL);
return;
}
+ if (!xlock_policylock(mdr, cur, true))
+ return;
+
file_layout_t layout;
if (cur->get_projected_inode()->has_layout())
layout = cur->get_projected_inode()->layout;
- else if (dir_layout)
- layout = *dir_layout;
+ else if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
else
layout = mdcache->default_file_layout;
if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
return;
- xlocks.insert(&cur->policylock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
-
auto &pi = cur->project_inode();
pi.inode.layout = layout;
mdr->no_early_reply = true;
if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
return;
- xlocks.insert(&cur->filelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
auto &pi = cur->project_inode();
return;
}
- xlocks.insert(&cur->policylock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (quota.is_enable() && !cur->get_projected_srnode())
+ new_realm = true;
+
+ if (!xlock_policylock(mdr, cur, false, new_realm))
return;
- auto &pi = cur->project_inode();
+ auto &pi = cur->project_inode(false, new_realm);
pi.inode.quota = quota;
+ if (new_realm) {
+ SnapRealm *realm = cur->find_snaprealm();
+ auto seq = realm->get_newest_seq();
+ auto &newsnap = *pi.snapnode;
+ newsnap.created = seq;
+ newsnap.seq = seq;
+ }
mdr->no_early_reply = true;
pip = &pi.inode;
client_t exclude_ct = mdr->get_client();
- mdcache->broadcast_quota_to_client(cur, exclude_ct);
+ mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
} else if (name.find("ceph.dir.pin") == 0) {
if (!cur->is_dir() || cur->is_root()) {
respond_to_request(mdr, -EINVAL);
return;
}
- xlocks.insert(&cur->policylock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!xlock_policylock(mdr, cur))
return;
auto &pi = cur->project_inode();
}
pip->change_attr++;
- pip->ctime = pip->rstat.rctime = mdr->get_op_stamp();
+ pip->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pip->rstat.rctime)
+ pip->rstat.rctime = mdr->get_op_stamp();
pip->version = cur->pre_dirty();
if (cur->is_file())
pip->update_backtrace();
mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
- journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+ false, false, new_realm));
return;
}
-void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
- file_layout_t *dir_layout,
- set<SimpleLock*> rdlocks,
- set<SimpleLock*> wrlocks,
- set<SimpleLock*> xlocks)
+void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
string name(req->get_path2());
dout(10) << __func__ << " " << name << " on " << *cur << dendl;
return;
}
- xlocks.insert(&cur->policylock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
auto &pi = cur->project_inode();
// null/none value (empty string, means default layout). Is equivalent
// to a setxattr with empty string: pass through the empty payload of
// the rmxattr request to do this.
- handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
+ handle_set_vxattr(mdr, cur);
return;
}
C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
ServerLogContext(s, r), in(i) { }
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
// apply
in->pop_and_dirty_projected_inode(mdr->ls);
mdr->apply();
- utime_t now = ceph_clock_now();
- get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
+ get_mds()->balancer->hit_inode(in, META_POP_IWR);
server->respond_to_request(mdr, 0);
}
void Server::handle_client_setxattr(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
string name(req->get_path2());
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *cur;
- file_layout_t *dir_layout = NULL;
- if (name.compare(0, 15, "ceph.dir.layout") == 0)
- cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
- else
- cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ // magic ceph.* namespace?
+ if (name.compare(0, 5, "ceph.") == 0) {
+ // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur)
+ return;
+
+ handle_set_vxattr(mdr, cur);
+ return;
+ }
+
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
if (!cur)
return;
int flags = req->head.args.setxattr.flags;
- // magic ceph.* namespace?
- if (name.compare(0, 5, "ceph.") == 0) {
- handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
- return;
- }
-
- xlocks.insert(&cur->xattrlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->xattrlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (!check_access(mdr, cur, MAY_WRITE))
// check xattrs kv pairs size
size_t cur_xattrs_size = 0;
for (const auto& p : *pxattrs) {
- if ((flags & CEPH_XATTR_REPLACE) && (name.compare(std::string(boost::string_view(p.first))) == 0)) {
+ if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
continue;
}
cur_xattrs_size += p.first.length() + p.second.length();
}
- if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
+ if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
<< cur_xattrs_size << ", inc " << inc << dendl;
respond_to_request(mdr, -ENOSPC);
return;
}
- if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
+ if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
respond_to_request(mdr, -EEXIST);
return;
}
- if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
+ if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
respond_to_request(mdr, -ENODATA);
return;
// project update
auto &pi = cur->project_inode(true);
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.xattr_version++;
auto &px = *pi.xattrs;
if ((flags & CEPH_XATTR_REMOVE)) {
- px.erase(mempool::mds_co::string(boost::string_view(name)));
+ px.erase(mempool::mds_co::string(name));
} else {
bufferptr b = buffer::create(len);
if (len)
- req->get_data().copy(0, len, b.c_str());
- auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(boost::string_view(name))), std::forward_as_tuple(b));
+ req->get_data().begin().copy(len, b.c_str());
+ auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
if (!em.second)
em.first->second = b;
}
void Server::handle_client_removexattr(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
std::string name(req->get_path2());
- std::set<SimpleLock*> rdlocks, wrlocks, xlocks;
- file_layout_t *dir_layout = NULL;
- CInode *cur;
- if (name == "ceph.dir.layout")
- cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
- else
- cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+
+ if (name.compare(0, 5, "ceph.") == 0) {
+ // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur)
+ return;
+
+ handle_remove_vxattr(mdr, cur);
+ return;
+ }
+
+ CInode* cur = rdlock_path_pin_ref(mdr, true);
if (!cur)
return;
return;
}
- if (name.compare(0, 5, "ceph.") == 0) {
- handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
- return;
- }
-
- xlocks.insert(&cur->xattrlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->xattrlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
auto pxattrs = cur->get_projected_xattrs();
- if (pxattrs->count(mempool::mds_co::string(boost::string_view(name))) == 0) {
+ if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
respond_to_request(mdr, -ENODATA);
return;
auto &pi = cur->project_inode(true);
auto &px = *pi.xattrs;
pi.inode.version = cur->pre_dirty();
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.xattr_version++;
- px.erase(mempool::mds_co::string(boost::string_view(name)));
+ px.erase(mempool::mds_co::string(name));
// log + wait
mdr->ls = mdlog->get_current_segment();
C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
ServerLogContext(s, r), dn(d), newi(ni) {}
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
// link the inode
dn->pop_projected_linkage();
// mkdir?
if (newi->inode.is_dir()) {
CDir *dir = newi->get_dirfrag(frag_t());
- assert(dir);
+ ceph_assert(dir);
dir->fnode.version--;
dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
dir->mark_new(mdr->ls);
get_mds()->locker->share_inode_max_size(newi);
// hit pop
- utime_t now = ceph_clock_now();
- get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
+ get_mds()->balancer->hit_inode(newi, META_POP_IWR);
// reply
server->respond_to_request(mdr, 0);
void Server::handle_client_mknod(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
client_t client = mdr->get_client();
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- file_layout_t *dir_layout = NULL;
- CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
- &dir_layout);
- if (!dn) return;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
- return;
- }
- CInode *diri = dn->get_dir()->get_inode();
- rdlocks.insert(&diri->authlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+
+ unsigned mode = req->head.args.mknod.mode;
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
+
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
+ if (!dn)
return;
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
if (!check_access(mdr, diri, MAY_WRITE))
return;
-
if (!check_fragment_space(mdr, dn->get_dir()))
return;
- unsigned mode = req->head.args.mknod.mode;
- if ((mode & S_IFMT) == 0)
- mode |= S_IFREG;
-
// set layout
file_layout_t layout;
- if (dir_layout && S_ISREG(mode))
- layout = *dir_layout;
+ if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
else
layout = mdcache->default_file_layout;
- SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
- snapid_t follows = realm->get_newest_seq();
- CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
- mode, &layout);
- assert(newi);
+ CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
+ ceph_assert(newi);
dn->push_projected_linkage(newi);
newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
newi->inode.update_backtrace();
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+ ceph_assert(follows >= realm->get_newest_seq());
+
// if the client created a _regular_ file via MKNOD, it's highly likely they'll
// want to write to it (e.g., if they are reexporting NFS)
if (S_ISREG(newi->inode.mode)) {
- dout(15) << " setting a client_range too, since this is a regular file" << dendl;
- newi->inode.client_ranges[client].range.first = 0;
- newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
- newi->inode.client_ranges[client].follows = follows;
-
// issue a cap on the file
int cmode = CEPH_FILE_MODE_RDWR;
- Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
if (cap) {
cap->set_wanted(0);
newi->filelock.set_state(LOCK_EXCL);
newi->authlock.set_state(LOCK_EXCL);
newi->xattrlock.set_state(LOCK_EXCL);
+
+ dout(15) << " setting a client_range too, since this is a regular file" << dendl;
+ newi->inode.client_ranges[client].range.first = 0;
+ newi->inode.client_ranges[client].range.last = newi->inode.layout.stripe_unit;
+ newi->inode.client_ranges[client].follows = follows;
+ cap->mark_clientwriteable();
}
}
- assert(dn->first == follows + 1);
+ ceph_assert(dn->first == follows + 1);
newi->first = dn->first;
dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
le->metablob.add_primary_dentry(dn, newi, true, true, true);
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+ mds->balancer->maybe_fragment(dn->get_dir(), false);
}
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_mkdir(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
- if (!dn) return;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
+ if (!dn)
return;
- }
+
CDir *dir = dn->get_dir();
CInode *diri = dir->get_inode();
- rdlocks.insert(&diri->authlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
// mkdir check access
if (!check_access(mdr, diri, MAY_WRITE))
return;
// new inode
- SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
- snapid_t follows = realm->get_newest_seq();
-
unsigned mode = req->head.args.mkdir.mode;
mode &= ~S_IFMT;
mode |= S_IFDIR;
- CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
- assert(newi);
+ CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
+ ceph_assert(newi);
// it's a directory.
dn->push_projected_linkage(newi);
newi->inode.rstat.rsubdirs = 1;
newi->inode.update_backtrace();
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+ ceph_assert(follows >= realm->get_newest_seq());
+
dout(12) << " follows " << follows << dendl;
- assert(dn->first == follows + 1);
+ ceph_assert(dn->first == follows + 1);
newi->first = dn->first;
// ...and that new dir is empty.
// issue a cap on the directory
int cmode = CEPH_FILE_MODE_RDWR;
- Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
if (cap) {
cap->set_wanted(0);
// make sure this inode gets into the journal
le->metablob.add_opened_ino(newi->ino());
- LogSegment *ls = mds->mdlog->get_current_segment();
- ls->open_files.push_back(&newi->item_open_file);
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+
+ // We hit_dir (via hit_inode) in our finish callback, but by then we might
+ // have overshot the split size (multiple mkdir in flight), so here is
+ // an early chance to split the dir if this mkdir makes it oversized.
+ mds->balancer->maybe_fragment(dir, false);
}
void Server::handle_client_symlink(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
- if (!dn) return;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
+ if (!dn)
return;
- }
+
CDir *dir = dn->get_dir();
CInode *diri = dir->get_inode();
- rdlocks.insert(&diri->authlock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
if (!check_access(mdr, diri, MAY_WRITE))
- return;
-
+ return;
if (!check_fragment_space(mdr, dir))
return;
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
unsigned mode = S_IFLNK | 0777;
- CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
- assert(newi);
+ CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
+ ceph_assert(newi);
// it's a symlink
dn->push_projected_linkage(newi);
- newi->symlink = mempool::mds_co::string(boost::string_view(req->get_path2()));
+ newi->symlink = req->get_path2();
newi->inode.size = newi->symlink.length();
newi->inode.rstat.rbytes = newi->inode.size;
newi->inode.rstat.rfiles = 1;
le->metablob.add_primary_dentry(dn, newi, true, true);
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+ mds->balancer->maybe_fragment(dir, false);
}
-// LINK
+// LINK
+
+void Server::handle_client_link(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ dout(7) << "handle_client_link " << req->get_filepath()
+ << " to " << req->get_filepath2()
+ << dendl;
+
+ mdr->disable_lock_cache();
+
+ CDentry *destdn;
+ CInode *targeti;
+
+ if (req->get_filepath2().depth() == 0) {
+ targeti = mdcache->get_inode(req->get_filepath2().get_ino());
+ if (!targeti) {
+ dout(10) << "ESTALE on path2, attempting recovery" << dendl;
+ mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
+ return;
+ }
+ mdr->pin(targeti);
+
+ if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
+ CDentry *pdn = targeti->get_projected_parent_dn();
+ if (!pdn) {
+ dout(7) << "target has no parent dn, failing..." << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
+ return;
+ mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
+ }
+
+ destdn = rdlock_path_xlock_dentry(mdr, false);
+ if (!destdn)
+ return;
-void Server::handle_client_link(MDRequestRef& mdr)
-{
- MClientRequest *req = mdr->client_request;
+ } else {
+ auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
+ destdn = ret.first;
+ if (!destdn)
+ return;
- dout(7) << "handle_client_link " << req->get_filepath()
- << " to " << req->get_filepath2()
- << dendl;
+ if (!destdn->get_projected_linkage()->is_null()) {
+ respond_to_request(mdr, -EEXIST);
+ return;
+ }
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ targeti = ret.second->get_projected_linkage()->get_inode();
+ }
- CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
- if (!dn) return;
- CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
- if (!targeti) return;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
+ if (targeti->is_dir()) {
+ dout(7) << "target is a dir, failing..." << dendl;
+ respond_to_request(mdr, -EINVAL);
return;
}
- CDir *dir = dn->get_dir();
- dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
+ CDir *dir = destdn->get_dir();
+ dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
dout(7) << "target is " << *targeti << dendl;
- if (targeti->is_dir()) {
- // if srcdn is replica, need to make sure its linkage is correct
- vector<CDentry*>& trace = mdr->dn[1];
- if (trace.empty() ||
- trace.back()->is_auth() ||
- trace.back()->lock.can_read(mdr->get_client())) {
- dout(7) << "target is a dir, failing..." << dendl;
- respond_to_request(mdr, -EINVAL);
+
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&targeti->snaplock);
+ lov.add_xlock(&targeti->linklock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
- }
- }
- xlocks.insert(&targeti->linklock);
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
+ if (targeti->get_projected_inode()->nlink == 0) {
+ dout(7) << "target has no link, failing..." << dendl;
+ respond_to_request(mdr, -ENOENT);
+ }
if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
if (!check_access(mdr, targeti, MAY_WRITE))
}
// go!
- assert(g_conf->mds_kill_link_at != 1);
+ ceph_assert(g_conf()->mds_kill_link_at != 1);
// local or remote?
if (targeti->is_auth())
- _link_local(mdr, dn, targeti);
+ _link_local(mdr, destdn, targeti);
else
- _link_remote(mdr, true, dn, targeti);
+ _link_remote(mdr, true, destdn, targeti);
+ mds->balancer->maybe_fragment(dir, false);
}
CInode *targeti;
version_t dnpv;
version_t tipv;
+ bool adjust_realm;
public:
C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
- version_t dnpv_, version_t tipv_) :
+ version_t dnpv_, version_t tipv_, bool ar) :
ServerLogContext(s, r), dn(d), targeti(ti),
- dnpv(dnpv_), tipv(tipv_) { }
+ dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
void finish(int r) override {
- assert(r == 0);
- server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
+ ceph_assert(r == 0);
+ server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
}
};
// project inode update
auto &pi = targeti->project_inode();
pi.inode.nlink++;
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.version = tipv;
+ bool adjust_realm = false;
+ if (!targeti->is_projected_snaprealm_global()) {
+ sr_t *newsnap = targeti->project_snaprealm();
+ targeti->mark_snaprealm_global(newsnap);
+ targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
+ adjust_realm = true;
+ }
+
// log + wait
EUpdate *le = new EUpdate(mdlog, "link_local");
mdlog->start_entry(le);
// do this after predirty_*, to avoid funky extra dnl arg
dn->push_projected_linkage(targeti->ino(), targeti->d_type());
- journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
+ journal_and_reply(mdr, targeti, dn, le,
+ new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
}
void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
- version_t dnpv, version_t tipv)
+ version_t dnpv, version_t tipv, bool adjust_realm)
{
dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
MDRequestRef null_ref;
mdcache->send_dentry_link(dn, null_ref);
+ if (adjust_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(targeti, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+ }
+
// bump target popularity
- utime_t now = ceph_clock_now();
- mds->balancer->hit_inode(now, targeti, META_POP_IWR);
- mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
// reply
respond_to_request(mdr, 0);
ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
dpv(d->get_projected_version()) {}
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
}
};
op = MMDSSlaveRequest::OP_LINKPREP;
else
op = MMDSSlaveRequest::OP_UNLINKPREP;
- MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
+ auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, op);
targeti->set_object_info(req->get_object_info());
req->op_stamp = mdr->get_op_stamp();
+ if (auto& desti_srnode = mdr->more()->desti_srnode)
+ encode(*desti_srnode, req->desti_snapbl);
mds->send_message_mds(req, linkauth);
- assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
+ ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
mdr->more()->waiting_on_slave.insert(linkauth);
return;
}
dout(10) << " targeti auth has prepared nlink++/--" << dendl;
- assert(g_conf->mds_kill_link_at != 2);
+ ceph_assert(g_conf()->mds_kill_link_at != 2);
+
+ if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ delete desti_srnode;
+ desti_srnode = NULL;
+ }
mdr->set_mds_stamp(ceph_clock_now());
dn->push_projected_linkage();
}
- journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
+ journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
+ new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
}
void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
<< (inc ? "link ":"unlink ")
<< *dn << " to " << *targeti << dendl;
- assert(g_conf->mds_kill_link_at != 3);
+ ceph_assert(g_conf()->mds_kill_link_at != 3);
if (!mdr->more()->witnessed.empty())
mdcache->logged_master_update(mdr->reqid);
mdcache->send_dentry_unlink(dn, NULL, null_ref);
// bump target popularity
- utime_t now = ceph_clock_now();
- mds->balancer->hit_inode(now, targeti, META_POP_IWR);
- mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
// reply
respond_to_request(mdr, 0);
class C_MDS_SlaveLinkPrep : public ServerLogContext {
CInode *targeti;
+ bool adjust_realm;
public:
- C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
- ServerLogContext(s, r), targeti(t) { }
+ C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
+ ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
void finish(int r) override {
- assert(r == 0);
- server->_logged_slave_link(mdr, targeti);
+ ceph_assert(r == 0);
+ server->_logged_slave_link(mdr, targeti, adjust_realm);
}
};
}
};
-/* This function DOES put the mdr->slave_request before returning*/
void Server::handle_slave_link_prep(MDRequestRef& mdr)
{
dout(10) << "handle_slave_link_prep " << *mdr
<< " on " << mdr->slave_request->get_object_info()
<< dendl;
- assert(g_conf->mds_kill_link_at != 4);
+ ceph_assert(g_conf()->mds_kill_link_at != 4);
CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
- assert(targeti);
+ ceph_assert(targeti);
dout(10) << "targeti " << *targeti << dendl;
CDentry *dn = targeti->get_parent_dn();
CDentry::linkage_t *dnl = dn->get_linkage();
- assert(dnl->is_primary());
+ ceph_assert(dnl->is_primary());
mdr->set_op_stamp(mdr->slave_request->op_stamp);
mdr->auth_pin(targeti);
//ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
- assert(g_conf->mds_kill_link_at != 5);
+ ceph_assert(g_conf()->mds_kill_link_at != 5);
// journal it
mdr->ls = mdlog->get_current_segment();
// update journaled target inode
bool inc;
+ bool adjust_realm = false;
+ bool realm_projected = false;
if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
inc = true;
pi.inode.nlink++;
+ if (!targeti->is_projected_snaprealm_global()) {
+ sr_t *newsnap = targeti->project_snaprealm();
+ targeti->mark_snaprealm_global(newsnap);
+ targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
+ adjust_realm = true;
+ realm_projected = true;
+ }
} else {
inc = false;
pi.inode.nlink--;
+ if (targeti->is_projected_snaprealm_global()) {
+ ceph_assert(mdr->slave_request->desti_snapbl.length());
+ auto p = mdr->slave_request->desti_snapbl.cbegin();
+
+ sr_t *newsnap = targeti->project_snaprealm();
+ decode(*newsnap, p);
+
+ if (pi.inode.nlink == 0)
+ ceph_assert(!newsnap->is_parent_global());
+
+ realm_projected = true;
+ } else {
+ ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
+ }
}
link_rollback rollback;
rollback.old_dir_mtime = pf->fragstat.mtime;
rollback.old_dir_rctime = pf->rstat.rctime;
rollback.was_inc = inc;
- ::encode(rollback, le->rollback);
+ if (realm_projected) {
+ if (targeti->snaprealm) {
+ encode(true, rollback.snapbl);
+ targeti->encode_snap_blob(rollback.snapbl);
+ } else {
+ encode(false, rollback.snapbl);
+ }
+ }
+ encode(rollback, le->rollback);
mdr->more()->rollback_bl = le->rollback;
pi.inode.ctime = mdr->get_op_stamp();
// commit case
mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
+ mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
// set up commit waiter
mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
mdr->more()->slave_update_journaled = true;
- submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
+ submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
mdr, __func__);
mdlog->flush();
}
-void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
+void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
{
dout(10) << "_logged_slave_link " << *mdr
<< " " << *targeti << dendl;
- assert(g_conf->mds_kill_link_at != 6);
+ ceph_assert(g_conf()->mds_kill_link_at != 6);
// update the target
targeti->pop_and_dirty_projected_inode(mdr->ls);
mdr->apply();
// hit pop
- utime_t now = ceph_clock_now();
- mds->balancer->hit_inode(now, targeti, META_POP_IWR);
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
// done.
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
+
+ if (adjust_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(targeti, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+ }
// ack
if (!mdr->aborted) {
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_LINKPREPACK);
+ auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
mds->send_message_mds(reply, mdr->slave_to_mds);
} else {
dout(10) << " abort flag set, finishing" << dendl;
<< " r=" << r
<< " " << *targeti << dendl;
- assert(g_conf->mds_kill_link_at != 7);
+ ceph_assert(g_conf()->mds_kill_link_at != 7);
if (r == 0) {
// drop our pins, etc.
{
dout(10) << "_committed_slave " << *mdr << dendl;
- assert(g_conf->mds_kill_link_at != 8);
+ ceph_assert(g_conf()->mds_kill_link_at != 8);
- MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_COMMITTED);
+ bool assert_exist = mdr->more()->slave_update_journaled;
+ mdcache->finish_uncommitted_slave(mdr->reqid, assert_exist);
+ auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
mds->send_message_mds(req, mdr->slave_to_mds);
mdcache->request_finish(mdr);
}
struct C_MDS_LoggedLinkRollback : public ServerLogContext {
MutationRef mut;
- C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
+ map<client_t,ref_t<MClientSnap>> splits;
+ C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
+ map<client_t,ref_t<MClientSnap>>&& _splits) :
+ ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
+ }
void finish(int r) override {
- server->_link_rollback_finish(mut, mdr);
+ server->_link_rollback_finish(mut, mdr, splits);
}
};
void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
{
link_rollback rollback;
- bufferlist::iterator p = rbl.begin();
- ::decode(rollback, p);
+ auto p = rbl.cbegin();
+ decode(rollback, p);
dout(10) << "do_link_rollback on " << rollback.reqid
<< (rollback.was_inc ? " inc":" dec")
<< " ino " << rollback.ino
<< dendl;
- assert(g_conf->mds_kill_link_at != 9);
+ ceph_assert(g_conf()->mds_kill_link_at != 9);
mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
- assert(mdr || mds->is_resolve());
+ ceph_assert(mdr || mds->is_resolve());
MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
mut->ls = mds->mdlog->get_current_segment();
CInode *in = mdcache->get_inode(rollback.ino);
- assert(in);
+ ceph_assert(in);
dout(10) << " target is " << *in << dendl;
- assert(!in->is_projected()); // live slave request hold versionlock xlock.
+ ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
auto &pi = in->project_inode();
pi.inode.version = in->pre_dirty();
}
// inode
- pi.inode.ctime = pi.inode.rstat.rctime = rollback.old_ctime;
+ pi.inode.ctime = rollback.old_ctime;
if (rollback.was_inc)
pi.inode.nlink--;
else
pi.inode.nlink++;
+ map<client_t,ref_t<MClientSnap>> splits;
+ if (rollback.snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (!mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ in->project_snaprealm(new_srnode);
+ } else {
+ decode(in->snaprealm->srnode, p);
+ }
+ } else {
+ SnapRealm *realm = parent->get_inode()->find_snaprealm();
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
+ in->project_snaprealm(NULL);
+ }
+ }
+
// journal it
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
le->commit.add_dir(parent, true);
le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
- submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
+ submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
mdr, __func__);
mdlog->flush();
}
-void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
+void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
+ map<client_t,ref_t<MClientSnap>>& splits)
{
dout(10) << "_link_rollback_finish" << dendl;
- assert(g_conf->mds_kill_link_at != 10);
+ ceph_assert(g_conf()->mds_kill_link_at != 10);
mut->apply();
+
+ if (!mds->is_resolve())
+ mdcache->send_snaps(splits);
+
if (mdr)
mdcache->request_finish(mdr);
- mdcache->finish_rollback(mut->reqid);
+ mdcache->finish_rollback(mut->reqid, mdr);
mut->cleanup();
}
-/* This function DOES NOT put the passed message before returning*/
-void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
+void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &m)
{
dout(10) << "handle_slave_link_prep_ack " << *mdr
<< " " << *m << dendl;
mds_rank_t from = mds_rank_t(m->get_source().num());
- assert(g_conf->mds_kill_link_at != 11);
+ ceph_assert(g_conf()->mds_kill_link_at != 11);
// note slave
mdr->more()->slaves.insert(from);
// witnessed!
- assert(mdr->more()->witnessed.count(from) == 0);
+ ceph_assert(mdr->more()->witnessed.count(from) == 0);
mdr->more()->witnessed.insert(from);
- assert(!m->is_not_journaled());
+ ceph_assert(!m->is_not_journaled());
mdr->more()->has_journaled_slaves = true;
// remove from waiting list
- assert(mdr->more()->waiting_on_slave.count(from));
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
mdr->more()->waiting_on_slave.erase(from);
- assert(mdr->more()->waiting_on_slave.empty());
+ ceph_assert(mdr->more()->waiting_on_slave.empty());
dispatch_client_request(mdr); // go again!
}
void Server::handle_client_unlink(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
client_t client = mdr->get_client();
// rmdir or unlink?
- bool rmdir = false;
- if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
-
- if (req->get_filepath().depth() == 0) {
- respond_to_request(mdr, -EINVAL);
- return;
- }
-
- // traverse to path
- vector<CDentry*> trace;
- CInode *in;
- int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
- if (r > 0) return;
- if (r < 0) {
- if (r == -ESTALE) {
- dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
- mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
- return;
- }
- respond_to_request(mdr, r);
- return;
- }
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
- return;
- }
+ bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
- CDentry *dn = trace[trace.size()-1];
- assert(dn);
- if (!dn->is_auth()) {
- mdcache->request_forward(mdr, dn->authority().first);
+ if (rmdir)
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
+ if (!dn)
return;
- }
-
- CInode *diri = dn->get_dir()->get_inode();
CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
- assert(!dnl->is_null());
+ ceph_assert(!dnl->is_null());
+ CInode *in = dnl->get_inode();
if (rmdir) {
dout(7) << "handle_client_rmdir on " << *dn << dendl;
}
}
+ CInode *diri = dn->get_dir()->get_inode();
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+ }
+
// -- create stray dentry? --
CDentry *straydn = NULL;
if (dnl->is_primary()) {
}
// lock
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
-
- for (int i=0; i<(int)trace.size()-1; i++)
- rdlocks.insert(&trace[i]->lock);
- xlocks.insert(&dn->lock);
- wrlocks.insert(&diri->filelock);
- wrlocks.insert(&diri->nestlock);
- xlocks.insert(&in->linklock);
- if (straydn) {
- wrlocks.insert(&straydn->get_dir()->inode->filelock);
- wrlocks.insert(&straydn->get_dir()->inode->nestlock);
- xlocks.insert(&straydn->lock);
- }
- if (in->is_dir())
- rdlocks.insert(&in->filelock); // to verify it's empty
- mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
+ lov.add_xlock(&in->linklock);
+ lov.add_xlock(&in->snaplock);
+ if (in->is_dir())
+ lov.add_rdlock(&in->filelock); // to verify it's empty
+
+ if (straydn) {
+ lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+ lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+ lov.add_xlock(&straydn->lock);
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
if (in->is_dir() &&
_dir_is_nonempty(mdr, in)) {
return;
}
- if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
- if (!check_access(mdr, diri, MAY_WRITE))
- return;
+ if (straydn)
+ straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+ if (!mdr->more()->desti_srnode) {
+ if (in->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = in->prepare_new_srnode(0);
+ in->record_snaprealm_parent_dentry(new_srnode, NULL, dn, dnl->is_primary());
+ // dropping the last linkage or dropping the last remote linkage,
+ // detch the inode from global snaprealm
+ auto nlink = in->get_projected_inode()->nlink;
+ if (nlink == 1 ||
+ (nlink == 2 && !dnl->is_primary() &&
+ !in->get_projected_parent_dir()->inode->is_stray()))
+ in->clear_snaprealm_global(new_srnode);
+ mdr->more()->desti_srnode = new_srnode;
+ } else if (dnl->is_primary()) {
+ // prepare snaprealm blob for slave request
+ SnapRealm *realm = in->find_snaprealm();
+ snapid_t follows = realm->get_newest_seq();
+ if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
+ sr_t *new_srnode = in->prepare_new_srnode(follows);
+ in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+ mdr->more()->desti_srnode = new_srnode;
+ }
+ }
}
// yay!
} else if (mdr->more()->waiting_on_slave.count(*p)) {
dout(10) << " already waiting on witness mds." << *p << dendl;
} else {
- if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
+ if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
return;
}
}
return; // we're waiting for a witness.
}
+ if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
+ mds->locker->create_lock_cache(mdr, diri);
+
// ok!
if (dnl->is_remote() && !dnl->get_inode()->is_auth())
_link_remote(mdr, false, dn, dnl->get_inode());
ServerLogContext(s, r), dn(d), straydn(sd),
dnpv(d->get_projected_version()) {}
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
server->_unlink_local_finish(mdr, dn, straydn, dnpv);
}
};
CDentry::linkage_t *dnl = dn->get_projected_linkage();
CInode *in = dnl->get_inode();
- SnapRealm *realm = in->find_snaprealm();
- snapid_t follows = realm->get_newest_seq();
// ok, let's do it.
mdr->ls = mdlog->get_current_segment();
}
if (straydn) {
- assert(dnl->is_primary());
+ ceph_assert(dnl->is_primary());
straydn->push_projected_linkage(in);
- straydn->first = follows + 1;
}
// the unlinked dentry
{
std::string t;
dn->make_path_string(t, true);
- pi.inode.stray_prior_path = mempool::mds_co::string(boost::string_view(t));
+ pi.inode.stray_prior_path = std::move(t);
}
- mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
pi.inode.version = in->pre_dirty();
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.change_attr++;
pi.inode.nlink--;
if (pi.inode.nlink == 0)
in->state_set(CInode::STATE_ORPHAN);
- if (dnl->is_primary()) {
+ if (mdr->more()->desti_srnode) {
+ auto& desti_srnode = mdr->more()->desti_srnode;
+ in->project_snaprealm(desti_srnode);
+ desti_srnode = NULL;
+ }
+
+ if (straydn) {
+ // will manually pop projected inode
+
// primary link. add stray dentry.
- assert(straydn);
mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
- // project snaprealm, too
- if (in->snaprealm || follows + 1 > in->get_oldest_snap())
- in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
-
pi.inode.update_backtrace();
le->metablob.add_primary_dentry(straydn, in, true, true);
} else {
+ mdr->add_projected_inode(in);
// remote link. update remote inode.
mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
dn->push_projected_linkage();
+ if (straydn) {
+ ceph_assert(in->first <= straydn->first);
+ in->first = straydn->first;
+ }
+
if (in->is_dir()) {
- assert(straydn);
+ ceph_assert(straydn);
mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
-
- in->maybe_export_pin(true);
}
journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
if (!mdr->more()->witnessed.empty())
mdcache->logged_master_update(mdr->reqid);
+ CInode *strayin = NULL;
+ bool hadrealm = false;
+ if (straydn) {
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ strayin = dn->get_linkage()->get_inode();
+ hadrealm = strayin->snaprealm ? true : false;
+ strayin->early_pop_projected_snaprealm();
+ }
+
// unlink main dentry
dn->get_dir()->unlink_inode(dn);
dn->pop_projected_linkage();
// relink as stray? (i.e. was primary link?)
- CInode *strayin = NULL;
- bool snap_is_new = false;
if (straydn) {
dout(20) << " straydn is " << *straydn << dendl;
- CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
- strayin = straydnl->get_inode();
+ straydn->pop_projected_linkage();
+
+ strayin->pop_and_dirty_projected_inode(mdr->ls);
- snap_is_new = strayin->snaprealm ? true : false;
mdcache->touch_dentry_bottom(straydn);
}
dn->mark_dirty(dnpv, mdr->ls);
mdr->apply();
-
- if (snap_is_new) //only new if strayin exists
- mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
mdcache->send_dentry_unlink(dn, straydn, mdr);
- // update subtree map?
- if (straydn && strayin->is_dir())
- mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
+ if (straydn) {
+ // update subtree map?
+ if (strayin->is_dir())
+ mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
+
+ if (strayin->snaprealm && !hadrealm)
+ mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
+ }
// bump pop
- utime_t now = ceph_clock_now();
- mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
// reply
respond_to_request(mdr, 0);
}
dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
- MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_RMDIRPREP);
+ auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
req->srcdnpath = filepath(trace.front()->get_dir()->ino());
for (auto dn : trace)
req->srcdnpath.push_dentry(dn->get_name());
- mdcache->replicate_stray(straydn, who, req->stray);
+ mdcache->encode_replica_stray(straydn, who, req->straybl);
+ if (mdr->more()->desti_srnode)
+ encode(*mdr->more()->desti_srnode, req->desti_snapbl);
req->op_stamp = mdr->get_op_stamp();
mds->send_message_mds(req, who);
- assert(mdr->more()->waiting_on_slave.count(who) == 0);
+ ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
mdr->more()->waiting_on_slave.insert(who);
return true;
}
filepath srcpath(mdr->slave_request->srcdnpath);
dout(10) << " src " << srcpath << dendl;
CInode *in;
- int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
+ CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
+ int r = mdcache->path_traverse(mdr, cf, srcpath,
+ MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+ &trace, &in);
if (r > 0) return;
if (r == -ESTALE) {
mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
- mdr->slave_to_mds);
+ mdr->slave_to_mds, true);
return;
}
- assert(r == 0);
- CDentry *dn = trace[trace.size()-1];
+ ceph_assert(r == 0);
+ CDentry *dn = trace.back();
dout(10) << " dn " << *dn << dendl;
mdr->pin(dn);
- assert(mdr->straydn);
+ ceph_assert(mdr->straydn);
CDentry *straydn = mdr->straydn;
dout(10) << " straydn " << *straydn << dendl;
rmdir_rollback rollback;
rollback.reqid = mdr->reqid;
rollback.src_dir = dn->get_dir()->dirfrag();
- rollback.src_dname = std::string(dn->get_name());
+ rollback.src_dname = dn->get_name();
rollback.dest_dir = straydn->get_dir()->dirfrag();
- rollback.dest_dname = std::string(straydn->get_name());
- ::encode(rollback, mdr->more()->rollback_bl);
+ rollback.dest_dname = straydn->get_name();
+ if (mdr->slave_request->desti_snapbl.length()) {
+ if (in->snaprealm) {
+ encode(true, rollback.snapbl);
+ in->encode_snap_blob(rollback.snapbl);
+ } else {
+ encode(false, rollback.snapbl);
+ }
+ }
+ encode(rollback, mdr->more()->rollback_bl);
+ // FIXME: rollback snaprealm
dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
// set up commit waiter
mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
- if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
- dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
- dn->get_dir()->unlink_inode(dn);
- straydn->get_dir()->link_primary_inode(straydn, in);
-
- assert(straydn->first >= in->first);
- in->first = straydn->first;
-
- mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
-
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_RMDIRPREPACK);
- reply->mark_not_journaled();
- mds->send_message_mds(reply, mdr->slave_to_mds);
-
- // send caps to auth (if we're not already)
- if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
- mdcache->migrator->export_caps(in);
+ straydn->push_projected_linkage(in);
+ dn->push_projected_linkage();
- mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
+ ceph_assert(straydn->first >= in->first);
+ in->first = straydn->first;
- mdr->slave_request->put();
- mdr->slave_request = 0;
- mdr->straydn = 0;
+ if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
+ dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
+ _logged_slave_rmdir(mdr, dn, straydn);
return;
}
- straydn->push_projected_linkage(in);
- dn->push_projected_linkage();
-
+ mdr->ls = mdlog->get_current_segment();
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
mdlog->start_entry(le);
le->commit.renamed_dirino = in->ino();
mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+ mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
mdr->more()->slave_update_journaled = true;
submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
{
dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
+ CInode *in = dn->get_linkage()->get_inode();
+
+ bool new_realm;
+ if (mdr->slave_request->desti_snapbl.length()) {
+ new_realm = !in->snaprealm;
+ in->decode_snap_blob(mdr->slave_request->desti_snapbl);
+ ceph_assert(in->snaprealm);
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ } else {
+ new_realm = false;
+ }
// update our cache now, so we are consistent with what is in the journal
// when we journal a subtree map
- CInode *in = dn->get_linkage()->get_inode();
dn->get_dir()->unlink_inode(dn);
straydn->pop_projected_linkage();
dn->pop_projected_linkage();
- mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
+
+ mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
+
+ if (new_realm)
+ mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
// done.
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
mdr->straydn = 0;
if (!mdr->aborted) {
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_RMDIRPREPACK);
+ auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
+ if (!mdr->more()->slave_update_journaled)
+ reply->mark_not_journaled();
mds->send_message_mds(reply, mdr->slave_to_mds);
} else {
dout(10) << " abort flag set, finishing" << dendl;
}
}
-void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
{
dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
<< " " << *ack << dendl;
mdr->more()->has_journaled_slaves = true;
// remove from waiting list
- assert(mdr->more()->waiting_on_slave.count(from));
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
mdr->more()->waiting_on_slave.erase(from);
if (mdr->more()->waiting_on_slave.empty())
void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
{
dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
-
+
if (r == 0) {
if (mdr->more()->slave_update_journaled) {
CInode *strayin = straydn->get_projected_linkage()->get_inode();
// the file system are taking place here, so there is no Mutation.
rmdir_rollback rollback;
- bufferlist::iterator p = rbl.begin();
- ::decode(rollback, p);
+ auto p = rbl.cbegin();
+ decode(rollback, p);
dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
- assert(mdr || mds->is_resolve());
+ ceph_assert(mdr || mds->is_resolve());
CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
if (!dir)
dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
- assert(dir);
+ ceph_assert(dir);
CDentry *dn = dir->lookup(rollback.src_dname);
- assert(dn);
+ ceph_assert(dn);
dout(10) << " dn " << *dn << dendl;
- dir = mdcache->get_dirfrag(rollback.dest_dir);
- assert(dir);
- CDentry *straydn = dir->lookup(rollback.dest_dname);
- assert(straydn);
- dout(10) << " straydn " << *dn << dendl;
+ CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
+ ceph_assert(straydir);
+ CDentry *straydn = straydir->lookup(rollback.dest_dname);
+ ceph_assert(straydn);
+ dout(10) << " straydn " << *straydn << dendl;
CInode *in = straydn->get_linkage()->get_inode();
- if (mdr && !mdr->more()->slave_update_journaled) {
- assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
+ dn->push_projected_linkage(in);
+ straydn->push_projected_linkage();
- straydn->get_dir()->unlink_inode(straydn);
- dn->get_dir()->link_primary_inode(dn, in);
+ if (rollback.snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ decode(in->snaprealm->srnode, p);
+ } else {
+ in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
+ }
+ }
- mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
+ if (mdr && !mdr->more()->slave_update_journaled) {
+ ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
- mdcache->request_finish(mdr);
- mdcache->finish_rollback(rollback.reqid);
+ _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
return;
}
- dn->push_projected_linkage(in);
- straydn->push_projected_linkage();
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
straydn->pop_projected_linkage();
CInode *in = dn->get_linkage()->get_inode();
- mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
+ mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
+ !mdr || mdr->more()->slave_update_journaled);
+
if (mds->is_resolve()) {
CDir *root = mdcache->get_subtree_root(straydn->get_dir());
mdcache->try_trim_non_auth_subtree(root);
if (mdr)
mdcache->request_finish(mdr);
- mdcache->finish_rollback(reqid);
+ mdcache->finish_rollback(reqid, mdr);
}
bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
{
dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
- assert(in->is_auth());
+ ceph_assert(in->is_auth());
+ if (in->filelock.is_cached())
+ return false; // there can be pending async create/unlink. don't know.
if (in->snaprealm && in->snaprealm->srnode.snaps.size())
return true; // in a snapshot!
- list<CDir*> ls;
- in->get_dirfrags(ls);
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- CDir *dir = *p;
+ auto&& ls = in->get_dirfrags();
+ for (const auto& dir : ls) {
// is the frag obviously non-empty?
if (dir->is_auth()) {
if (dir->get_projected_fnode()->fragstat.size()) {
bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
{
dout(10) << "dir_is_nonempty " << *in << dendl;
- assert(in->is_auth());
- assert(in->filelock.can_read(mdr->get_client()));
+ ceph_assert(in->is_auth());
+ ceph_assert(in->filelock.can_read(mdr->get_client()));
frag_info_t dirstat;
version_t dirstat_version = in->get_projected_inode()->dirstat.version;
- list<CDir*> ls;
- in->get_dirfrags(ls);
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- CDir *dir = *p;
+ auto&& ls = in->get_dirfrags();
+ for (const auto& dir : ls) {
const fnode_t *pf = dir->get_projected_fnode();
if (pf->fragstat.size()) {
dout(10) << "dir_is_nonempty dirstat has "
ServerLogContext(s, r),
srcdn(sdn), destdn(ddn), straydn(stdn) { }
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
server->_rename_finish(mdr, srcdn, destdn, straydn);
}
};
*/
void Server::handle_client_rename(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
dout(7) << "handle_client_rename " << *req << dendl;
filepath destpath = req->get_filepath();
filepath srcpath = req->get_filepath2();
- if (destpath.depth() == 0 || srcpath.depth() == 0) {
- respond_to_request(mdr, -EINVAL);
+ if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
+ respond_to_request(mdr, -EBUSY);
return;
}
- boost::string_view destname = destpath.last_dentry();
-
- vector<CDentry*>& srctrace = mdr->dn[1];
- vector<CDentry*>& desttrace = mdr->dn[0];
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
+ if (!destdn)
+ return;
- CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
- if (!destdn) return;
dout(10) << " destdn " << *destdn << dendl;
- if (mdr->snapid != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
- return;
- }
- CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
CDir *destdir = destdn->get_dir();
- assert(destdir->is_auth());
+ ceph_assert(destdir->is_auth());
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
- int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
- if (r > 0)
- return; // delayed
- if (r < 0) {
- if (r == -ESTALE) {
- dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
- mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
- } else {
- dout(10) << "FAIL on error " << r << dendl;
- respond_to_request(mdr, r);
- }
+ dout(10) << " srcdn " << *srcdn << dendl;
+ CDir *srcdir = srcdn->get_dir();
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ CInode *srci = srcdnl->get_inode();
+ dout(10) << " srci " << *srci << dendl;
+
+ // -- some sanity checks --
+ if (destdn == srcdn) {
+ dout(7) << "rename src=dest, noop" << dendl;
+ respond_to_request(mdr, 0);
return;
+ }
+ // dest a child of src?
+ // e.g. mv /usr /usr/foo
+ if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
+ dout(7) << "cannot rename item to be a child of itself" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
}
- assert(!srctrace.empty());
- CDentry *srcdn = srctrace[srctrace.size()-1];
- dout(10) << " srcdn " << *srcdn << dendl;
- if (srcdn->last != CEPH_NOSNAP) {
- respond_to_request(mdr, -EROFS);
+
+ // is this a stray migration, reintegration or merge? (sanity checks!)
+ if (mdr->reqid.name.is_mds() &&
+ !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
+ MDS_INO_IS_STRAY(destpath.get_ino())) &&
+ !(destdnl->is_remote() &&
+ destdnl->get_remote_ino() == srci->ino())) {
+ respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
return;
}
- CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
- CInode *srci = srcdnl->get_inode();
- dout(10) << " srci " << *srci << dendl;
CInode *oldin = 0;
if (!destdnl->is_null()) {
return;
}
- // if srcdn is replica, need to make sure its linkage is correct
- if (srcdn->is_auth() ||
- srcdn->lock.can_read(mdr->get_client()) ||
- (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
- // mv /some/thing /to/some/existing_other_thing
- if (oldin->is_dir() && !srci->is_dir()) {
- respond_to_request(mdr, -EISDIR);
- return;
- }
- if (!oldin->is_dir() && srci->is_dir()) {
- respond_to_request(mdr, -ENOTDIR);
- return;
- }
- if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
- respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
- return;
- }
+ // mv /some/thing /to/some/existing_other_thing
+ if (oldin->is_dir() && !srci->is_dir()) {
+ respond_to_request(mdr, -EISDIR);
+ return;
+ }
+ if (!oldin->is_dir() && srci->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+ if (srci == oldin && !srcdir->inode->is_stray()) {
+ respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
+ return;
}
}
- // -- some sanity checks --
+ vector<CDentry*>& srctrace = mdr->dn[1];
+ vector<CDentry*>& desttrace = mdr->dn[0];
// src+dest traces _must_ share a common ancestor for locking to prevent orphans
if (destpath.get_ino() != srcpath.get_ino() &&
!(req->get_source().is_mds() &&
- MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
+ MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
CInode *srcbase = srctrace[0]->get_dir()->get_inode();
CInode *destbase = desttrace[0]->get_dir()->get_inode();
// ok, extend srctrace toward root until it is an ancestor of desttrace.
while (destbase != srcbase) {
CDentry *pdn = destbase->get_projected_parent_dn();
desttrace.insert(desttrace.begin(), pdn);
- rdlocks.insert(&pdn->lock);
dout(10) << "rename prepending desttrace with " << *pdn << dendl;
destbase = pdn->get_dir()->get_inode();
}
- dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
- }
-
- // src == dest?
- if (srcdn->get_dir() == destdir && srcdn->get_name() == destname) {
- dout(7) << "rename src=dest, noop" << dendl;
- respond_to_request(mdr, 0);
- return;
- }
-
- // dest a child of src?
- // e.g. mv /usr /usr/foo
- CDentry *pdn = destdir->inode->get_projected_parent_dn();
- while (pdn) {
- if (pdn == srcdn) {
- dout(7) << "cannot rename item to be a child of itself" << dendl;
- respond_to_request(mdr, -EINVAL);
- return;
- }
- pdn = pdn->get_dir()->inode->parent;
- }
-
- // is this a stray migration, reintegration or merge? (sanity checks!)
- if (mdr->reqid.name.is_mds() &&
- !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
- MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
- !(destdnl->is_remote() &&
- destdnl->get_remote_ino() == srci->ino())) {
- respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
- return;
+ dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
}
- bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
- (srcdnl->is_primary() || destdnl->is_primary()));
+
+ bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
if (linkmerge)
dout(10) << " this is a link merge" << dendl;
mdr->straydn = NULL;
}
- // -- prepare witness list --
- /*
- * NOTE: we use _all_ replicas as witnesses.
- * this probably isn't totally necessary (esp for file renames),
- * but if/when we change that, we have to make sure rejoin is
- * sufficiently robust to handle strong rejoins from survivors
- * with totally wrong dentry->inode linkage.
- * (currently, it can ignore rename effects, because the resolve
- * stage will sort them out.)
- */
- set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
- if (srcdn->is_auth())
- srcdn->list_replicas(witnesses);
- else
- witnesses.insert(srcdn->authority().first);
- if (srcdnl->is_remote() && !srci->is_auth())
- witnesses.insert(srci->authority().first);
- destdn->list_replicas(witnesses);
- if (destdnl->is_remote() && !oldin->is_auth())
- witnesses.insert(oldin->authority().first);
- dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
-
// -- locks --
- map<SimpleLock*, mds_rank_t> remote_wrlocks;
-
- // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
- for (int i=0; i<(int)srctrace.size(); i++)
- rdlocks.insert(&srctrace[i]->lock);
- xlocks.insert(&srcdn->lock);
- mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
- if (srcdirauth != mds->get_nodeid()) {
- dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
- remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
- remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
- if (srci->is_dir())
- rdlocks.insert(&srci->dirfragtreelock);
- } else {
- wrlocks.insert(&srcdn->get_dir()->inode->filelock);
- wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
- }
- mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+
+ // we need to update srci's ctime. xlock its least contended lock to do that...
+ lov.add_xlock(&srci->linklock);
+ lov.add_xlock(&srci->snaplock);
+
+ if (oldin) {
+ // xlock oldin (for nlink--)
+ lov.add_xlock(&oldin->linklock);
+ lov.add_xlock(&oldin->snaplock);
+ if (oldin->is_dir()) {
+ ceph_assert(srci->is_dir());
+ lov.add_rdlock(&oldin->filelock); // to verify it's empty
+
+ // adjust locking order?
+ int cmp = mdr->compare_paths();
+ if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
+ std::reverse(lov.begin(), lov.end());
+ } else {
+ ceph_assert(!srci->is_dir());
+ // adjust locking order;
+ if (srci->ino() > oldin->ino())
+ std::reverse(lov.begin(), lov.end());
+ }
+ }
- // straydn?
- if (straydn) {
- wrlocks.insert(&straydn->get_dir()->inode->filelock);
- wrlocks.insert(&straydn->get_dir()->inode->nestlock);
- xlocks.insert(&straydn->lock);
- }
+ // straydn?
+ if (straydn) {
+ lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+ lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+ lov.add_xlock(&straydn->lock);
+ }
- // xlock versionlock on dentries if there are witnesses.
- // replicas can't see projected dentry linkages, and will get
- // confused if we try to pipeline things.
- if (!witnesses.empty()) {
- // take xlock on all projected ancestor dentries for srcdn and destdn.
- // this ensures the srcdn and destdn can be traversed to by the witnesses.
- for (int i= 0; i<(int)srctrace.size(); i++) {
- if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
- xlocks.insert(&srctrace[i]->versionlock);
- }
- for (int i=0; i<(int)desttrace.size(); i++) {
- if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
- xlocks.insert(&desttrace[i]->versionlock);
- }
- // xlock srci and oldin's primary dentries, so witnesses can call
- // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
- // is traversed.
- if (srcdnl->is_remote())
- xlocks.insert(&srci->get_projected_parent_dn()->lock);
- if (destdnl->is_remote())
- xlocks.insert(&oldin->get_projected_parent_dn()->lock);
- }
-
- // we need to update srci's ctime. xlock its least contended lock to do that...
- xlocks.insert(&srci->linklock);
-
- // xlock oldin (for nlink--)
- if (oldin) {
- xlocks.insert(&oldin->linklock);
- if (oldin->is_dir())
- rdlocks.insert(&oldin->filelock);
- }
- if (srcdnl->is_primary() && srci->is_dir())
- // FIXME: this should happen whenever we are renamning between
- // realms, regardless of the file type
- // FIXME: If/when this changes, make sure to update the
- // "allowance" in handle_slave_rename_prep
- xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
- else
- rdlocks.insert(&srci->snaplock);
+ CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
+ if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+ return;
- CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
- &remote_wrlocks, auth_pin_freeze))
- return;
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (linkmerge)
+ ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
- if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
+ if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
return;
if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
return;
}
- /* project_past_snaprealm_parent() will do this job
+ /* project_snaprealm_past_parent() will do this job
*
// moving between snaprealms?
if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
}
*/
- assert(g_conf->mds_kill_rename_at != 1);
+ ceph_assert(g_conf()->mds_kill_rename_at != 1);
// -- open all srcdn inode frags, if any --
// we need these open so that auth can properly delegate from inode to dirfrags
dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
mdr->set_stickydirs(srci);
- list<frag_t> frags;
- srci->dirfragtree.get_leaves(frags);
- for (list<frag_t>::iterator p = frags.begin();
- p != frags.end();
- ++p) {
- CDir *dir = srci->get_dirfrag(*p);
+ frag_vec_t leaves;
+ srci->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = srci->get_dirfrag(leaf);
if (!dir) {
- dout(10) << " opening " << *p << " under " << *srci << dendl;
- mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
+ dout(10) << " opening " << leaf << " under " << *srci << dendl;
+ mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
return;
}
}
}
+ // -- prepare snaprealm ---
+
+ if (linkmerge) {
+ if (!mdr->more()->srci_srnode &&
+ srci->get_projected_inode()->nlink == 1 &&
+ srci->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = srci->prepare_new_srnode(0);
+ srci->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, false);
+
+ srci->clear_snaprealm_global(new_srnode);
+ mdr->more()->srci_srnode = new_srnode;
+ }
+ } else {
+ if (oldin && !mdr->more()->desti_srnode) {
+ if (oldin->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = oldin->prepare_new_srnode(0);
+ oldin->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, destdnl->is_primary());
+ // dropping the last linkage or dropping the last remote linkage,
+ // detch the inode from global snaprealm
+ auto nlink = oldin->get_projected_inode()->nlink;
+ if (nlink == 1 ||
+ (nlink == 2 && !destdnl->is_primary() &&
+ !oldin->get_projected_parent_dir()->inode->is_stray()))
+ oldin->clear_snaprealm_global(new_srnode);
+ mdr->more()->desti_srnode = new_srnode;
+ } else if (destdnl->is_primary()) {
+ SnapRealm *dest_realm = destdir->inode->find_snaprealm();
+ snapid_t follows = dest_realm->get_newest_seq();
+ if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
+ sr_t *new_srnode = oldin->prepare_new_srnode(follows);
+ oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+ mdr->more()->desti_srnode = new_srnode;
+ }
+ }
+ }
+ if (!mdr->more()->srci_srnode) {
+ SnapRealm *dest_realm = destdir->inode->find_snaprealm();
+ if (srci->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = srci->prepare_new_srnode(0);
+ srci->record_snaprealm_parent_dentry(new_srnode, dest_realm, srcdn, srcdnl->is_primary());
+ mdr->more()->srci_srnode = new_srnode;
+ } else if (srcdnl->is_primary()) {
+ SnapRealm *src_realm = srcdir->inode->find_snaprealm();
+ snapid_t follows = src_realm->get_newest_seq();
+ if (src_realm != dest_realm &&
+ (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
+ sr_t *new_srnode = srci->prepare_new_srnode(follows);
+ srci->record_snaprealm_past_parent(new_srnode, dest_realm);
+ mdr->more()->srci_srnode = new_srnode;
+ }
+ }
+ }
+ }
+
// -- prepare witnesses --
+ /*
+ * NOTE: we use _all_ replicas as witnesses.
+ * this probably isn't totally necessary (esp for file renames),
+ * but if/when we change that, we have to make sure rejoin is
+ * sufficiently robust to handle strong rejoins from survivors
+ * with totally wrong dentry->inode linkage.
+ * (currently, it can ignore rename effects, because the resolve
+ * stage will sort them out.)
+ */
+ set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
+ if (srcdn->is_auth())
+ srcdn->list_replicas(witnesses);
+ else
+ witnesses.insert(srcdn->authority().first);
+ if (srcdnl->is_remote() && !srci->is_auth())
+ witnesses.insert(srci->authority().first);
+ destdn->list_replicas(witnesses);
+ if (destdnl->is_remote() && !oldin->is_auth())
+ witnesses.insert(oldin->authority().first);
+ dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+ if (!witnesses.empty()) {
+ // Replicas can't see projected dentry linkages and will get confused.
+ // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
+ // can't project these inodes' linkages.
+ bool need_flush = false;
+ for (auto& dn : srctrace) {
+ if (dn->is_projected()) {
+ need_flush = true;
+ break;
+ }
+ }
+ if (!need_flush) {
+ CDentry *dn = destdn;
+ do {
+ if (dn->is_projected()) {
+ need_flush = true;
+ break;
+ }
+ CInode *diri = dn->get_dir()->get_inode();
+ dn = diri->get_projected_parent_dn();
+ } while (dn);
+ }
+ if (need_flush) {
+ mdlog->wait_for_safe(
+ new MDSInternalContextWrapper(mds,
+ new C_MDS_RetryRequest(mdcache, mdr)));
+ mdlog->flush();
+ return;
+ }
+ }
+
// do srcdn auth last
mds_rank_t last = MDS_RANK_NONE;
if (!srcdn->is_auth()) {
// are involved in the rename operation.
if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
dout(10) << " preparing ambiguous auth for srci" << dendl;
- assert(mdr->more()->is_remote_frozen_authpin);
- assert(mdr->more()->rename_inode == srci);
+ ceph_assert(mdr->more()->is_remote_frozen_authpin);
+ ceph_assert(mdr->more()->rename_inode == srci);
_rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
return;
}
if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
dout(10) << " preparing last witness (srcdn auth)" << dendl;
- assert(mdr->more()->waiting_on_slave.count(last) == 0);
+ ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
_rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
return;
}
// test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
if (!mdr->more()->slaves.empty() && !srci->is_dir())
- assert(g_conf->mds_kill_rename_at != 3);
+ ceph_assert(g_conf()->mds_kill_rename_at != 3);
if (!mdr->more()->slaves.empty() && srci->is_dir())
- assert(g_conf->mds_kill_rename_at != 4);
+ ceph_assert(g_conf()->mds_kill_rename_at != 4);
// -- declare now --
mdr->set_mds_stamp(ceph_clock_now());
C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
journal_and_reply(mdr, srci, destdn, le, fin);
+ mds->balancer->maybe_fragment(destdn->get_dir(), false);
}
// test hack: test slave commit
if (!mdr->more()->slaves.empty() && !in->is_dir())
- assert(g_conf->mds_kill_rename_at != 5);
+ ceph_assert(g_conf()->mds_kill_rename_at != 5);
if (!mdr->more()->slaves.empty() && in->is_dir())
- assert(g_conf->mds_kill_rename_at != 6);
+ ceph_assert(g_conf()->mds_kill_rename_at != 6);
// bump popularity
- utime_t now = ceph_clock_now();
- mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
+ mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
if (destdnl->is_remote() && in->is_auth())
- mds->balancer->hit_inode(now, in, META_POP_IWR);
+ mds->balancer->hit_inode(in, META_POP_IWR);
// did we import srci? if so, explicitly ack that import that, before we unlock and reply.
- assert(g_conf->mds_kill_rename_at != 7);
+ ceph_assert(g_conf()->mds_kill_rename_at != 7);
// reply
respond_to_request(mdr, 0);
}
dout(10) << "_rename_prepare_witness mds." << who << dendl;
- MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_RENAMEPREP);
+ auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
for (auto dn : srctrace)
for (auto dn : dsttrace)
req->destdnpath.push_dentry(dn->get_name());
if (straydn)
- mdcache->replicate_stray(straydn, who, req->stray);
+ mdcache->encode_replica_stray(straydn, who, req->straybl);
+
+ if (mdr->more()->srci_srnode)
+ encode(*mdr->more()->srci_srnode, req->srci_snapbl);
+ if (mdr->more()->desti_srnode)
+ encode(*mdr->more()->desti_srnode, req->desti_snapbl);
req->srcdn_auth = mdr->more()->srcdn_auth_mds;
req->op_stamp = mdr->get_op_stamp();
mds->send_message_mds(req, who);
- assert(mdr->more()->waiting_on_slave.count(who) == 0);
+ ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
mdr->more()->waiting_on_slave.insert(who);
return true;
}
CDentry::linkage_t *srcdnl = srcdn->get_linkage();
/* import node */
- bufferlist::iterator blp = mdr->more()->inode_import.begin();
+ auto blp = mdr->more()->inode_import.cbegin();
// imported caps
map<client_t,entity_inst_t> client_map;
+ map<client_t, client_metadata_t> client_metadata_map;
decode(client_map, blp);
- prepare_force_open_sessions(client_map, mdr->more()->imported_session_map);
+ decode(client_metadata_map, blp);
+ prepare_force_open_sessions(client_map, client_metadata_map,
+ mdr->more()->imported_session_map);
encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
+ encode(client_metadata_map, *client_map_bl);
list<ScatterLock*> updated_scatterlocks;
mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
bool Server::_need_force_journal(CInode *diri, bool empty)
{
- list<CDir*> ls;
- diri->get_dirfrags(ls);
+ auto&& dirs = diri->get_dirfrags();
bool force_journal = false;
if (empty) {
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
- dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
+ for (const auto& dir : dirs) {
+ if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
+ dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
force_journal = true;
break;
} else
- dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
+ dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
}
} else {
// see if any children of our frags are auth subtrees.
- list<CDir*> subtrees;
- mdcache->list_subtrees(subtrees);
- dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- CDir *dir = *p;
- for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
- if (dir->contains(*q)) {
- if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
- dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
- << **q << dendl;
+ std::vector<CDir*> subtrees;
+ mdcache->get_subtrees(subtrees);
+ dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
+ for (const auto& dir : dirs) {
+ for (const auto& subtree : subtrees) {
+ if (dir->contains(subtree)) {
+ if (subtree->get_dir_auth().first == mds->get_nodeid()) {
+ dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
+ << *subtree << dendl;
force_journal = true;
break;
} else
- dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
+ dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
} else
- dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
+ dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
}
if (force_journal)
break;
CInode *oldin = destdnl->get_inode();
// primary+remote link merge?
- bool linkmerge = (srci == destdnl->get_inode() &&
- (srcdnl->is_primary() || destdnl->is_primary()));
+ bool linkmerge = (srci == oldin);
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
bool silent = srcdn->get_dir()->inode->is_stray();
bool force_journal_dest = false;
// target inode
if (!linkmerge) {
if (destdnl->is_primary()) {
- assert(straydn); // moving to straydn.
+ ceph_assert(straydn); // moving to straydn.
// link--, and move.
if (destdn->is_auth()) {
auto &pi= oldin->project_inode(); //project_snaprealm
// note which dirfrags have child subtrees in the journal
// event, so that we can open those (as bounds) during replay.
if (srci->is_dir()) {
- list<CDir*> ls;
- srci->get_dirfrags(ls);
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- CDir *dir = *p;
+ auto&& ls = srci->get_dirfrags();
+ for (const auto& dir : ls) {
if (!dir->is_auth())
metablob->renamed_dir_frags.push_back(dir->get_frag());
}
if (!silent) {
if (spi) {
- spi->ctime = spi->rstat.rctime = mdr->get_op_stamp();
+ spi->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > spi->rstat.rctime)
+ spi->rstat.rctime = mdr->get_op_stamp();
spi->change_attr++;
if (linkmerge)
spi->nlink--;
}
if (tpi) {
- tpi->ctime = tpi->rstat.rctime = mdr->get_op_stamp();
+ tpi->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > tpi->rstat.rctime)
+ tpi->rstat.rctime = mdr->get_op_stamp();
tpi->change_attr++;
{
std::string t;
destdn->make_path_string(t, true);
- tpi->stray_prior_path = mempool::mds_co::string(boost::string_view(t));
+ tpi->stray_prior_path = std::move(t);
}
tpi->nlink--;
if (tpi->nlink == 0)
// guarantee stray dir is processed first during journal replay. unlink the old inode,
// then link the source inode to destdn
if (destdnl->is_primary()) {
- assert(straydn);
+ ceph_assert(straydn);
if (straydn->is_auth()) {
metablob->add_dir_context(straydn->get_dir());
metablob->add_dir(straydn->get_dir(), true);
mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
(destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
if (destdnl->is_primary()) {
- assert(straydn);
+ ceph_assert(straydn);
mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
}
if (destdn->is_auth())
mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
- SnapRealm *src_realm = srci->find_snaprealm();
- SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
- snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
-
// add it all to the metablob
// target inode
if (!linkmerge) {
if (destdnl->is_primary()) {
- assert(straydn);
+ ceph_assert(straydn);
if (destdn->is_auth()) {
// project snaprealm, too
- if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
- oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
- straydn->first = MAX(oldin->first, next_dest_snap);
+ if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ oldin->project_snaprealm(desti_srnode);
+ if (tpi->nlink == 0)
+ ceph_assert(!desti_srnode->is_parent_global());
+ desti_srnode = NULL;
+ }
+ straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
metablob->add_primary_dentry(straydn, oldin, true, true);
} else if (force_journal_stray) {
dout(10) << " forced journaling straydn " << *straydn << dendl;
}
} else if (destdnl->is_remote()) {
if (oldin->is_auth()) {
+ sr_t *new_srnode = NULL;
+ if (mdr->slave_request) {
+ if (mdr->slave_request->desti_snapbl.length() > 0) {
+ new_srnode = new sr_t();
+ auto p = mdr->slave_request->desti_snapbl.cbegin();
+ decode(*new_srnode, p);
+ }
+ } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ new_srnode = desti_srnode;
+ desti_srnode = NULL;
+ }
+ if (new_srnode) {
+ oldin->project_snaprealm(new_srnode);
+ if (tpi->nlink == 0)
+ ceph_assert(!new_srnode->is_parent_global());
+ }
// auth for targeti
metablob->add_dir_context(oldin->get_projected_parent_dir());
mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
// dest
if (srcdnl->is_remote()) {
- if (!linkmerge) {
- if (destdn->is_auth() && !destdnl->is_null())
- mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
- else
- destdn->first = MAX(destdn->first, next_dest_snap);
+ ceph_assert(!linkmerge);
+ if (destdn->is_auth() && !destdnl->is_null())
+ mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+ else
+ destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
- if (destdn->is_auth())
- metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
- if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
- metablob->add_dir_context(srci->get_projected_parent_dir());
- mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
- metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
+ if (destdn->is_auth())
+ metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+
+ if (srci->is_auth() ) { // it's remote
+ if (mdr->slave_request) {
+ if (mdr->slave_request->srci_snapbl.length() > 0) {
+ sr_t *new_srnode = new sr_t();
+ auto p = mdr->slave_request->srci_snapbl.cbegin();
+ decode(*new_srnode, p);
+ srci->project_snaprealm(new_srnode);
+ }
+ } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ srci->project_snaprealm(srci_srnode);
+ srci_srnode = NULL;
}
- } else {
- if (destdn->is_auth() && !destdnl->is_null())
- mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
- else
- destdn->first = MAX(destdn->first, next_dest_snap);
- if (destdn->is_auth())
- metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
+ CDentry *srci_pdn = srci->get_projected_parent_dn();
+ metablob->add_dir_context(srci_pdn->get_dir());
+ mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
+ metablob->add_primary_dentry(srci_pdn, srci, true);
}
} else if (srcdnl->is_primary()) {
// project snap parent update?
- if (destdn->is_auth() && src_realm != dest_realm &&
- (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
- srci->project_past_snaprealm_parent(dest_realm);
+ if (destdn->is_auth()) {
+ if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ srci->project_snaprealm(srci_srnode);
+ srci_srnode = NULL;
+ }
+ }
if (destdn->is_auth() && !destdnl->is_null())
mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
- else
- destdn->first = MAX(destdn->first, next_dest_snap);
+
+ destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
if (destdn->is_auth())
metablob->add_primary_dentry(destdn, srci, true, true);
metablob->add_primary_dentry(destdn, srci, true);
if (srcdn->is_auth() && srci->is_dir()) {
// journal new subtrees root dirfrags
- list<CDir*> ls;
- srci->get_dirfrags(ls);
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- CDir *dir = *p;
+ auto&& ls = srci->get_dirfrags();
+ for (const auto& dir : ls) {
if (dir->is_auth())
metablob->add_dir(dir, true);
}
dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
// make renamed inode first track the dn
- if (srcdnl->is_primary() && destdn->is_auth())
- srci->first = destdn->first;
+ if (srcdnl->is_primary() && destdn->is_auth()) {
+ ceph_assert(srci->first <= destdn->first);
+ srci->first = destdn->first;
+ }
+ // make stray inode first track the straydn
+ if (straydn && straydn->is_auth()) {
+ ceph_assert(oldin->first <= straydn->first);
+ oldin->first = straydn->first;
+ }
if (oldin && oldin->is_dir()) {
- assert(straydn);
+ ceph_assert(straydn);
mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
}
if (srci->is_dir())
CInode *oldin = destdnl->get_inode();
// primary+remote link merge?
- bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
- (srcdnl->is_primary() || destdnl->is_primary()));
+ bool linkmerge = (srcdnl->get_inode() == oldin);
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
+
+ bool new_in_snaprealm = false;
+ bool new_oldin_snaprealm = false;
// target inode
if (!linkmerge) {
if (destdnl->is_primary()) {
- assert(straydn);
+ ceph_assert(straydn);
dout(10) << "straydn is " << *straydn << dendl;
+
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ if (destdn->is_auth()) {
+ bool hadrealm = (oldin->snaprealm ? true : false);
+ oldin->early_pop_projected_snaprealm();
+ new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
+ } else {
+ ceph_assert(mdr->slave_request);
+ if (mdr->slave_request->desti_snapbl.length()) {
+ new_oldin_snaprealm = !oldin->snaprealm;
+ oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
+ ceph_assert(oldin->snaprealm);
+ ceph_assert(oldin->snaprealm->have_past_parents_open());
+ }
+ }
+
destdn->get_dir()->unlink_inode(destdn, false);
straydn->pop_projected_linkage();
if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
- assert(!straydn->is_projected()); // no other projected
-
- mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
+ ceph_assert(!straydn->is_projected()); // no other projected
// nlink-- targeti
- if (destdn->is_auth()) {
- bool hadrealm = (oldin->snaprealm ? true : false);
+ if (destdn->is_auth())
oldin->pop_and_dirty_projected_inode(mdr->ls);
- if (oldin->snaprealm && !hadrealm)
- mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
- } else {
- // FIXME this snaprealm is not filled out correctly
- //oldin->open_snaprealm(); might be sufficient..
- }
+
+ mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
} else if (destdnl->is_remote()) {
destdn->get_dir()->unlink_inode(destdn, false);
- if (oldin->is_auth())
- oldin->pop_and_dirty_projected_inode(mdr->ls);
+ if (oldin->is_auth()) {
+ oldin->pop_and_dirty_projected_inode(mdr->ls);
+ } else if (mdr->slave_request) {
+ if (mdr->slave_request->desti_snapbl.length() > 0) {
+ ceph_assert(oldin->snaprealm);
+ oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
+ }
+ } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ delete desti_srnode;
+ desti_srnode = NULL;
+ }
}
}
// unlink src before we relink it at dest
CInode *in = srcdnl->get_inode();
- assert(in);
+ ceph_assert(in);
bool srcdn_was_remote = srcdnl->is_remote();
+ if (!srcdn_was_remote) {
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ if (destdn->is_auth()) {
+ bool hadrealm = (in->snaprealm ? true : false);
+ in->early_pop_projected_snaprealm();
+ new_in_snaprealm = (in->snaprealm && !hadrealm);
+ } else {
+ ceph_assert(mdr->slave_request);
+ if (mdr->slave_request->srci_snapbl.length()) {
+ new_in_snaprealm = !in->snaprealm;
+ in->decode_snap_blob(mdr->slave_request->srci_snapbl);
+ ceph_assert(in->snaprealm);
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ }
+ }
+ }
+
srcdn->get_dir()->unlink_inode(srcdn);
// dest
// destdn
destdnl = destdn->pop_projected_linkage();
if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
- assert(!destdn->is_projected()); // no other projected
+ ceph_assert(!destdn->is_projected()); // no other projected
destdn->link_remote(destdnl, in);
if (destdn->is_auth())
destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
// in
- if (in->is_auth())
+ if (in->is_auth()) {
in->pop_and_dirty_projected_inode(mdr->ls);
+ } else if (mdr->slave_request) {
+ if (mdr->slave_request->srci_snapbl.length() > 0) {
+ ceph_assert(in->snaprealm);
+ in->decode_snap_blob(mdr->slave_request->srci_snapbl);
+ }
+ } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ delete srci_srnode;
+ srci_srnode = NULL;
+ }
} else {
dout(10) << "merging remote onto primary link" << dendl;
oldin->pop_and_dirty_projected_inode(mdr->ls);
}
destdnl = destdn->pop_projected_linkage();
if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
- assert(!destdn->is_projected()); // no other projected
+ ceph_assert(!destdn->is_projected()); // no other projected
// srcdn inode import?
if (!srcdn->is_auth() && destdn->is_auth()) {
- assert(mdr->more()->inode_import.length() > 0);
+ ceph_assert(mdr->more()->inode_import.length() > 0);
map<client_t,Capability::Import> imported_caps;
}
mdr->more()->inode_import.clear();
- ::encode(imported_caps, mdr->more()->inode_import);
+ encode(imported_caps, mdr->more()->inode_import);
/* hack: add an auth pin for each xlock we hold. These were
* remote xlocks previously but now they're local and
* we're going to try and unpin when we xlock_finish. */
- for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
- i != mdr->xlocks.end();
- ++i)
- if ((*i)->get_parent() == destdnl->get_inode() &&
- !(*i)->is_locallock())
- mds->locker->xlock_import(*i);
+
+ for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
+ i != mdr->locks.end();
+ ++i) {
+ SimpleLock *lock = i->lock;
+ if (lock->get_parent() != destdnl->get_inode())
+ break;
+ if (i->is_xlock() && !lock->is_locallock())
+ mds->locker->xlock_import(lock);
+ }
// hack: fix auth bit
in->state_set(CInode::STATE_AUTH);
mdr->clear_ambiguous_auth();
}
- if (destdn->is_auth()) {
+ if (destdn->is_auth())
in->pop_and_dirty_projected_inode(mdr->ls);
-
- } else {
- // FIXME: fix up snaprealm!
- }
}
// src
srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
srcdn->pop_projected_linkage();
if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
- assert(!srcdn->is_projected()); // no other projected
+ ceph_assert(!srcdn->is_projected()); // no other projected
// apply remaining projected inodes (nested)
mdr->apply();
// update subtree map?
- if (destdnl->is_primary() && in->is_dir())
+ if (destdnl->is_primary() && in->is_dir())
mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
if (straydn && oldin->is_dir())
mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
+ if (new_oldin_snaprealm)
+ mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
+ if (new_in_snaprealm)
+ mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
+
// removing a new dn?
if (srcdn->is_auth())
srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
}
};
-/* This function DOES put the mdr->slave_request before returning*/
void Server::handle_slave_rename_prep(MDRequestRef& mdr)
{
dout(10) << "handle_slave_rename_prep " << *mdr
if (mdr->slave_request->is_interrupted()) {
dout(10) << " slave request interrupted, sending noop reply" << dendl;
- MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
reply->mark_interrupted();
mds->send_message_mds(reply, mdr->slave_to_mds);
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
return;
}
filepath destpath(mdr->slave_request->destdnpath);
dout(10) << " dest " << destpath << dendl;
vector<CDentry*> trace;
- int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
+ CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
+ int r = mdcache->path_traverse(mdr, cf, destpath,
+ MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
+ &trace);
if (r > 0) return;
if (r == -ESTALE) {
mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
- mdr->slave_to_mds);
+ mdr->slave_to_mds, true);
return;
}
- assert(r == 0); // we shouldn't get an error here!
+ ceph_assert(r == 0); // we shouldn't get an error here!
- CDentry *destdn = trace[trace.size()-1];
+ CDentry *destdn = trace.back();
CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
dout(10) << " destdn " << *destdn << dendl;
mdr->pin(destdn);
filepath srcpath(mdr->slave_request->srcdnpath);
dout(10) << " src " << srcpath << dendl;
CInode *srci = nullptr;
- r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
+ r = mdcache->path_traverse(mdr, cf, srcpath,
+ MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+ &trace, &srci);
if (r > 0) return;
- assert(r == 0);
+ ceph_assert(r == 0);
- // srcpath must not point to a null dentry
- assert(srci != nullptr);
-
- CDentry *srcdn = trace[trace.size()-1];
+ CDentry *srcdn = trace.back();
CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
dout(10) << " srcdn " << *srcdn << dendl;
mdr->pin(srcdn);
mdr->pin(srci);
// stray?
- bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
- (srcdnl->is_primary() || destdnl->is_primary()));
+ bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
CDentry *straydn = mdr->straydn;
if (destdnl->is_primary() && !linkmerge)
- assert(straydn);
+ ceph_assert(straydn);
mdr->set_op_stamp(mdr->slave_request->op_stamp);
mdr->more()->srcdn_auth_mds = srcdn->authority().first;
// - avoid conflicting lock state changes
// - avoid concurrent updates to the inode
// (this could also be accomplished with the versionlock)
- int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
- allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
+ int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
(mds->is_cluster_degraded() &&
!mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
continue;
- MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_RENAMENOTIFY);
+ auto notify = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
mds->send_message_mds(notify, *p);
mdr->more()->waiting_on_slave.insert(*p);
}
}
if (reply_witness) {
- assert(!srcdnrep.empty());
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
- MMDSSlaveRequest::OP_RENAMEPREPACK);
+ ceph_assert(!srcdnrep.empty());
+ auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
reply->witnesses.swap(srcdnrep);
mds->send_message_mds(reply, mdr->slave_to_mds);
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
return;
}
dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
- rollback.orig_src.dname = std::string(srcdn->get_name());
+ rollback.orig_src.dname = srcdn->get_name();
if (srcdnl->is_primary())
rollback.orig_src.ino = srcdnl->get_inode()->ino();
else {
- assert(srcdnl->is_remote());
+ ceph_assert(srcdnl->is_remote());
rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
}
rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
- rollback.orig_dest.dname = std::string(destdn->get_name());
+ rollback.orig_dest.dname = destdn->get_name();
if (destdnl->is_primary())
rollback.orig_dest.ino = destdnl->get_inode()->ino();
else if (destdnl->is_remote()) {
rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
- rollback.stray.dname = std::string(straydn->get_name());
+ rollback.stray.dname = straydn->get_name();
}
- ::encode(rollback, mdr->more()->rollback_bl);
+ if (mdr->slave_request->desti_snapbl.length()) {
+ CInode *oldin = destdnl->get_inode();
+ if (oldin->snaprealm) {
+ encode(true, rollback.desti_snapbl);
+ oldin->encode_snap_blob(rollback.desti_snapbl);
+ } else {
+ encode(false, rollback.desti_snapbl);
+ }
+ }
+ if (mdr->slave_request->srci_snapbl.length()) {
+ if (srci->snaprealm) {
+ encode(true, rollback.srci_snapbl);
+ srci->encode_snap_blob(rollback.srci_snapbl);
+ } else {
+ encode(false, rollback.srci_snapbl);
+ }
+ }
+ encode(rollback, mdr->more()->rollback_bl);
+ // FIXME: rollback snaprealm
dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
// journal.
mdr->ls = NULL;
_logged_slave_rename(mdr, srcdn, destdn, straydn);
} else {
+ mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
mdr->more()->slave_update_journaled = true;
submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
mdr, __func__);
dout(10) << "_logged_slave_rename " << *mdr << dendl;
// prepare ack
- MMDSSlaveRequest *reply = NULL;
+ ref_t<MMDSSlaveRequest> reply;
if (!mdr->aborted) {
- reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
if (!mdr->more()->slave_update_journaled)
reply->mark_not_journaled();
}
CDentry::linkage_t *srcdnl = srcdn->get_linkage();
- CDentry::linkage_t *destdnl = NULL;
//CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
// export srci?
if (srcdn->is_auth() && srcdnl->is_primary()) {
// set export bounds for CInode::encode_export()
- list<CDir*> bounds;
- if (srcdnl->get_inode()->is_dir()) {
- srcdnl->get_inode()->get_dirfrags(bounds);
- for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
- (*p)->state_set(CDir::STATE_EXPORTBOUND);
- }
+ if (reply) {
+ std::vector<CDir*> bounds;
+ if (srcdnl->get_inode()->is_dir()) {
+ srcdnl->get_inode()->get_dirfrags(bounds);
+ for (const auto& bound : bounds) {
+ bound->state_set(CDir::STATE_EXPORTBOUND);
+ }
+ }
- map<client_t,entity_inst_t> exported_client_map;
- bufferlist inodebl;
- mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
- exported_client_map);
+ map<client_t,entity_inst_t> exported_client_map;
+ map<client_t, client_metadata_t> exported_client_metadata_map;
+ bufferlist inodebl;
+ mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
+ exported_client_map,
+ exported_client_metadata_map);
- for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
- (*p)->state_clear(CDir::STATE_EXPORTBOUND);
+ for (const auto& bound : bounds) {
+ bound->state_clear(CDir::STATE_EXPORTBOUND);
+ }
- if (reply) {
- ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
+ encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
+ encode(exported_client_metadata_map, reply->inode_export);
reply->inode_export.claim_append(inodebl);
reply->inode_export_v = srcdnl->get_inode()->inode.version;
}
// apply
_rename_apply(mdr, srcdn, destdn, straydn);
-
- destdnl = destdn->get_linkage();
+
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
// bump popularity
- utime_t now = ceph_clock_now();
- mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
+ mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
- mds->balancer->hit_inode(now, destdnl->get_inode(), META_POP_IWR);
+ mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
// done.
- mdr->slave_request->put();
- mdr->slave_request = 0;
+ mdr->reset_slave_request();
mdr->straydn = 0;
if (reply) {
mds->send_message_mds(reply, mdr->slave_to_mds);
} else {
- assert(mdr->aborted);
+ ceph_assert(mdr->aborted);
dout(10) << " abort flag set, finishing" << dendl;
mdcache->request_finish(mdr);
}
{
dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
- CDentry::linkage_t *destdnl = destdn->get_linkage();
+ CInode *in = destdn->get_linkage()->get_inode();
+
+ inodeno_t migrated_stray;
+ if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+ migrated_stray = in->ino();
- list<MDSInternalContextBase*> finished;
+ MDSContext::vec finished;
if (r == 0) {
// unfreeze+singleauth inode
// hmm, do i really need to delay this?
if (mdr->more()->is_inode_exporter) {
-
- CInode *in = destdnl->get_inode();
-
// drop our pins
// we exported, clear out any xlocks that we moved to another MDS
- set<SimpleLock*>::iterator i = mdr->xlocks.begin();
- while (i != mdr->xlocks.end()) {
- SimpleLock *lock = *i++;
+ for (auto i = mdr->locks.lower_bound(&in->versionlock);
+ i != mdr->locks.end(); ) {
+ SimpleLock *lock = i->lock;
+ if (lock->get_parent() != in)
+ break;
// we only care about xlocks on the exported inode
- if (lock->get_parent() == in &&
- !lock->is_locallock())
- mds->locker->xlock_export(lock, mdr.get());
+ if (i->is_xlock() && !lock->is_locallock())
+ mds->locker->xlock_export(i++, mdr.get());
+ else
+ ++i;
}
map<client_t,Capability::Import> peer_imported;
- bufferlist::iterator bp = mdr->more()->inode_import.begin();
- ::decode(peer_imported, bp);
+ auto bp = mdr->more()->inode_import.cbegin();
+ decode(peer_imported, bp);
- dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
- mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
- mdr->slave_to_mds, peer_imported, finished);
+ dout(10) << " finishing inode export on " << *in << dendl;
+ mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
// unfreeze
- assert(destdnl->get_inode()->is_frozen_inode());
- destdnl->get_inode()->unfreeze_inode(finished);
+ ceph_assert(in->is_frozen_inode());
+ in->unfreeze_inode(finished);
}
// singleauth
// witness list from the master, and they failed before we tried prep again.
if (mdr->more()->rollback_bl.length()) {
if (mdr->more()->is_inode_exporter) {
- dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
- destdnl->get_inode()->abort_export();
+ dout(10) << " reversing inode export of " << *in << dendl;
+ in->abort_export();
}
if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
mdcache->request_finish(mdr);
}
}
+
+ if (migrated_stray && mds->is_stopping())
+ mdcache->shutdown_export_stray_finish(migrated_stray);
}
void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
pf->rstat.rbytes += linkunlink * rstat.rbytes;
pf->rstat.rfiles += linkunlink * rstat.rfiles;
pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
- pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
+ pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
}
if (pf->fragstat.mtime == ctime) {
pf->fragstat.mtime = r.dirfrag_old_mtime;
version_t srcdnpv;
CDentry *destdn;
CDentry *straydn;
+ map<client_t,ref_t<MClientSnap>> splits[2];
bool finish_mdr;
C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
- CDentry *sd, version_t pv, CDentry *dd,
- CDentry *st, bool f) :
+ CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
+ map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
- straydn(st), finish_mdr(f) {}
+ straydn(st), finish_mdr(f) {
+ splits[0].swap(_splits[0]);
+ splits[1].swap(_splits[1]);
+ }
void finish(int r) override {
server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
- destdn, straydn, finish_mdr);
+ destdn, straydn, splits, finish_mdr);
}
};
bool finish_mdr)
{
rename_rollback rollback;
- bufferlist::iterator p = rbl.begin();
- ::decode(rollback, p);
+ auto p = rbl.cbegin();
+ decode(rollback, p);
dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
// need to finish this update before sending resolve to claim the subtree
srcdn = srcdir->lookup(rollback.orig_src.dname);
if (srcdn) {
dout(10) << " srcdn " << *srcdn << dendl;
- assert(srcdn->get_linkage()->is_null());
+ ceph_assert(srcdn->get_linkage()->is_null());
} else
dout(10) << " srcdn not found" << dendl;
} else
if (rollback.orig_src.ino) {
in = mdcache->get_inode(rollback.orig_src.ino);
if (in && in->is_dir())
- assert(srcdn && destdn);
+ ceph_assert(srcdn && destdn);
} else
in = mdcache->get_inode(rollback.orig_src.remote_ino);
straydn = straydir->lookup(rollback.stray.dname);
if (straydn) {
dout(10) << " straydn " << *straydn << dendl;
- assert(straydn->get_linkage()->is_primary());
+ ceph_assert(straydn->get_linkage()->is_primary());
} else
dout(10) << " straydn not found" << dendl;
} else
if (rollback.orig_dest.ino) {
target = mdcache->get_inode(rollback.orig_dest.ino);
if (target)
- assert(destdn && straydn);
+ ceph_assert(destdn && straydn);
} else if (rollback.orig_dest.remote_ino)
target = mdcache->get_inode(rollback.orig_dest.remote_ino);
// can't use is_auth() in the resolve stage
mds_rank_t whoami = mds->get_nodeid();
// slave
- assert(!destdn || destdn->authority().first != whoami);
- assert(!straydn || straydn->authority().first != whoami);
+ ceph_assert(!destdn || destdn->authority().first != whoami);
+ ceph_assert(!straydn || straydn->authority().first != whoami);
bool force_journal_src = false;
bool force_journal_dest = false;
if (srcdn->authority().first == whoami)
srcdnpv = srcdn->pre_dirty();
if (rollback.orig_src.ino) {
- assert(in);
+ ceph_assert(in);
srcdn->push_projected_linkage(in);
} else
srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
rollback.orig_src.remote_d_type);
}
- CInode::mempool_inode *pip = 0;
+ map<client_t,ref_t<MClientSnap>> splits[2];
+
+ CInode::mempool_inode *pip = nullptr;
if (in) {
- if (in->authority().first == whoami) {
+ bool projected;
+ if (in->get_projected_parent_dn()->authority().first == whoami) {
auto &pi = in->project_inode();
- mut->add_projected_inode(in);
- pi.inode.version = in->pre_dirty();
pip = &pi.inode;
- } else
+ mut->add_projected_inode(in);
+ pip->version = in->pre_dirty();
+ projected = true;
+ } else {
pip = in->get_projected_inode();
+ projected = false;
+ }
if (pip->ctime == rollback.ctime)
- pip->ctime = pip->rstat.rctime = rollback.orig_src.old_ctime;
+ pip->ctime = rollback.orig_src.old_ctime;
+
+ if (rollback.srci_snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.srci_snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (projected && !mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ in->project_snaprealm(new_srnode);
+ } else
+ decode(in->snaprealm->srnode, p);
+ } else {
+ SnapRealm *realm;
+ if (rollback.orig_src.ino) {
+ ceph_assert(srcdir);
+ realm = srcdir->get_inode()->find_snaprealm();
+ } else {
+ realm = in->snaprealm->parent;
+ }
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
+ if (projected)
+ in->project_snaprealm(NULL);
+ else
+ in->snaprealm->merge_to(realm);
+ }
+ }
}
if (srcdn && srcdn->authority().first == whoami) {
} else {
// the dentry will be trimmed soon, it's ok to have wrong linkage
if (rollback.orig_dest.ino)
- assert(mds->is_resolve());
+ ceph_assert(mds->is_resolve());
destdn->push_projected_linkage();
}
}
straydn->push_projected_linkage();
if (target) {
- CInode::mempool_inode *ti = NULL;
- if (target->authority().first == whoami) {
+ bool projected;
+ CInode::mempool_inode *ti = nullptr;
+ if (target->get_projected_parent_dn()->authority().first == whoami) {
auto &pi = target->project_inode();
- mut->add_projected_inode(target);
- pi.inode.version = target->pre_dirty();
ti = &pi.inode;
- } else
+ mut->add_projected_inode(target);
+ ti->version = target->pre_dirty();
+ projected = true;
+ } else {
ti = target->get_projected_inode();
+ projected = false;
+ }
if (ti->ctime == rollback.ctime)
- ti->ctime = ti->rstat.rctime = rollback.orig_dest.old_ctime;
+ ti->ctime = rollback.orig_dest.old_ctime;
if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
- assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
+ ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
else
- assert(rollback.orig_dest.remote_ino &&
+ ceph_assert(rollback.orig_dest.remote_ino &&
rollback.orig_dest.remote_ino == rollback.orig_src.ino);
} else
ti->nlink++;
+
+ if (rollback.desti_snapbl.length() && target->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.desti_snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (projected && !mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ target->project_snaprealm(new_srnode);
+ } else
+ decode(target->snaprealm->srnode, p);
+ } else {
+ SnapRealm *realm;
+ if (rollback.orig_dest.ino) {
+ ceph_assert(destdir);
+ realm = destdir->get_inode()->find_snaprealm();
+ } else {
+ realm = target->snaprealm->parent;
+ }
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
+ if (projected)
+ target->project_snaprealm(NULL);
+ else
+ target->snaprealm->merge_to(realm);
+ }
+ }
}
if (srcdn)
}
if (force_journal_dest) {
- assert(rollback.orig_dest.ino);
+ ceph_assert(rollback.orig_dest.ino);
le->commit.add_dir_context(destdir);
le->commit.add_primary_dentry(destdn, 0, true);
}
// slave: no need to journal straydn
if (target && target != in && target->authority().first == whoami) {
- assert(rollback.orig_dest.remote_ino);
+ ceph_assert(rollback.orig_dest.remote_ino);
le->commit.add_dir_context(target->get_projected_parent_dir());
le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
}
dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
le->commit.renamed_dirino = in->ino();
if (srcdn->authority().first == whoami) {
- list<CDir*> ls;
- in->get_dirfrags(ls);
- for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
- CDir *dir = *p;
+ auto&& ls = in->get_dirfrags();
+ for (const auto& dir : ls) {
if (!dir->is_auth())
le->commit.renamed_dir_frags.push_back(dir->get_frag());
}
}
if (target && target->is_dir()) {
- assert(destdn);
+ ceph_assert(destdn);
mdcache->project_subtree_rename(target, straydir, destdir);
}
if (in && in->is_dir()) {
- assert(srcdn);
+ ceph_assert(srcdn);
mdcache->project_subtree_rename(in, destdir, srcdir);
}
if (mdr && !mdr->more()->slave_update_journaled) {
- assert(le->commit.empty());
+ ceph_assert(le->commit.empty());
mdlog->cancel_entry(le);
mut->ls = NULL;
- _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
+ _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
} else {
- assert(!le->commit.empty());
+ ceph_assert(!le->commit.empty());
if (mdr)
mdr->more()->slave_update_journaled = false;
- MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
- destdn, straydn, finish_mdr);
+ MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
+ srcdn, srcdnpv, destdn, straydn,
+ splits, finish_mdr);
submit_mdlog_entry(le, fin, mdr, __func__);
mdlog->flush();
}
}
void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
- version_t srcdnpv, CDentry *destdn,
- CDentry *straydn, bool finish_mdr)
+ version_t srcdnpv, CDentry *destdn, CDentry *straydn,
+ map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
{
dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
}
if (srcdn) {
srcdn->pop_projected_linkage();
- if (srcdn->authority().first == mds->get_nodeid())
+ if (srcdn->authority().first == mds->get_nodeid()) {
srcdn->mark_dirty(srcdnpv, mut->ls);
+ if (srcdn->get_linkage()->is_primary())
+ srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
+ }
}
mut->apply();
if (srcdn && srcdn->get_linkage()->is_primary()) {
CInode *in = srcdn->get_linkage()->get_inode();
- if (srcdn->authority().first == mds->get_nodeid())
- in->state_set(CInode::STATE_AUTH);
- // update subtree map?
if (in && in->is_dir()) {
- assert(destdn);
+ ceph_assert(destdn);
mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
}
}
CInode *oldin = destdn->get_linkage()->get_inode();
// update subtree map?
if (oldin && oldin->is_dir()) {
- assert(straydn);
+ ceph_assert(straydn);
mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
}
}
root = mdcache->get_subtree_root(destdn->get_dir());
if (root)
mdcache->try_trim_non_auth_subtree(root);
+ } else {
+ mdcache->send_snaps(splits[1]);
+ mdcache->send_snaps(splits[0]);
}
if (mdr) {
- list<MDSInternalContextBase*> finished;
+ MDSContext::vec finished;
if (mdr->more()->is_ambiguous_auth) {
if (srcdn->is_auth())
mdr->more()->rename_inode->unfreeze_inode(finished);
mdr->more()->slave_rolling_back = false;
}
- mdcache->finish_rollback(mut->reqid);
+ mdcache->finish_rollback(mut->reqid, mdr);
mut->cleanup();
}
-/* This function DOES put the passed message before returning*/
-void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
{
dout(10) << "handle_slave_rename_prep_ack " << *mdr
<< " witnessed by " << ack->get_source()
}
// witnessed? or add extra witnesses?
- assert(mdr->more()->witnessed.count(from) == 0);
+ ceph_assert(mdr->more()->witnessed.count(from) == 0);
if (ack->is_interrupted()) {
dout(10) << " slave request interrupted, noop" << dendl;
} else if (ack->witnesses.empty()) {
mdr->more()->has_journaled_slaves = true;
} else {
dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
- mdr->more()->extra_witnesses.swap(ack->witnesses);
+ mdr->more()->extra_witnesses = ack->witnesses;
mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
}
// srci import?
if (ack->inode_export.length()) {
dout(10) << " got srci import" << dendl;
- mdr->more()->inode_import.claim(ack->inode_export);
+ mdr->more()->inode_import.share(ack->inode_export);
mdr->more()->inode_import_v = ack->inode_export_v;
}
// remove from waiting list
- assert(mdr->more()->waiting_on_slave.count(from));
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
mdr->more()->waiting_on_slave.erase(from);
if (mdr->more()->waiting_on_slave.empty())
dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
}
-void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
+void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
{
dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
<< ack->get_source() << dendl;
- assert(mdr->is_slave());
+ ceph_assert(mdr->is_slave());
mds_rank_t from = mds_rank_t(ack->get_source().num());
if (mdr->more()->waiting_on_slave.count(from)) {
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_lssnap(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
// traverse to path
- CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
- if (!diri || diri->state_test(CInode::STATE_PURGING)) {
- respond_to_request(mdr, -ESTALE);
- return;
- }
- if (!diri->is_auth()) {
- mdcache->request_forward(mdr, diri->authority().first);
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
return;
- }
+
if (!diri->is_dir()) {
respond_to_request(mdr, -ENOTDIR);
return;
dout(10) << "lssnap on " << *diri << dendl;
// lock snap
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- mds->locker->include_snap_rdlocks(rdlocks, diri);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
return;
if (!check_access(mdr, diri, MAY_READ))
return;
SnapRealm *realm = diri->find_snaprealm();
- map<snapid_t,SnapInfo*> infomap;
+ map<snapid_t,const SnapInfo*> infomap;
realm->get_snap_info(infomap, diri->get_oldest_snap());
unsigned max_entries = req->head.args.readdir.max_entries;
int max_bytes = req->head.args.readdir.max_bytes;
if (!max_bytes)
// make sure at least one item can be encoded
- max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
__u64 last_snapid = 0;
string offset_str = req->get_path2();
if (!offset_str.empty())
last_snapid = realm->resolve_snapname(offset_str, diri->ino());
+ //Empty DirStat
bufferlist dirbl;
- encode_empty_dirstat(dirbl);
+ static DirStat empty;
+ CDir::encode_dirstat(dirbl, mdr->session->info, empty);
max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
__u32 num = 0;
bufferlist dnbl;
- map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
+ auto p = infomap.upper_bound(last_snapid);
for (; p != infomap.end() && num < max_entries; ++p) {
dout(10) << p->first << " -> " << *p->second << dendl;
// actual
string snap_name;
if (p->second->ino == diri->ino())
- snap_name = std::string(p->second->name);
+ snap_name = p->second->name;
else
- snap_name = std::string(p->second->get_long_name());
+ snap_name = p->second->get_long_name();
unsigned start_len = dnbl.length();
if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
break;
- ::encode(snap_name, dnbl);
- encode_infinite_lease(dnbl);
+ encode(snap_name, dnbl);
+ //infinite lease
+ LeaseStat e(CEPH_LEASE_VALID, -1, 0);
+ mds->locker->encode_lease(dnbl, mdr->session->info, e);
+ dout(20) << "encode_infinite_lease" << dendl;
int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
if (r < 0) {
++num;
}
- ::encode(num, dirbl);
+ encode(num, dirbl);
__u16 flags = 0;
if (p == infomap.end()) {
flags = CEPH_READDIR_FRAG_END;
if (last_snapid == 0)
flags |= CEPH_READDIR_FRAG_COMPLETE;
}
- ::encode(flags, dirbl);
+ encode(flags, dirbl);
dirbl.claim_append(dnbl);
mdr->reply_extra_bl = dirbl;
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_mksnap(MDRequestRef& mdr)
{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
if (!mds->mdsmap->allows_snaps()) {
// you can't make snapshots until you set an option right now
respond_to_request(mdr, -EPERM);
return;
}
- MClientRequest *req = mdr->client_request;
- CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
- if (!diri || diri->state_test(CInode::STATE_PURGING)) {
- respond_to_request(mdr, -ESTALE);
- return;
- }
-
- if (!diri->is_auth()) { // fw to auth?
- mdcache->request_forward(mdr, diri->authority().first);
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
return;
- }
// dir only
if (!diri->is_dir()) {
return;
}
- boost::string_view snapname = req->get_filepath().last_dentry();
+ std::string_view snapname = req->get_filepath().last_dentry();
- if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
respond_to_request(mdr, -EPERM);
return;
dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
// lock snap
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&diri->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
- mds->locker->include_snap_rdlocks(rdlocks, diri);
- rdlocks.erase(&diri->snaplock);
- xlocks.insert(&diri->snaplock);
+ if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+ return;
+ }
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
return;
- if (!check_access(mdr, diri, MAY_WRITE))
+ // check if we can create any more snapshots
+ // we don't allow any more if we are already at or beyond the limit
+ if (diri->snaprealm &&
+ diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
+ respond_to_request(mdr, -EMLINK);
return;
+ }
// make sure name is unique
if (diri->snaprealm &&
version_t stid = mdr->more()->stid;
snapid_t snapid;
- bufferlist::iterator p = mdr->more()->snapidbl.begin();
- ::decode(snapid, p);
+ auto p = mdr->more()->snapidbl.cbegin();
+ decode(snapid, p);
dout(10) << " stid " << stid << " snapid " << snapid << dendl;
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
// journal
SnapInfo info;
info.ino = diri->ino();
info.snapid = snapid;
- info.name = std::string(snapname);
+ info.name = snapname;
info.stamp = mdr->get_op_stamp();
auto &pi = diri->project_inode(false, true);
- pi.inode.ctime = pi.inode.rstat.rctime = info.stamp;
+ pi.inode.ctime = info.stamp;
+ if (info.stamp > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = info.stamp;
+ pi.inode.rstat.rsnaps++;
pi.inode.version = diri->pre_dirty();
// project the snaprealm
// create snap
dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, op);
+
mdcache->do_realm_invalidate_and_update_notify(diri, op);
// yay
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_rmsnap(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
- CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
- if (!diri || diri->state_test(CInode::STATE_PURGING)) {
- respond_to_request(mdr, -ESTALE);
- return;
- }
- if (!diri->is_auth()) { // fw to auth?
- mdcache->request_forward(mdr, diri->authority().first);
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
return;
- }
+
if (!diri->is_dir()) {
respond_to_request(mdr, -ENOTDIR);
return;
}
- boost::string_view snapname = req->get_filepath().last_dentry();
+ std::string_view snapname = req->get_filepath().last_dentry();
- if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
respond_to_request(mdr, -EPERM);
return;
snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
dout(10) << " snapname " << snapname << " is " << snapid << dendl;
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- mds->locker->include_snap_rdlocks(rdlocks, diri);
- rdlocks.erase(&diri->snaplock);
- xlocks.insert(&diri->snaplock);
-
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&diri->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+ if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+ return;
+ }
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
- if (!check_access(mdr, diri, MAY_WRITE))
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
return;
// prepare
return;
}
version_t stid = mdr->more()->stid;
- bufferlist::iterator p = mdr->more()->snapidbl.begin();
+ auto p = mdr->more()->snapidbl.cbegin();
snapid_t seq;
- ::decode(seq, p);
+ decode(seq, p);
dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
// journal
auto &pi = diri->project_inode(false, true);
pi.inode.version = diri->pre_dirty();
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.rstat.rsnaps--;
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "rmsnap");
{
dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
snapid_t stid = mdr->more()->stid;
- bufferlist::iterator p = mdr->more()->snapidbl.begin();
+ auto p = mdr->more()->snapidbl.cbegin();
snapid_t seq;
- ::decode(seq, p);
+ decode(seq, p);
diri->pop_and_dirty_projected_inode(mdr->ls);
mdr->apply();
dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
+
mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
// yay
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_renamesnap(MDRequestRef& mdr)
{
- MClientRequest *req = mdr->client_request;
+ const cref_t<MClientRequest> &req = mdr->client_request;
if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
respond_to_request(mdr, -EINVAL);
return;
}
- CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
- if (!diri || diri->state_test(CInode::STATE_PURGING)) {
- respond_to_request(mdr, -ESTALE);
- return;
- }
-
- if (!diri->is_auth()) { // fw to auth?
- mdcache->request_forward(mdr, diri->authority().first);
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
return;
- }
if (!diri->is_dir()) { // dir only
respond_to_request(mdr, -ENOTDIR);
return;
}
- if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
- mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
+ mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
respond_to_request(mdr, -EPERM);
return;
}
- boost::string_view dstname = req->get_filepath().last_dentry();
- boost::string_view srcname = req->get_filepath2().last_dentry();
+ std::string_view dstname = req->get_filepath().last_dentry();
+ std::string_view srcname = req->get_filepath2().last_dentry();
dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
if (srcname.length() == 0 || srcname[0] == '_') {
dout(10) << " snapname " << srcname << " is " << snapid << dendl;
// lock snap
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
-
- mds->locker->include_snap_rdlocks(rdlocks, diri);
- rdlocks.erase(&diri->snaplock);
- xlocks.insert(&diri->snaplock);
-
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
- return;
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&diri->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+ if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+ return;
+ }
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
- if (!check_access(mdr, diri, MAY_WRITE))
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
return;
// prepare
if (!mdr->more()->stid) {
mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
- &mdr->more()->stid, &mdr->more()->snapidbl,
+ &mdr->more()->stid,
new C_MDS_RetryRequest(mdcache, mdr));
return;
}
version_t stid = mdr->more()->stid;
- bufferlist::iterator p = mdr->more()->snapidbl.begin();
- snapid_t seq;
- ::decode(seq, p);
- dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
+ dout(10) << " stid is " << stid << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
// journal
auto &pi = diri->project_inode(false, true);
- pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
pi.inode.version = diri->pre_dirty();
// project the snaprealm
auto &newsnap = *pi.snapnode;
auto it = newsnap.snaps.find(snapid);
- assert(it != newsnap.snaps.end());
- it->second.name = std::string(dstname);
+ ceph_assert(it != newsnap.snaps.end());
+ it->second.name = dstname;
// journal the inode changes
mdr->ls = mdlog->get_current_segment();
dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
- mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
// yay
mdr->in[0] = diri;