#include <iostream>
#include <sstream>
#include <string>
-#include <boost/utility/string_view.hpp>
+#include <string_view>
#include <map>
#include "MDCache.h"
#include "include/filepath.h"
#include "include/util.h"
+#include "messages/MClientCaps.h"
+
#include "msg/Message.h"
#include "msg/Messenger.h"
#include "events/ECommitted.h"
#include "events/ESessions.h"
-#include "messages/MGenericMessage.h"
-
-#include "messages/MMDSResolve.h"
-#include "messages/MMDSResolveAck.h"
-#include "messages/MMDSCacheRejoin.h"
-
-#include "messages/MDiscover.h"
-#include "messages/MDiscoverReply.h"
-
-//#include "messages/MInodeUpdate.h"
-#include "messages/MDirUpdate.h"
-#include "messages/MCacheExpire.h"
-
-#include "messages/MInodeFileCaps.h"
-
-#include "messages/MLock.h"
-#include "messages/MDentryLink.h"
-#include "messages/MDentryUnlink.h"
-
-#include "messages/MMDSFindIno.h"
-#include "messages/MMDSFindInoReply.h"
-
-#include "messages/MMDSOpenIno.h"
-#include "messages/MMDSOpenInoReply.h"
-
-#include "messages/MClientRequest.h"
-#include "messages/MClientCaps.h"
-#include "messages/MClientSnap.h"
-#include "messages/MClientQuota.h"
-
-#include "messages/MMDSSlaveRequest.h"
-
-#include "messages/MMDSFragmentNotify.h"
-#include "messages/MMDSFragmentNotifyAck.h"
-
-#include "messages/MGatherCaps.h"
-
#include "InoTable.h"
#include "common/Timer.h"
#include "perfglue/heap_profiler.h"
-using namespace std;
#include "common/config.h"
-#include "include/assert.h"
+#include "include/ceph_assert.h"
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_mds
* All non-I/O contexts that require a reference
* to an MDCache instance descend from this.
*/
-class MDCacheContext : public virtual MDSInternalContextBase {
+class MDCacheContext : public virtual MDSContext {
protected:
MDCache *mdcache;
MDSRank *get_mds() override
{
- assert(mdcache != NULL);
+ ceph_assert(mdcache != NULL);
return mdcache->mds;
}
public:
MDCache *mdcache;
MDSRank *get_mds() override
{
- assert(mdcache != NULL);
+ ceph_assert(mdcache != NULL);
return mdcache->mds;
}
public:
MDCache *mdcache;
MDSRank *get_mds() override
{
- assert(mdcache != NULL);
+ ceph_assert(mdcache != NULL);
return mdcache->mds;
}
public:
exceeded_size_limit(false),
recovery_queue(m),
stray_manager(m, purge_queue_),
- trim_counter(ceph_clock_now(), g_conf->get_val<double>("mds_cache_trim_decay_rate"))
+ trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")),
+ open_file_table(m)
{
migrator.reset(new Migrator(mds, this));
root = NULL;
num_shadow_inodes = 0;
num_inodes_with_caps = 0;
- max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
- (g_conf->mds_dir_max_commit_size << 20) :
- (0.9 *(g_conf->osd_max_write_size << 20));
+ max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
+ (g_conf()->mds_dir_max_commit_size << 20) :
+ (0.9 *(g_conf()->osd_max_write_size << 20));
discover_last_tid = 0;
open_ino_last_tid = 0;
opening_root = open = false;
- cache_inode_limit = g_conf->get_val<int64_t>("mds_cache_size");
- cache_memory_limit = g_conf->get_val<uint64_t>("mds_cache_memory_limit");
- cache_reservation = g_conf->get_val<double>("mds_cache_reservation");
- cache_health_threshold = g_conf->get_val<double>("mds_health_cache_threshold");
+ cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
+ cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
+ cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
+ cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
- lru.lru_set_midpoint(g_conf->get_val<double>("mds_cache_mid"));
+ lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
bottom_lru.lru_set_midpoint(0);
- decayrate.set_halflife(g_conf->mds_decay_halflife);
+ decayrate.set_halflife(g_conf()->mds_decay_halflife);
did_shutdown_log_cap = false;
+
+ global_snaprealm = NULL;
}
MDCache::~MDCache()
}
}
-void MDCache::handle_conf_change(const struct md_config_t *conf,
+void MDCache::handle_conf_change(const ConfigProxy& conf,
const std::set <std::string> &changed,
const MDSMap &mdsmap)
{
if (changed.count("mds_cache_size"))
- cache_inode_limit = g_conf->get_val<int64_t>("mds_cache_size");
+ cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
if (changed.count("mds_cache_memory_limit"))
- cache_memory_limit = g_conf->get_val<uint64_t>("mds_cache_memory_limit");
+ cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
if (changed.count("mds_cache_reservation"))
- cache_reservation = g_conf->get_val<double>("mds_cache_reservation");
+ cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
if (changed.count("mds_health_cache_threshold"))
- cache_health_threshold = g_conf->get_val<double>("mds_health_cache_threshold");
+ cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
if (changed.count("mds_cache_mid"))
- lru.lru_set_midpoint(g_conf->get_val<double>("mds_cache_mid"));
+ lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
if (changed.count("mds_cache_trim_decay_rate")) {
- trim_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_cache_trim_decay_rate"));
+ trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
}
migrator->handle_conf_change(conf, changed, mdsmap);
// add to lru, inode map
if (in->last == CEPH_NOSNAP) {
auto &p = inode_map[in->ino()];
- assert(!p); // should be no dup inos!
+ ceph_assert(!p); // should be no dup inos!
p = in;
} else {
auto &p = snap_inode_map[in->vino()];
- assert(!p); // should be no dup inos!
+ ceph_assert(!p); // should be no dup inos!
p = in;
}
if (o->get_parent_dn()) {
// FIXME: multiple parents?
CDentry *dn = o->get_parent_dn();
- assert(!dn->is_dirty());
+ ceph_assert(!dn->is_dirty());
dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
}
export_pin_queue.erase(o);
// remove from inode map
- if (o->last == CEPH_NOSNAP)
+ if (o->last == CEPH_NOSNAP) {
inode_map.erase(o->ino());
- else
+ } else {
+ o->item_caps.remove_myself();
snap_inode_map.erase(o->vino());
+ }
if (o->ino() < MDS_INO_SYSTEM_BASE) {
if (o == root) root = 0;
}
if (o->is_base())
base_inodes.erase(o);
- }
+ }
// delete it
- assert(o->get_num_ref() == 0);
+ ceph_assert(o->get_num_ref() == 0);
delete o;
}
{
file_layout_t result = file_layout_t::get_default();
result.pool_id = mdsmap.get_metadata_pool();
- if (g_conf->mds_log_segment_size > 0) {
- result.object_size = g_conf->mds_log_segment_size;
- result.stripe_unit = g_conf->mds_log_segment_size;
+ if (g_conf()->mds_log_segment_size > 0) {
+ result.object_size = g_conf()->mds_log_segment_size;
+ result.stripe_unit = g_conf()->mds_log_segment_size;
}
return result;
}
memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
if (in->inode.is_dir()) {
- in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
in->inode.rstat.rsubdirs = 1; /* itself */
in->inode.rstat.rctime = in->inode.ctime;
} else {
else
in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
in->open_snaprealm(); // empty snaprealm
- assert(!in->snaprealm->parent); // created its own
+ ceph_assert(!in->snaprealm->parent); // created its own
in->snaprealm->srnode.seq = 1;
}
}
CInode *MDCache::create_root_inode()
{
CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
- i->inode.uid = g_conf->mds_root_ino_uid;
- i->inode.gid = g_conf->mds_root_ino_gid;
+ i->inode.uid = g_conf()->mds_root_ino_uid;
+ i->inode.gid = g_conf()->mds_root_ino_gid;
i->inode.layout = default_file_layout;
i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
return i;
adjust_subtree_auth(rootdir, mds->get_nodeid());
rootdir->dir_rep = CDir::REP_ALL; //NONE;
- assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
- assert(rootdir->fnode.fragstat == root->inode.dirstat);
- assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
+ ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
+ ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
+ ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
/* Do no update rootdir rstat information of the fragment, rstat upkeep magic
* assume version 0 is stale/invalid.
*/
MutationRef mut;
CDentry *dn;
version_t dpv;
- MDSInternalContextBase *fin;
- C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
+ MDSContext *fin;
+ C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
void finish(int r) override {
mdcache->_create_system_file_finish(mut, dn, dpv, fin);
}
};
-void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
+void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
{
dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
CDentry *dn = dir->add_null_dentry(name);
mds->mdlog->flush();
}
-void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
+void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
{
dout(10) << "_create_system_file_finish " << *dn << dendl;
if (in->inode.is_dir()) {
CDir *dir = in->get_dirfrag(frag_t());
- assert(dir);
+ ceph_assert(dir);
dir->mark_dirty(1, mut->ls);
dir->mark_new(mut->ls);
}
}
};
-void MDCache::open_root_inode(MDSInternalContextBase *c)
+void MDCache::open_root_inode(MDSContext *c)
{
if (mds->get_nodeid() == mds->mdsmap->get_root()) {
CInode *in;
}
}
-void MDCache::open_mydir_inode(MDSInternalContextBase *c)
+void MDCache::open_mydir_inode(MDSContext *c)
{
- MDSGatherBuilder gather(g_ceph_context);
-
CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
- in->fetch(gather.new_sub());
-
- gather.set_finisher(c);
- gather.activate();
+ in->fetch(c);
}
-void MDCache::open_mydir_frag(MDSInternalContextBase *c)
+void MDCache::open_mydir_frag(MDSContext *c)
{
open_mydir_inode(
new MDSInternalContextWrapper(mds,
return;
}
CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
- assert(mydir);
+ ceph_assert(mydir);
adjust_subtree_auth(mydir, mds->get_nodeid());
mydir->fetch(c);
})
return;
}
if (mds->get_nodeid() == mds->mdsmap->get_root()) {
- assert(root->is_auth());
+ ceph_assert(root->is_auth());
CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
- assert(rootdir);
+ ceph_assert(rootdir);
if (!rootdir->is_subtree_root())
adjust_subtree_auth(rootdir, mds->get_nodeid());
if (!rootdir->is_complete()) {
return;
}
} else {
- assert(!root->is_auth());
+ ceph_assert(!root->is_auth());
CDir *rootdir = root->get_dirfrag(frag_t());
if (!rootdir) {
open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
return;
}
CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
- assert(mydir);
+ ceph_assert(mydir);
adjust_subtree_auth(mydir, mds->get_nodeid());
populate_mydir();
void MDCache::populate_mydir()
{
- assert(myin);
+ ceph_assert(myin);
CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
- assert(mydir);
+ ceph_assert(mydir);
dout(10) << "populate_mydir " << *mydir << dendl;
new C_MDS_RetryOpenRoot(this));
return;
}
- assert(straydn);
- assert(strays[i]);
+ ceph_assert(straydn);
+ ceph_assert(strays[i]);
// we make multiple passes through this method; make sure we only pin each stray once.
if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
strays[i]->get(CInode::PIN_STRAY);
dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
// open all frags
- list<frag_t> ls;
- strays[i]->dirfragtree.get_leaves(ls);
- for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
- frag_t fg = *p;
- CDir *dir = strays[i]->get_dirfrag(fg);
+ frag_vec_t leaves;
+ strays[i]->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = strays[i]->get_dirfrag(leaf);
if (!dir) {
- dir = strays[i]->get_or_open_dirfrag(this, fg);
+ dir = strays[i]->get_or_open_dirfrag(this, leaf);
}
// DamageTable applies special handling to strays: it will
// have damaged() us out if one is damaged.
- assert(!dir->state_test(CDir::STATE_BADFRAG));
+ ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
if (dir->get_version() == 0) {
dir->fetch(new C_MDS_RetryOpenRoot(this));
}
}
- stray_manager.set_num_strays(num_strays);
-
// okay!
dout(10) << "populate_mydir done" << dendl;
- assert(!open);
+ ceph_assert(!open);
open = true;
mds->queue_waiters(waiting_for_open);
+ stray_manager.set_num_strays(num_strays);
+ stray_manager.activate();
+
scan_stray_dir();
}
-void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
+void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
{
discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
}
in->name_stray_dentry(straydname);
CInode *strayi = get_stray();
- assert(strayi);
+ ceph_assert(strayi);
frag_t fg = strayi->pick_dirfrag(straydname);
CDir *straydir = strayi->get_dirfrag(fg);
- assert(straydir);
+ ceph_assert(straydir);
return straydir;
}
straydn = straydir->add_null_dentry(straydname);
straydn->mark_new();
} else {
- assert(straydn->get_projected_linkage()->is_null());
+ ceph_assert(straydn->get_projected_linkage()->is_null());
}
straydn->state_set(CDentry::STATE_STRAY);
-MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
+MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
{
// inode?
if (info.ino)
// ====================================================================
// subtree management
-void MDCache::list_subtrees(list<CDir*>& ls)
-{
- for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p)
- ls.push_back(p->first);
-}
-
/*
* adjust the dir_auth of a subtree.
* merge with parent and/or child subtrees, if is it appropriate.
} else {
root = get_subtree_root(dir); // subtree root
}
- assert(root);
- assert(subtrees.count(root));
+ ceph_assert(root);
+ ceph_assert(subtrees.count(root));
dout(7) << " current root is " << *root << dendl;
if (root == dir) {
} else {
// i am a new subtree.
dout(10) << " new subtree at " << *dir << dendl;
- assert(subtrees.count(dir) == 0);
+ ceph_assert(subtrees.count(dir) == 0);
subtrees[dir]; // create empty subtree bounds list for me.
dir->get(CDir::PIN_SUBTREE);
// adjust recursive pop counters
if (adjust_pop && dir->is_auth()) {
- utime_t now = ceph_clock_now();
CDir *p = dir->get_parent_dir();
while (p) {
- p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
+ p->pop_auth_subtree.sub(dir->pop_auth_subtree);
if (p->is_subtree_root()) break;
p = p->inode->get_parent_dir();
}
return;
auto it = subtrees.find(dir);
- assert(it != subtrees.end());
+ ceph_assert(it != subtrees.end());
// merge with parent?
CDir *parent = dir;
// adjust popularity?
if (adjust_pop && dir->is_auth()) {
- utime_t now = ceph_clock_now();
CDir *cur = dir;
CDir *p = dir->get_parent_dir();
while (p) {
- p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
+ p->pop_auth_subtree.add(dir->pop_auth_subtree);
p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
if (p->is_subtree_root()) break;
cur = p;
{
// evaluate subtree inode filelock?
// (we should scatter the filelock on subtree bounds)
- assert(diri->is_auth());
+ ceph_assert(diri->is_auth());
mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
}
-void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
+void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
{
dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
<< " on " << *dir
} else {
root = get_subtree_root(dir); // subtree root
}
- assert(root);
- assert(subtrees.count(root));
+ ceph_assert(root);
+ ceph_assert(subtrees.count(root));
dout(7) << " current root is " << *root << dendl;
mds_authority_t oldauth = dir->authority();
} else {
// i am a new subtree.
dout(10) << " new subtree at " << *dir << dendl;
- assert(subtrees.count(dir) == 0);
+ ceph_assert(subtrees.count(dir) == 0);
subtrees[dir]; // create empty subtree bounds list for me.
dir->get(CDir::PIN_SUBTREE);
// - these may be new, or
// - beneath existing ambiguous bounds (which will be collapsed),
// - but NOT beneath unambiguous bounds.
- for (set<CDir*>::iterator p = bounds.begin();
- p != bounds.end();
- ++p) {
- CDir *bound = *p;
-
+ for (const auto& bound : bounds) {
// new bound?
if (subtrees[dir].count(bound) == 0) {
if (get_subtree_root(bound) == dir) {
dout(10) << " want bound " << *bound << dendl;
CDir *t = get_subtree_root(bound->get_parent_dir());
if (subtrees[t].count(bound) == 0) {
- assert(t != dir);
+ ceph_assert(t != dir);
dout(10) << " new bound " << *bound << dendl;
adjust_subtree_auth(bound, t->authority());
}
* fragmentation as necessary to get an equivalent bounding set. That is, only
* split if one of our frags spans the provided bounding set. Never merge.
*/
-void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
+void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
{
dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
// sort by ino
map<inodeno_t, fragset_t> byino;
- for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
- byino[p->ino].insert(p->frag);
+ for (auto& frag : dfs) {
+ byino[frag.ino].insert(frag.frag);
+ }
dout(10) << " by ino: " << byino << dendl;
for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
tmpdft.force_to_leaf(g_ceph_context, *q);
- for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
- frag_t fg = *q;
- list<frag_t> fgls;
- diri->dirfragtree.get_leaves_under(fg, fgls);
- if (fgls.empty()) {
+ for (const auto& fg : p->second) {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves_under(fg, leaves);
+ if (leaves.empty()) {
bool all = true;
frag_t approx_fg = diri->dirfragtree[fg.value()];
- list<frag_t> ls;
- tmpdft.get_leaves_under(approx_fg, ls);
- for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
- if (p->second.get().count(*r) == 0) {
+ frag_vec_t approx_leaves;
+ tmpdft.get_leaves_under(approx_fg, approx_leaves);
+ for (const auto& leaf : approx_leaves) {
+ if (p->second.get().count(leaf) == 0) {
// not bound, so the resolve message is from auth MDS of the dirfrag
- force_dir_fragment(diri, *r);
+ force_dir_fragment(diri, leaf);
all = false;
}
}
if (all)
- fgls.push_back(approx_fg);
+ leaves.push_back(approx_fg);
else
- diri->dirfragtree.get_leaves_under(fg, fgls);
+ diri->dirfragtree.get_leaves_under(fg, leaves);
}
- dout(10) << " frag " << fg << " contains " << fgls << dendl;
- for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
- CDir *dir = diri->get_dirfrag(*r);
+ dout(10) << " frag " << fg << " contains " << leaves << dendl;
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
if (dir)
bounds.insert(dir);
}
}
}
-void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
+void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
{
dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
<< " on " << *dir << " bound_dfs " << bound_dfs << dendl;
adjust_bounded_subtree_auth(dir, bounds, auth);
}
-void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
+void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
{
dout(10) << "map_dirfrag_set " << dfs << dendl;
// group by inode
map<inodeno_t, fragset_t> ino_fragset;
- for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
- ino_fragset[p->ino].insert(p->frag);
+ for (const auto &df : dfs) {
+ ino_fragset[df.ino].insert(df.frag);
+ }
// get frags
for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
if (!in)
continue;
- list<frag_t> fglist;
- for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
- in->dirfragtree.get_leaves_under(*q, fglist);
+ frag_vec_t fgs;
+ for (const auto& fg : p->second) {
+ in->dirfragtree.get_leaves_under(fg, fgs);
+ }
- dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
+ dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
<< " on " << *in << dendl;
- for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
- CDir *dir = in->get_dirfrag(*q);
+ for (const auto& fg : fgs) {
+ CDir *dir = in->get_dirfrag(fg);
if (dir)
result.insert(dir);
}
void MDCache::remove_subtree(CDir *dir)
{
dout(10) << "remove_subtree " << *dir << dendl;
- assert(subtrees.count(dir));
- assert(subtrees[dir].empty());
+ ceph_assert(subtrees.count(dir));
+ ceph_assert(subtrees[dir].empty());
subtrees.erase(dir);
dir->put(CDir::PIN_SUBTREE);
if (dir->get_parent_dir()) {
CDir *p = get_subtree_root(dir->get_parent_dir());
- assert(subtrees[p].count(dir));
+ ceph_assert(subtrees[p].count(dir));
subtrees[p].erase(dir);
}
}
void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
{
- assert(subtrees.count(dir));
+ ceph_assert(subtrees.count(dir));
bounds = subtrees[dir];
}
CDir *t = *p;
while (t != root) {
t = t->get_parent_dir();
- assert(t);
+ ceph_assert(t);
if (t == dir) {
bounds.insert(*p);
continue;
void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
{
// for debugging only.
- assert(subtrees.count(dir));
+ ceph_assert(subtrees.count(dir));
if (bounds != subtrees[dir]) {
dout(0) << "verify_subtree_bounds failed" << dendl;
set<CDir*> b = bounds;
for (const auto &cd : b)
dout(0) << " extra bound " << *cd << dendl;
}
- assert(bounds == subtrees[dir]);
+ ceph_assert(bounds == subtrees[dir]);
}
void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
{
// for debugging only.
- assert(subtrees.count(dir));
+ ceph_assert(subtrees.count(dir));
// make sure that any bounds i do have are properly noted as such.
int failed = 0;
failed++;
}
}
- assert(failed == 0);
+ ceph_assert(failed == 0);
}
void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
{
dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
- //show_subtrees();
- utime_t now = ceph_clock_now();
-
CDir *newdir = diri->get_parent_dir();
if (pop) {
map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
- assert(p != projected_subtree_renames.end());
- assert(!p->second.empty());
- assert(p->second.front().first == olddir);
- assert(p->second.front().second == newdir);
+ ceph_assert(p != projected_subtree_renames.end());
+ ceph_assert(!p->second.empty());
+ ceph_assert(p->second.front().first == olddir);
+ ceph_assert(p->second.front().second == newdir);
p->second.pop_front();
if (p->second.empty())
projected_subtree_renames.erase(p);
}
+ vector<CDir*> dfls;
+
+ // adjust total auth pin of freezing subtree
+ if (olddir != newdir) {
+ diri->get_nested_dirfrags(dfls);
+ for (auto dir : dfls)
+ olddir->adjust_freeze_after_rename(dir);
+ dfls.clear();
+ }
+
// adjust subtree
- list<CDir*> dfls;
// make sure subtree dirfrags are at the front of the list
diri->get_subtree_dirfrags(dfls);
diri->get_nested_dirfrags(dfls);
- for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
- CDir *dir = *p;
-
+ for (auto dir : dfls) {
dout(10) << "dirfrag " << *dir << dendl;
CDir *oldparent = get_subtree_root(olddir);
dout(10) << " old parent " << *oldparent << dendl;
dout(10) << " new parent " << *newparent << dendl;
if (olddir != newdir)
- mds->balancer->adjust_pop_for_rename(olddir, dir, now, false);
+ mds->balancer->adjust_pop_for_rename(olddir, dir, false);
if (oldparent == newparent) {
dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
} else if (dir->is_subtree_root()) {
// children are fine. change parent.
dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
- assert(subtrees[oldparent].count(dir));
+ ceph_assert(subtrees[oldparent].count(dir));
subtrees[oldparent].erase(dir);
- assert(subtrees.count(newparent));
+ ceph_assert(subtrees.count(newparent));
subtrees[newparent].insert(dir);
// caller is responsible for 'eval diri'
try_subtree_merge_at(dir, NULL, false);
CDir *bound = *p;
CDir *broot = get_subtree_root(bound->get_parent_dir());
if (broot != oldparent) {
- assert(broot == newparent);
+ ceph_assert(broot == newparent);
tomove.push_back(bound);
}
}
}
if (olddir != newdir)
- mds->balancer->adjust_pop_for_rename(newdir, dir, now, true);
+ mds->balancer->adjust_pop_for_rename(newdir, dir, true);
}
show_subtrees();
}
-
-void MDCache::get_fullauth_subtrees(set<CDir*>& s)
-{
- for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p) {
- CDir *root = p->first;
- if (root->is_full_dir_auth())
- s.insert(root);
- }
-}
-void MDCache::get_auth_subtrees(set<CDir*>& s)
-{
- for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p) {
- CDir *root = p->first;
- if (root->is_auth())
- s.insert(root);
- }
-}
-
-
-// count.
-
-int MDCache::num_subtrees()
-{
- return subtrees.size();
-}
-
-int MDCache::num_subtrees_fullauth()
-{
- int n = 0;
- for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p) {
- CDir *root = p->first;
- if (root->is_full_dir_auth())
- n++;
- }
- return n;
-}
-
-int MDCache::num_subtrees_fullnonauth()
-{
- int n = 0;
- for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p) {
- CDir *root = p->first;
- if (root->is_full_dir_nonauth())
- n++;
- }
- return n;
-}
-
-
-
// ===================================
// journal and snap/cow helpers
CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
{
dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
- assert(in->last == CEPH_NOSNAP);
+ ceph_assert(in->last == CEPH_NOSNAP);
auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
*/
CInode *MDCache::cow_inode(CInode *in, snapid_t last)
{
- assert(last >= in->first);
+ ceph_assert(last >= in->first);
CInode *oldin = new CInode(this, true, in->first, last);
oldin->inode = *in->get_previous_projected_inode();
- oldin->symlink = in->symlink;
oldin->xattrs = *in->get_previous_projected_xattrs();
+ oldin->symlink = in->symlink;
oldin->inode.trim_client_ranges(last);
if (in->first < in->oldest_snap)
if (in->last != CEPH_NOSNAP) {
CInode *head_in = get_inode(in->ino());
- assert(head_in);
+ ceph_assert(head_in);
if (head_in->split_need_snapflush(oldin, in)) {
oldin->client_snap_caps = in->client_snap_caps;
for (const auto &p : in->client_snap_caps) {
SimpleLock *lock = oldin->get_lock(p.first);
- assert(lock);
+ ceph_assert(lock);
for (const auto &q : p.second) {
oldin->auth_pin(lock);
lock->set_state(LOCK_SNAP_SYNC); // gathering
// clone caps?
for (auto &p : in->client_caps) {
client_t client = p.first;
- Capability *cap = p.second;
- int issued = cap->issued();
+ Capability *cap = &p.second;
+ int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
if ((issued & CEPH_CAP_ANY_WR) &&
cap->client_follows < last) {
// note in oldin
if (issued & cinode_lock_info[i].wr_caps) {
int lockid = cinode_lock_info[i].lock;
SimpleLock *lock = oldin->get_lock(lockid);
- assert(lock);
+ ceph_assert(lock);
oldin->client_snap_caps[lockid].insert(client);
oldin->auth_pin(lock);
lock->set_state(LOCK_SNAP_SYNC); // gathering
return;
}
dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
- assert(dn->is_auth());
+ ceph_assert(dn->is_auth());
// nothing to cow on a null dentry, fix caller
if (!dnl)
dnl = dn->get_projected_linkage();
- assert(!dnl->is_null());
+ ceph_assert(!dnl->is_null());
- if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
+ CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+ bool cow_head = false;
+ if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+ ceph_assert(in->is_frozen_inode());
+ cow_head = true;
+ }
+ if (in && (in->is_multiversion() || cow_head)) {
// multiversion inode.
- CInode *in = dnl->get_inode();
SnapRealm *realm = NULL;
if (in->get_projected_parent_dn() != dn) {
- assert(follows == CEPH_NOSNAP);
+ ceph_assert(follows == CEPH_NOSNAP);
realm = dn->dir->inode->find_snaprealm();
- snapid_t dir_follows = realm->get_newest_snap();
+ snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
+ ceph_assert(dir_follows >= realm->get_newest_seq());
if (dir_follows+1 > dn->first) {
snapid_t oldfirst = dn->first;
// FIXME: adjust link count here? hmm.
if (dir_follows+1 > in->first)
- in->cow_old_inode(dir_follows, false);
+ in->cow_old_inode(dir_follows, cow_head);
}
}
+ follows = dir_follows;
if (in->snaprealm) {
realm = in->snaprealm;
- follows = realm->get_newest_seq();
- } else
- follows = dir_follows;
+ ceph_assert(follows >= realm->get_newest_seq());
+ }
} else {
realm = in->find_snaprealm();
- if (follows == CEPH_NOSNAP)
- follows = realm->get_newest_seq();
+ if (follows == CEPH_NOSNAP) {
+ follows = get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+ }
}
// already cloned?
return;
}
- in->cow_old_inode(follows, false);
+ in->cow_old_inode(follows, cow_head);
} else {
SnapRealm *realm = dn->dir->inode->find_snaprealm();
- if (follows == CEPH_NOSNAP)
- follows = realm->get_newest_seq();
+ if (follows == CEPH_NOSNAP) {
+ follows = get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+ }
// already cloned?
if (follows < dn->first) {
snapid_t oldfirst = dn->first;
dn->first = follows+1;
- CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
-
if (!realm->has_snaps_in_range(oldfirst, follows)) {
dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
if (in)
mut->add_cow_inode(oldin);
if (pcow_inode)
*pcow_inode = oldin;
- CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, oldin->last);
+ CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
oldin->inode.version = olddn->pre_dirty();
dout(10) << " olddn " << *olddn << dendl;
bool need_snapflush = !oldin->client_snap_caps.empty();
- if (need_snapflush)
+ if (need_snapflush) {
mut->ls->open_files.push_back(&oldin->item_open_file);
+ mds->locker->mark_need_snapflush_inode(oldin);
+ }
metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
mut->add_cow_dentry(olddn);
} else {
- assert(dnl->is_remote());
+ ceph_assert(dnl->is_remote());
CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
oldfirst, follows);
olddn->pre_dirty();
void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
{
if (in->is_base()) {
- metablob->add_root(true, in, in->get_projected_inode());
+ metablob->add_root(true, in);
} else {
if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
follows = in->first - 1;
const set<snapid_t> snaps = prealm->get_snaps();
if (cur->last != CEPH_NOSNAP) {
- assert(cur->dirty_old_rstats.empty());
- set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
+ ceph_assert(cur->dirty_old_rstats.empty());
+ set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
if (q == snaps.end() || *q > cur->last)
return;
}
if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
// rename src inode is not projected in the slave rename prep case. so we should
// avoid updateing the inode.
- assert(linkunlink < 0);
- assert(cur->is_frozen_inode());
+ ceph_assert(linkunlink < 0);
+ ceph_assert(cur->is_frozen_inode());
update = false;
}
- _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
+ _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
linkunlink, update);
}
- if (g_conf->mds_snap_rstat) {
+ if (g_conf()->mds_snap_rstat) {
for (const auto &p : cur->dirty_old_rstats) {
auto &old = cur->old_inodes[p];
snapid_t ofirst = std::max(old.first, floor);
snapid_t first;
fnode_t *pf = parent->get_projected_fnode();
if (last == CEPH_NOSNAP) {
- if (g_conf->mds_snap_rstat)
- first = MAX(ofirst, parent->first);
+ if (g_conf()->mds_snap_rstat)
+ first = std::max(ofirst, parent->first);
else
first = parent->first;
prstat = &pf->rstat;
parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
}
parent->first = first;
- } else if (!g_conf->mds_snap_rstat) {
+ } else if (!g_conf()->mds_snap_rstat) {
// drop snapshots' rstats
break;
} else if (last >= parent->first) {
// apply
dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
- assert(last >= first);
+ ceph_assert(last >= first);
prstat->add(delta);
if (update_inode)
inode.accounted_rstat = inode.rstat;
snapid_t first;
if (last == pin->last) {
pi = pin->get_projected_inode();
- first = MAX(ofirst, pin->first);
+ first = std::max(ofirst, pin->first);
if (first > pin->first) {
auto &old = pin->cow_old_inode(first-1, cow_head);
dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
{
+ if (!(mds->is_active() || mds->is_stopping()))
+ return;
+
if (!in->is_auth() || in->is_frozen())
return;
!quota_change)
return;
- for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
- it != in->client_caps.end();
- ++it) {
- Session *session = mds->get_session(it->first);
- if (!session || !session->connection ||
- !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
- continue;
+ // creaete snaprealm for quota inode (quota was set before mimic)
+ if (!in->get_projected_srnode())
+ mds->server->create_quota_realm(in);
- Capability *cap = it->second;
+ for (auto &p : in->client_caps) {
+ Capability *cap = &p.second;
+ if (cap->is_noquota())
+ continue;
- if (exclude_ct >= 0 && exclude_ct != it->first)
+ if (exclude_ct >= 0 && exclude_ct != p.first)
goto update;
if (cap->last_rbytes == i->rstat.rbytes &&
cap->last_rsize = i->rstat.rsize();
cap->last_rbytes = i->rstat.rbytes;
- MClientQuota *msg = new MClientQuota();
+ auto msg = MClientQuota::create();
msg->ino = in->ino();
msg->rstat = i->rstat;
msg->quota = i->quota;
- mds->send_message_client_counted(msg, session->connection);
+ mds->send_message_client_counted(msg, cap->get_session());
}
for (const auto &it : in->get_replicas()) {
- MGatherCaps *msg = new MGatherCaps;
+ auto msg = MGatherCaps::create();
msg->ino = in->ino();
mds->send_message_mds(msg, it.first);
}
bool do_parent_mtime = flags & PREDIRTY_DIR;
bool shallow = flags & PREDIRTY_SHALLOW;
- assert(mds->mdlog->entry_is_open());
+ ceph_assert(mds->mdlog->entry_is_open());
// make sure stamp is set
if (mut->get_mds_stamp() == utime_t())
<< " " << *in << dendl;
if (!parent) {
- assert(primary_dn);
+ ceph_assert(primary_dn);
parent = in->get_projected_parent_dn()->get_dir();
}
bool first = true;
while (parent) {
//assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
- assert(parent->is_auth());
+ ceph_assert(parent->is_auth());
// opportunistically adjust parent dirfrag
CInode *pin = parent->get_inode();
pf->version = parent->pre_dirty();
if (do_parent_mtime || linkunlink) {
- assert(mut->wrlocks.count(&pin->filelock));
- assert(mut->wrlocks.count(&pin->nestlock));
- assert(cfollows == CEPH_NOSNAP);
+ ceph_assert(mut->is_wrlocked(&pin->filelock));
+ ceph_assert(mut->is_wrlocked(&pin->nestlock));
+ ceph_assert(cfollows == CEPH_NOSNAP);
// update stale fragstat/rstat?
parent->resync_accounted_fragstat();
// because we are about to write into the dirfrag fnode and that needs
// to commit before the lock can cycle.
if (linkunlink) {
- assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
+ ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
}
- if (mut->wrlocks.count(&pin->nestlock) == 0) {
+ if (!mut->is_wrlocked(&pin->nestlock)) {
dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
mds->locker->wrlock_force(&pin->nestlock, mut);
}
// delay propagating until later?
if (!stop && !first &&
- g_conf->mds_dirstat_min_interval > 0) {
+ g_conf()->mds_dirstat_min_interval > 0) {
double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
- if (since_last_prop < g_conf->mds_dirstat_min_interval) {
+ if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
dout(10) << "predirty_journal_parents last prop " << since_last_prop
- << " < " << g_conf->mds_dirstat_min_interval
+ << " < " << g_conf()->mds_dirstat_min_interval
<< ", stopping" << dendl;
stop = true;
} else {
// can cast only because i'm passing nowait=true in the sole user
MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
if (!stop &&
- mut->wrlocks.count(&pin->nestlock) == 0 &&
+ !mut->is_wrlocked(&pin->nestlock) &&
(!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
//true
!mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
}
break;
}
- if (!mut->wrlocks.count(&pin->versionlock))
+ if (!mut->is_wrlocked(&pin->versionlock))
mds->locker->local_wrlock_grab(&pin->versionlock, mut);
- assert(mut->wrlocks.count(&pin->nestlock) ||
- mut->is_slave());
+ ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
pin->last_dirstat_prop = mut->get_mds_stamp();
if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
if (pi.inode.dirstat.size() < 0)
- assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
+ ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
if (pi.inode.dirstat.size() != pf->fragstat.size()) {
mds->clog->error() << "unmatched fragstat size on single dirfrag "
<< parent->dirfrag() << ", inode has " << pi.inode.dirstat
// trust the dirfrag for now
pi.inode.dirstat = pf->fragstat;
- assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
+ ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
}
}
}
if (pin->is_base())
break;
parentdn = pin->get_projected_parent_dn();
- assert(parentdn);
+ ceph_assert(parentdn);
// rstat
dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
// first, if the frag is stale, bring it back in sync.
parent->resync_accounted_rstat();
- if (g_conf->mds_snap_rstat) {
+ if (g_conf()->mds_snap_rstat) {
for (auto &p : parent->dirty_old_rstat) {
project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
p.first, pin, true);
// trust the dirfrag for now
pi.inode.rstat = pf->rstat;
- assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
+ ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
}
}
}
// now, stick it in the blob
- assert(parent);
- assert(parent->is_auth());
+ ceph_assert(parent);
+ ceph_assert(parent->is_auth());
blob->add_dir_context(parent);
blob->add_dir(parent, true);
for (list<CInode*>::iterator p = lsi.begin();
void MDCache::_logged_master_commit(metareqid_t reqid)
{
dout(10) << "_logged_master_commit " << reqid << dendl;
- assert(uncommitted_masters.count(reqid));
+ ceph_assert(uncommitted_masters.count(reqid));
uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
mds->queue_waiters(uncommitted_masters[reqid].waiters);
uncommitted_masters.erase(reqid);
void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
{
dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
- assert(uncommitted_masters.count(r));
+ ceph_assert(uncommitted_masters.count(r));
uncommitted_masters[r].slaves.erase(from);
if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
log_master_commit(r);
void MDCache::logged_master_update(metareqid_t reqid)
{
dout(10) << "logged_master_update " << reqid << dendl;
- assert(uncommitted_masters.count(reqid));
+ ceph_assert(uncommitted_masters.count(reqid));
uncommitted_masters[reqid].safe = true;
- if (pending_masters.count(reqid)) {
- pending_masters.erase(reqid);
+ auto p = pending_masters.find(reqid);
+ if (p != pending_masters.end()) {
+ pending_masters.erase(p);
if (pending_masters.empty())
process_delayed_resolve();
}
dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
// send a message
- MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
+ auto req = MMDSSlaveRequest::create(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
mds->send_message_mds(req, from);
}
f->close_section();
}
-void MDCache::resolve_start(MDSInternalContext *resolve_done_)
+void MDCache::resolve_start(MDSContext *resolve_done_)
{
dout(10) << "resolve_start" << dendl;
- assert(!resolve_done);
+ ceph_assert(!resolve_done);
resolve_done.reset(resolve_done_);
if (mds->mdsmap->get_root() != mds->get_nodeid()) {
adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
}
resolve_gather = recovery_set;
+
+ resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
}
void MDCache::send_resolves()
{
send_slave_resolves();
+
+ if (!resolve_done) {
+ // I'm survivor: refresh snap cache
+ mds->snapclient->sync(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ maybe_finish_slave_resolve();
+ })
+ )
+ );
+ dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
+ return;
+ }
if (!resolve_ack_gather.empty()) {
dout(10) << "send_resolves still waiting for resolve ack from ("
<< resolve_ack_gather << ")" << dendl;
return;
}
- if (!need_resolve_rollback.empty()) {
+ if (!resolve_need_rollback.empty()) {
dout(10) << "send_resolves still waiting for rollback to commit on ("
- << need_resolve_rollback << ")" << dendl;
+ << resolve_need_rollback << ")" << dendl;
return;
}
+
send_subtree_resolves();
}
{
dout(10) << "send_slave_resolves" << dendl;
- map<mds_rank_t, MMDSResolve*> resolves;
+ map<mds_rank_t, MMDSResolve::ref> resolves;
if (mds->is_resolve()) {
for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
p != uncommitted_slave_updates.end();
++p) {
- resolves[p->first] = new MMDSResolve;
+ resolves[p->first] = MMDSResolve::create();
for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
q != p->second.end();
++q) {
if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
dout(10) << " including uncommitted " << *mdr << dendl;
if (!resolves.count(master))
- resolves[master] = new MMDSResolve;
+ resolves[master] = MMDSResolve::create();
if (!mdr->committing &&
mdr->has_more() && mdr->more()->is_inode_exporter) {
// re-send cap exports
map<client_t, Capability::Export> cap_map;
in->export_client_caps(cap_map);
bufferlist bl;
- ::encode(in->ino(), bl);
- ::encode(cap_map, bl);
+ encode(in->ino(), bl);
+ encode(cap_map, bl);
resolves[master]->add_slave_request(p->first, bl);
} else {
resolves[master]->add_slave_request(p->first, mdr->committing);
}
}
- for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
- p != resolves.end();
- ++p) {
- dout(10) << "sending slave resolve to mds." << p->first << dendl;
- mds->send_message_mds(p->second, p->first);
- resolve_ack_gather.insert(p->first);
+ for (auto &p : resolves) {
+ dout(10) << "sending slave resolve to mds." << p.first << dendl;
+ mds->send_message_mds(p.second, p.first);
+ resolve_ack_gather.insert(p.first);
}
}
return; // not now
}
- map<mds_rank_t, MMDSResolve*> resolves;
+ map<mds_rank_t, MMDSResolve::ref> resolves;
for (set<mds_rank_t>::iterator p = recovery_set.begin();
p != recovery_set.end();
++p) {
if (*p == mds->get_nodeid())
continue;
if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
- resolves[*p] = new MMDSResolve;
+ resolves[*p] = MMDSResolve::create();
}
map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
} else {
// not ambiguous.
- for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
- q != resolves.end();
- ++q)
- resolves[q->first]->add_subtree(dir->dirfrag());
+ for (auto &q : resolves) {
+ resolves[q.first]->add_subtree(dir->dirfrag());
+ }
// bounds too
vector<dirfrag_t> dfls;
for (set<CDir*>::iterator q = subtrees[dir].begin();
}
// send
- for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
- p != resolves.end();
- ++p) {
- MMDSResolve* m = p->second;
+ for (auto &p : resolves) {
+ const MMDSResolve::ref &m = p.second;
+ if (mds->is_resolve()) {
+ m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
+ } else {
+ m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
+ }
m->subtrees = my_subtrees;
m->ambiguous_imports = my_ambig_imports;
- dout(10) << "sending subtee resolve to mds." << p->first << dendl;
- mds->send_message_mds(m, p->first);
+ dout(10) << "sending subtee resolve to mds." << p.first << dendl;
+ mds->send_message_mds(m, p.first);
}
resolves_pending = false;
}
+void MDCache::maybe_finish_slave_resolve() {
+ if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
+ // snap cache get synced or I'm in resolve state
+ if (mds->snapclient->is_synced() || resolve_done)
+ send_subtree_resolves();
+ process_delayed_resolve();
+ }
+}
+
void MDCache::handle_mds_failure(mds_rank_t who)
{
dout(7) << "handle_mds_failure mds." << who << dendl;
remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
if (!mdr->more()->waiting_on_slave.empty()) {
- assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+ ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
// will rollback, no need to wait
mdr->reset_slave_request();
mdr->more()->waiting_on_slave.clear();
if (mdr->is_slave() && mdr->slave_did_prepare()) {
if (mdr->more()->waiting_on_slave.count(who)) {
- assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+ ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
<< who << dendl;
mdr->more()->waiting_on_slave.erase(who);
add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
}
} else if (mdr->slave_request) {
- MMDSSlaveRequest *slave_req = mdr->slave_request;
+ const MMDSSlaveRequest::const_ref &slave_req = mdr->slave_request;
// FIXME: Slave rename request can arrive after we notice mds failure.
// This can cause mds to crash (does not affect integrity of FS).
if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
if (mdr->more()->srcdn_auth_mds == who) {
dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
<< who << " to recover" << dendl;
- assert(mdr->more()->witnessed.count(who) == 0);
+ ceph_assert(mdr->more()->witnessed.count(who) == 0);
if (mdr->more()->is_ambiguous_auth)
mdr->clear_ambiguous_auth();
// rename srcdn's auth mds failed, all witnesses will rollback
<< mdr->more()->srcdn_auth_mds << " to reply" << dendl;
// waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
// until either the request is committing or the slave also fails.
- assert(mdr->more()->waiting_on_slave.size() == 1);
+ ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
pending_masters.insert(p->first);
} else {
dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
<< who << " to recover" << dendl;
if (srcdn_auth >= 0)
- assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
+ ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
// discard this peer's prepare (if any)
mdr->more()->witnessed.erase(who);
static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
// wake up any waiters in their subtrees
for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
if (dir->authority().first != who ||
dir->authority().second == mds->get_nodeid())
continue;
- assert(!dir->is_auth());
+ ceph_assert(!dir->is_auth());
// wake any waiters
list<CDir*> q;
*
* This functions puts the passed message before returning
*/
-void MDCache::handle_resolve(MMDSResolve *m)
+void MDCache::handle_resolve(const MMDSResolve::const_ref &m)
{
dout(7) << "handle_resolve from " << m->get_source() << dendl;
mds_rank_t from = mds_rank_t(m->get_source().num());
return;
}
// wait until we reach the resolve stage!
- m->put();
return;
}
if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
- assert(!p->second.committing);
+ ceph_assert(!p->second.committing);
pending_masters.insert(p->first);
}
}
}
}
- MMDSResolveAck *ack = new MMDSResolveAck;
- for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
- if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
+ auto ack = MMDSResolveAck::create();
+ for (const auto &p : m->slave_requests) {
+ if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
// COMMIT
- if (p->second.committing) {
+ if (p.second.committing) {
// already committing, waiting for the OP_COMMITTED slave reply
- dout(10) << " already committing slave request " << *p << " noop "<< dendl;
+ dout(10) << " already committing slave request " << p << " noop "<< dendl;
} else {
- dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
- ack->add_commit(p->first);
+ dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
+ ack->add_commit(p.first);
}
- uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
+ uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
- if (p->second.inode_caps.length() > 0) {
+ if (p.second.inode_caps.length() > 0) {
// slave wants to export caps (rename)
- assert(mds->is_resolve());
+ ceph_assert(mds->is_resolve());
inodeno_t ino;
map<client_t,Capability::Export> cap_exports;
- bufferlist::iterator q = p->second.inode_caps.begin();
- ::decode(ino, q);
- ::decode(cap_exports, q);
+ auto q = p.second.inode_caps.cbegin();
+ decode(ino, q);
+ decode(cap_exports, q);
- assert(get_inode(ino));
+ ceph_assert(get_inode(ino));
for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
q != cap_exports.end();
rejoin_slave_exports[ino].second.swap(cap_exports);
// send information of imported caps back to slave
- ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
+ encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
}
} else {
// ABORT
- dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
- assert(!p->second.committing);
- ack->add_abort(p->first);
+ dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
+ ceph_assert(!p.second.committing);
+ ack->add_abort(p.first);
}
}
mds->send_message(ack, m->get_connection());
- m->put();
return;
}
- if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
+ if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
dout(10) << "delay processing subtree resolve" << dendl;
delayed_resolve[from] = m;
return;
map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
++next;
CDir *dir = get_dirfrag(p->first);
- assert(dir);
+ ceph_assert(dir);
dout(10) << "checking ambiguous import " << *dir << dendl;
if (migrator->is_importing(dir->dirfrag()) &&
migrator->get_import_peer(dir->dirfrag()) == from) {
- assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
+ ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
// check if sender claims the subtree
bool claimed_by_sender = false;
- for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
- q != m->subtrees.end();
- ++q) {
+ for (const auto &q : m->subtrees) {
// an ambiguous import won't race with a refragmentation; it's appropriate to force here.
- CDir *base = get_force_dirfrag(q->first, false);
+ CDir *base = get_force_dirfrag(q.first, false);
if (!base || !base->contains(dir))
continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
bool inside = true;
set<CDir*> bounds;
- get_force_dirfrag_bound_set(q->second, bounds);
+ get_force_dirfrag_bound_set(q.second, bounds);
for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
CDir *bound = *p;
if (bound->contains(dir)) {
// update my dir_auth values
// need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
// migrations between other nodes)
- for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
- pi != m->subtrees.end();
- ++pi) {
- dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
- CDir *dir = get_force_dirfrag(pi->first, !survivor);
+ for (const auto& p : m->subtrees) {
+ dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
+ CDir *dir = get_force_dirfrag(p.first, !survivor);
if (!dir)
continue;
- adjust_bounded_subtree_auth(dir, pi->second, from);
+ adjust_bounded_subtree_auth(dir, p.second, from);
try_subtree_merge(dir);
}
show_subtrees();
// note ambiguous imports too
- for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
- pi != m->ambiguous_imports.end();
- ++pi) {
- dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
- other_ambiguous_imports[from][pi->first].swap( pi->second );
+ for (const auto& p : m->ambiguous_imports) {
+ dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
+ other_ambiguous_imports[from][p.first] = p.second;
+ }
+
+ // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
+ // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
+ for (const auto& p : m->table_clients) {
+ dout(10) << " noting " << get_mdstable_name(p.type)
+ << " pending_commits " << p.pending_commits << dendl;
+ MDSTableClient *client = mds->get_table_client(p.type);
+ for (const auto& q : p.pending_commits)
+ client->notify_commit(q);
}
// did i get them all?
resolve_gather.erase(from);
maybe_resolve_finish();
-
- m->put();
}
void MDCache::process_delayed_resolve()
{
dout(10) << "process_delayed_resolve" << dendl;
- map<mds_rank_t, MMDSResolve*> tmp;
+ map<mds_rank_t, MMDSResolve::const_ref> tmp;
tmp.swap(delayed_resolve);
- for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
- handle_resolve(p->second);
+ for (auto &p : tmp) {
+ handle_resolve(p.second);
+ }
}
void MDCache::discard_delayed_resolve(mds_rank_t who)
{
- if (delayed_resolve.count(who)) {
- delayed_resolve[who]->put();
- delayed_resolve.erase(who);
- }
+ delayed_resolve.erase(who);
}
void MDCache::maybe_resolve_finish()
{
- assert(resolve_ack_gather.empty());
- assert(need_resolve_rollback.empty());
+ ceph_assert(resolve_ack_gather.empty());
+ ceph_assert(resolve_need_rollback.empty());
if (!resolve_gather.empty()) {
dout(10) << "maybe_resolve_finish still waiting for resolves ("
finish_committed_masters();
if (resolve_done) {
- assert(mds->is_resolve());
+ ceph_assert(mds->is_resolve());
trim_unlinked_inodes();
recalc_auth_bits(false);
resolve_done.release()->complete(0);
} else {
+ // I am survivor.
maybe_send_pending_rejoins();
}
}
-/* This functions puts the passed message before returning */
-void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
+void MDCache::handle_resolve_ack(const MMDSResolveAck::const_ref &ack)
{
dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
mds_rank_t from = mds_rank_t(ack->get_source().num());
if (!resolve_ack_gather.count(from) ||
mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
- ack->put();
return;
}
if (ambiguous_slave_updates.count(from)) {
- assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
- assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+ ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
+ ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
}
- for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
- p != ack->commit.end();
- ++p) {
- dout(10) << " commit on slave " << p->first << dendl;
+ for (const auto &p : ack->commit) {
+ dout(10) << " commit on slave " << p.first << dendl;
if (ambiguous_slave_updates.count(from)) {
- remove_ambiguous_slave_update(p->first, from);
+ remove_ambiguous_slave_update(p.first, from);
continue;
}
if (mds->is_resolve()) {
// replay
- MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
- assert(su);
+ MDSlaveUpdate *su = get_uncommitted_slave_update(p.first, from);
+ ceph_assert(su);
// log commit
- mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
+ mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
ESlaveUpdate::OP_COMMIT, su->origop),
- new C_MDC_SlaveCommit(this, from, p->first));
+ new C_MDC_SlaveCommit(this, from, p.first));
mds->mdlog->flush();
- finish_uncommitted_slave_update(p->first, from);
+ finish_uncommitted_slave_update(p.first, from);
} else {
- MDRequestRef mdr = request_get(p->first);
+ MDRequestRef mdr = request_get(p.first);
// information about master imported caps
- if (p->second.length() > 0)
- mdr->more()->inode_import.claim(p->second);
+ if (p.second.length() > 0)
+ mdr->more()->inode_import.share(p.second);
- assert(mdr->slave_request == 0); // shouldn't be doing anything!
+ ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
request_finish(mdr);
}
}
- for (vector<metareqid_t>::iterator p = ack->abort.begin();
- p != ack->abort.end();
- ++p) {
- dout(10) << " abort on slave " << *p << dendl;
+ for (const auto &metareq : ack->abort) {
+ dout(10) << " abort on slave " << metareq << dendl;
if (mds->is_resolve()) {
- MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
- assert(su);
+ MDSlaveUpdate *su = get_uncommitted_slave_update(metareq, from);
+ ceph_assert(su);
// perform rollback (and journal a rollback entry)
// note: this will hold up the resolve a bit, until the rollback entries journal.
ceph_abort();
}
} else {
- MDRequestRef mdr = request_get(*p);
+ MDRequestRef mdr = request_get(metareq);
mdr->aborted = true;
if (mdr->slave_request) {
if (mdr->slave_did_prepare()) // journaling slave prepare ?
- add_rollback(*p, from);
+ add_rollback(metareq, from);
} else {
request_finish(mdr);
}
}
}
- if (!ambiguous_slave_updates.count(from))
+ if (!ambiguous_slave_updates.count(from)) {
resolve_ack_gather.erase(from);
- if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
- send_subtree_resolves();
- process_delayed_resolve();
+ maybe_finish_slave_resolve();
}
-
- ack->put();
}
void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
{
- assert(uncommitted_slave_updates[master].count(reqid) == 0);
+ ceph_assert(uncommitted_slave_updates[master].count(reqid) == 0);
uncommitted_slave_updates[master][reqid] = su;
for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
uncommitted_slave_rename_olddir[*p]++;
void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
{
- assert(uncommitted_slave_updates[master].count(reqid));
+ ceph_assert(uncommitted_slave_updates[master].count(reqid));
MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
uncommitted_slave_updates[master].erase(reqid);
for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
CInode *diri = *p;
map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
- assert(it != uncommitted_slave_rename_olddir.end());
+ ceph_assert(it != uncommitted_slave_rename_olddir.end());
it->second--;
if (it->second == 0) {
uncommitted_slave_rename_olddir.erase(it);
}
}
} else
- assert(it->second > 0);
+ ceph_assert(it->second > 0);
}
// removed the inodes that were unlinked by slave update
for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
CInode *in = *p;
map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
- assert(it != uncommitted_slave_unlink.end());
+ ceph_assert(it != uncommitted_slave_unlink.end());
it->second--;
if (it->second == 0) {
uncommitted_slave_unlink.erase(it);
if (!in->get_projected_parent_dn())
mds->mdcache->remove_inode_recursive(in);
} else
- assert(it->second > 0);
+ ceph_assert(it->second > 0);
}
delete su;
}
if (uncommitted_slave_updates.count(master) &&
uncommitted_slave_updates[master].count(reqid)) {
su = uncommitted_slave_updates[master][reqid];
- assert(su);
+ ceph_assert(su);
}
return su;
}
void MDCache::finish_rollback(metareqid_t reqid) {
- assert(need_resolve_rollback.count(reqid));
+ auto p = resolve_need_rollback.find(reqid);
+ ceph_assert(p != resolve_need_rollback.end());
if (mds->is_resolve())
- finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
- need_resolve_rollback.erase(reqid);
- if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
- send_subtree_resolves();
- process_delayed_resolve();
- }
+ finish_uncommitted_slave_update(reqid, p->second);
+ resolve_need_rollback.erase(p);
+ maybe_finish_slave_resolve();
}
void MDCache::disambiguate_other_imports()
dout(10) << "disambiguate_my_imports" << dendl;
if (!mds->is_resolve()) {
- assert(my_ambiguous_imports.empty());
+ ceph_assert(my_ambiguous_imports.empty());
return;
}
map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
CDir *dir = get_dirfrag(q->first);
- assert(dir);
+ ceph_assert(dir);
if (dir->authority() != me_ambig) {
dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
CDir *root = get_subtree_root(dir);
if (root != dir)
dout(10) << " subtree root is " << *root << dendl;
- assert(root->dir_auth.first != mds->get_nodeid()); // no us!
+ ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
try_trim_non_auth_subtree(root);
} else {
dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
}
}
- assert(my_ambiguous_imports.empty());
+ ceph_assert(my_ambiguous_imports.empty());
mds->mdlog->flush();
// verify all my subtrees are unambiguous!
if (dir->is_ambiguous_dir_auth()) {
dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
}
- assert(!dir->is_ambiguous_dir_auth());
+ ceph_assert(!dir->is_ambiguous_dir_auth());
}
show_subtrees();
void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
{
- assert(my_ambiguous_imports.count(base) == 0);
+ ceph_assert(my_ambiguous_imports.count(base) == 0);
my_ambiguous_imports[base] = bounds;
}
void MDCache::cancel_ambiguous_import(CDir *dir)
{
dirfrag_t df = dir->dirfrag();
- assert(my_ambiguous_imports.count(df));
+ ceph_assert(my_ambiguous_imports.count(df));
dout(10) << "cancel_ambiguous_import " << df
<< " bounds " << my_ambiguous_imports[df]
<< " " << *dir
void MDCache::finish_ambiguous_import(dirfrag_t df)
{
- assert(my_ambiguous_imports.count(df));
+ ceph_assert(my_ambiguous_imports.count(df));
vector<dirfrag_t> bounds;
bounds.swap(my_ambiguous_imports[df]);
my_ambiguous_imports.erase(df);
<< " bounds " << bounds
<< dendl;
CDir *dir = get_dirfrag(df);
- assert(dir);
+ ceph_assert(dir);
// adjust dir_auth, import maps
adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
remove_inode(in);
}
-bool MDCache::expire_recursive(
- CInode *in,
- map<mds_rank_t, MCacheExpire*>& expiremap)
+bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
{
- assert(!in->is_auth());
+ ceph_assert(!in->is_auth());
dout(10) << __func__ << ":" << *in << dendl;
f->close_section();
}
-void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
+void MDCache::rejoin_start(MDSContext *rejoin_done_)
{
dout(10) << "rejoin_start" << dendl;
- assert(!rejoin_done);
+ ceph_assert(!rejoin_done);
rejoin_done.reset(rejoin_done_);
rejoin_gather = recovery_set;
/*
* rejoin phase!
*
- * this initiates rejoin. it shoudl be called before we get any
+ * this initiates rejoin. it should be called before we get any
* rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
*
* we start out by sending rejoins to everyone in the recovery set.
*
* if we are rejoin, send for all regions in our cache.
- * if we are active|stopping, send only to nodes that are are rejoining.
+ * if we are active|stopping, send only to nodes that are rejoining.
*/
void MDCache::rejoin_send_rejoins()
{
return;
}
- assert(!migrator->is_importing());
- assert(!migrator->is_exporting());
+ ceph_assert(!migrator->is_importing());
+ ceph_assert(!migrator->is_exporting());
if (!mds->is_rejoin()) {
disambiguate_other_imports();
}
- map<mds_rank_t, MMDSCacheRejoin*> rejoins;
+ map<mds_rank_t, MMDSCacheRejoin::ref> rejoins;
// if i am rejoining, send a rejoin to everyone.
if (*p == mds->get_nodeid()) continue; // nothing to myself!
if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
if (mds->is_rejoin())
- rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
+ rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_WEAK);
else if (mds->mdsmap->is_rejoin(*p))
- rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
+ rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_STRONG);
}
if (mds->is_rejoin()) {
- map<client_t, set<mds_rank_t> > client_exports;
- for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
- mds_rank_t target = p->second.first;
+ map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
+ for (auto& p : cap_exports) {
+ mds_rank_t target = p.second.first;
if (rejoins.count(target) == 0)
continue;
- rejoins[target]->cap_exports[p->first] = p->second.second;
- for (auto q = p->second.second.begin(); q != p->second.second.end(); ++q)
- client_exports[q->first].insert(target);
+ for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
+ Session *session = nullptr;
+ auto it = client_exports.find(q->first);
+ if (it != client_exports.end()) {
+ session = it->second.first;
+ if (session)
+ it->second.second.insert(target);
+ } else {
+ session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ auto& r = client_exports[q->first];
+ r.first = session;
+ if (session)
+ r.second.insert(target);
+ }
+ if (session) {
+ ++q;
+ } else {
+ // remove reconnect with no session
+ p.second.second.erase(q++);
+ }
+ }
+ rejoins[target]->cap_exports[p.first] = p.second.second;
}
- for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
- p != client_exports.end();
- ++p) {
- entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
- for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
- rejoins[*q]->client_map[p->first] = inst;
+ for (auto& p : client_exports) {
+ Session *session = p.second.first;
+ for (auto& q : p.second.second) {
+ auto rejoin = rejoins[q];
+ rejoin->client_map[p.first] = session->info.inst;
+ rejoin->client_metadata_map[p.first] = session->info.client_metadata;
+ }
}
}
p != subtrees.end();
++p) {
CDir *dir = p->first;
- assert(dir->is_subtree_root());
+ ceph_assert(dir->is_subtree_root());
if (dir->is_ambiguous_dir_auth()) {
// exporter is recovering, importer is survivor.
- assert(rejoins.count(dir->authority().first));
- assert(!rejoins.count(dir->authority().second));
+ ceph_assert(rejoins.count(dir->authority().first));
+ ceph_assert(!rejoins.count(dir->authority().second));
continue;
}
continue; // skip my own regions!
mds_rank_t auth = dir->get_dir_auth().first;
- assert(auth >= 0);
+ ceph_assert(auth >= 0);
if (rejoins.count(auth) == 0)
continue; // don't care about this node's subtrees
}
// rejoin root inodes, too
- for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
- p != rejoins.end();
- ++p) {
+ for (auto &p : rejoins) {
if (mds->is_rejoin()) {
// weak
- if (p->first == 0 && root) {
- p->second->add_weak_inode(root->vino());
+ if (p.first == 0 && root) {
+ p.second->add_weak_inode(root->vino());
if (root->is_dirty_scattered()) {
dout(10) << " sending scatterlock state on root " << *root << dendl;
- p->second->add_scatterlock_state(root);
+ p.second->add_scatterlock_state(root);
}
}
- if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
+ if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
if (in)
- p->second->add_weak_inode(in->vino());
+ p.second->add_weak_inode(in->vino());
}
} else {
// strong
- if (p->first == 0 && root) {
- p->second->add_strong_inode(root->vino(),
+ if (p.first == 0 && root) {
+ p.second->add_strong_inode(root->vino(),
root->get_replica_nonce(),
root->get_caps_wanted(),
root->filelock.get_state(),
root->state_set(CInode::STATE_REJOINING);
if (root->is_dirty_scattered()) {
dout(10) << " sending scatterlock state on root " << *root << dendl;
- p->second->add_scatterlock_state(root);
+ p.second->add_scatterlock_state(root);
}
}
- if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
- p->second->add_strong_inode(in->vino(),
+ if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
+ p.second->add_strong_inode(in->vino(),
in->get_replica_nonce(),
in->get_caps_wanted(),
in->filelock.get_state(),
if (mdr->is_slave())
continue;
// auth pins
- for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
- q != mdr->remote_auth_pins.end();
- ++q) {
- if (!q->first->is_auth()) {
- assert(q->second == q->first->authority().first);
- if (rejoins.count(q->second) == 0) continue;
- MMDSCacheRejoin *rejoin = rejoins[q->second];
+ for (const auto& q : mdr->remote_auth_pins) {
+ if (!q.first->is_auth()) {
+ ceph_assert(q.second == q.first->authority().first);
+ if (rejoins.count(q.second) == 0) continue;
+ const MMDSCacheRejoin::ref &rejoin = rejoins[q.second];
- dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
+ dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
MDSCacheObjectInfo i;
- q->first->set_object_info(i);
+ q.first->set_object_info(i);
if (i.ino)
rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
else
rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
- mdr->more()->rename_inode == q->first)
+ mdr->more()->rename_inode == q.first)
rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
mdr->reqid, mdr->attempt);
}
}
// xlocks
- for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
- q != mdr->xlocks.end();
- ++q) {
- if (!(*q)->get_parent()->is_auth()) {
- mds_rank_t who = (*q)->get_parent()->authority().first;
+ for (const auto& q : mdr->locks) {
+ auto lock = q.lock;
+ auto obj = lock->get_parent();
+ if (q.is_xlock() && !obj->is_auth()) {
+ mds_rank_t who = obj->authority().first;
if (rejoins.count(who) == 0) continue;
- MMDSCacheRejoin *rejoin = rejoins[who];
+ const MMDSCacheRejoin::ref &rejoin = rejoins[who];
- dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
+ dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
MDSCacheObjectInfo i;
- (*q)->get_parent()->set_object_info(i);
+ obj->set_object_info(i);
if (i.ino)
- rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
+ rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
mdr->reqid, mdr->attempt);
else
rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
mdr->reqid, mdr->attempt);
- }
- }
- // remote wrlocks
- for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
- q != mdr->remote_wrlocks.end();
- ++q) {
- mds_rank_t who = q->second;
- if (rejoins.count(who) == 0) continue;
- MMDSCacheRejoin *rejoin = rejoins[who];
+ } else if (q.is_remote_wrlock()) {
+ mds_rank_t who = q.wrlock_target;
+ if (rejoins.count(who) == 0) continue;
+ const MMDSCacheRejoin::ref &rejoin = rejoins[who];
- dout(15) << " " << *mdr << " wrlock on " << q->second
- << " " << q->first->get_parent() << dendl;
- MDSCacheObjectInfo i;
- q->first->get_parent()->set_object_info(i);
- assert(i.ino);
- rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
- mdr->reqid, mdr->attempt);
+ dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
+ MDSCacheObjectInfo i;
+ obj->set_object_info(i);
+ ceph_assert(i.ino);
+ rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
+ mdr->reqid, mdr->attempt);
+ }
}
}
}
// send the messages
- for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
- p != rejoins.end();
- ++p) {
- assert(rejoin_sent.count(p->first) == 0);
- assert(rejoin_ack_gather.count(p->first) == 0);
- rejoin_sent.insert(p->first);
- rejoin_ack_gather.insert(p->first);
- mds->send_message_mds(p->second, p->first);
+ for (auto &p : rejoins) {
+ ceph_assert(rejoin_sent.count(p.first) == 0);
+ ceph_assert(rejoin_ack_gather.count(p.first) == 0);
+ rejoin_sent.insert(p.first);
+ rejoin_ack_gather.insert(p.first);
+ mds->send_message_mds(p.second, p.first);
}
rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
rejoins_pending = false;
* strong dentries (no connectivity!)
* strong inodes
*/
-void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
+void MDCache::rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin)
{
dout(10) << "rejoin_walk " << *dir << dendl;
rejoin->add_weak_dirfrag(dir->dirfrag());
for (auto &p : dir->items) {
CDentry *dn = p.second;
- assert(dn->last == CEPH_NOSNAP);
+ ceph_assert(dn->last == CEPH_NOSNAP);
CDentry::linkage_t *dnl = dn->get_linkage();
dout(15) << " add_weak_primary_dentry " << *dn << dendl;
- assert(dnl->is_primary());
+ ceph_assert(dnl->is_primary());
CInode *in = dnl->get_inode();
- assert(dnl->get_inode()->is_dir());
+ ceph_assert(dnl->get_inode()->is_dir());
rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
in->get_nested_dirfrags(nested);
if (in->is_dirty_scattered()) {
rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
dir->state_set(CDir::STATE_REJOINING);
- for (auto it = dir->items.begin(); it != dir->items.end(); ++it) {
+ for (auto it = dir->items.begin(); it != dir->items.end(); ) {
CDentry *dn = it->second;
+ ++it;
+ dn->state_set(CDentry::STATE_REJOINING);
CDentry::linkage_t *dnl = dn->get_linkage();
+ CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+
+ // trim snap dentries. because they may have been pruned by
+ // their auth mds (snap deleted)
+ if (dn->last != CEPH_NOSNAP) {
+ if (in && !in->remote_parents.empty()) {
+ // unlink any stale remote snap dentry.
+ for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
+ CDentry *remote_dn = *it2;
+ ++it2;
+ ceph_assert(remote_dn->last != CEPH_NOSNAP);
+ remote_dn->unlink_remote(remote_dn->get_linkage());
+ }
+ }
+ if (dn->lru_is_expireable()) {
+ if (!dnl->is_null())
+ dir->unlink_inode(dn, false);
+ if (in)
+ remove_inode(in);
+ dir->remove_dentry(dn);
+ continue;
+ } else {
+ // Inventing null/remote dentry shouldn't cause problem
+ ceph_assert(!dnl->is_primary());
+ }
+ }
+
dout(15) << " add_strong_dentry " << *dn << dendl;
rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
*
* if i am active|stopping,
* - remove source from replica list for everything not referenced here.
- * This function puts the passed message before returning.
*/
-void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
+void MDCache::handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m)
{
dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
<< " (" << m->get_payload().length() << " bytes)"
default:
ceph_abort();
}
- m->put();
}
* - may have incorrect (out of date) dentry/inode linkage
* - may have deleted/purged inodes
* and i may have to go to disk to get accurate inode contents. yuck.
- * This functions DOES NOT put the passed message before returning
*/
-void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
+void MDCache::handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &weak)
{
mds_rank_t from = mds_rank_t(weak->get_source().num());
// possible response(s)
- MMDSCacheRejoin *ack = 0; // if survivor
+ MMDSCacheRejoin::ref ack; // if survivor
set<vinodeno_t> acked_inodes; // if survivor
set<SimpleLock *> gather_locks; // if survivor
bool survivor = false; // am i a survivor?
if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
survivor = true;
dout(10) << "i am a surivivor, and will ack immediately" << dendl;
- ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
+ ack = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
// check cap exports
for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
CInode *in = get_inode(p->first);
- assert(!in || in->is_auth());
+ ceph_assert(!in || in->is_auth());
for (auto q = p->second.begin(); q != p->second.end(); ++q) {
dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
mds->locker->eval(in, CEPH_CAP_LOCKS, true);
}
- ::encode(imported_caps, ack->imported_caps);
+ encode(imported_caps, ack->imported_caps);
} else {
- assert(mds->is_rejoin());
+ ceph_assert(mds->is_rejoin());
// we may have already received a strong rejoin from the sender.
rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
- assert(gather_locks.empty());
+ ceph_assert(gather_locks.empty());
// check cap exports.
rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
+ rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
+ weak->client_metadata_map.end());
for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
CInode *in = get_inode(p->first);
- assert(!in || in->is_auth());
+ ceph_assert(!in || in->is_auth());
// note
for (auto q = p->second.begin(); q != p->second.end(); ++q) {
dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
}
// assimilate any potentially dirty scatterlock state
- for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
- p != weak->inode_scatterlocks.end();
- ++p) {
- CInode *in = get_inode(p->first);
- assert(in);
- in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
- in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
- in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
+ for (const auto &p : weak->inode_scatterlocks) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
+ in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
+ in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
if (!survivor)
rejoin_potential_updated_scatterlocks.insert(in);
}
// walk weak map
set<CDir*> dirs_to_share;
- for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
- p != weak->weak_dirfrags.end();
- ++p) {
- CInode *diri = get_inode(p->ino);
+ for (const auto &p : weak->weak_dirfrags) {
+ CInode *diri = get_inode(p.ino);
if (!diri)
- dout(0) << " missing dir ino " << p->ino << dendl;
- assert(diri);
+ dout(0) << " missing dir ino " << p.ino << dendl;
+ ceph_assert(diri);
- list<frag_t> ls;
- if (diri->dirfragtree.is_leaf(p->frag)) {
- ls.push_back(p->frag);
+ frag_vec_t leaves;
+ if (diri->dirfragtree.is_leaf(p.frag)) {
+ leaves.push_back(p.frag);
} else {
- diri->dirfragtree.get_leaves_under(p->frag, ls);
- if (ls.empty())
- ls.push_back(diri->dirfragtree[p->frag.value()]);
+ diri->dirfragtree.get_leaves_under(p.frag, leaves);
+ if (leaves.empty())
+ leaves.push_back(diri->dirfragtree[p.frag.value()]);
}
- for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
- frag_t fg = *q;
- CDir *dir = diri->get_dirfrag(fg);
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
if (!dir) {
- dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
+ dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
continue;
}
- assert(dir);
+ ceph_assert(dir);
if (dirs_to_share.count(dir)) {
- dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
+ dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
} else {
dirs_to_share.insert(dir);
unsigned nonce = dir->add_replica(from);
- dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
+ dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
if (ack) {
ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
ack->add_dirfrag_base(dir);
}
}
- for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
- p != weak->weak.end();
- ++p) {
- CInode *diri = get_inode(p->first);
+ for (const auto &p : weak->weak) {
+ CInode *diri = get_inode(p.first);
if (!diri)
- dout(0) << " missing dir ino " << p->first << dendl;
- assert(diri);
+ dout(0) << " missing dir ino " << p.first << dendl;
+ ceph_assert(diri);
// weak dentries
CDir *dir = 0;
- for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
- q != p->second.end();
- ++q) {
+ for (const auto &q : p.second) {
// locate proper dirfrag.
// optimize for common case (one dirfrag) to avoid dirs_to_share set check
- frag_t fg = diri->pick_dirfrag(q->first.name);
+ frag_t fg = diri->pick_dirfrag(q.first.name);
if (!dir || dir->get_frag() != fg) {
dir = diri->get_dirfrag(fg);
if (!dir)
dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
- assert(dir);
- assert(dirs_to_share.count(dir));
+ ceph_assert(dir);
+ ceph_assert(dirs_to_share.count(dir));
}
// and dentry
- CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
- assert(dn);
+ CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
+ ceph_assert(dn);
CDentry::linkage_t *dnl = dn->get_linkage();
- assert(dnl->is_primary());
+ ceph_assert(dnl->is_primary());
if (survivor && dn->is_replica(from))
dentry_remove_replica(dn, from, gather_locks);
// inode
CInode *in = dnl->get_inode();
- assert(in);
+ ceph_assert(in);
if (survivor && in->is_replica(from))
inode_remove_replica(in, from, true, gather_locks);
p != weak->weak_inodes.end();
++p) {
CInode *in = get_inode(*p);
- assert(in); // hmm fixme wrt stray?
+ ceph_assert(in); // hmm fixme wrt stray?
if (survivor && in->is_replica(from))
inode_remove_replica(in, from, true, gather_locks);
unsigned inonce = in->add_replica(from);
}
}
- assert(rejoin_gather.count(from));
+ ceph_assert(rejoin_gather.count(from));
rejoin_gather.erase(from);
if (survivor) {
// survivor. do everything now.
- for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
- p != weak->inode_scatterlocks.end();
- ++p) {
- CInode *in = get_inode(p->first);
- assert(in);
+ for (const auto &p : weak->inode_scatterlocks) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
acked_inodes.insert(in->vino());
ack->add_inode_base(in, mds->mdsmap->get_up_features());
* all validated replicas are acked with a strong nonce, etc. if that isn't in the
* ack, the replica dne, and we can remove it from our replica maps.
*/
-void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
+void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
set<vinodeno_t>& acked_inodes,
set<SimpleLock *>& gather_locks)
{
for (auto &p : dir->items) {
CDentry *dn = p.second;
- if (dn->is_replica(from) &&
- (ack == NULL ||
- ack->strong_dentries.count(dir->dirfrag()) == 0 ||
- ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->get_name(), dn->last)) == 0)) {
+ if (dn->is_replica(from)) {
+ if (ack) {
+ const auto it = ack->strong_dentries.find(dir->dirfrag());
+ if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
+ continue;
+ }
+ }
dentry_remove_replica(dn, from, gather_locks);
dout(10) << " rem " << *dn << dendl;
}
if (!in)
in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
if (!in->is_dir()) {
- assert(in->state_test(CInode::STATE_REJOINUNDEF));
+ ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
in->inode.mode = S_IFDIR;
- in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
}
CDir *dir = in->get_or_open_dirfrag(this, df.frag);
dir->state_set(CDir::STATE_REJOINUNDEF);
return dir;
}
-/* This functions DOES NOT put the passed message before returning */
-void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
+void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &strong)
{
mds_rank_t from = mds_rank_t(strong->get_source().num());
mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
return;
}
- assert(!"got unexpected rejoin message during recovery");
+ ceph_abort_msg("got unexpected rejoin message during recovery");
}
// assimilate any potentially dirty scatterlock state
- for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
- p != strong->inode_scatterlocks.end();
- ++p) {
- CInode *in = get_inode(p->first);
- assert(in);
- in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
- in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
- in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
+ for (const auto &p : strong->inode_scatterlocks) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
+ in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
+ in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
rejoin_potential_updated_scatterlocks.insert(in);
}
// strong dirfrags/dentries.
// also process auth_pins, xlocks.
- for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
- p != strong->strong_dirfrags.end();
- ++p) {
- CInode *diri = get_inode(p->first.ino);
+ for (const auto &p : strong->strong_dirfrags) {
+ auto& dirfrag = p.first;
+ CInode *diri = get_inode(dirfrag.ino);
if (!diri)
- diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
- CDir *dir = diri->get_dirfrag(p->first.frag);
+ diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
+ CDir *dir = diri->get_dirfrag(dirfrag.frag);
bool refragged = false;
if (dir) {
dout(10) << " have " << *dir << dendl;
} else {
if (diri->state_test(CInode::STATE_REJOINUNDEF))
dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
- else if (diri->dirfragtree.is_leaf(p->first.frag))
- dir = rejoin_invent_dirfrag(p->first);
+ else if (diri->dirfragtree.is_leaf(dirfrag.frag))
+ dir = rejoin_invent_dirfrag(dirfrag);
}
if (dir) {
- dir->add_replica(from, p->second.nonce);
- dir->dir_rep = p->second.dir_rep;
+ dir->add_replica(from, p.second.nonce);
+ dir->dir_rep = p.second.dir_rep;
} else {
- dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
- list<frag_t> ls;
- diri->dirfragtree.get_leaves_under(p->first.frag, ls);
- if (ls.empty())
- ls.push_back(diri->dirfragtree[p->first.frag.value()]);
- dout(10) << " maps to frag(s) " << ls << dendl;
- for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
- CDir *dir = diri->get_dirfrag(*q);
+ dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
+ if (leaves.empty())
+ leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
+ dout(10) << " maps to frag(s) " << leaves << dendl;
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
if (!dir)
- dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
+ dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
else
dout(10) << " have(approx) " << *dir << dendl;
- dir->add_replica(from, p->second.nonce);
- dir->dir_rep = p->second.dir_rep;
+ dir->add_replica(from, p.second.nonce);
+ dir->dir_rep = p.second.dir_rep;
}
refragged = true;
}
- map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
- for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
- q != dmap.end();
- ++q) {
- CDentry *dn;
- if (!refragged)
- dn = dir->lookup(q->first.name, q->first.snapid);
- else {
- frag_t fg = diri->pick_dirfrag(q->first.name);
- dir = diri->get_dirfrag(fg);
- assert(dir);
- dn = dir->lookup(q->first.name, q->first.snapid);
- }
- if (!dn) {
- if (q->second.is_remote()) {
- dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
- q->second.first, q->first.snapid);
- } else if (q->second.is_null()) {
- dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
- } else {
- CInode *in = get_inode(q->second.ino, q->first.snapid);
- if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
- dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
- }
- dout(10) << " invented " << *dn << dendl;
- }
- CDentry::linkage_t *dnl = dn->get_linkage();
-
- // dn auth_pin?
- if (strong->authpinned_dentries.count(p->first) &&
- strong->authpinned_dentries[p->first].count(q->first)) {
- for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
- r != strong->authpinned_dentries[p->first][q->first].end();
- ++r) {
- dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
-
- // get/create slave mdrequest
- MDRequestRef mdr;
- if (have_request(r->reqid))
- mdr = request_get(r->reqid);
- else
- mdr = request_start_slave(r->reqid, r->attempt, strong);
- mdr->auth_pin(dn);
- }
- }
-
- // dn xlock?
- if (strong->xlocked_dentries.count(p->first) &&
- strong->xlocked_dentries[p->first].count(q->first)) {
- MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
- dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
- MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
- assert(mdr->is_auth_pinned(dn));
- if (!mdr->xlocks.count(&dn->versionlock)) {
- assert(dn->versionlock.can_xlock_local());
- dn->versionlock.get_xlock(mdr, mdr->get_client());
- mdr->xlocks.insert(&dn->versionlock);
- mdr->locks.insert(&dn->versionlock);
+ const auto it = strong->strong_dentries.find(dirfrag);
+ if (it != strong->strong_dentries.end()) {
+ const map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = it->second;
+ for (const auto &q : dmap) {
+ const string_snap_t& ss = q.first;
+ const MMDSCacheRejoin::dn_strong& d = q.second;
+ CDentry *dn;
+ if (!refragged)
+ dn = dir->lookup(ss.name, ss.snapid);
+ else {
+ frag_t fg = diri->pick_dirfrag(ss.name);
+ dir = diri->get_dirfrag(fg);
+ ceph_assert(dir);
+ dn = dir->lookup(ss.name, ss.snapid);
+ }
+ if (!dn) {
+ if (d.is_remote()) {
+ dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
+ } else if (d.is_null()) {
+ dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
+ } else {
+ CInode *in = get_inode(d.ino, ss.snapid);
+ if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
+ dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
+ }
+ dout(10) << " invented " << *dn << dendl;
+ }
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ // dn auth_pin?
+ const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
+ if (pinned_it != strong->authpinned_dentries.end()) {
+ const auto slave_reqid_it = pinned_it->second.find(ss);
+ if (slave_reqid_it != pinned_it->second.end()) {
+ for (const auto &r : slave_reqid_it->second) {
+ dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
+
+ // get/create slave mdrequest
+ MDRequestRef mdr;
+ if (have_request(r.reqid))
+ mdr = request_get(r.reqid);
+ else
+ mdr = request_start_slave(r.reqid, r.attempt, strong);
+ mdr->auth_pin(dn);
+ }
+ }
}
- if (dn->lock.is_stable())
- dn->auth_pin(&dn->lock);
- dn->lock.set_state(LOCK_XLOCK);
- dn->lock.get_xlock(mdr, mdr->get_client());
- mdr->xlocks.insert(&dn->lock);
- mdr->locks.insert(&dn->lock);
- }
- dn->add_replica(from, q->second.nonce);
- dout(10) << " have " << *dn << dendl;
+ // dn xlock?
+ const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
+ if (xlocked_it != strong->xlocked_dentries.end()) {
+ const auto ss_req_it = xlocked_it->second.find(ss);
+ if (ss_req_it != xlocked_it->second.end()) {
+ const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
+ dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
+ MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
+ ceph_assert(mdr->is_auth_pinned(dn));
+ if (!mdr->is_xlocked(&dn->versionlock)) {
+ ceph_assert(dn->versionlock.can_xlock_local());
+ dn->versionlock.get_xlock(mdr, mdr->get_client());
+ mdr->locks.emplace(&dn->versionlock, MutationImpl::LockOp::XLOCK);
+ }
+ if (dn->lock.is_stable())
+ dn->auth_pin(&dn->lock);
+ dn->lock.set_state(LOCK_XLOCK);
+ dn->lock.get_xlock(mdr, mdr->get_client());
+ mdr->locks.emplace(&dn->lock, MutationImpl::LockOp::XLOCK);
+ }
+ }
- if (dnl->is_primary()) {
- if (q->second.is_primary()) {
- if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
- // the survivor missed MDentryUnlink+MDentryLink messages ?
- assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
- CInode *in = get_inode(q->second.ino, q->first.snapid);
- assert(in);
- assert(in->get_parent_dn());
+ dn->add_replica(from, d.nonce);
+ dout(10) << " have " << *dn << dendl;
+
+ if (dnl->is_primary()) {
+ if (d.is_primary()) {
+ if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
+ // the survivor missed MDentryUnlink+MDentryLink messages ?
+ ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+ CInode *in = get_inode(d.ino, ss.snapid);
+ ceph_assert(in);
+ ceph_assert(in->get_parent_dn());
+ rejoin_unlinked_inodes[from].insert(in);
+ dout(7) << " sender has primary dentry but wrong inode" << dendl;
+ }
+ } else {
+ // the survivor missed MDentryLink message ?
+ ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+ dout(7) << " sender doesn't have primay dentry" << dendl;
+ }
+ } else {
+ if (d.is_primary()) {
+ // the survivor missed MDentryUnlink message ?
+ CInode *in = get_inode(d.ino, ss.snapid);
+ ceph_assert(in);
+ ceph_assert(in->get_parent_dn());
rejoin_unlinked_inodes[from].insert(in);
- dout(7) << " sender has primary dentry but wrong inode" << dendl;
+ dout(7) << " sender has primary dentry but we don't" << dendl;
}
- } else {
- // the survivor missed MDentryLink message ?
- assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
- dout(7) << " sender doesn't have primay dentry" << dendl;
- }
- } else {
- if (q->second.is_primary()) {
- // the survivor missed MDentryUnlink message ?
- CInode *in = get_inode(q->second.ino, q->first.snapid);
- assert(in);
- assert(in->get_parent_dn());
- rejoin_unlinked_inodes[from].insert(in);
- dout(7) << " sender has primary dentry but we don't" << dendl;
- }
+ }
}
}
}
- for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
- p != strong->strong_inodes.end();
- ++p) {
- CInode *in = get_inode(p->first);
- assert(in);
- in->add_replica(from, p->second.nonce);
+ for (const auto &p : strong->strong_inodes) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ in->add_replica(from, p.second.nonce);
dout(10) << " have " << *in << dendl;
- MMDSCacheRejoin::inode_strong &is = p->second;
+ const MMDSCacheRejoin::inode_strong& is = p.second;
// caps_wanted
if (is.caps_wanted) {
- in->mds_caps_wanted[from] = is.caps_wanted;
+ in->set_mds_caps_wanted(from, is.caps_wanted);
dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
<< " on " << *in << dendl;
}
in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
// auth pin?
- if (strong->authpinned_inodes.count(in->vino())) {
- for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
- r != strong->authpinned_inodes[in->vino()].end();
- ++r) {
- dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
+ const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
+ if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
+ for (const auto& r : authpinned_inodes_it->second) {
+ dout(10) << " inode authpin by " << r << " on " << *in << dendl;
// get/create slave mdrequest
MDRequestRef mdr;
- if (have_request(r->reqid))
- mdr = request_get(r->reqid);
+ if (have_request(r.reqid))
+ mdr = request_get(r.reqid);
else
- mdr = request_start_slave(r->reqid, r->attempt, strong);
+ mdr = request_start_slave(r.reqid, r.attempt, strong);
if (strong->frozen_authpin_inodes.count(in->vino())) {
- assert(!in->get_num_auth_pins());
+ ceph_assert(!in->get_num_auth_pins());
mdr->freeze_auth_pin(in);
} else {
- assert(!in->is_frozen_auth_pin());
+ ceph_assert(!in->is_frozen_auth_pin());
}
mdr->auth_pin(in);
}
}
// xlock(s)?
- if (strong->xlocked_inodes.count(in->vino())) {
- for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
- q != strong->xlocked_inodes[in->vino()].end();
- ++q) {
- SimpleLock *lock = in->get_lock(q->first);
- dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
- MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
- assert(mdr->is_auth_pinned(in));
- if (!mdr->xlocks.count(&in->versionlock)) {
- assert(in->versionlock.can_xlock_local());
+ const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
+ if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
+ for (const auto &q : xlocked_inodes_it->second) {
+ SimpleLock *lock = in->get_lock(q.first);
+ dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
+ MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
+ ceph_assert(mdr->is_auth_pinned(in));
+ if (!mdr->is_xlocked(&in->versionlock)) {
+ ceph_assert(in->versionlock.can_xlock_local());
in->versionlock.get_xlock(mdr, mdr->get_client());
- mdr->xlocks.insert(&in->versionlock);
- mdr->locks.insert(&in->versionlock);
+ mdr->locks.emplace(&in->versionlock, MutationImpl::LockOp::XLOCK);
}
if (lock->is_stable())
in->auth_pin(lock);
if (lock == &in->filelock)
in->loner_cap = -1;
lock->get_xlock(mdr, mdr->get_client());
- mdr->xlocks.insert(lock);
- mdr->locks.insert(lock);
+ mdr->locks.emplace(lock, MutationImpl::LockOp::XLOCK);
}
}
}
// wrlock(s)?
- for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
- p != strong->wrlocked_inodes.end();
- ++p) {
- CInode *in = get_inode(p->first);
- for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
- q != p->second.end();
- ++q) {
- SimpleLock *lock = in->get_lock(q->first);
- for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
- r != q->second.end();
- ++r) {
- dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
- MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
+ for (const auto &p : strong->wrlocked_inodes) {
+ CInode *in = get_inode(p.first);
+ for (const auto &q : p.second) {
+ SimpleLock *lock = in->get_lock(q.first);
+ for (const auto &r : q.second) {
+ dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
+ MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
if (in->is_auth())
- assert(mdr->is_auth_pinned(in));
+ ceph_assert(mdr->is_auth_pinned(in));
lock->set_state(LOCK_MIX);
if (lock == &in->filelock)
in->loner_cap = -1;
lock->get_wrlock(true);
- mdr->wrlocks.insert(lock);
- mdr->locks.insert(lock);
+ mdr->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
}
}
}
// done?
- assert(rejoin_gather.count(from));
+ ceph_assert(rejoin_gather.count(from));
rejoin_gather.erase(from);
if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
rejoin_gather_finish();
}
}
-/* This functions DOES NOT put the passed message before returning */
-void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
+void MDCache::handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &ack)
{
dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
mds_rank_t from = mds_rank_t(ack->get_source().num());
- assert(mds->get_state() >= MDSMap::STATE_REJOIN);
+ ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
bool survivor = !mds->is_rejoin();
// for sending cache expire message
set<CInode*> isolated_inodes;
set<CInode*> refragged_inodes;
+ list<pair<CInode*,int> > updated_realms;
// dirs
- for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
- p != ack->strong_dirfrags.end();
- ++p) {
+ for (const auto &p : ack->strong_dirfrags) {
// we may have had incorrect dir fragmentation; refragment based
// on what they auth tells us.
- CDir *dir = get_dirfrag(p->first);
+ CDir *dir = get_dirfrag(p.first);
if (!dir) {
- dir = get_force_dirfrag(p->first, false);
+ dir = get_force_dirfrag(p.first, false);
if (dir)
refragged_inodes.insert(dir->get_inode());
}
if (!dir) {
- CInode *diri = get_inode(p->first.ino);
+ CInode *diri = get_inode(p.first.ino);
if (!diri) {
// barebones inode; the full inode loop below will clean up.
diri = new CInode(this, false);
- diri->inode.ino = p->first.ino;
+ diri->inode.ino = p.first.ino;
diri->inode.mode = S_IFDIR;
- diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
+ diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
add_inode(diri);
- if (MDS_INO_MDSDIR(from) == p->first.ino) {
+ if (MDS_INO_MDSDIR(from) == p.first.ino) {
diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
dout(10) << " add inode " << *diri << dendl;
} else {
diri->inode_auth = CDIR_AUTH_DEFAULT;
isolated_inodes.insert(diri);
- dout(10) << " unconnected dirfrag " << p->first << dendl;
+ dout(10) << " unconnected dirfrag " << p.first << dendl;
}
}
// barebones dirfrag; the full dirfrag loop below will clean up.
- dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
- if (MDS_INO_MDSDIR(from) == p->first.ino ||
+ dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
+ if (MDS_INO_MDSDIR(from) == p.first.ino ||
(dir->authority() != CDIR_AUTH_UNDEF &&
dir->authority().first != from))
adjust_subtree_auth(dir, from);
dout(10) << " add dirfrag " << *dir << dendl;
}
- dir->set_replica_nonce(p->second.nonce);
+ dir->set_replica_nonce(p.second.nonce);
dir->state_clear(CDir::STATE_REJOINING);
dout(10) << " got " << *dir << dendl;
// dentries
- map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
- for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
- q != dmap.end();
- ++q) {
- CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
- if(!dn)
- dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
-
- CDentry::linkage_t *dnl = dn->get_linkage();
-
- assert(dn->last == q->first.snapid);
- if (dn->first != q->second.first) {
- dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
- dn->first = q->second.first;
- }
+ auto it = ack->strong_dentries.find(p.first);
+ if (it != ack->strong_dentries.end()) {
+ for (const auto &q : it->second) {
+ CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
+ if(!dn)
+ dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
+
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ ceph_assert(dn->last == q.first.snapid);
+ if (dn->first != q.second.first) {
+ dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
+ dn->first = q.second.first;
+ }
- // may have bad linkage if we missed dentry link/unlink messages
- if (dnl->is_primary()) {
- CInode *in = dnl->get_inode();
- if (!q->second.is_primary() ||
- vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
- dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
- dir->unlink_inode(dn);
- }
- } else if (dnl->is_remote()) {
- if (!q->second.is_remote() ||
- q->second.remote_ino != dnl->get_remote_ino() ||
- q->second.remote_d_type != dnl->get_remote_d_type()) {
- dout(10) << " had bad linkage for " << *dn << dendl;
- dir->unlink_inode(dn);
- }
- } else {
- if (!q->second.is_null())
- dout(10) << " had bad linkage for " << *dn << dendl;
- }
+ // may have bad linkage if we missed dentry link/unlink messages
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (!q.second.is_primary() ||
+ vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
+ dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
+ dir->unlink_inode(dn);
+ }
+ } else if (dnl->is_remote()) {
+ if (!q.second.is_remote() ||
+ q.second.remote_ino != dnl->get_remote_ino() ||
+ q.second.remote_d_type != dnl->get_remote_d_type()) {
+ dout(10) << " had bad linkage for " << *dn << dendl;
+ dir->unlink_inode(dn);
+ }
+ } else {
+ if (!q.second.is_null())
+ dout(10) << " had bad linkage for " << *dn << dendl;
+ }
- // hmm, did we have the proper linkage here?
- if (dnl->is_null() && !q->second.is_null()) {
- if (q->second.is_remote()) {
- dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
- } else {
- CInode *in = get_inode(q->second.ino, q->first.snapid);
- if (!in) {
- // barebones inode; assume it's dir, the full inode loop below will clean up.
- in = new CInode(this, false, q->second.first, q->first.snapid);
- in->inode.ino = q->second.ino;
- in->inode.mode = S_IFDIR;
- in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
- add_inode(in);
- dout(10) << " add inode " << *in << dendl;
- } else if (in->get_parent_dn()) {
- dout(10) << " had bad linkage for " << *(in->get_parent_dn())
- << ", unlinking " << *in << dendl;
- in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+ // hmm, did we have the proper linkage here?
+ if (dnl->is_null() && !q.second.is_null()) {
+ if (q.second.is_remote()) {
+ dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
+ } else {
+ CInode *in = get_inode(q.second.ino, q.first.snapid);
+ if (!in) {
+ // barebones inode; assume it's dir, the full inode loop below will clean up.
+ in = new CInode(this, false, q.second.first, q.first.snapid);
+ in->inode.ino = q.second.ino;
+ in->inode.mode = S_IFDIR;
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ add_inode(in);
+ dout(10) << " add inode " << *in << dendl;
+ } else if (in->get_parent_dn()) {
+ dout(10) << " had bad linkage for " << *(in->get_parent_dn())
+ << ", unlinking " << *in << dendl;
+ in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+ }
+ dn->dir->link_primary_inode(dn, in);
+ isolated_inodes.erase(in);
}
- dn->dir->link_primary_inode(dn, in);
- isolated_inodes.erase(in);
- }
- }
+ }
- dn->set_replica_nonce(q->second.nonce);
- dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
- dn->state_clear(CDentry::STATE_REJOINING);
- dout(10) << " got " << *dn << dendl;
+ dn->set_replica_nonce(q.second.nonce);
+ dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
+ dn->state_clear(CDentry::STATE_REJOINING);
+ dout(10) << " got " << *dn << dendl;
+ }
}
}
for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
continue;
- assert((*q)->get_num_any() == 0);
+ ceph_assert((*q)->get_num_any() == 0);
(*p)->close_dirfrag((*q)->get_frag());
}
}
// full dirfrags
- for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
- p != ack->dirfrag_bases.end();
- ++p) {
- CDir *dir = get_dirfrag(p->first);
- assert(dir);
- bufferlist::iterator q = p->second.begin();
+ for (const auto &p : ack->dirfrag_bases) {
+ CDir *dir = get_dirfrag(p.first);
+ ceph_assert(dir);
+ auto q = p.second.cbegin();
dir->_decode_base(q);
dout(10) << " got dir replica " << *dir << dendl;
}
// full inodes
- bufferlist::iterator p = ack->inode_base.begin();
+ auto p = ack->inode_base.cbegin();
while (!p.end()) {
inodeno_t ino;
snapid_t last;
bufferlist basebl;
- ::decode(ino, p);
- ::decode(last, p);
- ::decode(basebl, p);
+ decode(ino, p);
+ decode(last, p);
+ decode(basebl, p);
CInode *in = get_inode(ino, last);
- assert(in);
- bufferlist::iterator q = basebl.begin();
+ ceph_assert(in);
+ auto q = basebl.cbegin();
+ snapid_t sseq = 0;
+ if (in->snaprealm)
+ sseq = in->snaprealm->srnode.seq;
in->_decode_base(q);
+ if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
+ int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
+ updated_realms.push_back(pair<CInode*,int>(in, snap_op));
+ }
dout(10) << " got inode base " << *in << dendl;
}
// inodes
- p = ack->inode_locks.begin();
+ p = ack->inode_locks.cbegin();
//dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
while (!p.end()) {
inodeno_t ino;
snapid_t last;
__u32 nonce;
bufferlist lockbl;
- ::decode(ino, p);
- ::decode(last, p);
- ::decode(nonce, p);
- ::decode(lockbl, p);
+ decode(ino, p);
+ decode(last, p);
+ decode(nonce, p);
+ decode(lockbl, p);
CInode *in = get_inode(ino, last);
- assert(in);
+ ceph_assert(in);
in->set_replica_nonce(nonce);
- bufferlist::iterator q = lockbl.begin();
+ auto q = lockbl.cbegin();
in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
in->state_clear(CInode::STATE_REJOINING);
dout(10) << " got inode locks " << *in << dendl;
// FIXME: This can happen if entire subtree, together with the inode subtree root
// belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
- assert(isolated_inodes.empty());
+ ceph_assert(isolated_inodes.empty());
map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
- bufferlist::iterator bp = ack->imported_caps.begin();
- ::decode(peer_imported, bp);
+ auto bp = ack->imported_caps.cbegin();
+ decode(peer_imported, bp);
for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
p != peer_imported.end();
++p) {
auto& ex = cap_exports.at(p->first);
- assert(ex.first == from);
+ ceph_assert(ex.first == from);
for (map<client_t,Capability::Import>::iterator q = p->second.begin();
q != p->second.end();
++q) {
auto r = ex.second.find(q->first);
- assert(r != ex.second.end());
+ ceph_assert(r != ex.second.end());
dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
}
// mark client caps stale.
- MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
+ auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0,
r->second.capinfo.cap_id, 0,
mds->get_osd_epoch_barrier());
m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
ex.second.erase(r);
}
- assert(ex.second.empty());
+ ceph_assert(ex.second.empty());
+ }
+
+ for (auto p : updated_realms) {
+ CInode *in = p.first;
+ bool notify_clients;
+ if (mds->is_rejoin()) {
+ if (!rejoin_pending_snaprealms.count(in)) {
+ in->get(CInode::PIN_OPENINGSNAPPARENTS);
+ rejoin_pending_snaprealms.insert(in);
+ }
+ notify_clients = false;
+ } else {
+ // notify clients if I'm survivor
+ notify_clients = true;
+ }
+ do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
}
// done?
- assert(rejoin_ack_gather.count(from));
+ ceph_assert(rejoin_ack_gather.count(from));
rejoin_ack_gather.erase(from);
if (!survivor) {
-
if (rejoin_gather.empty()) {
// eval unstable scatter locks after all wrlocks are rejoined.
while (!rejoin_eval_locks.empty()) {
if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
rejoin_ack_gather.empty()) {
// finally, kickstart past snap parent opens
- open_snap_parents();
+ open_snaprealms();
} else {
dout(7) << "still need rejoin from (" << rejoin_gather << ")"
<< ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
}
}
- assert(rejoin_undef_inodes.empty());
+ ceph_assert(rejoin_undef_inodes.empty());
}
void MDCache::rejoin_gather_finish()
{
dout(10) << "rejoin_gather_finish" << dendl;
- assert(mds->is_rejoin());
- assert(rejoin_ack_gather.count(mds->get_nodeid()));
+ ceph_assert(mds->is_rejoin());
+ ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
if (open_undef_inodes_dirfrags())
return;
// did we already get our acks too?
if (rejoin_ack_gather.empty()) {
- // finally, kickstart past snap parent opens
- open_snap_parents();
+ // finally, open snaprealms
+ open_snaprealms();
}
}
if (ret < 0) {
cap_imports_missing.insert(ino);
} else if (ret == mds->get_nodeid()) {
- assert(get_inode(ino));
+ ceph_assert(get_inode(ino));
} else {
auto p = cap_imports.find(ino);
- assert(p != cap_imports.end());
+ ceph_assert(p != cap_imports.end());
for (auto q = p->second.begin(); q != p->second.end(); ++q) {
- assert(q->second.count(MDS_RANK_NONE));
- assert(q->second.size() == 1);
+ ceph_assert(q->second.count(MDS_RANK_NONE));
+ ceph_assert(q->second.size() == 1);
rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
}
cap_imports.erase(p);
}
- assert(cap_imports_num_opening > 0);
+ ceph_assert(cap_imports_num_opening > 0);
cap_imports_num_opening--;
if (cap_imports_num_opening == 0) {
map<client_t,pair<Session*,uint64_t> > session_map;
C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
void finish(int r) override {
- assert(r == 0);
+ ceph_assert(r == 0);
mdcache->rejoin_open_sessions_finish(session_map);
}
};
rejoin_gather_finish();
}
+void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
+{
+ auto p = cap_imports.find(ino);
+ if (p != cap_imports.end()) {
+ dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
+ if (ret < 0) {
+ cap_imports_missing.insert(ino);
+ } else if (ret != mds->get_nodeid()) {
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ ceph_assert(q->second.count(MDS_RANK_NONE));
+ ceph_assert(q->second.size() == 1);
+ rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
+ }
+ cap_imports.erase(p);
+ }
+ }
+}
+
bool MDCache::process_imported_caps()
{
dout(10) << "process_imported_caps" << dendl;
+ if (!open_file_table.is_prefetched() &&
+ open_file_table.prefetch_inodes()) {
+ open_file_table.wait_for_prefetch(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ ceph_assert(rejoin_gather.count(mds->get_nodeid()));
+ process_imported_caps();
+ })
+ )
+ );
+ return true;
+ }
+
for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
CInode *in = get_inode(p->first);
if (in) {
- assert(in->is_auth());
+ ceph_assert(in->is_auth());
cap_imports_missing.erase(p->first);
continue;
}
rejoin_session_map.empty()) {
C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
+ rejoin_client_metadata_map,
finish->session_map);
- mds->mdlog->start_submit_entry(new ESessions(pv, rejoin_client_map), finish);
+ ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
+ std::move(rejoin_client_metadata_map));
+ mds->mdlog->start_submit_entry(le, finish);
mds->mdlog->flush();
rejoin_client_map.clear();
+ rejoin_client_metadata_map.clear();
return true;
}
p != rejoin_slave_exports.end();
++p) {
CInode *in = get_inode(p->first);
- assert(in);
+ ceph_assert(in);
for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
q != p->second.second.end();
++q) {
Session *session = r->second.first;
Capability *cap = in->get_client_cap(q->first);
- if (!cap)
+ if (!cap) {
cap = in->add_client_cap(q->first, session);
+ // add empty item to reconnected_caps
+ (void)reconnected_caps[p->first][q->first];
+ }
cap->merge(q->second, true);
Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
- assert(cap->get_last_seq() == im.issue_seq);
- assert(cap->get_mseq() == im.mseq);
+ ceph_assert(cap->get_last_seq() == im.issue_seq);
+ ceph_assert(cap->get_mseq() == im.mseq);
cap->set_cap_id(im.cap_id);
// send cap import because we assigned a new cap ID
do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
++p;
continue;
}
- assert(in->is_auth());
+ ceph_assert(in->is_auth());
for (auto q = p->second.begin(); q != p->second.end(); ++q) {
Session *session;
{
} else {
trim_non_auth();
- assert(rejoin_gather.count(mds->get_nodeid()));
+ ceph_assert(rejoin_gather.count(mds->get_nodeid()));
rejoin_gather.erase(mds->get_nodeid());
- assert(!rejoin_ack_gather.count(mds->get_nodeid()));
+ ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
maybe_send_pending_rejoins();
}
return false;
}
-void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
-{
- // are this realm's parents fully open?
- if (realm->have_past_parents_open()) {
- dout(10) << " have past snap parents for realm " << *realm
- << " on " << *realm->inode << dendl;
- if (reconnect) {
- // finish off client snaprealm reconnects?
- auto p = reconnected_snaprealms.find(realm->inode->ino());
- if (p != reconnected_snaprealms.end()) {
- for (auto q = p->second.begin(); q != p->second.end(); ++q)
- finish_snaprealm_reconnect(q->first, realm, q->second);
- reconnected_snaprealms.erase(p);
- }
- }
- } else {
- if (!missing_snap_parents.count(realm->inode)) {
- dout(10) << " MISSING past snap parents for realm " << *realm
- << " on " << *realm->inode << dendl;
- realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
- missing_snap_parents[realm->inode].size(); // just to get it into the map!
- } else {
- dout(10) << " (already) MISSING past snap parents for realm " << *realm
- << " on " << *realm->inode << dendl;
- }
- }
-}
-
void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
client_t client, snapid_t snap_follows)
{
dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
+ if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
+ return;
+
const set<snapid_t>& snaps = realm->get_snaps();
snapid_t follows = snap_follows;
CInode *in = pick_inode_snap(head_in, follows);
if (in == head_in)
break;
+
+ bool need_snapflush = false;
+ for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
+ p != snaps.end() && *p <= in->last;
+ ++p) {
+ head_in->add_need_snapflush(in, *p, client);
+ need_snapflush = true;
+ }
+ follows = in->last;
+ if (!need_snapflush)
+ continue;
+
dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
- /* TODO: we can check the reconnected/flushing caps to find
+ /* TODO: we can check the reconnected/flushing caps to find
* which locks need gathering */
for (int i = 0; i < num_cinode_locks; i++) {
int lockid = cinode_lock_info[i].lock;
SimpleLock *lock = in->get_lock(lockid);
- assert(lock);
+ ceph_assert(lock);
in->client_snap_caps[lockid].insert(client);
in->auth_pin(lock);
lock->set_state(LOCK_SNAP_SYNC);
lock->get_wrlock(true);
}
-
- for (auto p = snaps.lower_bound(in->first);
- p != snaps.end() && *p <= in->last;
- ++p) {
- head_in->add_need_snapflush(in, *p, client);
- }
-
- follows = in->last;
+ mds->locker->mark_need_snapflush_inode(in);
}
}
{
dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
- map<client_t,MClientSnap*> splits;
-
- for (auto i : inode_map) {
- CInode *in = i.second;
+ for (auto p : inode_map) {
+ CInode *in = p.second;
if (in->last != CEPH_NOSNAP)
continue;
in->mark_dirty_rstat();
int dirty_caps = 0;
- auto p = reconnected_caps.find(in->ino());
- if (p != reconnected_caps.end()) {
- for (const auto &it : p->second)
+ auto q = reconnected_caps.find(in->ino());
+ if (q != reconnected_caps.end()) {
+ for (const auto &it : q->second)
dirty_caps |= it.second.dirty_caps;
}
in->choose_lock_states(dirty_caps);
dout(15) << " chose lock states on " << *in << dendl;
- SnapRealm *realm = in->find_snaprealm();
-
- check_realm_past_parents(realm, realm == in->snaprealm);
-
- if (p != reconnected_caps.end()) {
- bool missing_snap_parent = false;
- // also, make sure client's cap is in the correct snaprealm.
- for (auto q = p->second.begin(); q != p->second.end(); ++q) {
- if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
- if (realm->have_past_parents_open()) {
- rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
- } else {
- missing_snap_parent = true;
- }
- }
-
- if (q->second.realm_ino == realm->inode->ino()) {
- dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
- } else {
- dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
- << " != " << realm->inode->ino() << dendl;
- if (realm->have_past_parents_open()) {
- // ok, include in a split message _now_.
- prepare_realm_split(realm, q->first, in->ino(), splits);
- } else {
- // send the split later.
- missing_snap_parent = true;
- }
- }
- }
- if (missing_snap_parent)
- missing_snap_parents[realm->inode].insert(in);
+ if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
+ in->get(CInode::PIN_OPENINGSNAPPARENTS);
+ rejoin_pending_snaprealms.insert(in);
}
- }
-
- send_snaps(splits);
+ }
}
void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
- map<client_t,MClientSnap*>& splits)
+ map<client_t,MClientSnap::ref>& splits)
{
- MClientSnap *snap;
- if (splits.count(client) == 0) {
- splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
+ MClientSnap::ref snap;
+ auto it = splits.find(client);
+ if (it != splits.end()) {
+ snap = it->second;
+ snap->head.op = CEPH_SNAP_OP_SPLIT;
+ } else {
+ snap = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
+ splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
snap->head.split = realm->inode->ino();
- realm->build_snap_trace(snap->bl);
+ snap->bl = realm->get_snap_trace();
- for (set<SnapRealm*>::iterator p = realm->open_children.begin();
- p != realm->open_children.end();
- ++p)
- snap->split_realms.push_back((*p)->inode->ino());
-
- } else
- snap = splits[client];
+ for (const auto& child : realm->open_children)
+ snap->split_realms.push_back(child->inode->ino());
+ }
snap->split_inos.push_back(ino);
}
-void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
+void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
+ map<client_t,MClientSnap::ref>& splits)
+{
+ ceph_assert(parent_realm);
+
+ vector<inodeno_t> split_inos;
+ vector<inodeno_t> split_realms;
+
+ for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end();
+ ++p)
+ split_inos.push_back((*p)->ino());
+ for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+ p != realm->open_children.end();
+ ++p)
+ split_realms.push_back((*p)->inode->ino());
+
+ for (const auto& p : realm->client_caps) {
+ ceph_assert(!p.second->empty());
+ auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
+ if (em.second) {
+ auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
+ update->head.split = parent_realm->inode->ino();
+ update->split_inos = split_inos;
+ update->split_realms = split_realms;
+ update->bl = parent_realm->get_snap_trace();
+ em.first->second = std::move(update);
+ }
+ }
+}
+
+void MDCache::send_snaps(map<client_t,MClientSnap::ref>& splits)
{
dout(10) << "send_snaps" << dendl;
- for (map<client_t,MClientSnap*>::iterator p = splits.begin();
- p != splits.end();
- ++p) {
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
+ for (auto &p : splits) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
if (session) {
- dout(10) << " client." << p->first
- << " split " << p->second->head.split
- << " inos " << p->second->split_inos
+ dout(10) << " client." << p.first
+ << " split " << p.second->head.split
+ << " inos " << p.second->split_inos
<< dendl;
- mds->send_message_client_counted(p->second, session);
+ mds->send_message_client_counted(p.second, session);
} else {
- dout(10) << " no session for client." << p->first << dendl;
- p->second->put();
+ dout(10) << " no session for client." << p.first << dendl;
}
}
splits.clear();
CInode *in = *q;
++q;
if (in->last == CEPH_NOSNAP) {
- if (!in->is_any_caps_wanted()) {
- dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
- in->item_open_file.remove_myself();
- }
- } else if (in->last != CEPH_NOSNAP) {
+ dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
+ in->item_open_file.remove_myself();
+ } else {
if (in->client_snap_caps.empty()) {
dout(10) << " unlisting flushed snap inode " << *in << dendl;
in->item_open_file.remove_myself();
}
}
-
+void MDCache::dump_openfiles(Formatter *f)
+{
+ f->open_array_section("openfiles");
+ for (auto p = mds->mdlog->segments.begin();
+ p != mds->mdlog->segments.end();
+ ++p) {
+ LogSegment *ls = p->second;
+
+ auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
+ while (!q.end()) {
+ CInode *in = *q;
+ ++q;
+ if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
+ || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
+ continue;
+ f->open_object_section("file");
+ in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
+ f->close_section();
+ }
+ }
+ f->close_section();
+}
Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
{
Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
if (session) {
// mark client caps stale.
- MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
+ auto stale = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
stale->set_cap_peer(0, 0, 0, -1, 0);
mds->send_message_client_counted(stale, q->first);
}
mds->heartbeat_reset();
}
- for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
+ for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
p != cap_reconnect_waiters.end();
++p)
mds->queue_waiters(p->second);
dout(15) << " chose lock states on " << *in << dendl;
}
- map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
+ map<inodeno_t, MDSContext::vec >::iterator it =
cap_reconnect_waiters.find(in->ino());
if (it != cap_reconnect_waiters.end()) {
mds->queue_waiters(it->second);
uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
int peer, int p_flags)
{
- client_t client = session->info.inst.name.num();
SnapRealm *realm = in->find_snaprealm();
if (realm->have_past_parents_open()) {
dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
cap->set_last_issue();
cap->set_last_issue_stamp(ceph_clock_now());
cap->clear_new();
- MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
- in->ino(),
- realm->inode->ino(),
- cap->get_cap_id(), cap->get_last_seq(),
- cap->pending(), cap->wanted(), 0,
- cap->get_mseq(), mds->get_osd_epoch_barrier());
+ auto reap = MClientCaps::create(CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(), cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
in->encode_cap_message(reap, cap);
- realm->build_snap_trace(reap->snapbl);
+ reap->snapbl = realm->get_snap_trace();
reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
mds->send_message_client_counted(reap, session);
} else {
- dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
- << cap->get_mseq() << " on " << *in << dendl;
- in->auth_pin(this);
- cap->inc_suppress();
- delayed_imported_caps[client].insert(in);
- missing_snap_parents[in].size();
+ ceph_abort();
}
}
{
dout(10) << "do_delayed_cap_imports" << dendl;
- assert(delayed_imported_caps.empty());
+ ceph_assert(delayed_imported_caps.empty());
}
-struct C_MDC_OpenSnapParents : public MDCacheContext {
- explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
+struct C_MDC_OpenSnapRealms : public MDCacheContext {
+ explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
void finish(int r) override {
- mdcache->open_snap_parents();
+ mdcache->open_snaprealms();
}
};
-void MDCache::open_snap_parents()
+void MDCache::open_snaprealms()
{
- dout(10) << "open_snap_parents" << dendl;
+ dout(10) << "open_snaprealms" << dendl;
- map<client_t,MClientSnap*> splits;
MDSGatherBuilder gather(g_ceph_context);
- auto p = missing_snap_parents.begin();
- while (p != missing_snap_parents.end()) {
- CInode *in = p->first;
- assert(in->snaprealm);
- if (in->snaprealm->open_parents(gather.new_sub())) {
+ auto it = rejoin_pending_snaprealms.begin();
+ while (it != rejoin_pending_snaprealms.end()) {
+ CInode *in = *it;
+ SnapRealm *realm = in->snaprealm;
+ ceph_assert(realm);
+ if (realm->have_past_parents_open() ||
+ realm->open_parents(gather.new_sub())) {
dout(10) << " past parents now open on " << *in << dendl;
- for (CInode *child : p->second) {
+ map<client_t,MClientSnap::ref> splits;
+ // finish off client snaprealm reconnects?
+ map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
+ if (q != reconnected_snaprealms.end()) {
+ for (const auto& r : q->second)
+ finish_snaprealm_reconnect(r.first, realm, r.second, splits);
+ reconnected_snaprealms.erase(q);
+ }
+
+ for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end(); ++p) {
+ CInode *child = *p;
auto q = reconnected_caps.find(child->ino());
- assert(q != reconnected_caps.end());
+ ceph_assert(q != reconnected_caps.end());
for (auto r = q->second.begin(); r != q->second.end(); ++r) {
- if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
- rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
+ Capability *cap = child->get_client_cap(r->first);
+ if (!cap)
+ continue;
+ if (r->second.snap_follows > 0) {
+ if (r->second.snap_follows < child->first - 1) {
+ rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
+ } else if (r->second.snapflush) {
+ // When processing a cap flush message that is re-sent, it's possble
+ // that the sender has already released all WR caps. So we should
+ // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
+ cap->mark_needsnapflush();
+ }
}
// make sure client's cap is in the correct snaprealm.
if (r->second.realm_ino != in->ino()) {
- prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
+ prepare_realm_split(realm, r->first, child->ino(), splits);
}
}
}
- missing_snap_parents.erase(p++);
-
+ rejoin_pending_snaprealms.erase(it++);
in->put(CInode::PIN_OPENINGSNAPPARENTS);
- // finish off client snaprealm reconnects?
- map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
- if (q != reconnected_snaprealms.end()) {
- for (map<client_t,snapid_t>::iterator r = q->second.begin();
- r != q->second.end();
- ++r)
- finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
- reconnected_snaprealms.erase(q);
- }
+ send_snaps(splits);
} else {
dout(10) << " opening past parents on " << *in << dendl;
- ++p;
+ ++it;
}
}
- send_snaps(splits);
-
if (gather.has_subs()) {
- dout(10) << "open_snap_parents - waiting for "
- << gather.num_subs_remaining() << dendl;
- gather.set_finisher(new C_MDC_OpenSnapParents(this));
- gather.activate();
- } else {
- if (!reconnected_snaprealms.empty()) {
+ if (gather.num_subs_remaining() == 0) {
+ // cleanup gather
+ gather.set_finisher(new C_MDSInternalNoop);
+ gather.activate();
+ } else {
+ // for multimds, must succeed the first time
+ ceph_assert(recovery_set.empty());
+
+ dout(10) << "open_snaprealms - waiting for "
+ << gather.num_subs_remaining() << dendl;
+ gather.set_finisher(new C_MDC_OpenSnapRealms(this));
+ gather.activate();
+ return;
+ }
+ }
+
+ notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
+
+ if (!reconnected_snaprealms.empty()) {
+ dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
+ for (auto& p : reconnected_snaprealms) {
stringstream warn_str;
- for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
- p != reconnected_snaprealms.end();
- ++p) {
- warn_str << " unconnected snaprealm " << p->first << "\n";
- for (map<client_t,snapid_t>::iterator q = p->second.begin();
- q != p->second.end();
- ++q)
- warn_str << " client." << q->first << " snapid " << q->second << "\n";
+ warn_str << " " << p.first << " {";
+ bool first = true;
+ for (auto& q : p.second) {
+ if (!first)
+ warn_str << ", ";
+ warn_str << "client." << q.first << "/" << q.second;
}
- mds->clog->warn() << "open_snap_parents has:";
- mds->clog->warn(warn_str);
+ warn_str << "}";
+ dout(5) << warn_str.str() << dendl;
}
- assert(rejoin_waiters.empty());
- assert(missing_snap_parents.empty());
- dout(10) << "open_snap_parents - all open" << dendl;
- do_delayed_cap_imports();
-
- assert(rejoin_done);
- rejoin_done.release()->complete(0);
- reconnected_caps.clear();
}
+ ceph_assert(rejoin_waiters.empty());
+ ceph_assert(rejoin_pending_snaprealms.empty());
+ dout(10) << "open_snaprealms - all open" << dendl;
+ do_delayed_cap_imports();
+
+ ceph_assert(rejoin_done);
+ rejoin_done.release()->complete(0);
+ reconnected_caps.clear();
}
bool MDCache::open_undef_inodes_dirfrags()
p != rejoin_undef_inodes.end();
++p) {
CInode *in = *p;
- assert(!in->is_base());
+ ceph_assert(!in->is_base());
fetch_queue.insert(in->get_parent_dir());
}
if (diri->state_test(CInode::STATE_REJOINUNDEF))
continue;
if (dir->state_test(CDir::STATE_REJOINUNDEF))
- assert(diri->dirfragtree.is_leaf(dir->get_frag()));
+ ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
dir->fetch(gather.new_sub());
}
- assert(gather.has_subs());
+ ceph_assert(gather.has_subs());
gather.activate();
return true;
}
rejoin_undef_inodes.erase(in);
if (in->is_dir()) {
// FIXME: re-hash dentries if necessary
- assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
+ ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
CDir *dir = in->get_dirfrag(frag_t());
- assert(dir);
+ ceph_assert(dir);
rejoin_undef_dirfrags.erase(dir);
in->force_dirfrags();
list<CDir*> ls;
}
}
-void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
+void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
+ map<client_t,MClientSnap::ref>& updates)
{
if (seq < realm->get_newest_seq()) {
dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
- << realm->get_newest_seq()
- << " on " << *realm << dendl;
- // send an update
- Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
- if (session) {
- MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
- realm->build_snap_trace(snap->bl);
- mds->send_message_client_counted(snap, session);
- } else {
- dout(10) << " ...or not, no session for this client!" << dendl;
- }
+ << realm->get_newest_seq() << " on " << *realm << dendl;
+ auto snap = MClientSnap::create(CEPH_SNAP_OP_UPDATE);
+ snap->bl = realm->get_snap_trace();
+ for (const auto& child : realm->open_children)
+ snap->split_realms.push_back(child->inode->ino());
+ updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
} else {
dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
<< " on " << *realm << dendl;
rejoin_unlinked_inodes.clear();
// send acks to everyone in the recovery set
- map<mds_rank_t,MMDSCacheRejoin*> acks;
+ map<mds_rank_t,MMDSCacheRejoin::ref> acks;
for (set<mds_rank_t>::iterator p = recovery_set.begin();
p != recovery_set.end();
++p) {
if (rejoin_ack_sent.count(*p))
continue;
- acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
+ acks[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
}
rejoin_ack_sent = recovery_set;
// send acks
for (auto p = acks.begin(); p != acks.end(); ++p) {
- ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
+ encode(rejoin_imported_caps[p->first], p->second->imported_caps);
mds->send_message_mds(p->second, p->first);
}
void MDCache::queue_file_recover(CInode *in)
{
dout(10) << "queue_file_recover " << *in << dendl;
- assert(in->is_auth());
+ ceph_assert(in->is_auth());
// cow?
/*
snapid_t snapid = *s.begin();
CInode *cow_inode = 0;
journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
- assert(cow_inode);
+ ceph_assert(cow_inode);
recovery_queue.enqueue(cow_inode);
s.erase(*s.begin());
}
if (in->filelock.is_stable()) {
in->auth_pin(&in->filelock);
} else {
- assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
+ ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
}
in->filelock.set_state(LOCK_PRE_SCAN);
rejoin_recover_q.push_back(in);
if (!in->client_need_snapflush.empty() &&
(in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
- assert(in->filelock.is_xlocked());
+ ceph_assert(in->filelock.is_xlocked());
in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
mds->locker->issue_caps(in);
return;
MDCacheIOContext(c, false), in(i), ls(l) {
}
void finish(int r) override {
- assert(r == 0 || r == -ENOENT);
+ ceph_assert(r == 0 || r == -ENOENT);
mdcache->truncate_inode_finish(in, ls);
}
void print(ostream& out) const override {
<< pi->truncate_from << " -> " << pi->truncate_size
<< " on " << *in << dendl;
- assert(pi->is_truncating());
- assert(pi->truncate_size < (1ULL << 63));
- assert(pi->truncate_from < (1ULL << 63));
- assert(pi->truncate_size < pi->truncate_from);
+ ceph_assert(pi->is_truncating());
+ ceph_assert(pi->truncate_size < (1ULL << 63));
+ ceph_assert(pi->truncate_from < (1ULL << 63));
+ ceph_assert(pi->truncate_size < pi->truncate_from);
SnapRealm *realm = in->find_snaprealm();
} else {
dout(10) << " NO realm, using null context" << dendl;
snapc = &nullsnap;
- assert(in->last == CEPH_NOSNAP);
+ ceph_assert(in->last == CEPH_NOSNAP);
}
dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
dout(10) << "truncate_inode_finish " << *in << dendl;
set<CInode*>::iterator p = ls->truncating_inodes.find(in);
- assert(p != ls->truncating_inodes.end());
+ ceph_assert(p != ls->truncating_inodes.end());
ls->truncating_inodes.erase(p);
// update
in->put(CInode::PIN_TRUNCATING);
in->auth_unpin(this);
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
in->take_waiting(CInode::WAIT_TRUNC, waiters);
mds->queue_waiters(waiters);
}
<< ls->seq << "/" << ls->offset << dendl;
// if we have the logseg the truncate started in, it must be in our list.
set<CInode*>::iterator p = ls->truncating_inodes.find(in);
- assert(p != ls->truncating_inodes.end());
+ ceph_assert(p != ls->truncating_inodes.end());
ls->truncating_inodes.erase(p);
in->put(CInode::PIN_TRUNCATING);
}
if (!in->client_need_snapflush.empty() &&
(in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
- assert(in->filelock.is_stable());
+ ceph_assert(in->filelock.is_stable());
in->filelock.set_state(LOCK_XLOCKDONE);
in->auth_pin(&in->filelock);
in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
// ================================================================================
// cache trimming
-std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
{
bool is_standby_replay = mds->is_standby_replay();
std::vector<CDentry *> unexpirables;
uint64_t trimmed = 0;
- auto trim_threshold = g_conf->get_val<uint64_t>("mds_cache_trim_threshold");
+ auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
dout(7) << "trim_lru trimming " << count
<< " items from LRU"
<< " pinned=" << lru.lru_get_num_pinned()
<< dendl;
- const uint64_t trim_counter_start = trim_counter.get(ceph_clock_now());
+ const uint64_t trim_counter_start = trim_counter.get();
bool throttled = false;
while (1) {
throttled |= trim_counter_start+trimmed >= trim_threshold;
if (count > 0) count--;
}
}
- trim_counter.hit(ceph_clock_now(), trimmed);
+ trim_counter.hit(trimmed);
for (auto &dn : unexpirables) {
lru.lru_insert_mid(dn);
{
uint64_t used = cache_size();
uint64_t limit = cache_memory_limit;
- map<mds_rank_t, MCacheExpire*> expiremap;
+ expiremap expiremap;
dout(7) << "trim bytes_used=" << bytes2str(used)
<< " limit=" << bytes2str(limit)
if (!mdsdir_in)
continue;
- if (expiremap.count(rank) == 0) {
- expiremap[rank] = new MCacheExpire(mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
+ if (em.second) {
+ em.first->second = MCacheExpire::create(mds->get_nodeid());
}
dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
// Other rank's base inodes (when I'm stopping)
if (mds->is_stopping()) {
for (set<CInode*>::iterator p = base_inodes.begin();
- p != base_inodes.end(); ++p) {
- if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
- dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
- if ((*p)->get_num_ref() == 0) {
- trim_inode(NULL, *p, NULL, expiremap);
+ p != base_inodes.end();) {
+ CInode *base_in = *p;
+ ++p;
+ if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
+ MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
+ dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
+ if (base_in->get_num_ref() == 0) {
+ trim_inode(NULL, base_in, NULL, expiremap);
++trimmed;
}
}
return result;
}
-void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
+void MDCache::send_expire_messages(expiremap& expiremap)
{
// send expires
- for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
- it != expiremap.end();
- ++it) {
+ for (const auto &p : expiremap) {
if (mds->is_cluster_degraded() &&
- (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
- (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
- rejoin_sent.count(it->first) == 0))) {
- it->second->put();
+ (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+ rejoin_sent.count(p.first) == 0))) {
continue;
}
- dout(7) << "sending cache_expire to " << it->first << dendl;
- mds->send_message_mds(it->second, it->first);
+ dout(7) << "sending cache_expire to " << p.first << dendl;
+ mds->send_message_mds(p.second, p.first);
}
+ expiremap.clear();
}
-bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
+bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
{
dout(12) << "trim_dentry " << *dn << dendl;
CDentry::linkage_t *dnl = dn->get_linkage();
CDir *dir = dn->get_dir();
- assert(dir);
+ ceph_assert(dir);
CDir *con = get_subtree_root(dir);
if (con)
dout(12) << " in container " << *con << dendl;
else {
dout(12) << " no container; under a not-yet-linked dir" << dendl;
- assert(dn->is_auth());
+ ceph_assert(dn->is_auth());
}
// If replica dentry is not readable, it's likely we will receive
} else if (dnl->is_primary()) {
// expire the inode, too.
CInode *in = dnl->get_inode();
- assert(in);
+ ceph_assert(in);
if (trim_inode(dn, in, con, expiremap))
return true; // purging stray instead of trimming
} else {
- assert(dnl->is_null());
+ ceph_assert(dnl->is_null());
}
if (!dn->is_auth()) {
if (a == mds->get_nodeid()) continue; // on export, ignore myself.
dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
- assert(a != mds->get_nodeid());
- if (expiremap.count(a) == 0)
- expiremap[a] = new MCacheExpire(mds->get_nodeid());
- expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
+ ceph_assert(a != mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(mds->get_nodeid());
+ em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
}
}
}
-void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
+void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
{
dout(15) << "trim_dirfrag " << *dir << dendl;
if (dir->is_subtree_root()) {
- assert(!dir->is_auth() ||
+ ceph_assert(!dir->is_auth() ||
(!dir->is_replicated() && dir->inode->is_base()));
remove_subtree(dir); // remove from subtree map
}
- assert(dir->get_num_ref() == 0);
+ ceph_assert(dir->get_num_ref() == 0);
CInode *in = dir->get_inode();
if (a == mds->get_nodeid()) continue; // on export, ignore myself.
dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
- assert(a != mds->get_nodeid());
- if (expiremap.count(a) == 0)
- expiremap[a] = new MCacheExpire(mds->get_nodeid());
- expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
+ ceph_assert(a != mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
+ em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
}
}
*
* @return true if the inode is still in cache, else false if it was trimmed
*/
-bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
+bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
{
dout(15) << "trim_inode " << *in << dendl;
- assert(in->get_num_ref() == 0);
+ ceph_assert(in->get_num_ref() == 0);
if (in->is_dir()) {
// If replica inode's dirfragtreelock is not readable, it's likely
in->get_dirfrags(dfls);
for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
CDir *dir = *p;
- assert(!dir->is_subtree_root());
+ ceph_assert(!dir->is_subtree_root());
trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
}
}
if (a == mds->get_nodeid()) continue; // on export, ignore myself.
dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
- assert(a != mds->get_nodeid());
- if (expiremap.count(a) == 0)
- expiremap[a] = new MCacheExpire(mds->get_nodeid());
- expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
+ ceph_assert(a != mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
+ em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
}
}
} else {
// non-auth. expire.
CDir *dir = dn->get_dir();
- assert(dir);
+ ceph_assert(dir);
// unlink the dentry
dout(10) << " removing " << *dn << dendl;
in->get_dirfrags(ls);
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *subdir = *p;
- assert(!subdir->is_subtree_root());
+ ceph_assert(!subdir->is_subtree_root());
in->close_dirfrag(subdir->dirfrag().frag);
}
dir->unlink_inode(dn, false);
remove_inode(in);
}
else {
- assert(dnl->is_null());
+ ceph_assert(dnl->is_null());
}
- assert(!dir->has_bloom());
+ ceph_assert(!dir->has_bloom());
dir->remove_dentry(dn);
// adjust the dir state
dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
p != ls.end();
++p) {
dout(10) << " removing " << **p << dendl;
- assert((*p)->get_num_ref() == 1); // SUBTREE
+ ceph_assert((*p)->get_num_ref() == 1); // SUBTREE
remove_subtree((*p));
in->close_dirfrag((*p)->dirfrag().frag);
}
dout(10) << " removing " << *in << dendl;
- assert(!in->get_parent_dn());
- assert(in->get_num_ref() == 0);
+ ceph_assert(!in->get_parent_dn());
+ ceph_assert(in->get_num_ref() == 0);
remove_inode(in);
}
}
dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
dir->unlink_inode(dn, false);
remove_inode(in);
- assert(!dir->has_bloom());
+ ceph_assert(!dir->has_bloom());
dir->remove_dentry(dn);
} else {
dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
diri->close_dirfrag(dir->get_frag());
dout(10) << " removing " << *diri << dendl;
- assert(!diri->get_parent_dn());
- assert(diri->get_num_ref() == 0);
+ ceph_assert(!diri->get_parent_dn());
+ ceph_assert(diri->get_num_ref() == 0);
remove_inode(diri);
}
break;
}
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_cache_expire(MCacheExpire *m)
+void MDCache::handle_cache_expire(const MCacheExpire::const_ref &m)
{
mds_rank_t from = mds_rank_t(m->get_from());
dout(7) << "cache_expire from mds." << from << dendl;
if (mds->get_state() < MDSMap::STATE_REJOIN) {
- m->put();
return;
}
set<SimpleLock *> gather_locks;
// loop over realms
- for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
- p != m->realms.end();
- ++p) {
+ for (const auto &p : m->realms) {
// check container?
- if (p->first.ino > 0) {
- CInode *expired_inode = get_inode(p->first.ino);
- assert(expired_inode); // we had better have this.
- CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
- assert(parent_dir);
+ if (p.first.ino > 0) {
+ CInode *expired_inode = get_inode(p.first.ino);
+ ceph_assert(expired_inode); // we had better have this.
+ CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
+ ceph_assert(parent_dir);
int export_state = -1;
if (parent_dir->is_auth() && parent_dir->is_exporting()) {
export_state = migrator->get_export_state(parent_dir);
- assert(export_state >= 0);
+ ceph_assert(export_state >= 0);
}
if (!parent_dir->is_auth() ||
// not auth.
dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
- assert(parent_dir->is_frozen_tree_root());
+ ceph_assert(parent_dir->is_frozen_tree_root());
// make a message container
- if (delayed_expire[parent_dir].count(from) == 0)
- delayed_expire[parent_dir][from] = new MCacheExpire(from);
-
+
+ auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(from); /* new */
+
// merge these expires into it
- delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
+ em.first->second->add_realm(p.first, p.second);
continue;
}
- assert(export_state <= Migrator::EXPORT_PREPPING ||
+ ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
(export_state == Migrator::EXPORT_WARNING &&
!migrator->export_has_warned(parent_dir, from)));
}
// INODES
- for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
- it != p->second.inodes.end();
- ++it) {
- CInode *in = get_inode(it->first);
- unsigned nonce = it->second;
+ for (const auto &q : p.second.inodes) {
+ CInode *in = get_inode(q.first);
+ unsigned nonce = q.second;
if (!in) {
- dout(0) << " inode expire on " << it->first << " from " << from
+ dout(0) << " inode expire on " << q.first << " from " << from
<< ", don't have it" << dendl;
- assert(in);
+ ceph_assert(in);
}
- assert(in->is_auth());
+ ceph_assert(in->is_auth());
dout(20) << __func__ << ": expiring inode " << *in << dendl;
// check nonce
}
// DIRS
- for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
- it != p->second.dirs.end();
- ++it) {
- CDir *dir = get_dirfrag(it->first);
- unsigned nonce = it->second;
+ for (const auto &q : p.second.dirs) {
+ CDir *dir = get_dirfrag(q.first);
+ unsigned nonce = q.second;
if (!dir) {
- CInode *diri = get_inode(it->first.ino);
+ CInode *diri = get_inode(q.first.ino);
if (diri) {
if (mds->is_rejoin() &&
rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
!diri->is_replica(from)) {
list<CDir*> ls;
diri->get_nested_dirfrags(ls);
- dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
+ dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
<< " while rejoining, inode isn't replicated" << dendl;
for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
dir = *q;
}
continue;
}
- CDir *other = diri->get_approx_dirfrag(it->first.frag);
+ CDir *other = diri->get_approx_dirfrag(q.first.frag);
if (other) {
- dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
+ dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
<< " have " << *other << ", mismatched frags, dropping" << dendl;
continue;
}
}
- dout(0) << " dir expire on " << it->first << " from " << from
+ dout(0) << " dir expire on " << q.first << " from " << from
<< ", don't have it" << dendl;
- assert(dir);
+ ceph_assert(dir);
}
dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
- assert(dir->is_auth());
+ ceph_assert(dir->is_auth());
// check nonce
if (nonce == dir->get_replica_nonce(from)) {
}
// DENTRIES
- for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
- pd != p->second.dentries.end();
- ++pd) {
- dout(10) << " dn expires in dir " << pd->first << dendl;
- CInode *diri = get_inode(pd->first.ino);
- assert(diri);
- CDir *dir = diri->get_dirfrag(pd->first.frag);
+ for (const auto &pd : p.second.dentries) {
+ dout(10) << " dn expires in dir " << pd.first << dendl;
+ CInode *diri = get_inode(pd.first.ino);
+ ceph_assert(diri);
+ CDir *dir = diri->get_dirfrag(pd.first.frag);
if (!dir) {
- dout(0) << " dn expires on " << pd->first << " from " << from
+ dout(0) << " dn expires on " << pd.first << " from " << from
<< ", must have refragmented" << dendl;
} else {
- assert(dir->is_auth());
+ ceph_assert(dir->is_auth());
}
- for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
- p != pd->second.end();
- ++p) {
- unsigned nonce = p->second;
+ for (const auto &p : pd.second) {
+ unsigned nonce = p.second;
CDentry *dn;
if (dir) {
- dn = dir->lookup(p->first.first, p->first.second);
+ dn = dir->lookup(p.first.first, p.first.second);
} else {
// which dirfrag for this dentry?
- CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
- assert(dir);
- assert(dir->is_auth());
- dn = dir->lookup(p->first.first, p->first.second);
+ CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
+ ceph_assert(dir);
+ ceph_assert(dir->is_auth());
+ dn = dir->lookup(p.first.first, p.first.second);
}
if (!dn) {
if (dir)
- dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
+ dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
else
- dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
+ dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
}
- assert(dn);
+ ceph_assert(dn);
if (nonce == dn->get_replica_nonce(from)) {
dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
}
}
- // done
- m->put();
-
for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
if (!(*p)->is_stable())
mds->locker->eval_gather(*p);
void MDCache::process_delayed_expire(CDir *dir)
{
dout(7) << "process_delayed_expire on " << *dir << dendl;
- for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
- p != delayed_expire[dir].end();
- ++p)
- handle_cache_expire(p->second);
+ for (const auto &p : delayed_expire[dir]) {
+ handle_cache_expire(p.second);
+ }
delayed_expire.erase(dir);
}
void MDCache::discard_delayed_expire(CDir *dir)
{
dout(7) << "discard_delayed_expire on " << *dir << dendl;
- for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
- p != delayed_expire[dir].end();
- ++p)
- p->second->put();
delayed_expire.erase(dir);
}
set<SimpleLock *>& gather_locks)
{
in->remove_replica(from);
- in->mds_caps_wanted.erase(from);
+ in->set_mds_caps_wanted(from, 0);
// note: this code calls _eval more often than it needs to!
// fix lock
gather_locks.insert(&dn->lock);
// Replicated strays might now be elegible for purge
- CDentry::linkage_t *dnl = dn->get_linkage();
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
if (dnl->is_primary()) {
maybe_eval_stray(dnl->get_inode());
}
static MemoryModel::snap baseline = last;
// check client caps
- assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
+ ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
double caps_per_inode = 0.0;
if (CInode::count())
caps_per_inode = (double)Capability::count() / (double)CInode::count();
<< ", rss " << last.get_rss()
<< ", heap " << last.get_heap()
<< ", baseline " << baseline.get_heap()
- << ", buffers " << (buffer::get_total_alloc() >> 10)
<< ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
<< ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
<< dendl;
// cache
char old_val[32] = { 0 };
char *o = old_val;
- g_conf->get_val("debug_mds", &o, sizeof(old_val));
- g_conf->set_val("debug_mds", "10");
- g_conf->apply_changes(NULL);
+ g_conf().get_val("debug_mds", &o, sizeof(old_val));
+ g_conf().set_val("debug_mds", "10");
+ g_conf().apply_changes(nullptr);
show_cache();
- g_conf->set_val("debug_mds", old_val);
- g_conf->apply_changes(NULL);
- mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+ g_conf().set_val("debug_mds", old_val);
+ g_conf().apply_changes(nullptr);
+ mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
// this
dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
{
dout(5) << "shutdown_start" << dendl;
- if (g_conf->mds_shutdown_check)
- mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+ if (g_conf()->mds_shutdown_check)
+ mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
- // g_conf->debug_mds = 10;
+ // g_conf()->debug_mds = 10;
}
}
if (num_auth_subtree > 0) {
- assert(mds->get_nodeid() > 0);
+ ceph_assert(mds->get_nodeid() > 0);
dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
show_subtrees();
return false;
show_cache();
return false;
}
- assert(!migrator->is_exporting());
- assert(!migrator->is_importing());
+ ceph_assert(!migrator->is_exporting());
+ ceph_assert(!migrator->is_importing());
// replicas may dirty scatter locks
if (myin && myin->is_replicated()) {
return false;
}
- if ((myin && myin->is_auth_pinned()) ||
- (mydir && mydir->is_auth_pinned())) {
+ if ((myin && myin->get_num_auth_pins()) ||
+ (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
dout(7) << "still have auth pinned objects" << dendl;
return false;
}
if (!did_shutdown_log_cap) {
// flush journal header
dout(7) << "writing header for (now-empty) journal" << dendl;
- assert(mds->mdlog->empty());
+ ceph_assert(mds->mdlog->empty());
mds->mdlog->write_head(0);
// NOTE: filer active checker below will block us until this completes.
did_shutdown_log_cap = true;
remove_subtree(mydir);
myin->close_dirfrag(mydir->get_frag());
}
- assert(subtrees.empty());
+ ceph_assert(subtrees.empty());
if (myin) {
remove_inode(myin);
- assert(!myin);
+ ceph_assert(!myin);
}
+ if (global_snaprealm) {
+ remove_inode(global_snaprealm->inode);
+ global_snaprealm = nullptr;
+ }
+
// done!
dout(5) << "shutdown done." << dendl;
return true;
}
if (!dir->is_complete()) {
- MDSInternalContextBase *fin = nullptr;
+ MDSContext *fin = nullptr;
if (shutdown_exporting_strays.empty()) {
fin = new MDSInternalContextWrapper(mds,
new FunctionContext([this](int r) {
continue;
if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
- next.second = string(it->first.name);
+ next.second = it->first.name;
goto done;
}
if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
++it;
if (it != dir->end()) {
- next.second = string(it->first.name);
+ next.second = it->first.name;
} else {
if (dfls.empty())
next.first.ino.val++;
// ========= messaging ==============
-/* This function DOES put the passed message before returning */
-void MDCache::dispatch(Message *m)
+void MDCache::dispatch(const Message::const_ref &m)
{
switch (m->get_type()) {
// RESOLVE
case MSG_MDS_RESOLVE:
- handle_resolve(static_cast<MMDSResolve*>(m));
+ handle_resolve(MMDSResolve::msgref_cast(m));
break;
case MSG_MDS_RESOLVEACK:
- handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
+ handle_resolve_ack(MMDSResolveAck::msgref_cast(m));
break;
// REJOIN
case MSG_MDS_CACHEREJOIN:
- handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
+ handle_cache_rejoin(MMDSCacheRejoin::msgref_cast(m));
break;
case MSG_MDS_DISCOVER:
- handle_discover(static_cast<MDiscover*>(m));
+ handle_discover(MDiscover::msgref_cast(m));
break;
case MSG_MDS_DISCOVERREPLY:
- handle_discover_reply(static_cast<MDiscoverReply*>(m));
+ handle_discover_reply(MDiscoverReply::msgref_cast(m));
break;
case MSG_MDS_DIRUPDATE:
- handle_dir_update(static_cast<MDirUpdate*>(m));
+ handle_dir_update(MDirUpdate::msgref_cast(m));
break;
case MSG_MDS_CACHEEXPIRE:
- handle_cache_expire(static_cast<MCacheExpire*>(m));
+ handle_cache_expire(MCacheExpire::msgref_cast(m));
break;
case MSG_MDS_DENTRYLINK:
- handle_dentry_link(static_cast<MDentryLink*>(m));
+ handle_dentry_link(MDentryLink::msgref_cast(m));
break;
case MSG_MDS_DENTRYUNLINK:
- handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
+ handle_dentry_unlink(MDentryUnlink::msgref_cast(m));
break;
case MSG_MDS_FRAGMENTNOTIFY:
- handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
+ handle_fragment_notify(MMDSFragmentNotify::msgref_cast(m));
break;
case MSG_MDS_FRAGMENTNOTIFYACK:
- handle_fragment_notify_ack(static_cast<MMDSFragmentNotifyAck*>(m));
+ handle_fragment_notify_ack(MMDSFragmentNotifyAck::msgref_cast(m));
break;
case MSG_MDS_FINDINO:
- handle_find_ino(static_cast<MMDSFindIno *>(m));
+ handle_find_ino(MMDSFindIno::msgref_cast(m));
break;
case MSG_MDS_FINDINOREPLY:
- handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
+ handle_find_ino_reply(MMDSFindInoReply::msgref_cast(m));
break;
case MSG_MDS_OPENINO:
- handle_open_ino(static_cast<MMDSOpenIno *>(m));
+ handle_open_ino(MMDSOpenIno::msgref_cast(m));
break;
case MSG_MDS_OPENINOREPLY:
- handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
+ handle_open_ino_reply(MMDSOpenInoReply::msgref_cast(m));
+ break;
+
+ case MSG_MDS_SNAPUPDATE:
+ handle_snap_update(MMDSSnapUpdate::msgref_cast(m));
break;
default:
derr << "cache unknown message " << m->get_type() << dendl;
- assert(0 == "cache unknown message");
- }
-}
-
-MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
-{
- if (mdr) {
- dout(20) << "_get_waiter retryrequest" << dendl;
- return new C_MDS_RetryRequest(this, mdr);
- } else if (req) {
- dout(20) << "_get_waiter retrymessage" << dendl;
- return new C_MDS_RetryMessage(mds, req);
- } else {
- return fin;
+ ceph_abort_msg("cache unknown message");
}
}
-int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
- const filepath& path, // what
+int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, // who
+ const filepath& path, // what
vector<CDentry*> *pdnvec, // result
- CInode **pin,
+ CInode **pin,
int onfail)
{
bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
bool forward = (onfail == MDS_TRAVERSE_FORWARD);
- assert(mdr || req || fin);
- assert(!forward || mdr || req); // forward requires a request
+ ceph_assert(!forward || mdr); // forward requires a request
snapid_t snapid = CEPH_NOSNAP;
if (mdr)
CInode *cur = get_inode(path.get_ino());
if (cur == NULL) {
if (MDS_INO_IS_MDSDIR(path.get_ino()))
- open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
+ open_foreign_mdsdir(path.get_ino(), cf.build());
else {
//ceph_abort(); // hrm.. broken
return -ESTALE;
return -ESTALE;
// make sure snaprealm are open...
- if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
- !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
+ if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
+ !cur->snaprealm->open_parents(cf.build())) {
return 1;
}
SnapRealm *realm = cur->find_snaprealm();
snapid = realm->resolve_snapname(path[depth], cur->ino());
dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
- if (!snapid)
+ if (!snapid) {
+ CInode *t = cur;
+ while (t) {
+ // if snaplock isn't readable, it's possible that other mds is creating
+ // snapshot, but snap update message hasn't been received.
+ if (!t->snaplock.can_read(client)) {
+ dout(10) << " non-readable snaplock on " << *t << dendl;
+ t->snaplock.add_waiter(SimpleLock::WAIT_RD, cf.build());
+ return 1;
+ }
+ CDentry *pdn = t->get_projected_parent_dn();
+ t = pdn ? pdn->get_dir()->get_inode() : NULL;
+ }
return -ENOENT;
+ }
mdr->snapid = snapid;
depth++;
continue;
// parent dir frozen_dir?
if (cur->is_frozen()) {
dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
- cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
+ cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
return 1;
}
curdir = cur->get_or_open_dirfrag(this, fg);
} else {
// discover?
dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
- discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
+ discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
null_okay);
if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
return 1;
}
}
- assert(curdir);
+ ceph_assert(curdir);
#ifdef MDS_VERIFY_FRAGSTAT
if (curdir->is_complete())
!dn->lock.can_read(client) &&
(dnl->is_null() || forward)) {
dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
- dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
mds->mdlog->flush();
return 1;
return -ENOENT;
} else {
dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
- dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
return 1;
}
}
// do we have inode?
if (!in) {
- assert(dnl->is_remote());
+ ceph_assert(dnl->is_remote());
// do i have it?
in = get_inode(dnl->get_remote_ino());
if (in) {
dn->link_remote(dnl, in);
} else {
dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
- assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
+ ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
dout(4) << "traverse: remote dentry points to damaged ino "
<< *dn << dendl;
return -EIO;
}
- open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
+ open_remote_dentry(dn, true, cf.build(),
(null_okay && depth == path.depth() - 1));
if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
return 1;
cur = in;
// make sure snaprealm are open...
- if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
- !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
+ if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
+ !cur->snaprealm->open_parents(cf.build())) {
return 1;
}
if (curdir->is_complete() ||
(snapid == CEPH_NOSNAP &&
curdir->has_bloom() &&
- !curdir->is_in_bloom(path[depth]))){
+ !curdir->is_in_bloom(path[depth]))) {
// file not found
if (pdnvec) {
// instantiate a null dn?
// directory isn't complete; reload
dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
touch_inode(cur);
- curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
+ curdir->fetch(cf.build(), path[depth]);
if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
return 1;
}
mds_authority_t dauth = curdir->authority();
if (forward &&
- snapid && mdr && mdr->client_request &&
+ mdr && mdr->client_request &&
(int)depth < mdr->client_request->get_num_fwd()) {
dout(7) << "traverse: snap " << snapid << " and depth " << depth
<< " < fwd " << mdr->client_request->get_num_fwd()
if ((discover || null_okay)) {
dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
- discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
+ discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
null_okay);
if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
return 1;
if (curdir->is_ambiguous_auth()) {
// wait
dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
- curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
+ curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
return 1;
}
dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
-
- if (mdr)
- request_forward(mdr, dauth.first);
- else
- mds->forward_message_mds(req, dauth.first);
-
+
+ request_forward(mdr, dauth.first);
+
if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
- assert(fin == NULL);
return 2;
- }
+ }
}
-
+
ceph_abort(); // i shouldn't get here
}
if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
dout(10) << "path_traverse finish on snapid " << snapid << dendl;
if (mdr)
- assert(mdr->snapid == snapid);
+ ceph_assert(mdr->snapid == snapid);
return 0;
}
return NULL;
for (unsigned i = 0; i < fp.depth(); i++) {
- boost::string_view dname = fp[i];
+ std::string_view dname = fp[i];
frag_t fg = in->pick_dirfrag(dname);
dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
CDir *curdir = in->get_dirfrag(fg);
* @param approxfg approximate fragment.
* @param fin completion callback
*/
-void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
+void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
{
dout(10) << "open_remote_dir on " << *diri << dendl;
- assert(diri->is_dir());
- assert(!diri->is_auth());
- assert(diri->get_dirfrag(approxfg) == 0);
+ ceph_assert(diri->is_dir());
+ ceph_assert(!diri->is_auth());
+ ceph_assert(diri->get_dirfrag(approxfg) == 0);
discover_dir_frag(diri, approxfg, fin);
}
else
dnl = dn->get_linkage();
- assert(!dnl->is_null());
+ ceph_assert(!dnl->is_null());
if (dnl->is_primary())
return dnl->inode;
- assert(dnl->is_remote());
+ ceph_assert(dnl->is_remote());
CInode *in = get_inode(dnl->get_remote_ino());
if (in) {
dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
struct C_MDC_OpenRemoteDentry : public MDCacheContext {
CDentry *dn;
inodeno_t ino;
- MDSInternalContextBase *onfinish;
+ MDSContext *onfinish;
bool want_xlocked;
- C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
+ C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
dn->get(MDSCacheObject::PIN_PTRWAITER);
}
}
};
-void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
+void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
{
dout(10) << "open_remote_dentry " << *dn << dendl;
CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
}
-void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
+void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
bool want_xlocked, int r)
{
if (r < 0) {
if (dir) {
dir->get_inode()->make_path_string(path);
path += "/";
- path += std::string(dn->get_name());
+ path += dn->get_name();
}
bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
return;
CInode *parent = in->get_parent_inode();
- assert(parent);
+ ceph_assert(parent);
make_trace(trace, parent);
CDentry *dn = in->get_parent_dn();
struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
inodeno_t ino;
- MMDSOpenIno *msg;
+ MMDSOpenIno::const_ref msg;
bool parent;
public:
- C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
+ C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const MMDSOpenIno::const_ref &m, bool p) :
MDCacheContext(c), ino(i), msg(m), parent(p) {}
void finish(int r) override {
if (r < 0 && !parent)
mdcache->handle_open_ino(msg, r);
return;
}
- assert(mdcache->opening_inodes.count(ino));
- mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
+ auto& info = mdcache->opening_inodes.at(ino);
+ mdcache->_open_ino_traverse_dir(ino, info, r);
}
};
{
dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
- assert(opening_inodes.count(ino));
- open_ino_info_t& info = opening_inodes[ino];
+ open_ino_info_t& info = opening_inodes.at(ino);
CInode *in = get_inode(ino);
if (in) {
inode_backtrace_t backtrace;
if (err == 0) {
try {
- ::decode(backtrace, bl);
+ decode(backtrace, bl);
} catch (const buffer::error &decode_exc) {
derr << "corrupt backtrace on ino x0" << std::hex << ino
<< std::dec << ": " << decode_exc << dendl;
{
dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
- assert(opening_inodes.count(ino));
- open_ino_info_t& info = opening_inodes[ino];
+ open_ino_info_t& info = opening_inodes.at(ino);
CInode *in = get_inode(ino);
if (in) {
do_open_ino(ino, info, ret);
}
-void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
+void MDCache::_open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent)
{
if (dir->state_test(CDir::STATE_REJOINUNDEF))
- assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
+ ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
+ if (mds->logger)
+ mds->logger->inc(l_mds_openino_dir_fetch);
}
-int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
- vector<inode_backpointer_t>& ancestors,
+int MDCache::open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
+ const vector<inode_backpointer_t>& ancestors,
bool discover, bool want_xlocked, mds_rank_t *hint)
{
dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
int err = 0;
for (unsigned i = 0; i < ancestors.size(); i++) {
- CInode *diri = get_inode(ancestors[i].dirino);
+ const auto& ancestor = ancestors.at(i);
+ CInode *diri = get_inode(ancestor.dirino);
if (!diri) {
- if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
- open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+ if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
+ open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
return 1;
}
continue;
break;
}
- string &name = ancestors[i].dname;
+ const string& name = ancestor.dname;
frag_t fg = diri->pick_dirfrag(name);
CDir *dir = diri->get_dirfrag(fg);
if (!dir) {
}
}
if (dir) {
- inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
+ inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
CDentry *dn = dir->lookup(name);
CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
if (dir->is_auth()) {
{
dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
waiters.swap(info.waiters);
opening_inodes.erase(ino);
finish_contexts(g_ceph_context, waiters, ret);
fetch_backtrace(ino, info.pool, fin->bl,
new C_OnFinisher(fin, mds->finisher));
} else {
- assert(!info.ancestors.empty());
+ ceph_assert(!info.ancestors.empty());
info.checking = mds->get_nodeid();
open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
dout(10) << "do_open_ino_peer " << ino << " active " << active
<< " all " << all << " checked " << info.checked << dendl;
+ mds_rank_t whoami = mds->get_nodeid();
mds_rank_t peer = MDS_RANK_NONE;
- if (info.auth_hint >= 0) {
+ if (info.auth_hint >= 0 && info.auth_hint != whoami) {
if (active.count(info.auth_hint)) {
peer = info.auth_hint;
info.auth_hint = MDS_RANK_NONE;
}
} else {
for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
- if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
+ if (*p != whoami && info.checked.count(*p) == 0) {
peer = *p;
break;
}
}
if (peer < 0) {
- all.erase(mds->get_nodeid());
+ all.erase(whoami);
if (all != info.checked) {
dout(10) << " waiting for more peers to be active" << dendl;
} else {
// got backtrace from peer or backtrace just fetched
if (info.discover || !info.fetch_backtrace)
pa = &info.ancestors;
- mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
+ mds->send_message_mds(MMDSOpenIno::create(info.tid, ino, pa), peer);
+ if (mds->logger)
+ mds->logger->inc(l_mds_openino_peer_discover);
}
}
-void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
+void MDCache::handle_open_ino(const MMDSOpenIno::const_ref &m, int err)
{
if (mds->get_state() < MDSMap::STATE_REJOIN &&
mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
- m->put();
return;
}
dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
+ auto from = mds_rank_t(m->get_source().num());
inodeno_t ino = m->ino;
- MMDSOpenInoReply *reply;
+ MMDSOpenInoReply::ref reply;
CInode *in = get_inode(ino);
if (in) {
dout(10) << " have " << *in << dendl;
- reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
+ reply = MMDSOpenInoReply::create(m->get_tid(), ino, mds_rank_t(0));
if (in->is_auth()) {
touch_inode(in);
while (1) {
reply->hint = in->authority().first;
}
} else if (err < 0) {
- reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
+ reply = MMDSOpenInoReply::create(m->get_tid(), ino, MDS_RANK_NONE, err);
} else {
mds_rank_t hint = MDS_RANK_NONE;
int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
if (ret > 0)
return;
- reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
+ reply = MMDSOpenInoReply::create(m->get_tid(), ino, hint, ret);
}
- m->get_connection()->send_message(reply);
- m->put();
+ mds->send_message_mds(reply, from);
}
-void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
+void MDCache::handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m)
{
dout(10) << "handle_open_ino_reply " << *m << dendl;
dout(10) << " found ino " << ino << " on mds." << from << dendl;
if (!info.want_replica) {
open_ino_finish(ino, info, from);
- m->put();
return;
}
do_open_ino_peer(ino, info);
}
}
- m->put();
}
void MDCache::kick_open_ino_peers(mds_rank_t who)
}
}
-void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
+void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
bool want_replica, bool want_xlocked)
{
dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
<< want_replica << dendl;
- if (opening_inodes.count(ino)) {
- open_ino_info_t& info = opening_inodes[ino];
+ auto it = opening_inodes.find(ino);
+ if (it != opening_inodes.end()) {
+ open_ino_info_t& info = it->second;
if (want_replica) {
info.want_replica = true;
if (want_xlocked && !info.want_xlocked) {
info.tid = ++open_ino_last_tid;
info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
info.waiters.push_back(fin);
- do_open_ino(ino, info, 0);
+ if (mds->is_rejoin() &&
+ open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
+ info.fetch_backtrace = false;
+ info.checking = mds->get_nodeid();
+ _open_ino_traverse_dir(ino, info, 0);
+ } else {
+ do_open_ino(ino, info, 0);
+ }
}
}
- traverse path
*/
-void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
+void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint)
{
dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
CInode *in = get_inode(ino);
c->complete(-ESTALE);
return;
}
- assert(!in);
+ ceph_assert(!in);
ceph_tid_t tid = ++find_ino_peer_last_tid;
find_ino_peer_info_t& fip = find_ino_peer[tid];
}
} else {
fip.checking = m;
- mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
+ mds->send_message_mds(MMDSFindIno::create(fip.tid, fip.ino), m);
}
}
-void MDCache::handle_find_ino(MMDSFindIno *m)
+void MDCache::handle_find_ino(const MMDSFindIno::const_ref &m)
{
if (mds->get_state() < MDSMap::STATE_REJOIN) {
- m->put();
return;
}
dout(10) << "handle_find_ino " << *m << dendl;
- MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
+ auto r = MMDSFindInoReply::create(m->tid);
CInode *in = get_inode(m->ino);
if (in) {
in->make_path(r->path);
dout(10) << " have " << r->path << " " << *in << dendl;
}
- m->get_connection()->send_message(r);
- m->put();
+ mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
}
-void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
+void MDCache::handle_find_ino_reply(const MMDSFindInoReply::const_ref &m)
{
map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
if (p != find_ino_peer.end()) {
dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
mds->queue_waiter(fip.fin);
find_ino_peer.erase(p);
- m->put();
return;
}
if (!m->path.empty()) {
// we got a path!
vector<CDentry*> trace;
+ CF_MDS_RetryMessageFactory cf(mds, m);
MDRequestRef null_ref;
- int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
+ int r = path_traverse(null_ref, cf, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
if (r > 0)
return;
dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
} else {
dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
}
- m->put();
}
void MDCache::kick_find_ino_peers(mds_rank_t who)
return count;
}
-/* This function takes over the reference to the passed Message */
-MDRequestRef MDCache::request_start(MClientRequest *req)
+MDRequestRef MDCache::request_start(const MClientRequest::const_ref& req)
{
// did we win a forward race against a slave?
if (active_requests.count(req->get_reqid())) {
MDRequestRef& mdr = active_requests[req->get_reqid()];
- assert(mdr);
+ ceph_assert(mdr);
if (mdr->is_slave()) {
dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
} else {
dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
- req->put();
}
return MDRequestRef();
}
params.dispatched = req->get_dispatch_stamp();
MDRequestRef mdr =
- mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
+ mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms);
active_requests[params.reqid] = mdr;
mdr->set_op_stamp(req->get_stamp());
dout(7) << "request_start " << *mdr << dendl;
return mdr;
}
-MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
+MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const Message::const_ref &m)
{
int by = m->get_source().num();
MDRequestImpl::Params params;
params.all_read = m->get_recv_complete_stamp();
params.dispatched = m->get_dispatch_stamp();
MDRequestRef mdr =
- mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
- assert(active_requests.count(mdr->reqid) == 0);
+ mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms);
+ ceph_assert(active_requests.count(mdr->reqid) == 0);
active_requests[mdr->reqid] = mdr;
dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
return mdr;
params.dispatched = now;
params.internal_op = op;
MDRequestRef mdr =
- mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
+ mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms);
- assert(active_requests.count(mdr->reqid) == 0);
+ ceph_assert(active_requests.count(mdr->reqid) == 0);
active_requests[mdr->reqid] = mdr;
dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
return mdr;
MDRequestRef MDCache::request_get(metareqid_t rid)
{
ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
- assert(p != active_requests.end());
+ ceph_assert(p != active_requests.end());
dout(7) << "request_get " << rid << " " << *p->second << dendl;
return p->second;
}
case CEPH_MDS_OP_REPAIR_INODESTATS:
repair_inode_stats_work(mdr);
break;
+ case CEPH_MDS_OP_UPGRADE_SNAPREALM:
+ upgrade_inode_snaprealm_work(mdr);
+ break;
default:
ceph_abort();
}
for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
p != mdr->more()->slaves.end();
++p) {
- MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
+ auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt,
MMDSSlaveRequest::OP_FINISH);
if (mdr->killed && !mdr->committing) {
* implicitly. Note that we don't call the finishers -- there shouldn't
* be any on a remote lock and the request finish wakes up all
* the waiters anyway! */
- set<SimpleLock*>::iterator p = mdr->xlocks.begin();
- while (p != mdr->xlocks.end()) {
- if ((*p)->get_parent()->is_auth())
- ++p;
- else {
- dout(10) << "request_drop_foreign_locks forgetting lock " << **p
- << " on " << *(*p)->get_parent() << dendl;
- (*p)->put_xlock();
- mdr->locks.erase(*p);
- mdr->xlocks.erase(p++);
- }
- }
- map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
- while (q != mdr->remote_wrlocks.end()) {
- dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
- << " on mds." << q->second
- << " on " << *(q->first)->get_parent() << dendl;
- mdr->locks.erase(q->first);
- mdr->remote_wrlocks.erase(q++);
+ for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
+ SimpleLock *lock = it->lock;
+ if (it->is_xlock() && !lock->get_parent()->is_auth()) {
+ dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
+ << " on " << lock->get_parent() << dendl;
+ lock->put_xlock();
+ mdr->locks.erase(it++);
+ } else if (it->is_remote_wrlock()) {
+ dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
+ << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
+ if (it->is_wrlock()) {
+ it->clear_remote_wrlock();
+ ++it;
+ } else {
+ mdr->locks.erase(it++);
+ }
+ } else {
+ ++it;
+ }
}
mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
mdr->drop_local_auth_pins();
// drop stickydirs
- for (set<CInode*>::iterator p = mdr->stickydirs.begin();
- p != mdr->stickydirs.end();
- ++p)
- (*p)->put_stickydirs();
+ mdr->put_stickydirs();
mds->locker->kick_cap_releases(mdr);
if (mdr->has_more() &&
(!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
if (!mdr->done_locking) {
- assert(mdr->more()->witnessed.empty());
+ ceph_assert(mdr->more()->witnessed.empty());
mdr->aborted = true;
dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
} else {
dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
}
- assert(mdr->used_prealloc_ino == 0);
- assert(mdr->prealloc_inos.empty());
+ ceph_assert(mdr->used_prealloc_ino == 0);
+ ceph_assert(mdr->prealloc_inos.empty());
mdr->session = NULL;
mdr->item_session_request.remove_myself();
// -------------------------------------------------------------------------------
// SNAPREALMS
-struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
- MDRequestRef mdr;
- MutationRef mut;
- CInode *in;
- C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
- MutationRef& mu, CInode *i) :
- MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
- void finish(int r) override {
- mdcache->_snaprealm_create_finish(mdr, mut, in);
- }
-};
-
-void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
+void MDCache::create_global_snaprealm()
{
- dout(10) << "snaprealm_create " << *in << dendl;
- assert(!in->snaprealm);
-
- // allocate an id..
- if (!mdr->more()->stid) {
- mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
- new C_MDS_RetryRequest(this, mdr));
- return;
- }
-
- MutationRef mut(new MutationImpl());
- mut->ls = mds->mdlog->get_current_segment();
- EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
- mds->mdlog->start_entry(le);
-
- le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
-
- auto &pi = in->project_inode(false, true);
- pi.inode.version = in->pre_dirty();
- pi.inode.rstat.rsnaprealms++;
-
- bufferlist::iterator p = mdr->more()->snapidbl.begin();
- snapid_t seq;
- ::decode(seq, p);
-
- auto &newsnap = *pi.snapnode;
- newsnap.created = seq;
- newsnap.seq = seq;
- newsnap.last_created = seq;
-
- predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
- journal_cow_inode(mut, &le->metablob, in);
- le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
-
- mds->server->submit_mdlog_entry(le,
- new C_MDC_snaprealm_create_finish(this, mdr,
- mut, in),
- mdr, __func__);
- mds->mdlog->flush();
+ CInode *in = new CInode(this); // dummy inode
+ create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
+ add_inode(in);
+ global_snaprealm = in->snaprealm;
}
-
-void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
+void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
{
dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
vector<inodeno_t> split_inos;
vector<inodeno_t> split_realms;
- if (snapop == CEPH_SNAP_OP_SPLIT) {
- // notify clients of update|split
- for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
- !p.end(); ++p)
- split_inos.push_back((*p)->ino());
-
- for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
- p != in->snaprealm->open_children.end();
- ++p)
- split_realms.push_back((*p)->inode->ino());
- }
+ if (notify_clients) {
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ if (snapop == CEPH_SNAP_OP_SPLIT) {
+ // notify clients of update|split
+ for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end(); ++p)
+ split_inos.push_back((*p)->ino());
- bufferlist snapbl;
- in->snaprealm->build_snap_trace(snapbl);
+ for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
+ p != in->snaprealm->open_children.end();
+ ++p)
+ split_realms.push_back((*p)->inode->ino());
+ }
+ }
set<SnapRealm*> past_children;
- map<client_t, MClientSnap*> updates;
+ map<client_t, MClientSnap::ref> updates;
list<SnapRealm*> q;
q.push_back(in->snaprealm);
while (!q.empty()) {
dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
realm->invalidate_cached_snaps();
- for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
- p != realm->client_caps.end();
- ++p) {
- assert(!p->second->empty());
- if (!nosend && updates.count(p->first) == 0) {
- MClientSnap *update = new MClientSnap(snapop);
- update->head.split = in->ino();
- update->split_inos = split_inos;
- update->split_realms = split_realms;
- update->bl = snapbl;
- updates[p->first] = update;
+ if (notify_clients) {
+ for (const auto& p : realm->client_caps) {
+ const auto& client = p.first;
+ const auto& caps = p.second;
+ ceph_assert(!caps->empty());
+
+ auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
+ if (em.second) {
+ auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
+ update->head.split = in->ino();
+ update->split_inos = split_inos;
+ update->split_realms = split_realms;
+ update->bl = in->snaprealm->get_snap_trace();
+ em.first->second = std::move(update);
+ }
}
}
q.push_back(*p);
}
- if (!nosend)
+ if (notify_clients)
send_snaps(updates);
// notify past children and their descendants if we update/delete old snapshots
}
}
-void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
+void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
{
- dout(10) << "_snaprealm_create_finish " << *in << dendl;
+ dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
+ ceph_assert(in->is_auth());
- // apply
- in->pop_and_dirty_projected_inode(mut->ls);
- mut->apply();
- mds->locker->drop_locks(mut.get());
- mut->cleanup();
+ set<mds_rank_t> mds_set;
+ if (stid > 0) {
+ mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
+ mds_set.erase(mds->get_nodeid());
+ } else {
+ in->list_replicas(mds_set);
+ }
- // tell table we've committed
- mds->snapclient->commit(mdr->more()->stid, mut->ls);
+ if (!mds_set.empty()) {
+ bufferlist snap_blob;
+ in->encode_snap(snap_blob);
- // create
- bufferlist::iterator p = mdr->more()->snapidbl.begin();
- snapid_t seq;
- ::decode(seq, p);
+ for (auto p : mds_set) {
+ auto m = MMDSSnapUpdate::create(in->ino(), stid, snap_op);
+ m->snap_blob = snap_blob;
+ mds->send_message_mds(m, p);
+ }
+ }
- in->open_snaprealm();
- in->snaprealm->srnode.seq = seq;
- in->snaprealm->srnode.created = seq;
- bool ok = in->snaprealm->_open_parents(NULL);
- assert(ok);
+ if (stid > 0)
+ notify_global_snaprealm_update(snap_op);
+}
- do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
+void MDCache::handle_snap_update(const MMDSSnapUpdate::const_ref &m)
+{
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
- /*
- static int count = 5;
- if (--count == 0)
- ceph_abort(); // hack test test **********
- */
+ if (mds->get_state() < MDSMap::STATE_RESOLVE &&
+ mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
+ return;
+ }
- // done.
- mdr->more()->stid = 0; // caller will likely need to reuse this
- dispatch_request(mdr);
+ // null rejoin_done means open_snaprealms() has already been called
+ bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
+ (mds->is_rejoin() && !rejoin_done);
+
+ if (m->get_tid() > 0) {
+ mds->snapclient->notify_commit(m->get_tid());
+ if (notify_clients)
+ notify_global_snaprealm_update(m->get_snap_op());
+ }
+
+ CInode *in = get_inode(m->get_ino());
+ if (in) {
+ ceph_assert(!in->is_auth());
+ if (mds->get_state() > MDSMap::STATE_REJOIN ||
+ (mds->is_rejoin() && !in->is_rejoining())) {
+ auto p = m->snap_blob.cbegin();
+ in->decode_snap(p);
+
+ if (!notify_clients) {
+ if (!rejoin_pending_snaprealms.count(in)) {
+ in->get(CInode::PIN_OPENINGSNAPPARENTS);
+ rejoin_pending_snaprealms.insert(in);
+ }
+ }
+ do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
+ }
+ }
}
+void MDCache::notify_global_snaprealm_update(int snap_op)
+{
+ if (snap_op != CEPH_SNAP_OP_DESTROY)
+ snap_op = CEPH_SNAP_OP_UPDATE;
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto &session : sessions) {
+ if (!session->is_open() && !session->is_stale())
+ continue;
+ auto update = MClientSnap::create(snap_op);
+ update->head.split = global_snaprealm->inode->ino();
+ update->bl = global_snaprealm->get_snap_trace();
+ mds->send_message_client_counted(update, session);
+ }
+}
// -------------------------------------------------------------------------------
// STRAYS
{
object_t oid = CInode::get_object_name(ino, frag_t(), "");
mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
+ if (mds->logger)
+ mds->logger->inc(l_mds_openino_backtrace_fetch);
}
void MDCache::_send_discover(discover_info_t& d)
{
- MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
- d.want_base_dir, d.want_xlocked);
+ auto dis = MDiscover::create(d.ino, d.frag, d.snap, d.want_path, d.want_base_dir, d.want_xlocked);
dis->set_tid(d.tid);
mds->send_message_mds(dis, d.mds);
}
void MDCache::discover_base_ino(inodeno_t want_ino,
- MDSInternalContextBase *onfinish,
+ MDSContext *onfinish,
mds_rank_t from)
{
dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
void MDCache::discover_dir_frag(CInode *base,
frag_t approx_fg,
- MDSInternalContextBase *onfinish,
+ MDSContext *onfinish,
mds_rank_t from)
{
if (from < 0)
void MDCache::discover_path(CInode *base,
snapid_t snap,
filepath want_path,
- MDSInternalContextBase *onfinish,
+ MDSContext *onfinish,
bool want_xlocked,
mds_rank_t from)
{
base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
return;
} else if (from == mds->get_nodeid()) {
- list<MDSInternalContextBase*> finished;
+ MDSContext::vec finished;
base->take_waiting(CInode::WAIT_DIR, finished);
mds->queue_waiters(finished);
return;
void MDCache::discover_path(CDir *base,
snapid_t snap,
filepath want_path,
- MDSInternalContextBase *onfinish,
+ MDSContext *onfinish,
bool want_xlocked)
{
mds_rank_t from = base->authority().first;
base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
return;
} else if (from == mds->get_nodeid()) {
- list<MDSInternalContextBase*> finished;
+ MDSContext::vec finished;
base->take_sub_waiting(finished);
mds->queue_waiters(finished);
return;
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_discover(MDiscover *dis)
+void MDCache::handle_discover(const MDiscover::const_ref &dis)
{
mds_rank_t whoami = mds->get_nodeid();
mds_rank_t from = mds_rank_t(dis->get_source().num());
- assert(from != whoami);
+ ceph_assert(from != whoami);
if (mds->get_state() <= MDSMap::STATE_REJOIN) {
if (mds->get_state() < MDSMap::STATE_REJOIN &&
mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
- dis->put();
return;
}
CInode *cur = 0;
- MDiscoverReply *reply = new MDiscoverReply(dis);
+ auto reply = MDiscoverReply::create(*dis);
snapid_t snapid = dis->get_snapid();
<< dendl;
cur = get_inode(dis->get_base_ino());
- assert(cur);
+ ceph_assert(cur);
// add root
reply->starts_with = MDiscoverReply::INODE;
}
}
- assert(reply);
+ ceph_assert(reply);
// add content
// do some fidgeting to include a dir if they asked for the base dir, or just root.
fg = cur->pick_dirfrag(dis->get_dentry(i));
} else {
// requester explicity specified the frag
- assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
+ ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
fg = dis->get_base_dir_frag();
if (!cur->dirfragtree.is_leaf(fg))
fg = cur->dirfragtree[fg.value()];
}
dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
- reply->put();
return;
}
curdir = cur->get_or_open_dirfrag(this, fg);
}
dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
- reply->put();
return;
}
// original request was sent.
reply->set_base_dir_frag(curdir->get_frag());
} else {
- assert(!curdir->is_ambiguous_auth()); // would be frozen.
+ ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
if (!reply->trace.length())
reply->starts_with = MDiscoverReply::DIR;
replicate_dir(curdir, from, reply->trace);
CDentry *dn = 0;
if (curdir->get_version() == 0) {
// fetch newly opened dir
- assert(!curdir->has_bloom());
+ ceph_assert(!curdir->has_bloom());
} else if (dis->get_want().depth() > 0) {
// lookup dentry
dn = curdir->lookup(dis->get_dentry(i), snapid);
// incomplete dir?
if (!dn) {
if (!curdir->is_complete() &&
- (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
+ !(snapid == CEPH_NOSNAP &&
+ curdir->has_bloom() &&
+ !curdir->is_in_bloom(dis->get_dentry(i)))) {
// readdir
dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
if (reply->is_empty()) {
// fetch and wait
curdir->fetch(new C_MDS_RetryMessage(mds, dis),
dis->wants_base_dir() && curdir->get_version() == 0);
- reply->put();
return;
} else {
// initiate fetch, but send what we have so far
}
}
+ if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
+ dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
+ << " dne, non-empty reply, stopping" << dendl;
+ break;
+ }
+
// send null dentry
dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
<< *curdir << dendl;
- dn = curdir->add_null_dentry(dis->get_dentry(i));
+ if (snapid == CEPH_NOSNAP)
+ dn = curdir->add_null_dentry(dis->get_dentry(i));
+ else
+ dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
}
- assert(dn);
+ ceph_assert(dn);
// don't add replica to purging dentry/inode
if (dn->state_test(CDentry::STATE_PURGING)) {
} else if (reply->is_empty()) {
dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
- reply->put();
return;
} else {
dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
} else if (reply->is_empty()) {
dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
- reply->put();
return;
} else {
dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
// add inode
CInode *next = dnl->get_inode();
- assert(next->is_auth());
+ ceph_assert(next->is_auth());
replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
dout(7) << "handle_discover added inode " << *next << dendl;
}
// how did we do?
- assert(!reply->is_empty());
+ ceph_assert(!reply->is_empty());
dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
mds->send_message(reply, dis->get_connection());
-
- dis->put();
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_discover_reply(MDiscoverReply *m)
+void MDCache::handle_discover_reply(const MDiscoverReply::const_ref &m)
{
/*
if (mds->get_state() < MDSMap::STATE_ACTIVE) {
dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
- m->put();
return;
}
*/
if (m->is_flag_error_dn())
dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
- list<MDSInternalContextBase*> finished, error;
+ MDSContext::vec finished, error;
mds_rank_t from = mds_rank_t(m->get_source().num());
// starting point
CInode *cur = get_inode(m->get_base_ino());
- bufferlist::iterator p = m->trace.begin();
+ auto p = m->trace.cbegin();
int next = m->starts_with;
if (!p.end() && next == MDiscoverReply::INODE) {
cur = add_replica_inode(p, NULL, finished);
dout(7) << "discover_reply got base inode " << *cur << dendl;
- assert(cur->is_base());
+ ceph_assert(cur->is_base());
next = MDiscoverReply::DIR;
waiting_for_base_ino[from].erase(cur->ino());
}
}
- assert(cur);
+ ceph_assert(cur);
// loop over discover results.
// indexes follow each ([[dir] dentry] inode)
if (next == MDiscoverReply::DIR) {
curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
- assert(m->get_wanted_base_dir());
+ ceph_assert(m->get_wanted_base_dir());
cur->take_dir_waiting(m->get_base_dir_frag(), finished);
}
} else {
// waiters
finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
mds->queue_waiters(finished);
-
- // done
- m->put();
}
void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
{
dirfrag_t df = dir->dirfrag();
- ::encode(df, bl);
+ encode(df, bl);
dir->encode_replica(to, bl);
}
void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
{
- ::encode(dn->get_name(), bl);
- ::encode(dn->last, bl);
+ encode(dn->get_name(), bl);
+ encode(dn->last, bl);
dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
}
void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
uint64_t features)
{
- ::encode(in->inode.ino, bl); // bleh, minor assymetry here
- ::encode(in->last, bl);
+ encode(in->inode.ino, bl); // bleh, minor assymetry here
+ encode(in->last, bl);
in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
}
-CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
- list<MDSInternalContextBase*>& finished)
+CDir *MDCache::add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
+ MDSContext::vec& finished)
{
dirfrag_t df;
- ::decode(df, p);
+ decode(df, p);
- assert(diri->ino() == df.ino);
+ ceph_assert(diri->ino() == df.ino);
// add it (_replica_)
CDir *dir = diri->get_dirfrag(df.frag);
return dir;
}
-CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
+CDentry *MDCache::add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
{
string name;
snapid_t last;
- ::decode(name, p);
- ::decode(last, p);
+ decode(name, p);
+ decode(last, p);
CDentry *dn = dir->lookup(name, last);
return dn;
}
-CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
+CInode *MDCache::add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
{
inodeno_t ino;
snapid_t last;
- ::decode(ino, p);
- ::decode(last, p);
+ decode(ino, p);
+ decode(last, p);
CInode *in = get_inode(ino, last);
if (!in) {
in = new CInode(this, false, 1, last);
in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
dout(10) << "add_replica_inode added " << *in << dendl;
if (dn) {
- assert(dn->get_linkage()->is_null());
+ ceph_assert(dn->get_linkage()->is_null());
dn->dir->link_primary_inode(dn, in);
}
} else {
replicate_dentry(straydn, who, bl);
}
-CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
+CDentry *MDCache::add_replica_stray(const bufferlist &bl, mds_rank_t from)
{
- list<MDSInternalContextBase*> finished;
- bufferlist::iterator p = bl.begin();
+ MDSContext::vec finished;
+ auto p = bl.cbegin();
CInode *mdsin = add_replica_inode(p, NULL, finished);
CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
for (const auto &r : dir->dir_rep_by) {
s.insert(r);
}
- mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
- dir->dirfrag(),
- dir->dir_rep,
- s,
- path,
- bcast),
- *it);
+ mds->send_message_mds(MDirUpdate::create(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
}
return 0;
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_dir_update(MDirUpdate *m)
+void MDCache::handle_dir_update(const MDirUpdate::const_ref &m)
{
dirfrag_t df = m->get_dirfrag();
CDir *dir = get_dirfrag(df);
CInode *in;
filepath path = m->get_path();
dout(5) << "trying discover on dir_update for " << path << dendl;
+ CF_MDS_RetryMessageFactory cf(mds, m);
MDRequestRef null_ref;
- int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
+ int r = path_traverse(null_ref, cf, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
if (r > 0)
return;
if (r == 0 &&
}
}
- m->put();
return;
}
dir->dir_rep_by.insert(e);
}
}
-
- // done
- m->put();
}
rejoin_gather.count(p.first)))
continue;
CDentry::linkage_t *dnl = dn->get_linkage();
- MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
- dn->get_name(), dnl->is_primary());
+ auto m = MDentryLink::create(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
if (dnl->is_primary()) {
dout(10) << " primary " << *dnl->get_inode() << dendl;
replicate_inode(dnl->get_inode(), p.first, m->bl,
inodeno_t ino = dnl->get_remote_ino();
__u8 d_type = dnl->get_remote_d_type();
dout(10) << " remote " << ino << " " << d_type << dendl;
- ::encode(ino, m->bl);
- ::encode(d_type, m->bl);
+ encode(ino, m->bl);
+ encode(d_type, m->bl);
} else
ceph_abort(); // aie, bad caller!
mds->send_message_mds(m, p.first);
}
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_dentry_link(MDentryLink *m)
+void MDCache::handle_dentry_link(const MDentryLink::const_ref &m)
{
-
CDentry *dn = NULL;
CDir *dir = get_dirfrag(m->get_dirfrag());
if (!dir) {
dout(7) << "handle_dentry_link on " << *dn << dendl;
CDentry::linkage_t *dnl = dn->get_linkage();
- assert(!dn->is_auth());
- assert(dnl->is_null());
+ ceph_assert(!dn->is_auth());
+ ceph_assert(dnl->is_null());
}
}
- bufferlist::iterator p = m->bl.begin();
- list<MDSInternalContextBase*> finished;
+ auto p = m->bl.cbegin();
+ MDSContext::vec finished;
if (dn) {
if (m->get_is_primary()) {
// primary link.
// remote link, easy enough.
inodeno_t ino;
__u8 d_type;
- ::decode(ino, p);
- ::decode(d_type, p);
+ decode(ino, p);
+ decode(d_type, p);
dir->link_remote_inode(dn, ino, d_type);
}
} else {
if (!finished.empty())
mds->queue_waiters(finished);
- m->put();
return;
}
// share unlink news with replicas
set<mds_rank_t> replicas;
dn->list_replicas(replicas);
- if (straydn)
+ bufferlist snapbl;
+ if (straydn) {
straydn->list_replicas(replicas);
+ CInode *strayin = straydn->get_linkage()->get_inode();
+ strayin->encode_snap_blob(snapbl);
+ }
for (set<mds_rank_t>::iterator it = replicas.begin();
it != replicas.end();
++it) {
rejoin_gather.count(*it)))
continue;
- MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->get_name());
- if (straydn)
+ auto unlink = MDentryUnlink::create(dn->get_dir()->dirfrag(), dn->get_name());
+ if (straydn) {
replicate_stray(straydn, *it, unlink->straybl);
+ unlink->snapbl = snapbl;
+ }
mds->send_message_mds(unlink, *it);
}
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_dentry_unlink(MDentryUnlink *m)
+void MDCache::handle_dentry_unlink(const MDentryUnlink::const_ref &m)
{
// straydn
CDentry *straydn = NULL;
if (dnl->is_primary()) {
CInode *in = dnl->get_inode();
dn->dir->unlink_inode(dn);
- assert(straydn);
+ ceph_assert(straydn);
straydn->dir->link_primary_inode(straydn, in);
// in->first is lazily updated on replica; drag it forward so
// that we always keep it in sync with the dnq
- assert(straydn->first >= in->first);
+ ceph_assert(straydn->first >= in->first);
in->first = straydn->first;
// update subtree map?
if (in->is_dir())
adjust_subtree_after_rename(in, dir, false);
+ if (m->snapbl.length()) {
+ bool hadrealm = (in->snaprealm ? true : false);
+ in->decode_snap_blob(m->snapbl);
+ ceph_assert(in->snaprealm);
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ if (!hadrealm)
+ do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
+ }
+
// send caps to auth (if we're not already)
if (in->is_any_caps() &&
!in->state_test(CInode::STATE_EXPORTINGCAPS))
straydn = NULL;
} else {
- assert(!straydn);
- assert(dnl->is_remote());
+ ceph_assert(!straydn);
+ ceph_assert(dnl->is_remote());
dn->dir->unlink_inode(dn);
}
- assert(dnl->is_null());
+ ceph_assert(dnl->is_null());
}
}
// race with trim_dentry()
if (straydn) {
- assert(straydn->get_num_ref() == 0);
- assert(straydn->get_linkage()->is_null());
- map<mds_rank_t, MCacheExpire*> expiremap;
- trim_dentry(straydn, expiremap);
- send_expire_messages(expiremap);
+ ceph_assert(straydn->get_num_ref() == 0);
+ ceph_assert(straydn->get_linkage()->is_null());
+ expiremap ex;
+ trim_dentry(straydn, ex);
+ send_expire_messages(ex);
}
-
- m->put();
- return;
}
*/
void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
list<CDir*>& resultfrags,
- list<MDSInternalContextBase*>& waiters,
+ MDSContext::vec& waiters,
bool replay)
{
dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
list<CDir*> src, result;
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
// split a parent?
frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
list<CDir*>& srcfrags,
frag_t basefrag, int bits,
list<CDir*>& resultfrags,
- list<MDSInternalContextBase*>& waiters,
+ MDSContext::vec& waiters,
bool replay)
{
dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
if (bits > 0) {
// SPLIT
- assert(srcfrags.size() == 1);
+ ceph_assert(srcfrags.size() == 1);
CDir *dir = srcfrags.front();
dir->split(bits, resultfrags, waiters, replay);
// was i a bound?
if (parent_subtree) {
- assert(subtrees[parent_subtree].count(dir));
+ ceph_assert(subtrees[parent_subtree].count(dir));
subtrees[parent_subtree].erase(dir);
for (list<CDir*>::iterator p = resultfrags.begin();
p != resultfrags.end();
++p) {
- assert((*p)->is_subtree_root());
+ ceph_assert((*p)->is_subtree_root());
subtrees[parent_subtree].insert(*p);
}
}
// are my constituent bits subtrees? if so, i will be too.
// (it's all or none, actually.)
- bool any_subtree = false;
+ bool any_subtree = false, any_non_subtree = false;
for (CDir *dir : srcfrags) {
- if (dir->is_subtree_root()) {
+ if (dir->is_subtree_root())
any_subtree = true;
- break;
- }
+ else
+ any_non_subtree = true;
}
+ ceph_assert(!any_subtree || !any_non_subtree);
+
set<CDir*> new_bounds;
if (any_subtree) {
for (CDir *dir : srcfrags) {
}
for (CDir *dir : srcfrags) {
- assert(dir->is_subtree_root());
+ ceph_assert(dir->is_subtree_root());
dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
set<CDir*>::iterator r = q->second.begin();
f->merge(srcfrags, waiters, replay);
if (any_subtree) {
- assert(f->is_subtree_root());
+ ceph_assert(f->is_subtree_root());
subtrees[f].swap(new_bounds);
if (parent_subtree)
subtrees[parent_subtree].insert(f);
void MDCache::split_dir(CDir *dir, int bits)
{
dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
- assert(dir->is_auth());
+ ceph_assert(dir->is_auth());
CInode *diri = dir->inode;
list<CDir*> dirs;
MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
mdr->more()->fragment_base = dir->dirfrag();
- assert(fragments.count(dir->dirfrag()) == 0);
+ ceph_assert(fragments.count(dir->dirfrag()) == 0);
fragment_info_t& info = fragments[dir->dirfrag()];
info.mdr = mdr;
info.dirs.push_back(dir);
MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
mdr->more()->fragment_base = basedirfrag;
- assert(fragments.count(basedirfrag) == 0);
+ ceph_assert(fragments.count(basedirfrag) == 0);
fragment_info_t& info = fragments[basedirfrag];
info.mdr = mdr;
info.dirs = dirs;
void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
{
- for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
- CDir *dir = *p;
+ bool any_subtree = false, any_non_subtree = false;
+ for (CDir* dir : dirs) {
dir->auth_pin(dir); // until we mark and complete them
dir->state_set(CDir::STATE_FRAGMENTING);
dir->freeze_dir();
- assert(dir->is_freezing_dir());
+ ceph_assert(dir->is_freezing_dir());
+
+ if (dir->is_subtree_root())
+ any_subtree = true;
+ else
+ any_non_subtree = true;
+ }
+
+ if (any_subtree && any_non_subtree) {
+ // either all dirfrags are subtree roots or all are not.
+ for (CDir *dir : dirs) {
+ if (dir->is_subtree_root()) {
+ ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
+ } else {
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ adjust_subtree_auth(dir, mds->get_nodeid());
+ }
+ }
}
}
ready = false;
} else if (dir->is_new()) {
dout(15) << " committing new " << *dir << dendl;
- assert(dir->is_dirty());
+ ceph_assert(dir->is_dirty());
dir->commit(0, gather.new_sub(), true);
ready = false;
}
for (auto &p : dir->items) {
CDentry *dn = p.second;
dn->get(CDentry::PIN_FRAGMENTING);
- assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
+ ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
dn->state_set(CDentry::STATE_FRAGMENTING);
}
dir->state_set(CDir::STATE_DNPINNEDFRAG);
++p) {
CDir *dir = *p;
if (!dir->is_frozen_dir()) {
- assert(dir->is_freezing_dir());
+ ceph_assert(dir->is_freezing_dir());
dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
}
}
CDir *dir = *p;
dout(10) << " frag " << *dir << dendl;
- assert(dir->state_test(CDir::STATE_FRAGMENTING));
+ ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
dir->state_clear(CDir::STATE_FRAGMENTING);
if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
for (auto &p : dir->items) {
CDentry *dn = p.second;
- assert(dn->state_test(CDentry::STATE_FRAGMENTING));
+ ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
dn->state_clear(CDentry::STATE_FRAGMENTING);
dn->put(CDentry::PIN_FRAGMENTING);
}
bool MDCache::fragment_are_all_frozen(CDir *dir)
{
- assert(dir->is_frozen_dir());
+ ceph_assert(dir->is_frozen_dir());
map<dirfrag_t,fragment_info_t>::iterator p;
for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
p != fragments.end() && p->first.ino == dir->ino();
// see comment in Migrator::find_stale_export_freeze()
utime_t now = ceph_clock_now();
utime_t cutoff = now;
- cutoff -= g_conf->mds_freeze_tree_timeout;
+ cutoff -= g_conf()->mds_freeze_tree_timeout;
for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
p != fragments.end(); ) {
const MDRequestRef& r) :
MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
void finish(int r) override {
- assert(r == 0 || r == -ENOENT);
+ ceph_assert(r == 0 || r == -ENOENT);
mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
}
void print(ostream& out) const override {
return;
}
- assert(r == 0);
+ ceph_assert(r == 0);
fragment_info_t& info = it->second;
dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
<< " on " << info.dirs.front()->get_inode() << dendl;
dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
<< " on " << *diri << dendl;
if (!mdr->aborted) {
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- wrlocks.insert(&diri->dirfragtreelock);
+ MutationImpl::LockOpVec lov;
+ lov.add_wrlock(&diri->dirfragtreelock);
// prevent a racing gather on any other scatterlocks too
- wrlocks.insert(&diri->nestlock);
- wrlocks.insert(&diri->filelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_wrlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov, NULL, true))
if (!mdr->aborted)
return;
}
}
// refragment
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
info.resultfrags, waiters, false);
- if (g_conf->mds_debug_frag)
+ if (g_conf()->mds_debug_frag)
diri->verify_dirfrags();
mds->queue_waiters(waiters);
- for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
- assert(!diri->dirfragtree.is_leaf(*p));
+ for (const auto& fg : le->orig_frags)
+ ceph_assert(!diri->dirfragtree.is_leaf(fg));
le->metablob.add_dir_context(*info.resultfrags.begin());
for (list<CDir*>::iterator p = info.resultfrags.begin();
rejoin_gather.count(p.first)))
continue;
- auto notify = new MMDSFragmentNotify(basedirfrag, info.bits, mdr->reqid.tid);
+ auto notify = MMDSFragmentNotify::create(basedirfrag, info.bits, mdr->reqid.tid);
if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
diri_auth != p.first) { // not auth mds of diri
/*
for (auto &p : dir->items) {
CDentry *dn = p.second;
- assert(dn->state_test(CDentry::STATE_FRAGMENTING));
+ ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
dn->state_clear(CDentry::STATE_FRAGMENTING);
dn->put(CDentry::PIN_FRAGMENTING);
}
SnapContext nullsnapc;
object_locator_t oloc(mds->mdsmap->get_metadata_pool());
- for (list<frag_t>::iterator p = uf.old_frags.begin();
- p != uf.old_frags.end();
- ++p) {
- object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
+ for (const auto& fg : uf.old_frags) {
+ object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
ObjectOperation op;
- if (*p == frag_t()) {
+ if (fg == frag_t()) {
// backtrace object
dout(10) << " truncate orphan dirfrag " << oid << dendl;
op.truncate(0);
0, gather.new_sub());
}
- assert(gather.has_subs());
+ ceph_assert(gather.has_subs());
gather.activate();
}
}
-void MDCache::handle_fragment_notify_ack(MMDSFragmentNotifyAck *ack)
+void MDCache::handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &ack)
{
dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
mds_rank_t from = mds_rank_t(ack->get_source().num());
if (mds->get_state() < MDSMap::STATE_ACTIVE) {
- ack->put();
return;
}
if (it == fragments.end() ||
it->second.get_tid() != ack->get_tid()) {
dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
- ack->put();
return;
}
fragment_drop_locks(it->second);
fragment_maybe_finish(it);
}
- ack->put();
}
-/* This function DOES put the passed message before returning */
-void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
+void MDCache::handle_fragment_notify(const MMDSFragmentNotify::const_ref ¬ify)
{
dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
mds_rank_t from = mds_rank_t(notify->get_source().num());
if (mds->get_state() < MDSMap::STATE_REJOIN) {
- notify->put();
return;
}
(bits > 0 && !diri->dirfragtree.is_leaf(base))) {
dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
<< ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
- notify->put();
return;
}
*/
// refragment
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
list<CDir*> resultfrags;
adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
- if (g_conf->mds_debug_frag)
+ if (g_conf()->mds_debug_frag)
diri->verify_dirfrags();
for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
diri->take_dir_waiting((*p)->get_frag(), waiters);
// add new replica dirs values
- bufferlist::iterator p = notify->basebl.begin();
+ auto p = notify->basebl.cbegin();
while (!p.end())
add_replica_dir(p, diri, from, waiters);
}
if (notify->is_ack_wanted()) {
- auto ack = new MMDSFragmentNotifyAck(notify->get_base_dirfrag(),
- notify->get_bits(), notify->get_tid());
+ auto ack = MMDSFragmentNotifyAck::create(notify->get_base_dirfrag(),
+ notify->get_bits(), notify->get_tid());
mds->send_message_mds(ack, from);
}
- notify->put();
}
-void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
+void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
LogSegment *ls, bufferlist *rollback)
{
dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
- assert(!uncommitted_fragments.count(basedirfrag));
+ ceph_assert(!uncommitted_fragments.count(basedirfrag));
ufragment& uf = uncommitted_fragments[basedirfrag];
uf.old_frags = old_frags;
uf.bits = bits;
}
}
-void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
+void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
{
dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
<< " old_frags (" << old_frags << ")" << dendl;
if (it != uncommitted_fragments.end()) {
ufragment& uf = it->second;
if (!uf.old_frags.empty()) {
- uf.old_frags.swap(old_frags);
+ uf.old_frags = std::move(old_frags);
uf.committed = true;
} else {
uf.ls->uncommitted_fragments.erase(basedirfrag);
++p) {
ufragment &uf = p->second;
CInode *diri = get_inode(p->first.ino);
- assert(diri);
+ ceph_assert(diri);
if (uf.committed) {
_fragment_committed(p->first, MDRequestRef());
mds->mdlog->start_entry(le);
bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
- list<frag_t> old_frags;
+ frag_vec_t old_frags;
diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
list<CDir*> resultfrags;
if (uf.old_frags.empty()) {
// created by old format EFragment
- list<MDSInternalContextBase*> waiters;
+ MDSContext::vec waiters;
adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
} else {
- bufferlist::iterator bp = uf.rollback.begin();
- for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
- CDir *dir = force_dir_fragment(diri, *q);
+ auto bp = uf.rollback.cbegin();
+ for (const auto& fg : uf.old_frags) {
+ CDir *dir = force_dir_fragment(diri, fg);
resultfrags.push_back(dir);
dirfrag_rollback rollback;
- ::decode(rollback, bp);
+ decode(rollback, bp);
dir->set_version(rollback.fnode.version);
dir->fnode = rollback.fnode;
ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
}
- if (g_conf->mds_debug_frag)
+ if (g_conf()->mds_debug_frag)
diri->verify_dirfrags();
- for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
- assert(!diri->dirfragtree.is_leaf(*q));
+ for (const auto& leaf : old_frags) {
+ ceph_assert(!diri->dirfragtree.is_leaf(leaf));
+ }
mds->mdlog->submit_entry(le);
void MDCache::show_subtrees(int dbl)
{
- if (g_conf->mds_thrash_exports)
+ if (g_conf()->mds_thrash_exports)
dbl += 15;
//dout(10) << "show_subtrees" << dendl;
- if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
+ if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
return; // i won't print anything.
if (subtrees.empty()) {
- dout(dbl) << "show_subtrees - no subtrees" << dendl;
+ dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
+ << dendl;
return;
}
// sanity check
//dout(25) << "saw depth " << d << " " << *dir << dendl;
if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
- assert(seen.count(dir) == 0);
+ ceph_assert(seen.count(dir) == 0);
seen.insert(dir);
// nested items?
snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
// print
- dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
+ dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
+ << " " << auth << *dir << dendl;
if (dir->ino() == MDS_INO_ROOT)
- assert(dir->inode == root);
+ ceph_assert(dir->inode == root);
if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
- assert(dir->inode == myin);
+ ceph_assert(dir->inode == myin);
if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
- assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
+ ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
// nested items?
if (!subtrees[dir].empty()) {
dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
lost++;
}
- assert(lost == 0);
+ ceph_assert(lost == 0);
}
void MDCache::show_cache()
f->close_section();
}
-int MDCache::dump_cache(boost::string_view file_name)
+void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
{
- return dump_cache(file_name, NULL);
+ ceph_assert(in);
+ if ((max_depth >= 0) && (cur_depth > max_depth)) {
+ return;
+ }
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (const auto &subdir : ls) {
+ for (const auto &p : subdir->items) {
+ CDentry *dn = p.second;
+ CInode *in = dn->get_linkage()->get_inode();
+ if (in) {
+ dump_tree(in, cur_depth + 1, max_depth, f);
+ }
+ }
+ }
+ f->open_object_section("inode");
+ in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
+ f->close_section();
}
-int MDCache::dump_cache(Formatter *f)
+int MDCache::dump_cache(std::string_view file_name)
{
- return dump_cache(boost::string_view(""), f);
+ return dump_cache(file_name, NULL);
}
-int MDCache::dump_cache(boost::string_view dump_root, int depth, Formatter *f)
+int MDCache::dump_cache(Formatter *f)
{
- return dump_cache(boost::string_view(""), f, dump_root, depth);
+ return dump_cache(std::string_view(""), f);
}
/**
* Dump the metadata cache, either to a Formatter, if
* provided, else to a plain text file.
*/
-int MDCache::dump_cache(boost::string_view fn, Formatter *f,
- boost::string_view dump_root, int depth)
+int MDCache::dump_cache(std::string_view fn, Formatter *f)
{
int r = 0;
// shooting, but the option itself is for devs and hence dangerous to
// tune). TODO: remove this when fixed.
uint64_t threshold = f ?
- g_conf->get_val<uint64_t>("mds_dump_cache_threshold_formatter") :
- g_conf->get_val<uint64_t>("mds_dump_cache_threshold_file");
+ g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
+ g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
if (threshold && cache_size() > threshold) {
if (f) {
}
}
- auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) {
+ auto dump_func = [fd, f](CInode *in) {
int r;
- if (!dump_root.empty()) {
- string ipath;
- if (in->is_root())
- ipath = "/";
- else
- in->make_path_string(ipath);
-
- if (dump_root.length() > ipath.length() ||
- !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
- return 0;
-
- if (depth >= 0 &&
- count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
- return 0;
- }
-
if (f) {
f->open_object_section("inode");
- in->dump(f);
- } else {
- ostringstream ss;
- ss << *in << std::endl;
- std::string s = ss.str();
- r = safe_write(fd, s.c_str(), s.length());
- if (r < 0)
- return r;
- }
-
+ in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
+ f->close_section();
+ return 1;
+ }
+ ostringstream ss;
+ ss << *in << std::endl;
+ std::string s = ss.str();
+ r = safe_write(fd, s.c_str(), s.length());
+ if (r < 0)
+ return r;
list<CDir*> dfs;
in->get_dirfrags(dfs);
- if (f) {
- f->open_array_section("dirfrags");
- }
- for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
- CDir *dir = *p;
- if (f) {
- f->open_object_section("dir");
- dir->dump(f);
- } else {
- ostringstream tt;
- tt << " " << *dir << std::endl;
- string t = tt.str();
- r = safe_write(fd, t.c_str(), t.length());
- if (r < 0)
- return r;
- }
-
- if (f) {
- f->open_array_section("dentries");
- }
+ for (auto &dir : dfs) {
+ ostringstream tt;
+ tt << " " << *dir << std::endl;
+ std::string t = tt.str();
+ r = safe_write(fd, t.c_str(), t.length());
+ if (r < 0)
+ return r;
for (auto &p : dir->items) {
CDentry *dn = p.second;
- if (f) {
- f->open_object_section("dentry");
- dn->dump(f);
- f->close_section();
- } else {
- ostringstream uu;
- uu << " " << *dn << std::endl;
- string u = uu.str();
- r = safe_write(fd, u.c_str(), u.length());
- if (r < 0)
- return r;
- }
- }
- if (f) {
- f->close_section(); //dentries
+ ostringstream uu;
+ uu << " " << *dn << std::endl;
+ std::string u = uu.str();
+ r = safe_write(fd, u.c_str(), u.length());
+ if (r < 0)
+ return r;
}
dir->check_rstats();
- if (f) {
- f->close_section(); //dir
- }
- }
- if (f) {
- f->close_section(); // dirfrags
- }
-
- if (f) {
- f->close_section(); // inode
}
return 1;
};
class C_MDS_EnqueueScrub : public Context
{
+ std::string tag;
Formatter *formatter;
Context *on_finish;
public:
ScrubHeaderRef header;
- C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
- formatter(f), on_finish(fin), header(nullptr) {}
+ C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
+ tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
Context *take_finisher() {
Context *fin = on_finish;
}
void finish(int r) override {
- if (r < 0) { // we failed the lookup or something; dump ourselves
+ if (r == 0) {
+ // since recursive scrub is asynchronous, dump minimal output
+ // to not upset cli tools.
+ if (header && header->get_recursive()) {
+ formatter->open_object_section("results");
+ formatter->dump_int("return_code", 0);
+ formatter->dump_string("scrub_tag", tag);
+ formatter->dump_string("mode", "asynchronous");
+ formatter->close_section(); // results
+ }
+ } else { // we failed the lookup or something; dump ourselves
formatter->open_object_section("results");
formatter->dump_int("return_code", r);
formatter->close_section(); // results
+ r = 0; // already dumped in formatter
}
if (on_finish)
on_finish->complete(r);
};
void MDCache::enqueue_scrub(
- boost::string_view path,
- boost::string_view tag,
+ std::string_view path,
+ std::string_view tag,
bool force, bool recursive, bool repair,
Formatter *f, Context *fin)
{
- dout(10) << __func__ << path << dendl;
+ dout(10) << __func__ << " " << path << dendl;
MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
- filepath fp(path);
- mdr->set_filepath(fp);
+ if (path == "~mdsdir") {
+ filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
+ mdr->set_filepath(fp);
+ } else {
+ filepath fp(path);
+ mdr->set_filepath(path);
+ }
+
+ bool is_internal = false;
+ std::string tag_str(tag);
+ if (tag_str.empty()) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ tag_str = uuid_gen.to_string();
+ is_internal = true;
+ }
- C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
+ C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
cs->header = std::make_shared<ScrubHeader>(
- tag, force, recursive, repair, f);
+ tag_str, is_internal, force, recursive, repair, f);
mdr->internal_op_finish = cs;
enqueue_scrub_work(mdr);
-
- // since recursive scrub is asynchronous, dump minimal output
- // to not upset cli tools.
- if (recursive) {
- f->open_object_section("results");
- f->close_section(); // results
- }
}
void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
{
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ MutationImpl::LockOpVec lov;
+ CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
if (NULL == in)
return;
// TODO: Remove this restriction
- assert(in->is_auth());
+ ceph_assert(in->is_auth());
- bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
+ bool locked = mds->locker->acquire_locks(mdr, lov);
if (!locked)
return;
C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
- ScrubHeaderRef &header = cs->header;
+ ScrubHeaderRef header = cs->header;
// Cannot scrub same dentry twice at same time
- if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
+ if (in->scrub_is_in_progress()) {
mds->server->respond_to_request(mdr, -EBUSY);
return;
} else {
header->set_origin(in);
- Context *fin = nullptr;
- if (!header->get_recursive()) {
+ Context *fin;
+ if (header->get_recursive()) {
+ header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
+ fin = new MDSInternalContextWrapper(mds,
+ new FunctionContext([this, header](int r) {
+ recursive_scrub_finish(header);
+ header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
+ })
+ );
+ } else {
fin = cs->take_finisher();
}
auto& expiring_segments = mds->mdlog->get_expiring_segments();
for (auto logseg : expiring_segments)
logseg->wait_for_expiry(gather.new_sub());
- assert(gather.has_subs());
+ ceph_assert(gather.has_subs());
gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
gather.activate();
}
return;
}
-struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
+void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
+{
+ if (header->get_origin()->is_base() &&
+ header->get_force() && header->get_repair()) {
+ // notify snapserver that base directory is recursively scrubbed.
+ // After both root and mdsdir are recursively scrubbed, snapserver
+ // knows that all old format snaprealms are converted to the new
+ // format.
+ if (mds->mdsmap->get_num_in_mds() == 1 &&
+ mds->mdsmap->get_num_failed_mds() == 0 &&
+ mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
+ mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
+ }
+ }
+}
+
+struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
MDRequestRef mdr;
- C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
+ C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
MDCacheLogContext(c), mdr(m) {}
void finish(int r) override {
mdr->apply();
mdr->auth_pin(dir);
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
+ MutationImpl::LockOpVec lov;
CInode *diri = dir->inode;
- rdlocks.insert(&diri->dirfragtreelock);
- wrlocks.insert(&diri->nestlock);
- wrlocks.insert(&diri->filelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ lov.add_rdlock(&diri->dirfragtreelock);
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_wrlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
if (!dir->is_complete()) {
le->metablob.add_dir_context(dir);
le->metablob.add_dir(dir, true);
- mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
+ mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
}
void MDCache::repair_inode_stats(CInode *diri)
return;
}
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- std::list<frag_t> frags;
+ MutationImpl::LockOpVec lov;
if (mdr->ls) // already marked filelock/nestlock dirty ?
goto do_rdlocks;
- rdlocks.insert(&diri->dirfragtreelock);
- wrlocks.insert(&diri->nestlock);
- wrlocks.insert(&diri->filelock);
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ lov.add_rdlock(&diri->dirfragtreelock);
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_wrlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
// Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
// the scatter-gather process, which will fix any fragstat/rstat errors.
- diri->dirfragtree.get_leaves(frags);
- for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
- CDir *dir = diri->get_dirfrag(*p);
- if (!dir) {
- assert(mdr->is_auth_pinned(diri));
- dir = diri->get_or_open_dirfrag(this, *p);
- }
- if (dir->get_version() == 0) {
- assert(dir->is_auth());
- dir->fetch(new C_MDS_RetryRequest(this, mdr));
- return;
+ {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ if (!dir) {
+ ceph_assert(mdr->is_auth_pinned(diri));
+ dir = diri->get_or_open_dirfrag(this, leaf);
+ }
+ if (dir->get_version() == 0) {
+ ceph_assert(dir->is_auth());
+ dir->fetch(new C_MDS_RetryRequest(this, mdr));
+ return;
+ }
}
}
do_rdlocks:
// force the scatter-gather process
- rdlocks.insert(&diri->dirfragtreelock);
- rdlocks.insert(&diri->nestlock);
- rdlocks.insert(&diri->filelock);
- wrlocks.clear();
- if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+ lov.clear();
+ lov.add_rdlock(&diri->dirfragtreelock);
+ lov.add_rdlock(&diri->nestlock);
+ lov.add_rdlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
return;
diri->state_clear(CInode::STATE_REPAIRSTATS);
frag_info_t dir_info;
nest_info_t nest_info;
- nest_info.rsubdirs++; // it gets one to account for self
+ nest_info.rsubdirs = 1; // it gets one to account for self
+ if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
+ nest_info.rsnaps = srnode->snaps.size();
- diri->dirfragtree.get_leaves(frags);
- for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
- CDir *dir = diri->get_dirfrag(*p);
- assert(dir);
- assert(dir->get_version() > 0);
- dir_info.add(dir->fnode.accounted_fragstat);
- nest_info.add(dir->fnode.accounted_rstat);
+ {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ ceph_assert(dir);
+ ceph_assert(dir->get_version() > 0);
+ dir_info.add(dir->fnode.accounted_fragstat);
+ nest_info.add(dir->fnode.accounted_rstat);
+ }
}
if (!dir_info.same_sums(diri->inode.dirstat) ||
mds->server->respond_to_request(mdr, 0);
}
-void MDCache::flush_dentry(boost::string_view path, Context *fin)
+void MDCache::upgrade_inode_snaprealm(CInode *in)
+{
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
+ mdr->pin(in);
+ mdr->internal_op_private = in;
+ mdr->internal_op_finish = new C_MDSInternalNoop;
+ upgrade_inode_snaprealm_work(mdr);
+}
+
+void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
+{
+ CInode *in = static_cast<CInode*>(mdr->internal_op_private);
+ dout(10) << __func__ << " " << *in << dendl;
+
+ if (!in->is_auth()) {
+ mds->server->respond_to_request(mdr, -ESTALE);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ mds->locker->include_snap_rdlocks(in, lov);
+ lov.erase_rdlock(&in->snaplock);
+ lov.add_xlock(&in->snaplock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // project_snaprealm() upgrades snaprealm format
+ auto &pi = in->project_inode(false, true);
+ mdr->add_projected_inode(in);
+ pi.inode.version = in->pre_dirty();
+
+ mdr->ls = mds->mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
+ mds->mdlog->start_entry(le);
+
+ if (in->is_base()) {
+ le->metablob.add_root(true, in);
+ } else {
+ CDentry *pdn = in->get_projected_parent_dn();
+ le->metablob.add_dir_context(pdn->get_dir());
+ le->metablob.add_primary_dentry(pdn, in, true);
+ }
+
+ mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
+}
+
+void MDCache::flush_dentry(std::string_view path, Context *fin)
{
if (is_readonly()) {
dout(10) << __func__ << ": read-only FS" << dendl;
flush_dentry_work(mdr);
}
-class C_FinishIOMDR : public MDSInternalContextBase {
+class C_FinishIOMDR : public MDSContext {
protected:
MDSRank *mds;
MDRequestRef mdr;
void MDCache::flush_dentry_work(MDRequestRef& mdr)
{
- set<SimpleLock*> rdlocks, wrlocks, xlocks;
- CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+ MutationImpl::LockOpVec lov;
+ CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
if (NULL == in)
return;
// TODO: Is this necessary? Fix it if so
- assert(in->is_auth());
- bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
+ ceph_assert(in->is_auth());
+ bool locked = mds->locker->acquire_locks(mdr, lov);
if (!locked)
return;
in->flush(new C_FinishIOMDR(mds, mdr));
stray_manager.set_logger(logger.get());
}
-void MDCache::activate_stray_manager()
-{
- if (open) {
- stray_manager.activate();
- } else {
- wait_for_open(
- new MDSInternalContextWrapper(mds,
- new FunctionContext([this](int r){
- stray_manager.activate();
- })
- )
- );
- }
-}
-
/**
* Call this when putting references to an inode/dentry or
* when attempting to trim it.
return;
}
- if (dn->get_projected_linkage()->is_primary() &&
- dn->get_dir()->get_inode()->is_stray()) {
- stray_manager.eval_stray(dn, delay);
+ if (dn->get_dir()->get_inode()->is_stray()) {
+ if (delay)
+ stray_manager.queue_delayed(dn);
+ else
+ stray_manager.eval_stray(dn);
}
}
void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
dout(10) << __func__ << " " << *diri << dendl;
- assert(diri->get_projected_parent_dir()->inode->is_stray());
+ ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
list<CDir*> ls;
diri->get_dirfrags(ls);
for (auto &p : ls) {
}
}
+bool MDCache::dump_inode(Formatter *f, uint64_t number) {
+ CInode *in = get_inode(number);
+ if (!in) {
+ return false;
+ }
+ f->open_object_section("inode");
+ in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
+ f->close_section();
+ return true;
+}