X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=ceph%2Fsrc%2Fclient%2FClient.cc;h=2b7db5a894d1cc325e2fd2a3d2d7a114d24484e1;hb=aee94f6923ba628a85d855d0c5316d0da78bfa2a;hp=7c3f117a803dd281cf9ca25a04c4187604b15cae;hpb=27f45121cc74e31203777ad565f78d8aad9b92a2;p=ceph.git diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc index 7c3f117a8..2b7db5a89 100644 --- a/ceph/src/client/Client.cc +++ b/ceph/src/client/Client.cc @@ -72,6 +72,7 @@ #include "mds/flock.h" #include "mds/cephfs_features.h" +#include "mds/snap.h" #include "osd/OSDMap.h" #include "osdc/Filer.h" @@ -1209,6 +1210,11 @@ Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dl Inode *diri = dir->parent_inode; clear_dir_complete_and_ordered(diri, false); dn = link(dir, dname, in, dn); + + if (old_dentry) { + dn->is_renaming = false; + signal_cond_list(waiting_for_rename); + } } update_dentry_lease(dn, dlease, from, session); @@ -1292,7 +1298,8 @@ void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete) /* * insert results from readdir or lssnap into the metadata cache. */ -void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) { +void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, + Inode *diri, Inode *diri_other) { auto& reply = request->reply; ConnectionRef con = request->reply->get_connection(); @@ -1307,7 +1314,8 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dir_result_t *dirp = request->dirp; ceph_assert(dirp); - // the extra buffer list is only set for readdir and lssnap replies + // the extra buffer list is only set for readdir, lssnap and + // readdir_snapdiff replies auto p = reply->get_extra_bl().cbegin(); if (!p.end()) { // snapdir? @@ -1315,10 +1323,27 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, ceph_assert(diri); diri = open_snapdir(diri); } + bool snapdiff_req = request->head.op == CEPH_MDS_OP_READDIR_SNAPDIFF; + frag_t fg; + unsigned offset_hash; + if (snapdiff_req) { + fg = (unsigned)request->head.args.snapdiff.frag; + offset_hash = (unsigned)request->head.args.snapdiff.offset_hash; + } else { + fg = (unsigned)request->head.args.readdir.frag; + offset_hash = (unsigned)request->head.args.readdir.offset_hash; + } // only open dir if we're actually adding stuff to it! Dir *dir = diri->open_dir(); ceph_assert(dir); + //open opponent dir for snapdiff if any + Dir *dir_other = nullptr; + if (snapdiff_req) { + ceph_assert(diri_other); + dir_other = diri_other->open_dir(); + ceph_assert(dir_other); + } // dirstat DirStat dst(p, features); @@ -1330,7 +1355,6 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END); bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER); - frag_t fg = (unsigned)request->head.args.readdir.frag; unsigned readdir_offset = dirp->next_offset; string readdir_start = dirp->last_name; ceph_assert(!readdir_start.empty() || readdir_offset == 2); @@ -1341,7 +1365,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start)); } else if (flags & CEPH_READDIR_OFFSET_HASH) { /* mds understands offset_hash */ - last_hash = (unsigned)request->head.args.readdir.offset_hash; + last_hash = offset_hash; } } @@ -1386,13 +1410,22 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *in = add_update_inode(&ist, request->sent_stamp, session, request->perms); + auto *effective_dir = dir; + auto *effective_diri = diri; + + if (snapdiff_req && in->snapid != diri->snapid) { + ceph_assert(diri_other); + ceph_assert(dir_other); + effective_diri = diri_other; + effective_dir = dir_other; + } Dentry *dn; - if (diri->dir->dentries.count(dname)) { - Dentry *olddn = diri->dir->dentries[dname]; + if (effective_dir->dentries.count(dname)) { + Dentry *olddn = effective_dir->dentries[dname]; if (olddn->inode != in) { // replace incorrect dentry unlink(olddn, true, true); // keep dir, dentry - dn = link(dir, dname, in, olddn); + dn = link(effective_dir, dname, in, olddn); ceph_assert(dn == olddn); } else { // keep existing dn @@ -1401,13 +1434,13 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, } } else { // new dn - dn = link(dir, dname, in, NULL); + dn = link(effective_dir, dname, in, NULL); } dn->alternate_name = std::move(dlease.alternate_name); update_dentry_lease(dn, &dlease, request->sent_stamp, session); if (hash_order) { - unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname)); + unsigned hash = ceph_frag_value(effective_diri->hash_dentry_name(dname)); if (hash != last_hash) readdir_offset = 2; last_hash = hash; @@ -1416,20 +1449,21 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false); } // add to readdir cache - if (dirp->release_count == diri->dir_release_count && - dirp->ordered_count == diri->dir_ordered_count && - dirp->start_shared_gen == diri->shared_gen) { - if (dirp->cache_index == dir->readdir_cache.size()) { + if (!snapdiff_req && + dirp->release_count == effective_diri->dir_release_count && + dirp->ordered_count == effective_diri->dir_ordered_count && + dirp->start_shared_gen == effective_diri->shared_gen) { + if (dirp->cache_index == effective_dir->readdir_cache.size()) { if (i == 0) { ceph_assert(!dirp->inode->is_complete_and_ordered()); dir->readdir_cache.reserve(dirp->cache_index + numdn); } - dir->readdir_cache.push_back(dn); - } else if (dirp->cache_index < dir->readdir_cache.size()) { + effective_dir->readdir_cache.push_back(dn); + } else if (dirp->cache_index < effective_dir->readdir_cache.size()) { if (dirp->inode->is_complete_and_ordered()) - ceph_assert(dir->readdir_cache[dirp->cache_index] == dn); + ceph_assert(effective_dir->readdir_cache[dirp->cache_index] == dn); else - dir->readdir_cache[dirp->cache_index] = dn; + effective_dir->readdir_cache[dirp->cache_index] = dn; } else { ceph_abort_msg("unexpected readdir buffer idx"); } @@ -1449,6 +1483,8 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, if (dir->is_empty()) close_dir(dir); + if (dir_other && dir_other->is_empty()) + close_dir(dir_other); } } @@ -1608,10 +1644,20 @@ Inode* Client::insert_trace(MetaRequest *request, MetaSession *session) if (in) { if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) { - insert_readdir_results(request, session, in); + insert_readdir_results(request, + session, + in, + nullptr); } else if (op == CEPH_MDS_OP_LOOKUPNAME) { // hack: return parent inode instead in = diri; + } else if (op == CEPH_MDS_OP_READDIR_SNAPDIFF) { + // provide both request's inode (aka snapA) and traced one (snapB) + // to properly match snapdiff results + insert_readdir_results(request, + session, + request->inode(), + in); } if (request->dentry() == NULL && in != request->inode()) { @@ -1685,7 +1731,7 @@ mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri) * I think the MDS should be able to redirect as needed*/ in = in->get_first_parent()->dir->parent_inode; else { - ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl; + ldout(cct, 10) << __func__ << "got unlinked inode, can't look at parent" << dendl; break; } } @@ -2333,6 +2379,12 @@ void Client::_closed_mds_session(MetaSession *s, int err, bool rejected) mds_sessions.erase(s->mds_num); } +static void reinit_mds_features(MetaSession *session, + const MConstRef& m) { + session->mds_features = std::move(m->supported_features); + session->mds_metric_flags = std::move(m->metric_spec.metric_flags); +} + void Client::handle_client_session(const MConstRef& m) { mds_rank_t from = mds_rank_t(m->get_source().num()); @@ -2351,6 +2403,13 @@ void Client::handle_client_session(const MConstRef& m) if (session->state == MetaSession::STATE_OPEN) { ldout(cct, 10) << "mds." << from << " already opened, ignore it" << dendl; + // The MDS could send a client_session(open) message even when + // the session state is STATE_OPEN. Normally, its fine to + // ignore this message, but, if the MDS sent this message just + // after it got upgraded, the MDS feature bits could differ + // than the one before the upgrade - so, refresh the feature + // bits the client holds. + reinit_mds_features(session.get(), m); return; } /* @@ -2360,8 +2419,7 @@ void Client::handle_client_session(const MConstRef& m) if (!session->seq && m->get_seq()) session->seq = m->get_seq(); - session->mds_features = std::move(m->supported_features); - session->mds_metric_flags = std::move(m->metric_spec.metric_flags); + reinit_mds_features(session.get(), m); renew_caps(session.get()); session->state = MetaSession::STATE_OPEN; @@ -2546,7 +2604,7 @@ ref_t Client::build_client_request(MetaRequest *request, mds_ran } } - auto req = make_message(request->get_op(), old_version); + auto req = make_message(request->get_op(), session->mds_features); req->set_tid(request->tid); req->set_stamp(request->op_stamp); memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head)); @@ -3421,12 +3479,17 @@ Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) lru.lru_insert_mid(dn); // mid or top? - ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in - << " dn " << dn << " (new dn)" << dendl; + if(in) { + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << *in + << " dn " << *dn << " (new dn)" << dendl; + } else { + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' " + << " dn " << *dn << " (new dn)" << dendl; + } } else { ceph_assert(!dn->inode); - ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in - << " dn " << dn << " (old dn)" << dendl; + ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << in + << " dn " << *dn << " (old dn)" << dendl; } if (in) { // link to inode @@ -3529,7 +3592,7 @@ void Client::put_cap_ref(Inode *in, int cap) int put_nref = 0; int drop = last & ~in->caps_issued(); if (in->snapid == CEPH_NOSNAP) { - if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) && + if ((last & CEPH_CAP_FILE_WR) && !in->cap_snaps.empty() && in->cap_snaps.rbegin()->second.writing) { ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl; @@ -3543,6 +3606,10 @@ void Client::put_cap_ref(Inode *in, int cap) signal_cond_list(in->waitfor_commit); ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl; ++put_nref; + + if (!in->cap_snaps.empty()) { + flush_snaps(in); + } } } if (last & CEPH_CAP_FILE_CACHE) { @@ -4003,15 +4070,13 @@ void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc) in->cap_snaps.rbegin()->second.writing) { ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl; return; - } else if (in->caps_dirty() || - (used & CEPH_CAP_FILE_WR) || - (dirty & CEPH_CAP_ANY_WR)) { + } else if (dirty || (used & CEPH_CAP_FILE_WR)) { const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in)); ceph_assert(capsnapem.second); /* element inserted */ CapSnap &capsnap = capsnapem.first->second; capsnap.context = old_snapc; capsnap.issued = in->caps_issued(); - capsnap.dirty = in->caps_dirty(); + capsnap.dirty = dirty; capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER); @@ -4058,9 +4123,11 @@ void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used) } if (used & CEPH_CAP_FILE_BUFFER) { - capsnap.writing = 1; ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used - << " WRBUFFER, delaying" << dendl; + << " WRBUFFER, trigger to flush dirty buffer" << dendl; + + /* trigger to flush the buffer */ + _flush(in, new C_Client_FlushComplete(this, in)); } else { capsnap.dirty_data = 0; flush_snaps(in); @@ -4412,11 +4479,19 @@ void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id if (flags & CEPH_CAP_FLAG_AUTH) { if (in->auth_cap != &cap && (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) { - if (in->auth_cap && in->flushing_cap_item.is_on_list()) { - ldout(cct, 10) << __func__ << " changing auth cap: " - << "add myself to new auth MDS' flushing caps list" << dendl; - adjust_session_flushing_caps(in, in->auth_cap->session, mds_session); + if (in->auth_cap) { + if (in->flushing_cap_item.is_on_list()) { + ldout(cct, 10) << __func__ << " changing auth cap: " + << "add myself to new auth MDS' flushing caps list" << dendl; + adjust_session_flushing_caps(in, in->auth_cap->session, mds_session); + } + if (in->dirty_cap_item.is_on_list()) { + ldout(cct, 10) << __func__ << " changing auth cap: " + << "add myself to new auth MDS' dirty caps list" << dendl; + mds_session->get_dirty_list().push_back(&in->dirty_cap_item); + } } + in->auth_cap = ∩ } } @@ -5263,24 +5338,48 @@ void Client::handle_caps(const MConstRef& m) got_mds_push(session.get()); + bool do_cap_release = false; Inode *in; vinodeno_t vino(m->get_ino(), CEPH_NOSNAP); if (auto it = inode_map.find(vino); it != inode_map.end()) { in = it->second; + + /* MDS maybe waiting for cap release with increased seq */ + switch (m->get_op()) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + if (!in->caps.count(mds)) { + do_cap_release = true; + ldout(cct, 5) << __func__ << " vino " << vino << " don't have cap " + << m->get_cap_id() << " op " << m->get_op() + << ", immediately releasing" << dendl; + } + } } else { - if (m->get_op() == CEPH_CAP_OP_IMPORT) { - ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl; - session->enqueue_cap_release( - m->get_ino(), - m->get_cap_id(), - m->get_seq(), - m->get_mseq(), - cap_epoch_barrier); - } else { - ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl; + /* MDS maybe waiting for cap release with increased seq */ + switch (m->get_op()) { + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + ldout(cct, 5) << __func__ << " don't have vino " << vino << " op " + << m->get_op() << ", immediately releasing" << dendl; + break; + default: + ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl; + return; } + } + + // In case the mds is waiting on e.g. a revocation + if (do_cap_release) { + session->enqueue_cap_release( + m->get_ino(), + m->get_cap_id(), + m->get_seq(), + m->get_mseq(), + cap_epoch_barrier); - // in case the mds is waiting on e.g. a revocation flush_cap_releases(); return; } @@ -5757,6 +5856,13 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const M } } + // just in case the caps was released just before we get the revoke msg + if (!check && m->get_op() == CEPH_CAP_OP_REVOKE) { + cap->wanted = 0; // don't let check_caps skip sending a response to MDS + check = true; + flags = CHECK_CAPS_NODELAY; + } + if (check) check_caps(in, flags); @@ -6888,6 +6994,13 @@ void Client::collect_and_send_global_metrics() { ldout(cct, 20) << __func__ << dendl; ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); + /* Do not send the metrics until the MDS rank is ready */ + if (!mdsmap->is_active((mds_rank_t)0)) { + ldout(cct, 5) << __func__ << " MDS rank 0 is not ready yet -- not sending metric" + << dendl; + return; + } + if (!have_open_session((mds_rank_t)0)) { ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric" << dendl; @@ -7059,7 +7172,8 @@ bool Client::_dentry_valid(const Dentry *dn) } int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target, - const UserPerm& perms, std::string* alternate_name) + const UserPerm& perms, std::string* alternate_name, + bool is_rename) { int r = 0; Dentry *dn = NULL; @@ -7138,6 +7252,19 @@ relookup: } else { ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl; } + + // In rare case during the rename if another thread tries to + // lookup the dst dentry, it may get an inconsistent result + // that both src dentry and dst dentry will link to the same + // inode at the same time. + // Will wait the rename to finish and try it again. + if (!is_rename && dn->is_renaming) { + ldout(cct, 1) << __func__ << " dir " << *dir + << " rename is on the way, will wait for dn '" + << dname << "'" << dendl; + wait_on_list(waiting_for_rename); + goto relookup; + } } else { // can we conclude ENOENT locally? if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) && @@ -9055,7 +9182,8 @@ void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp) dirp->buffer.clear(); } -int Client::_readdir_get_frag(dir_result_t *dirp) +int Client::_readdir_get_frag(int op, dir_result_t* dirp, + fill_readdir_args_cb_t fill_req_cb) { ceph_assert(dirp); ceph_assert(dirp->inode); @@ -9070,33 +9198,18 @@ int Client::_readdir_get_frag(dir_result_t *dirp) ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg << " offset " << hex << dirp->offset << dec << dendl; - int op = CEPH_MDS_OP_READDIR; - if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR) - op = CEPH_MDS_OP_LSSNAP; - InodeRef& diri = dirp->inode; MetaRequest *req = new MetaRequest(op); - filepath path; - diri->make_nosnap_relative_path(path); - req->set_filepath(path); - req->set_inode(diri.get()); - req->head.args.readdir.frag = fg; - req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS; - if (dirp->last_name.length()) { - req->path2.set_path(dirp->last_name); - } else if (dirp->hash_order()) { - req->head.args.readdir.offset_hash = dirp->offset_high(); - } - req->dirp = dirp; - + fill_req_cb(dirp, req, diri, fg); + bufferlist dirbl; int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl); if (res == -CEPHFS_EAGAIN) { ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl; _readdir_rechoose_frag(dirp); - return _readdir_get_frag(dirp); + return _readdir_get_frag(op, dirp, fill_req_cb); } if (res == 0) { @@ -9121,7 +9234,8 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, { ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino - << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec + << " last_name " << dirp->last_name + << " offset " << hex << dirp->offset << dec << dendl; Dir *dir = dirp->inode->dir; @@ -9210,8 +9324,57 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, return 0; } -int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, - unsigned want, unsigned flags, bool getref) +int Client::readdir_r_cb(dir_result_t* d, + add_dirent_cb_t cb, + void* p, + unsigned want, + unsigned flags, + bool getref) +{ + auto fill_readdir_cb = [](dir_result_t* dirp, + MetaRequest* req, + InodeRef& diri, + frag_t fg) { + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.readdir.frag = fg; + req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name); + } else if (dirp->hash_order()) { + req->head.args.readdir.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + }; + int op = CEPH_MDS_OP_READDIR; + if (d->inode && d->inode->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_LSSNAP; + return _readdir_r_cb(op, + d, + cb, + fill_readdir_cb, + p, + want, + flags, + getref, + false); +} + +// +// NB: this is used for both readdir and readdir_snapdiff results processing +// hence it should be request type agnostic +// +int Client::_readdir_r_cb(int op, + dir_result_t *d, + add_dirent_cb_t cb, + fill_readdir_args_cb_t fill_cb, + void *p, + unsigned want, + unsigned flags, + bool getref, + bool bypass_cache) { int caps = statx_to_mask(flags, want); @@ -9301,12 +9464,14 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, } // can we read from our cache? - ldout(cct, 10) << "offset " << hex << dirp->offset << dec + ldout(cct, 10) << __func__ + << " offset " << hex << dirp->offset << dec << " snapid " << dirp->inode->snapid << " (complete && ordered) " << dirp->inode->is_complete_and_ordered() << " issued " << ccap_string(dirp->inode->caps_issued()) << dendl; - if (dirp->inode->snapid != CEPH_SNAPDIR && + if (!bypass_cache && + dirp->inode->snapid != CEPH_SNAPDIR && dirp->inode->is_complete_and_ordered() && dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) { int err = _readdir_cache_cb(dirp, cb, p, caps, getref); @@ -9320,7 +9485,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, bool check_caps = true; if (!dirp->is_cached()) { - int r = _readdir_get_frag(dirp); + int r = _readdir_get_frag(op, dirp, fill_cb); if (r) return r; // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is @@ -9329,7 +9494,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, } frag_t fg = dirp->buffer_frag; - ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size() + ldout(cct, 10) << __func__ + << " frag " << fg << " buffer size " << dirp->buffer.size() << " offset " << hex << dirp->offset << dendl; for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(), @@ -9364,7 +9530,9 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, r = cb(p, &de, &stx, next_off, inode); // _next_ offset cl.lock(); - ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec + ldout(cct, 15) << __func__ + << " de " << de.d_name << " off " << hex << next_off - 1 << dec + << " snap " << entry.inode->snapid << " = " << r << dendl; if (r < 0) return r; @@ -9386,7 +9554,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, continue; } - if (diri->shared_gen == dirp->start_shared_gen && + if (!bypass_cache && + diri->shared_gen == dirp->start_shared_gen && diri->dir_release_count == dirp->release_count) { if (diri->dir_ordered_count == dirp->ordered_count) { ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl; @@ -9492,6 +9661,81 @@ int Client::readdirplus_r(dir_result_t *d, struct dirent *de, return 0; } +int Client::readdir_snapdiff(dir_result_t* d1, snapid_t snap2, + struct dirent* out_de, + snapid_t* out_snap) +{ + if (!d1 || !d1->inode || d1->inode->snapid == snap2) { + lderr(cct) << __func__ << " invalid parameters: " + << " d1:" << d1 + << " d1->inode:" << (d1 ? d1->inode : nullptr) + << " snap2 id :" << snap2 + << dendl; + errno = EINVAL; + return -errno; + } + + auto& de = d1->de; + ceph_statx stx; + single_readdir sr; + sr.de = &de; + sr.stx = &stx; + sr.inode = NULL; + sr.full = false; + + auto fill_snapdiff_cb = [&](dir_result_t* dirp, + MetaRequest* req, + InodeRef& diri, + frag_t fg) { + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.snapdiff.snap_other = snap2; + req->head.args.snapdiff.frag = fg; + req->head.args.snapdiff.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name); + } else if (dirp->hash_order()) { + req->head.args.snapdiff.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + }; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + int ret = _readdir_r_cb(CEPH_MDS_OP_READDIR_SNAPDIFF, + d1, + _readdir_single_dirent_cb, + fill_snapdiff_cb, + (void*)&sr, + 0, + AT_STATX_DONT_SYNC, + false, + true); + if (ret < -1) { + lderr(cct) << __func__ << " error: " + << cpp_strerror(ret) + << dendl; + errno = -ret; // this sucks. + return ret; + } + + ldout(cct, 15) << __func__ << " " << ret + << " " << sr.de->d_name + << " " << stx.stx_dev + << dendl; + if (sr.full) { + if (out_de) { + *out_de = de; + } + if (out_snap) { + *out_snap = stx.stx_dev; + } + return 1; + } + return 0; +} /* getdents */ struct getdents_result { @@ -13569,6 +13813,8 @@ int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD); + req->set_inode_owner_uid_gid(perms.uid(), perms.gid()); + filepath path; dir->make_nosnap_relative_path(path); path.push_dentry(name); @@ -13713,6 +13959,8 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE); + req->set_inode_owner_uid_gid(perms.uid(), perms.gid()); + filepath path; dir->make_nosnap_relative_path(path); path.push_dentry(name); @@ -13790,6 +14038,9 @@ int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& pe MetaRequest *req = new MetaRequest(is_snap_op ? CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR); + if (!is_snap_op) + req->set_inode_owner_uid_gid(perm.uid(), perm.gid()); + filepath path; dir->make_nosnap_relative_path(path); path.push_dentry(name); @@ -13928,6 +14179,8 @@ int Client::_symlink(Inode *dir, const char *name, const char *target, MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK); + req->set_inode_owner_uid_gid(perms.uid(), perms.gid()); + filepath path; dir->make_nosnap_relative_path(path); path.push_dentry(name); @@ -14180,11 +14433,13 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch else return -CEPHFS_EROFS; } + + // don't allow cross-quota renames if (cct->_conf.get_val("client_quota") && fromdir != todir) { Inode *fromdir_root = - fromdir->quota.is_enabled(QUOTA_MAX_FILES) ? fromdir : get_quota_root(fromdir, perm, QUOTA_MAX_FILES); + fromdir->quota.is_enabled() ? fromdir : get_quota_root(fromdir, perm); Inode *todir_root = - todir->quota.is_enabled(QUOTA_MAX_FILES) ? todir : get_quota_root(todir, perm, QUOTA_MAX_FILES); + todir->quota.is_enabled() ? todir : get_quota_root(todir, perm); if (fromdir_root != todir_root) { return -CEPHFS_EXDEV; } @@ -14212,12 +14467,13 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch req->old_dentry_drop = CEPH_CAP_FILE_SHARED; req->old_dentry_unless = CEPH_CAP_FILE_EXCL; + de->is_renaming = true; req->set_dentry(de); req->dentry_drop = CEPH_CAP_FILE_SHARED; req->dentry_unless = CEPH_CAP_FILE_EXCL; InodeRef oldin, otherin; - res = _lookup(fromdir, fromname, 0, &oldin, perm); + res = _lookup(fromdir, fromname, 0, &oldin, perm, nullptr, true); if (res < 0) goto fail; @@ -14226,7 +14482,7 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch req->set_old_inode(oldinode); req->old_inode_drop = CEPH_CAP_LINK_SHARED; - res = _lookup(todir, toname, 0, &otherin, perm); + res = _lookup(todir, toname, 0, &otherin, perm, nullptr, true); switch (res) { case 0: { @@ -14255,6 +14511,12 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch res = make_request(req, perm, &target); ldout(cct, 10) << "rename result is " << res << dendl; + // if rename fails it will miss waking up the waiters + if (op == CEPH_MDS_OP_RENAME && de->is_renaming) { + de->is_renaming = false; + signal_cond_list(waiting_for_rename); + } + // renamed item from our cache trim_cache();