1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "cephfs_features.h"
34 #include "msg/Messenger.h"
36 #include "osdc/Objecter.h"
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
44 #include "include/stringify.h"
45 #include "include/filepath.h"
46 #include "common/errno.h"
47 #include "common/Timer.h"
48 #include "common/perf_counters.h"
49 #include "include/compat.h"
50 #include "osd/OSDMap.h"
57 #include <string_view>
59 #include "common/config.h"
61 #define dout_context g_ceph_context
62 #define dout_subsys ceph_subsys_mds
64 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
66 class ServerContext
: public MDSContext
{
69 MDSRank
*get_mds() override
75 explicit ServerContext(Server
*s
) : server(s
) {
76 ceph_assert(server
!= NULL
);
80 class ServerLogContext
: public MDSLogContextBase
{
83 MDSRank
*get_mds() override
89 void pre_finish(int r
) override
{
91 mdr
->mark_event("journal_committed: ");
94 explicit ServerLogContext(Server
*s
) : server(s
) {
95 ceph_assert(server
!= NULL
);
97 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
98 ceph_assert(server
!= NULL
);
102 void Server::create_logger()
104 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
106 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
107 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
108 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
109 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
110 plb
.add_u64_counter(l_mdss_handle_client_session
,
111 "handle_client_session", "Client session messages", "hcs",
112 PerfCountersBuilder::PRIO_INTERESTING
);
113 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
114 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
116 // fop latencies are useful
117 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
118 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
119 "Request type lookup hash of inode latency");
120 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
121 "Request type lookup inode latency");
122 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
123 "Request type lookup parent latency");
124 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
125 "Request type lookup name latency");
126 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
127 "Request type lookup latency");
128 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
129 "Request type lookup snapshot latency");
130 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
131 "Request type get attribute latency");
132 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
133 "Request type set attribute latency");
134 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
135 "Request type set file layout latency");
136 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
137 "Request type set directory layout latency");
138 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
139 "Request type set extended attribute latency");
140 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
141 "Request type remove extended attribute latency");
142 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
143 "Request type read directory latency");
144 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
145 "Request type set file lock latency");
146 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
147 "Request type get file lock latency");
148 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
149 "Request type create latency");
150 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
151 "Request type open latency");
152 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
153 "Request type make node latency");
154 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
155 "Request type link latency");
156 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
157 "Request type unlink latency");
158 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
159 "Request type remove directory latency");
160 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
161 "Request type rename latency");
162 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
163 "Request type make directory latency");
164 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
165 "Request type symbolic link latency");
166 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
167 "Request type list snapshot latency");
168 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
169 "Request type make snapshot latency");
170 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
171 "Request type remove snapshot latency");
172 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
173 "Request type rename snapshot latency");
175 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
176 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
177 "Client requests dispatched");
178 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request",
179 "Server requests dispatched");
181 logger
= plb
.create_perf_counters();
182 g_ceph_context
->get_perfcounters_collection()->add(logger
);
185 Server::Server(MDSRank
*m
) :
187 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
190 reconnect_done(NULL
),
191 failed_reconnects(0),
192 reconnect_evicting(false),
193 terminating_sessions(false),
194 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate"))
196 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
197 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
200 void Server::dispatch(const Message::const_ref
&m
)
202 switch (m
->get_type()) {
203 case CEPH_MSG_CLIENT_RECONNECT
:
204 handle_client_reconnect(MClientReconnect::msgref_cast(m
));
209 // handle_slave_request()/handle_client_session() will wait if necessary
210 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
211 const auto &req
= MClientRequest::msgref_cast(m
);
212 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
213 Session
*session
= mds
->get_session(req
);
214 if (!session
|| session
->is_closed()) {
215 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
218 bool queue_replay
= false;
219 if (req
->is_replay()) {
220 dout(3) << "queuing replayed op" << dendl
;
223 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
224 mdcache
->add_replay_ino_alloc(inodeno_t(req
->head
.ino
));
226 } else if (req
->get_retry_attempt()) {
227 // process completed request in clientreplay stage. The completed request
228 // might have created new file/directorie. This guarantees MDS sends a reply
229 // to client before other request modifies the new file/directorie.
230 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
231 dout(3) << "queuing completed op" << dendl
;
234 // this request was created before the cap reconnect message, drop any embedded
236 req
->releases
.clear();
239 req
->mark_queued_for_replay();
240 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
245 bool wait_for_active
= true;
246 if (mds
->is_stopping()) {
247 wait_for_active
= false;
248 } else if (mds
->is_clientreplay()) {
249 if (req
->is_queued_for_replay()) {
250 wait_for_active
= false;
253 if (wait_for_active
) {
254 dout(3) << "not active yet, waiting" << dendl
;
255 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
260 switch (m
->get_type()) {
261 case CEPH_MSG_CLIENT_SESSION
:
262 handle_client_session(MClientSession::msgref_cast(m
));
264 case CEPH_MSG_CLIENT_REQUEST
:
265 handle_client_request(MClientRequest::msgref_cast(m
));
267 case CEPH_MSG_CLIENT_RECLAIM
:
268 handle_client_reclaim(MClientReclaim::msgref_cast(m
));
270 case MSG_MDS_SLAVE_REQUEST
:
271 handle_slave_request(MMDSSlaveRequest::msgref_cast(m
));
274 derr
<< "server unknown message " << m
->get_type() << dendl
;
275 ceph_abort_msg("server unknown message");
281 // ----------------------------------------------------------
282 // SESSION management
284 class C_MDS_session_finish
: public ServerLogContext
{
289 interval_set
<inodeno_t
> inos
;
293 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
294 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
295 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
>& i
, version_t iv
, Context
*fin_
= NULL
) :
296 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(i
), inotablev(iv
), fin(fin_
) { }
297 void finish(int r
) override
{
299 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
);
306 Session
* Server::find_session_by_uuid(std::string_view uuid
)
308 Session
* session
= nullptr;
309 for (auto& it
: mds
->sessionmap
.get_sessions()) {
310 auto& metadata
= it
.second
->info
.client_metadata
;
312 auto p
= metadata
.find("uuid");
313 if (p
== metadata
.end() || p
->second
!= uuid
)
318 } else if (!session
->reclaiming_from
) {
319 assert(it
.second
->reclaiming_from
== session
);
322 assert(session
->reclaiming_from
== it
.second
);
328 void Server::reclaim_session(Session
*session
, const MClientReclaim::const_ref
&m
)
330 if (!session
->is_open() && !session
->is_stale()) {
331 dout(10) << "session not open, dropping this req" << dendl
;
335 auto reply
= MClientReclaimReply::create(0);
336 if (m
->get_uuid().empty()) {
337 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
338 reply
->set_result(-EINVAL
);
339 mds
->send_message_client(reply
, session
);
343 unsigned flags
= m
->get_flags();
344 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
345 dout(10) << __func__
<< " unsupported flags" << dendl
;
346 reply
->set_result(-EOPNOTSUPP
);
347 mds
->send_message_client(reply
, session
);
351 Session
* target
= find_session_by_uuid(m
->get_uuid());
353 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
354 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
355 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
356 reply
->set_result(-EPERM
);
357 mds
->send_message_client(reply
, session
);
360 assert(!target
->reclaiming_from
);
361 assert(!session
->reclaiming_from
);
362 session
->reclaiming_from
= target
;
363 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
366 if (flags
& CEPH_RECLAIM_RESET
) {
367 finish_reclaim_session(session
, reply
);
374 void Server::finish_reclaim_session(Session
*session
, const MClientReclaimReply::ref
&reply
)
376 Session
*target
= session
->reclaiming_from
;
378 session
->reclaiming_from
= nullptr;
382 int64_t session_id
= session
->get_client().v
;
383 send_reply
= new FunctionContext([this, session_id
, reply
](int r
) {
384 assert(mds
->mds_lock
.is_locked_by_me());
385 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
389 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
390 reply
->set_epoch(epoch
);
391 mds
->send_message_client(reply
, session
);
394 send_reply
= nullptr;
397 bool blacklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
398 return map
.is_blacklisted(target
->info
.inst
.addr
);
401 if (blacklisted
|| !g_conf()->mds_session_blacklist_on_evict
) {
402 kill_session(target
, send_reply
);
404 std::stringstream ss
;
405 mds
->evict_client(target
->get_client().v
, false, true, ss
, send_reply
);
408 mds
->send_message_client(reply
, session
);
412 void Server::handle_client_reclaim(const MClientReclaim::const_ref
&m
)
414 Session
*session
= mds
->get_session(m
);
415 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
416 assert(m
->get_source().is_client()); // should _not_ come from an mds!
419 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
423 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
424 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
428 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
429 finish_reclaim_session(session
);
431 reclaim_session(session
, m
);
435 void Server::handle_client_session(const MClientSession::const_ref
&m
)
438 Session
*session
= mds
->get_session(m
);
440 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
441 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
444 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
448 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
449 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
450 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
451 // close requests need to be handled when mds is active
452 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
453 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
457 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
458 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
464 logger
->inc(l_mdss_handle_client_session
);
467 switch (m
->get_op()) {
468 case CEPH_SESSION_REQUEST_OPEN
:
469 if (session
->is_opening() ||
470 session
->is_open() ||
471 session
->is_stale() ||
472 session
->is_killing() ||
473 terminating_sessions
) {
474 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
477 ceph_assert(session
->is_closed() || session
->is_closing());
479 if (mds
->is_stopping()) {
480 dout(10) << "mds is stopping, dropping open req" << dendl
;
485 auto& addr
= session
->info
.inst
.addr
;
486 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
));
487 auto& client_metadata
= session
->info
.client_metadata
;
489 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
490 auto now
= ceph_clock_now();
491 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
492 auto elapsed
= now
- m
->get_recv_stamp();
493 CachedStackStringStream css
;
494 *css
<< "New client session:"
495 << " addr=\"" << session
->info
.inst
.addr
<< "\""
496 << ",elapsed=" << elapsed
497 << ",throttled=" << throttle_elapsed
498 << ",status=\"" << status
<< "\"";
500 *css
<< ",error=\"" << err
<< "\"";
502 const auto& metadata
= session
->info
.client_metadata
;
503 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
504 *css
<< ",root=\"" << it
->second
<< "\"";
506 dout(2) << css
->strv() << dendl
;
509 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
) {
510 auto m
= MClientSession::create(CEPH_SESSION_REJECT
);
511 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
512 m
->metadata
["error_string"] = err_str
;
513 mds
->send_message_client(m
, session
);
514 log_session_status("REJECTED", err_str
);
517 bool blacklisted
= mds
->objecter
->with_osdmap(
518 [&addr
](const OSDMap
&osd_map
) -> bool {
519 return osd_map
.is_blacklisted(addr
);
523 dout(10) << "rejecting blacklisted client " << addr
<< dendl
;
524 send_reject_message("blacklisted");
529 if (client_metadata
.features
.empty())
530 infer_supported_features(session
, client_metadata
);
532 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
533 dout(20) << " features: '" << client_metadata
.features
<< dendl
;
534 for (const auto& p
: client_metadata
) {
535 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
538 feature_bitset_t missing_features
= required_client_features
;
539 missing_features
-= client_metadata
.features
;
540 if (!missing_features
.empty()) {
542 ss
<< "missing required features '" << missing_features
<< "'";
543 send_reject_message(ss
.str());
544 mds
->clog
->warn() << "client session lacks required features '"
545 << missing_features
<< "' denied (" << session
->info
.inst
<< ")";
550 // Special case for the 'root' metadata path; validate that the claimed
551 // root is actually within the caps of the session
552 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
553 auto claimed_root
= it
->second
;
556 // claimed_root has a leading "/" which we strip before passing
558 if (claimed_root
.empty() || claimed_root
[0] != '/') {
560 ss
<< "invalue root '" << claimed_root
<< "'";
561 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
563 ss
<< "non-allowable root '" << claimed_root
<< "'";
567 // Tell the client we're rejecting their open
568 send_reject_message(ss
.str());
569 mds
->clog
->warn() << "client session with " << ss
.str()
570 << " denied (" << session
->info
.inst
<< ")";
576 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
577 if (find_session_by_uuid(it
->second
)) {
578 send_reject_message("duplicated session uuid");
579 mds
->clog
->warn() << "client session with duplicated session uuid '"
580 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
586 if (session
->is_closed())
587 mds
->sessionmap
.add_session(session
);
589 pv
= mds
->sessionmap
.mark_projected(session
);
590 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
591 mds
->sessionmap
.touch_session(session
);
592 auto fin
= new FunctionContext([log_session_status
= std::move(log_session_status
)](int r
){
594 log_session_status("ACCEPTED", "");
596 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
597 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
602 case CEPH_SESSION_REQUEST_RENEWCAPS
:
603 if (session
->is_open() || session
->is_stale()) {
604 mds
->sessionmap
.touch_session(session
);
605 if (session
->is_stale()) {
606 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
607 mds
->locker
->resume_stale_caps(session
);
608 mds
->sessionmap
.touch_session(session
);
610 auto reply
= MClientSession::create(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
611 mds
->send_message_client(reply
, session
);
613 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
617 case CEPH_SESSION_REQUEST_CLOSE
:
619 if (session
->is_closed() ||
620 session
->is_closing() ||
621 session
->is_killing()) {
622 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
625 if (session
->is_importing()) {
626 dout(10) << "ignoring close req on importing session" << dendl
;
629 ceph_assert(session
->is_open() ||
630 session
->is_stale() ||
631 session
->is_opening());
632 if (m
->get_seq() < session
->get_push_seq()) {
633 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
634 << ", dropping" << dendl
;
637 // We are getting a seq that is higher than expected.
638 // Handle the same as any other seqn error.
640 if (m
->get_seq() != session
->get_push_seq()) {
641 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
642 << ", BUGGY!" << dendl
;
643 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
644 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
647 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
651 case CEPH_SESSION_FLUSHMSG_ACK
:
652 finish_flush_session(session
, m
->get_seq());
655 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
656 if (mds
->is_active())
666 void Server::flush_session(Session
*session
, MDSGatherBuilder
*gather
) {
667 if (!session
->is_open() ||
668 !session
->get_connection() ||
669 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
673 version_t seq
= session
->wait_for_flush(gather
->new_sub());
674 mds
->send_message_client(
675 MClientSession::create(CEPH_SESSION_FLUSHMSG
, seq
), session
);
678 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
680 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
681 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
682 ceph_assert(session
);
683 flush_session(session
, &gather
);
687 void Server::finish_flush_session(Session
*session
, version_t seq
)
689 MDSContext::vec finished
;
690 session
->finish_flush(seq
, finished
);
691 mds
->queue_waiters(finished
);
694 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
695 interval_set
<inodeno_t
>& inos
, version_t piv
)
697 dout(10) << "_session_logged " << session
->info
.inst
<< " state_seq " << state_seq
<< " " << (open
? "open":"close")
698 << " " << pv
<< dendl
;
701 ceph_assert(session
->is_closing() || session
->is_killing() ||
702 session
->is_opening()); // re-open closing session
703 session
->info
.prealloc_inos
.subtract(inos
);
704 mds
->inotable
->apply_release_ids(inos
);
705 ceph_assert(mds
->inotable
->get_version() == piv
);
708 mds
->sessionmap
.mark_dirty(session
);
711 if (session
->get_state_seq() != state_seq
) {
712 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
713 << ", noop" << dendl
;
714 // close must have been canceled (by an import?), or any number of other things..
716 ceph_assert(session
->is_opening());
717 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
718 mds
->sessionmap
.touch_session(session
);
719 ceph_assert(session
->get_connection());
720 auto reply
= MClientSession::create(CEPH_SESSION_OPEN
);
721 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
722 reply
->supported_features
= supported_features
;
723 mds
->send_message_client(reply
, session
);
724 if (mdcache
->is_readonly()) {
725 auto m
= MClientSession::create(CEPH_SESSION_FORCE_RO
);
726 mds
->send_message_client(m
, session
);
728 } else if (session
->is_closing() ||
729 session
->is_killing()) {
730 // kill any lingering capabilities, leases, requests
731 while (!session
->caps
.empty()) {
732 Capability
*cap
= session
->caps
.front();
733 CInode
*in
= cap
->get_inode();
734 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
735 mds
->locker
->remove_client_cap(in
, cap
, true);
737 while (!session
->leases
.empty()) {
738 ClientLease
*r
= session
->leases
.front();
739 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
740 dout(20) << " killing client lease of " << *dn
<< dendl
;
741 dn
->remove_client_lease(r
, mds
->locker
);
743 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
744 dout(20) << " removing client from reconnect set" << dendl
;
745 if (client_reconnect_gather
.empty()) {
746 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
747 reconnect_gather_finish();
750 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
751 dout(20) << " removing client from reclaim set" << dendl
;
752 if (client_reclaim_gather
.empty()) {
753 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
754 mds
->maybe_clientreplay_done();
758 if (session
->is_closing()) {
759 // mark con disposable. if there is a fault, we will get a
760 // reset and clean it up. if the client hasn't received the
761 // CLOSE message yet, they will reconnect and get an
762 // ms_handle_remote_reset() and realize they had in fact closed.
763 // do this *before* sending the message to avoid a possible
765 if (session
->get_connection()) {
766 // Conditional because terminate_sessions will indiscrimately
767 // put sessions in CLOSING whether they ever had a conn or not.
768 session
->get_connection()->mark_disposable();
772 mds
->send_message_client(MClientSession::create(CEPH_SESSION_CLOSE
), session
);
773 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
775 mds
->sessionmap
.remove_session(session
);
776 } else if (session
->is_killing()) {
777 // destroy session, close connection
778 if (session
->get_connection()) {
779 session
->get_connection()->mark_down();
780 session
->get_connection()->set_priv(NULL
);
782 mds
->sessionmap
.remove_session(session
);
792 * Inject sessions from some source other than actual connections.
795 * - sessions inferred from journal replay
796 * - sessions learned from other MDSs during rejoin
797 * - sessions learned from other MDSs during dir/caps migration
798 * - sessions learned from other MDSs during a cross-MDS rename
800 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
801 map
<client_t
,client_metadata_t
>& cmm
,
802 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
804 version_t pv
= mds
->sessionmap
.get_projected();
806 dout(10) << "prepare_force_open_sessions " << pv
807 << " on " << cm
.size() << " clients"
810 mds
->objecter
->with_osdmap(
811 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
812 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
813 if (osd_map
.is_blacklisted(p
->second
.addr
)) {
814 dout(10) << " ignoring blacklisted client." << p
->first
815 << " (" << p
->second
.addr
<< ")" << dendl
;
824 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
825 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
826 pv
= mds
->sessionmap
.mark_projected(session
);
828 if (session
->is_closed() ||
829 session
->is_closing() ||
830 session
->is_killing()) {
831 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
832 auto q
= cmm
.find(p
->first
);
834 session
->info
.client_metadata
.merge(q
->second
);
836 ceph_assert(session
->is_open() ||
837 session
->is_opening() ||
838 session
->is_stale());
841 smap
[p
->first
] = make_pair(session
, sseq
);
842 session
->inc_importing();
847 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
851 * FIXME: need to carefully consider the race conditions between a
852 * client trying to close a session and an MDS doing an import
853 * trying to force open a session...
855 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
856 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
858 for (auto &it
: smap
) {
859 Session
*session
= it
.second
.first
;
860 uint64_t sseq
= it
.second
.second
;
862 if (session
->get_state_seq() != sseq
) {
863 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
865 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
866 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
867 mds
->sessionmap
.touch_session(session
);
869 auto reply
= MClientSession::create(CEPH_SESSION_OPEN
);
870 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
871 reply
->supported_features
= supported_features
;
872 mds
->send_message_client(reply
, session
);
874 if (mdcache
->is_readonly())
875 mds
->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO
), session
);
878 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
879 ceph_assert(session
->is_open() || session
->is_stale());
883 session
->dec_importing();
886 mds
->sessionmap
.mark_dirty(session
);
889 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
892 class C_MDS_TerminatedSessions
: public ServerContext
{
893 void finish(int r
) override
{
894 server
->terminating_sessions
= false;
897 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
900 void Server::terminate_sessions()
902 dout(5) << "terminating all sessions..." << dendl
;
904 terminating_sessions
= true;
906 // kill them off. clients will retry etc.
907 set
<Session
*> sessions
;
908 mds
->sessionmap
.get_client_session_set(sessions
);
909 for (set
<Session
*>::const_iterator p
= sessions
.begin();
912 Session
*session
= *p
;
913 if (session
->is_closing() ||
914 session
->is_killing() ||
915 session
->is_closed())
917 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
920 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
924 void Server::find_idle_sessions()
926 auto now
= clock::now();
927 auto last_cleared_laggy
= mds
->last_cleared_laggy();
929 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
932 // (caps go stale, lease die)
933 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
934 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
936 // don't kick clients if we've been laggy
937 if (last_cleared_laggy
< cutoff
) {
938 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
939 << "), not marking any client stale" << dendl
;
943 std::vector
<Session
*> to_evict
;
945 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
946 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
947 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
948 std::vector
<Session
*> new_stale
;
950 for (auto session
: *(sessions_p1
->second
)) {
951 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
952 if (last_cap_renew_span
< cutoff
) {
953 dout(20) << "laggiest active session is " << session
->info
.inst
954 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
958 if (session
->last_seen
> session
->last_cap_renew
) {
959 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
960 if (last_cap_renew_span
< cutoff
) {
961 dout(20) << "laggiest active session is " << session
->info
.inst
962 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
967 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
968 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
969 "has arrived" << dendl
;
970 // evict session without marking it stale
971 to_evict
.push_back(session
);
975 if (defer_session_stale
&&
976 !session
->is_any_flush_waiter() &&
977 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
978 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
979 "since it holds no caps" << dendl
;
983 auto it
= session
->info
.client_metadata
.find("timeout");
984 if (it
!= session
->info
.client_metadata
.end()) {
985 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
987 dout(10) << "skipping session " << session
->info
.inst
988 << ", infinite timeout specified" << dendl
;
991 double cutoff
= queue_max_age
+ timeout
;
992 if (last_cap_renew_span
< cutoff
) {
993 dout(10) << "skipping session " << session
->info
.inst
994 << ", timeout (" << timeout
<< ") specified"
995 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
999 // do not go through stale, evict it directly.
1000 to_evict
.push_back(session
);
1002 dout(10) << "new stale session " << session
->info
.inst
1003 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1004 new_stale
.push_back(session
);
1008 for (auto session
: new_stale
) {
1009 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1010 if (mds
->locker
->revoke_stale_caps(session
)) {
1011 mds
->locker
->remove_stale_leases(session
);
1012 finish_flush_session(session
, session
->get_push_seq());
1013 auto m
= MClientSession::create(CEPH_SESSION_STALE
, session
->get_push_seq());
1014 mds
->send_message_client(m
, session
);
1016 to_evict
.push_back(session
);
1022 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1024 // Collect a list of sessions exceeding the autoclose threshold
1025 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1026 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1027 for (auto session
: *(sessions_p2
->second
)) {
1028 assert(session
->is_stale());
1029 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1030 if (last_cap_renew_span
< cutoff
) {
1031 dout(20) << "oldest stale session is " << session
->info
.inst
1032 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1035 to_evict
.push_back(session
);
1039 for (auto session
: to_evict
) {
1040 if (session
->is_importing()) {
1041 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1045 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1046 mds
->clog
->warn() << "evicting unresponsive client " << *session
1047 << ", after " << last_cap_renew_span
<< " seconds";
1048 dout(10) << "autoclosing stale session " << session
->info
.inst
1049 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1051 if (g_conf()->mds_session_blacklist_on_timeout
) {
1052 std::stringstream ss
;
1053 mds
->evict_client(session
->get_client().v
, false, true, ss
, nullptr);
1055 kill_session(session
, NULL
);
1060 void Server::evict_cap_revoke_non_responders() {
1061 if (!cap_revoke_eviction_timeout
) {
1065 std::list
<client_t
> to_evict
;
1066 mds
->locker
->get_late_revoking_clients(&to_evict
, cap_revoke_eviction_timeout
);
1068 for (auto const &client
: to_evict
) {
1069 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1070 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1071 << " seconds, evicting";
1072 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1075 std::stringstream ss
;
1076 bool evicted
= mds
->evict_client(client
.v
, false,
1077 g_conf()->mds_session_blacklist_on_evict
,
1079 if (evicted
&& logger
) {
1080 logger
->inc(l_mdss_cap_revoke_eviction
);
1085 void Server::handle_conf_change(const ConfigProxy
& conf
,
1086 const std::set
<std::string
> &changed
) {
1087 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1088 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1089 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1090 << cap_revoke_eviction_timeout
<< dendl
;
1092 if (changed
.count("mds_recall_max_decay_rate")) {
1093 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1098 * XXX bump in the interface here, not using an MDSContext here
1099 * because all the callers right now happen to use a SaferCond
1101 void Server::kill_session(Session
*session
, Context
*on_safe
)
1103 ceph_assert(mds
->mds_lock
.is_locked_by_me());
1105 if ((session
->is_opening() ||
1106 session
->is_open() ||
1107 session
->is_stale()) &&
1108 !session
->is_importing()) {
1109 dout(10) << "kill_session " << session
<< dendl
;
1110 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1112 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1113 if (session
->is_closing() ||
1114 session
->is_killing()) {
1116 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1118 ceph_assert(session
->is_closed() ||
1119 session
->is_importing());
1121 on_safe
->complete(0);
1126 size_t Server::apply_blacklist(const std::set
<entity_addr_t
> &blacklist
)
1128 bool prenautilus
= mds
->objecter
->with_osdmap(
1129 [&](const OSDMap
& o
) {
1130 return o
.require_osd_release
< CEPH_RELEASE_NAUTILUS
;
1133 std::vector
<Session
*> victims
;
1134 const auto& sessions
= mds
->sessionmap
.get_sessions();
1135 for (const auto& p
: sessions
) {
1136 if (!p
.first
.is_client()) {
1137 // Do not apply OSDMap blacklist to MDS daemons, we find out
1138 // about their death via MDSMap.
1142 Session
*s
= p
.second
;
1143 auto inst_addr
= s
->info
.inst
.addr
;
1144 // blacklist entries are always TYPE_ANY for nautilus+
1145 inst_addr
.set_type(entity_addr_t::TYPE_ANY
);
1146 if (blacklist
.count(inst_addr
)) {
1147 victims
.push_back(s
);
1151 // ...except pre-nautilus, they were TYPE_LEGACY
1152 inst_addr
.set_type(entity_addr_t::TYPE_LEGACY
);
1153 if (blacklist
.count(inst_addr
)) {
1154 victims
.push_back(s
);
1159 for (const auto s
: victims
) {
1160 kill_session(s
, nullptr);
1163 dout(10) << "apply_blacklist: killed " << victims
.size() << dendl
;
1165 return victims
.size();
1168 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1170 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1171 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1174 // release alloc and pending-alloc inos for this session
1175 // and wipe out session state, in case the session close aborts for some reason
1176 interval_set
<inodeno_t
> both
;
1177 both
.insert(session
->info
.prealloc_inos
);
1178 both
.insert(session
->pending_prealloc_inos
);
1180 mds
->inotable
->project_release_ids(both
);
1181 piv
= mds
->inotable
->get_projected_version();
1185 mdlog
->start_submit_entry(new ESession(session
->info
.inst
, false, pv
, both
, piv
),
1186 new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
));
1189 // clean up requests, too
1190 elist
<MDRequestImpl
*>::iterator p
=
1191 session
->requests
.begin(member_offset(MDRequestImpl
,
1192 item_session_request
));
1194 MDRequestRef mdr
= mdcache
->request_get((*p
)->reqid
);
1196 mdcache
->request_kill(mdr
);
1199 finish_flush_session(session
, session
->get_push_seq());
1202 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1204 reconnect_done
= reconnect_done_
;
1206 auto now
= clock::now();
1207 set
<Session
*> sessions
;
1208 mds
->sessionmap
.get_client_session_set(sessions
);
1209 for (auto session
: sessions
) {
1210 if (session
->is_open()) {
1211 client_reconnect_gather
.insert(session
->get_client());
1212 session
->last_cap_renew
= now
;
1216 if (client_reconnect_gather
.empty()) {
1217 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1218 reconnect_gather_finish();
1222 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1224 reconnect_start
= now
;
1225 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1226 mds
->sessionmap
.dump();
1229 void Server::handle_client_reconnect(const MClientReconnect::const_ref
&m
)
1231 dout(7) << "handle_client_reconnect " << m
->get_source()
1232 << (m
->has_more() ? " (more)" : "") << dendl
;
1233 client_t from
= m
->get_source().num();
1234 Session
*session
= mds
->get_session(m
);
1238 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1239 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1240 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1244 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1245 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1248 if (!mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1249 // XXX maybe in the future we can do better than this?
1250 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1251 mds
->clog
->info() << "denied reconnect attempt (mds is "
1252 << ceph_mds_state_name(mds
->get_state())
1253 << ") from " << m
->get_source_inst()
1254 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1257 std::string error_str
;
1258 if (!session
->is_open()) {
1259 error_str
= "session is closed";
1260 } else if (mdcache
->is_readonly()) {
1261 error_str
= "mds is readonly";
1263 if (session
->info
.client_metadata
.features
.empty())
1264 infer_supported_features(session
, session
->info
.client_metadata
);
1266 feature_bitset_t missing_features
= required_client_features
;
1267 missing_features
-= session
->info
.client_metadata
.features
;
1268 if (!missing_features
.empty()) {
1270 ss
<< "missing required features '" << missing_features
<< "'";
1271 error_str
= ss
.str();
1275 if (!error_str
.empty()) {
1277 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1278 mds
->clog
->info() << "denied reconnect attempt from "
1279 << m
->get_source_inst() << " (" << error_str
<< ")";
1284 auto r
= MClientSession::create(CEPH_SESSION_CLOSE
);
1285 mds
->send_message_client(r
, session
);
1286 if (session
->is_open())
1287 kill_session(session
, nullptr);
1291 if (!m
->has_more()) {
1292 // notify client of success with an OPEN
1293 auto reply
= MClientSession::create(CEPH_SESSION_OPEN
);
1294 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1295 reply
->supported_features
= supported_features
;
1296 mds
->send_message_client(reply
, session
);
1297 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1300 session
->last_cap_renew
= clock::now();
1303 for (const auto &r
: m
->realms
) {
1304 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1305 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1308 if (in
->snaprealm
) {
1309 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1311 // this can happen if we are non-auth or we rollback snaprealm
1312 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1314 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1316 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1317 << " seq " << r
.realm
.seq
<< dendl
;
1318 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1323 for (const auto &p
: m
->caps
) {
1324 // make sure our last_cap_id is MAX over all issued caps
1325 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1326 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1328 CInode
*in
= mdcache
->get_inode(p
.first
);
1329 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1331 if (in
&& in
->is_auth()) {
1332 // we recovered it, and it's ours. take note.
1333 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1334 << " on " << *in
<< dendl
;
1335 in
->reconnect_cap(from
, p
.second
, session
);
1336 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1337 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1341 if (in
&& !in
->is_auth()) {
1343 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1344 // add to cap export list.
1345 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1346 in
->authority().first
, true);
1348 // don't know if the inode is mine
1349 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1350 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1354 reconnect_last_seen
= clock::now();
1356 if (!m
->has_more()) {
1357 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1359 // remove from gather set
1360 client_reconnect_gather
.erase(from
);
1361 if (client_reconnect_gather
.empty())
1362 reconnect_gather_finish();
1366 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1369 auto it
= client_metadata
.find("ceph_version");
1370 if (it
!= client_metadata
.end()) {
1371 // user space client
1372 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1373 supported
= CEPHFS_FEATURE_LUMINOUS
;
1374 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1375 supported
= CEPHFS_FEATURE_KRAKEN
;
1377 it
= client_metadata
.find("kernel_version");
1378 if (it
!= client_metadata
.end()) {
1380 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1381 supported
= CEPHFS_FEATURE_LUMINOUS
;
1384 if (supported
== -1 &&
1385 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1386 supported
= CEPHFS_FEATURE_JEWEL
;
1388 if (supported
>= 0) {
1389 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1390 client_metadata
.features
= feature_bitset_t(value
);
1391 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1395 void Server::update_required_client_features()
1397 vector
<size_t> bits
= CEPHFS_FEATURES_MDS_REQUIRED
;
1399 int min_compat
= mds
->mdsmap
->get_min_compat_client();
1400 if (min_compat
>= CEPH_RELEASE_NAUTILUS
) {
1401 bits
.push_back(CEPHFS_FEATURE_NAUTILUS
);
1402 } else if (min_compat
>= CEPH_RELEASE_MIMIC
)
1403 bits
.push_back(CEPHFS_FEATURE_MIMIC
);
1404 else if (min_compat
>= CEPH_RELEASE_LUMINOUS
)
1405 bits
.push_back(CEPHFS_FEATURE_LUMINOUS
);
1406 else if (min_compat
>= CEPH_RELEASE_KRAKEN
)
1407 bits
.push_back(CEPHFS_FEATURE_KRAKEN
);
1408 else if (min_compat
>= CEPH_RELEASE_JEWEL
)
1409 bits
.push_back(CEPHFS_FEATURE_JEWEL
);
1411 std::sort(bits
.begin(), bits
.end());
1412 required_client_features
= feature_bitset_t(bits
);
1413 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1415 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1416 set
<Session
*> sessions
;
1417 mds
->sessionmap
.get_client_session_set(sessions
);
1418 for (auto session
: sessions
) {
1419 feature_bitset_t missing_features
= required_client_features
;
1420 missing_features
-= session
->info
.client_metadata
.features
;
1421 if (!missing_features
.empty()) {
1422 bool blacklisted
= mds
->objecter
->with_osdmap(
1423 [session
](const OSDMap
&osd_map
) -> bool {
1424 return osd_map
.is_blacklisted(session
->info
.inst
.addr
);
1429 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1430 << missing_features
<< "'";
1431 std::stringstream ss
;
1432 mds
->evict_client(session
->get_client().v
, false,
1433 g_conf()->mds_session_blacklist_on_evict
, ss
);
1439 void Server::reconnect_gather_finish()
1441 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1442 ceph_assert(reconnect_done
);
1444 if (!mds
->snapclient
->is_synced()) {
1445 // make sure snaptable cache is populated. snaprealms will be
1446 // extensively used in rejoin stage.
1447 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1448 mds
->snapclient
->wait_for_sync(reconnect_done
);
1450 reconnect_done
->complete(0);
1452 reconnect_done
= NULL
;
1455 void Server::reconnect_tick()
1457 if (reconnect_evicting
) {
1458 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1462 if (client_reconnect_gather
.empty())
1465 auto now
= clock::now();
1466 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1467 if (elapse1
< g_conf()->mds_reconnect_timeout
)
1470 vector
<Session
*> remaining_sessions
;
1471 remaining_sessions
.reserve(client_reconnect_gather
.size());
1472 for (auto c
: client_reconnect_gather
) {
1473 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1474 ceph_assert(session
);
1475 remaining_sessions
.push_back(session
);
1476 // client re-sends cap flush messages before the reconnect message
1477 if (session
->last_seen
> reconnect_last_seen
)
1478 reconnect_last_seen
= session
->last_seen
;
1481 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1482 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2) {
1483 dout(7) << "reconnect_tick: last seen " << elapse2
1484 << " seconds ago, extending reconnect interval" << dendl
;
1488 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1489 << " clients have not reconnected in time" << dendl
;
1491 // If we're doing blacklist evictions, use this to wait for them before
1492 // proceeding to reconnect_gather_finish
1493 MDSGatherBuilder
gather(g_ceph_context
);
1495 for (auto session
: remaining_sessions
) {
1496 // Keep sessions that have specified timeout. These sessions will prevent
1497 // mds from going to active. MDS goes to active after they all have been
1498 // killed or reclaimed.
1499 if (session
->info
.client_metadata
.find("timeout") !=
1500 session
->info
.client_metadata
.end()) {
1501 dout(1) << "reconnect keeps " << session
->info
.inst
1502 << ", need to be reclaimed" << dendl
;
1503 client_reclaim_gather
.insert(session
->get_client());
1507 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1509 mds
->clog
->warn() << "evicting unresponsive client " << *session
1510 << ", after waiting " << elapse1
1511 << " seconds during MDS startup";
1513 if (g_conf()->mds_session_blacklist_on_timeout
) {
1514 std::stringstream ss
;
1515 mds
->evict_client(session
->get_client().v
, false, true, ss
,
1518 kill_session(session
, NULL
);
1521 failed_reconnects
++;
1523 client_reconnect_gather
.clear();
1525 if (gather
.has_subs()) {
1526 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1527 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new FunctionContext(
1528 [this](int r
){reconnect_gather_finish();})));
1530 reconnect_evicting
= true;
1532 reconnect_gather_finish();
1536 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1538 if (!locks
.length()) return;
1541 auto p
= locks
.cbegin();
1542 decode(numlocks
, p
);
1543 for (int i
= 0; i
< numlocks
; ++i
) {
1545 lock
.client
= client
;
1546 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1547 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1549 decode(numlocks
, p
);
1550 for (int i
= 0; i
< numlocks
; ++i
) {
1552 lock
.client
= client
;
1553 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1554 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1559 * Call this when the MDCache is oversized, to send requests to the clients
1560 * to trim some caps, and consequently unpin some inodes in the MDCache so
1561 * that it can trim too.
1563 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1565 const auto now
= clock::now();
1566 const bool steady
= flags
&RecallFlags::STEADY
;
1567 const bool enforce_max
= flags
&RecallFlags::ENFORCE_MAX
;
1569 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1570 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1571 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1572 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1573 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1575 dout(7) << __func__
<< ":"
1576 << " min=" << min_caps_per_client
1577 << " max=" << max_caps_per_client
1578 << " total=" << Capability::count()
1579 << " flags=0x" << std::hex
<< flags
1582 /* trim caps of sessions with the most caps first */
1583 std::multimap
<uint64_t, Session
*> caps_session
;
1584 auto f
= [&caps_session
, enforce_max
, max_caps_per_client
](auto& s
) {
1585 auto num_caps
= s
->caps
.size();
1586 if (!enforce_max
|| num_caps
> max_caps_per_client
) {
1587 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1590 mds
->sessionmap
.get_client_sessions(std::move(f
));
1592 std::pair
<bool, uint64_t> result
= {false, 0};
1593 auto& [throttled
, caps_recalled
] = result
;
1594 last_recall_state
= now
;
1595 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1596 if (!session
->is_open() ||
1597 !session
->get_connection() ||
1598 !session
->info
.inst
.name
.is_client())
1601 dout(10) << __func__
<< ":"
1602 << " session " << session
->info
.inst
1603 << " caps " << num_caps
1604 << ", leases " << session
->leases
.size()
1608 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1609 newlim
= min_caps_per_client
;
1611 newlim
= num_caps
-recall_max_caps
;
1613 if (num_caps
> newlim
) {
1614 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1615 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1616 newlim
= num_caps
-recall
;
1617 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1618 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1619 const uint64_t global_recall_throttle
= recall_throttle
.get();
1620 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1621 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1624 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1625 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1628 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1629 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1634 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1636 const auto session_recall
= session
->get_recall_caps();
1637 const auto session_release
= session
->get_release_caps();
1638 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1639 /* The session has been unable to keep up with the number of caps
1640 * recalled (by half); additionally, to prevent marking sessions
1641 * we've just begun to recall from, the session_recall counter
1642 * (decayed count of caps recently recalled) is **greater** than the
1643 * session threshold for the session's cap recall throttle.
1645 dout(15) << " 2*session_release < session_recall"
1646 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1647 " 2*session_recall < recall_max_decay_threshold"
1648 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1649 " Skipping because we are unlikely to get more released." << dendl
;
1651 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1652 /* The number of caps recalled is less than the number we *could*
1653 * recall (so there isn't much left to recall?) and the number of
1654 * caps is less than the current recall_caps counter (decayed count
1655 * of caps recently recalled).
1657 dout(15) << " 2*recall < session_recall "
1658 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1659 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1660 " Skipping because we are unlikely to get more released." << dendl
;
1665 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1667 auto m
= MClientSession::create(CEPH_SESSION_RECALL_STATE
);
1668 m
->head
.max_caps
= newlim
;
1669 mds
->send_message_client(m
, session
);
1671 flush_session(session
, gather
);
1673 caps_recalled
+= session
->notify_recall_sent(newlim
);
1674 recall_throttle
.hit(recall
);
1678 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1683 void Server::force_clients_readonly()
1685 dout(10) << "force_clients_readonly" << dendl
;
1686 set
<Session
*> sessions
;
1687 mds
->sessionmap
.get_client_session_set(sessions
);
1688 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1689 p
!= sessions
.end();
1691 Session
*session
= *p
;
1692 if (!session
->info
.inst
.name
.is_client() ||
1693 !(session
->is_open() || session
->is_stale()))
1695 mds
->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO
), session
);
1700 * some generic stuff for finishing off requests
1702 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1704 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1705 ceph_assert(!mdr
->has_completed
);
1707 // note trace items for eventual reply.
1716 early_reply(mdr
, in
, dn
);
1718 mdr
->committing
= true;
1719 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1721 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1722 if (mds
->queue_one_replay()) {
1723 dout(10) << " queued next replay op" << dendl
;
1725 dout(10) << " journaled last replay op" << dendl
;
1727 } else if (mdr
->did_early_reply
)
1728 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1733 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1734 std::string_view event
)
1737 string
event_str("submit entry: ");
1739 mdr
->mark_event(event_str
);
1741 mdlog
->submit_entry(le
, fin
);
1745 * send response built from mdr contents and error code; clean up mdr
1747 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1749 if (mdr
->client_request
) {
1750 reply_client_request(mdr
, MClientReply::create(*mdr
->client_request
, r
));
1751 } else if (mdr
->internal_op
> -1) {
1752 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1753 if (!mdr
->internal_op_finish
)
1754 ceph_abort_msg("trying to respond to internal op without finisher");
1755 mdr
->internal_op_finish
->complete(r
);
1756 mdcache
->request_finish(mdr
);
1760 // statistics mds req op number and latency
1761 void Server::perf_gather_op_latency(const MClientRequest::const_ref
&req
, utime_t lat
)
1763 int code
= l_mdss_first
;
1764 switch(req
->get_op()) {
1765 case CEPH_MDS_OP_LOOKUPHASH
:
1766 code
= l_mdss_req_lookuphash_latency
;
1768 case CEPH_MDS_OP_LOOKUPINO
:
1769 code
= l_mdss_req_lookupino_latency
;
1771 case CEPH_MDS_OP_LOOKUPPARENT
:
1772 code
= l_mdss_req_lookupparent_latency
;
1774 case CEPH_MDS_OP_LOOKUPNAME
:
1775 code
= l_mdss_req_lookupname_latency
;
1777 case CEPH_MDS_OP_LOOKUP
:
1778 code
= l_mdss_req_lookup_latency
;
1780 case CEPH_MDS_OP_LOOKUPSNAP
:
1781 code
= l_mdss_req_lookupsnap_latency
;
1783 case CEPH_MDS_OP_GETATTR
:
1784 code
= l_mdss_req_getattr_latency
;
1786 case CEPH_MDS_OP_SETATTR
:
1787 code
= l_mdss_req_setattr_latency
;
1789 case CEPH_MDS_OP_SETLAYOUT
:
1790 code
= l_mdss_req_setlayout_latency
;
1792 case CEPH_MDS_OP_SETDIRLAYOUT
:
1793 code
= l_mdss_req_setdirlayout_latency
;
1795 case CEPH_MDS_OP_SETXATTR
:
1796 code
= l_mdss_req_setxattr_latency
;
1798 case CEPH_MDS_OP_RMXATTR
:
1799 code
= l_mdss_req_rmxattr_latency
;
1801 case CEPH_MDS_OP_READDIR
:
1802 code
= l_mdss_req_readdir_latency
;
1804 case CEPH_MDS_OP_SETFILELOCK
:
1805 code
= l_mdss_req_setfilelock_latency
;
1807 case CEPH_MDS_OP_GETFILELOCK
:
1808 code
= l_mdss_req_getfilelock_latency
;
1810 case CEPH_MDS_OP_CREATE
:
1811 code
= l_mdss_req_create_latency
;
1813 case CEPH_MDS_OP_OPEN
:
1814 code
= l_mdss_req_open_latency
;
1816 case CEPH_MDS_OP_MKNOD
:
1817 code
= l_mdss_req_mknod_latency
;
1819 case CEPH_MDS_OP_LINK
:
1820 code
= l_mdss_req_link_latency
;
1822 case CEPH_MDS_OP_UNLINK
:
1823 code
= l_mdss_req_unlink_latency
;
1825 case CEPH_MDS_OP_RMDIR
:
1826 code
= l_mdss_req_rmdir_latency
;
1828 case CEPH_MDS_OP_RENAME
:
1829 code
= l_mdss_req_rename_latency
;
1831 case CEPH_MDS_OP_MKDIR
:
1832 code
= l_mdss_req_mkdir_latency
;
1834 case CEPH_MDS_OP_SYMLINK
:
1835 code
= l_mdss_req_symlink_latency
;
1837 case CEPH_MDS_OP_LSSNAP
:
1838 code
= l_mdss_req_lssnap_latency
;
1840 case CEPH_MDS_OP_MKSNAP
:
1841 code
= l_mdss_req_mksnap_latency
;
1843 case CEPH_MDS_OP_RMSNAP
:
1844 code
= l_mdss_req_rmsnap_latency
;
1846 case CEPH_MDS_OP_RENAMESNAP
:
1847 code
= l_mdss_req_renamesnap_latency
;
1849 default: ceph_abort();
1851 logger
->tinc(code
, lat
);
1854 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1856 if (!g_conf()->mds_early_reply
)
1859 if (mdr
->no_early_reply
) {
1860 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
1864 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
1865 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
1869 if (mdr
->alloc_ino
) {
1870 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
1874 const MClientRequest::const_ref
&req
= mdr
->client_request
;
1875 entity_inst_t client_inst
= req
->get_source_inst();
1876 if (client_inst
.name
.is_mds())
1879 if (req
->is_replay()) {
1880 dout(10) << " no early reply on replay op" << dendl
;
1885 auto reply
= MClientReply::create(*req
, 0);
1886 reply
->set_unsafe();
1888 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1890 //_rename_finish() does not send dentry link/unlink message to replicas.
1891 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1892 // that have projected linkages from getting new replica.
1893 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
1895 dout(10) << "early_reply " << reply
->get_result()
1896 << " (" << cpp_strerror(reply
->get_result())
1897 << ") " << *req
<< dendl
;
1899 if (tracei
|| tracedn
) {
1901 mdr
->cap_releases
.erase(tracei
->vino());
1903 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1905 set_trace_dist(mdr
->session
, reply
, tracei
, tracedn
, mdr
->snapid
,
1906 req
->get_dentry_wanted(), mdr
);
1909 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1910 mds
->send_message_client(reply
, mdr
->session
);
1912 mdr
->did_early_reply
= true;
1914 mds
->logger
->inc(l_mds_reply
);
1915 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
1916 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1917 if (client_inst
.name
.is_client()) {
1918 mds
->sessionmap
.hit_session(mdr
->session
);
1920 perf_gather_op_latency(req
, lat
);
1921 dout(20) << "lat " << lat
<< dendl
;
1923 mdr
->mark_event("early_replied");
1928 * include a trace to tracei
1931 void Server::reply_client_request(MDRequestRef
& mdr
, const MClientReply::ref
&reply
)
1933 ceph_assert(mdr
.get());
1934 const MClientRequest::const_ref
&req
= mdr
->client_request
;
1936 dout(7) << "reply_client_request " << reply
->get_result()
1937 << " (" << cpp_strerror(reply
->get_result())
1938 << ") " << *req
<< dendl
;
1940 mdr
->mark_event("replying");
1942 Session
*session
= mdr
->session
;
1944 // note successful request in session map?
1946 // setfilelock requests are special, they only modify states in MDS memory.
1947 // The states get lost when MDS fails. If Client re-send a completed
1948 // setfilelock request, it means that client did not receive corresponding
1949 // setfilelock reply. So MDS should re-execute the setfilelock request.
1950 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
1951 reply
->get_result() == 0 && session
) {
1952 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
1953 session
->add_completed_request(mdr
->reqid
.tid
, created
);
1955 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
1959 // give any preallocated inos to the session
1960 apply_allocated_inos(mdr
, session
);
1962 // get tracei/tracedn from mdr?
1963 snapid_t snapid
= mdr
->snapid
;
1964 CInode
*tracei
= mdr
->tracei
;
1965 CDentry
*tracedn
= mdr
->tracedn
;
1967 bool is_replay
= mdr
->client_request
->is_replay();
1968 bool did_early_reply
= mdr
->did_early_reply
;
1969 entity_inst_t client_inst
= req
->get_source_inst();
1970 int dentry_wanted
= req
->get_dentry_wanted();
1972 if (!did_early_reply
&& !is_replay
) {
1974 mds
->logger
->inc(l_mds_reply
);
1975 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
1976 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1977 if (session
&& client_inst
.name
.is_client()) {
1978 mds
->sessionmap
.hit_session(session
);
1980 perf_gather_op_latency(req
, lat
);
1981 dout(20) << "lat " << lat
<< dendl
;
1984 mdr
->cap_releases
.erase(tracei
->vino());
1986 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1989 // drop non-rdlocks before replying, so that we can issue leases
1990 mdcache
->request_drop_non_rdlocks(mdr
);
1993 if (session
&& !client_inst
.name
.is_mds()) {
1995 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
1996 (tracei
|| tracedn
)) {
1999 mdcache
->try_reconnect_cap(tracei
, session
);
2001 // include metadata in reply
2002 set_trace_dist(session
, reply
, tracei
, tracedn
,
2003 snapid
, dentry_wanted
,
2008 // We can set the extra bl unconditionally: if it's already been sent in the
2009 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2010 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2012 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2013 mds
->send_message_client(reply
, session
);
2016 if (req
->is_queued_for_replay() &&
2017 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2018 if (reply
->get_result() < 0) {
2019 int r
= reply
->get_result();
2020 derr
<< "reply_client_request: failed to replay " << *req
2021 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2022 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2024 mds
->queue_one_replay();
2028 mdcache
->request_finish(mdr
);
2030 // take a closer look at tracei, if it happens to be a remote link
2033 tracedn
->get_projected_linkage()->is_remote()) {
2034 mdcache
->eval_remote(tracedn
);
2039 * pass inode OR dentry (not both, or we may get confused)
2041 * trace is in reverse order (i.e. root inode comes last)
2043 void Server::set_trace_dist(Session
*session
, const MClientReply::ref
&reply
,
2044 CInode
*in
, CDentry
*dn
,
2049 // skip doing this for debugging purposes?
2050 if (g_conf()->mds_inject_traceless_reply_probability
&&
2051 mdr
->ls
&& !mdr
->o_trunc
&&
2052 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2053 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2057 // inode, dentry, dir, ..., inode
2059 mds_rank_t whoami
= mds
->get_nodeid();
2060 client_t client
= session
->get_client();
2061 utime_t now
= ceph_clock_now();
2063 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2065 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
2068 if (snapid
== CEPH_NOSNAP
) {
2071 realm
= in
->find_snaprealm();
2073 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2074 reply
->snapbl
= realm
->get_snap_trace();
2075 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2080 reply
->head
.is_dentry
= 1;
2081 CDir
*dir
= dn
->get_dir();
2082 CInode
*diri
= dir
->get_inode();
2084 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2085 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2087 #ifdef MDS_VERIFY_FRAGSTAT
2088 if (dir
->is_complete())
2089 dir
->verify_fragstat();
2092 ds
.frag
= dir
->get_frag();
2093 ds
.auth
= dir
->get_dir_auth().first
;
2095 dir
->get_dist_spec(ds
.dist
, whoami
);
2097 dir
->encode_dirstat(bl
, session
->info
, ds
);
2098 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2100 encode(dn
->get_name(), bl
);
2101 if (snapid
== CEPH_NOSNAP
)
2102 mds
->locker
->issue_client_lease(dn
, client
, bl
, now
, session
);
2106 mds
->locker
->encode_lease(bl
, session
->info
, e
);
2108 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
2110 reply
->head
.is_dentry
= 0;
2114 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2115 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2116 reply
->head
.is_target
= 1;
2118 reply
->head
.is_target
= 0;
2120 reply
->set_trace(bl
);
2123 void Server::handle_client_request(const MClientRequest::const_ref
&req
)
2125 dout(4) << "handle_client_request " << *req
<< dendl
;
2128 mds
->logger
->inc(l_mds_request
);
2130 logger
->inc(l_mdss_handle_client_request
);
2132 if (!mdcache
->is_open()) {
2133 dout(5) << "waiting for root" << dendl
;
2134 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2139 Session
*session
= 0;
2140 if (req
->get_source().is_client()) {
2141 session
= mds
->get_session(req
);
2143 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2144 } else if (session
->is_closed() ||
2145 session
->is_closing() ||
2146 session
->is_killing()) {
2147 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2151 if (req
->is_queued_for_replay())
2152 mds
->queue_one_replay();
2158 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2159 // send it? hrm, this isn't ideal; they may get a lot of copies if
2160 // they have a high request rate.
2163 // completed request?
2164 bool has_completed
= false;
2165 if (req
->is_replay() || req
->get_retry_attempt()) {
2166 ceph_assert(session
);
2168 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2169 has_completed
= true;
2170 // Don't send traceless reply if the completed request has created
2171 // new inode. Treat the request as lookup request instead.
2172 if (req
->is_replay() ||
2173 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2174 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2175 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2176 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2177 auto reply
= MClientReply::create(*req
, 0);
2178 if (created
!= inodeno_t()) {
2180 encode(created
, extra
);
2181 reply
->set_extra_bl(extra
);
2183 mds
->send_message_client(reply
, session
);
2185 if (req
->is_queued_for_replay())
2186 mds
->queue_one_replay();
2190 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2191 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2192 dout(10) << " completed request which created new inode " << created
2193 << ", convert it to lookup request" << dendl
;
2194 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2195 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2200 // trim completed_request list
2201 if (req
->get_oldest_client_tid() > 0) {
2202 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2203 ceph_assert(session
);
2204 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2205 // Sessions 'completed_requests' was dirtied, mark it to be
2206 // potentially flushed at segment expiry.
2207 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2209 if (session
->get_num_trim_requests_warnings() > 0 &&
2210 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2211 session
->reset_num_trim_requests_warnings();
2213 if (session
->get_num_completed_requests() >=
2214 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2215 session
->inc_num_trim_requests_warnings();
2217 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2218 << req
->get_oldest_client_tid() << "), "
2219 << session
->get_num_completed_requests()
2220 << " completed requests recorded in session\n";
2221 mds
->clog
->warn() << ss
.str();
2222 dout(20) << __func__
<< " " << ss
.str() << dendl
;
2227 // register + dispatch
2228 MDRequestRef mdr
= mdcache
->request_start(req
);
2233 mdr
->session
= session
;
2234 session
->requests
.push_back(&mdr
->item_session_request
);
2238 mdr
->has_completed
= true;
2240 // process embedded cap releases?
2241 // (only if NOT replay!)
2242 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2243 client_t client
= req
->get_source().num();
2244 for (const auto &r
: req
->releases
) {
2245 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2247 req
->releases
.clear();
2250 dispatch_client_request(mdr
);
2254 void Server::handle_osd_map()
2256 /* Note that we check the OSDMAP_FULL flag directly rather than
2257 * using osdmap_full_flag(), because we want to know "is the flag set"
2258 * rather than "does the flag apply to us?" */
2259 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2260 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
2261 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2262 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2263 << o
.get_epoch() << dendl
;
2267 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2269 // we shouldn't be waiting on anyone.
2270 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
2273 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2275 } else if (mdr
->aborted
) {
2276 mdr
->aborted
= false;
2277 mdcache
->request_kill(mdr
);
2281 const MClientRequest::const_ref
&req
= mdr
->client_request
;
2283 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2285 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2287 if (req
->may_write()) {
2288 if (mdcache
->is_readonly()) {
2289 dout(10) << " read-only FS" << dendl
;
2290 respond_to_request(mdr
, -EROFS
);
2293 if (mdr
->has_more() && mdr
->more()->slave_error
) {
2294 dout(10) << " got error from slaves" << dendl
;
2295 respond_to_request(mdr
, mdr
->more()->slave_error
);
2301 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2302 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2303 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2304 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2305 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2306 req
->get_op() == CEPH_MDS_OP_CREATE
||
2307 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2308 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2309 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2310 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2311 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
2314 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2315 respond_to_request(mdr
, -ENOSPC
);
2318 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2322 switch (req
->get_op()) {
2323 case CEPH_MDS_OP_LOOKUPHASH
:
2324 case CEPH_MDS_OP_LOOKUPINO
:
2325 handle_client_lookup_ino(mdr
, false, false);
2327 case CEPH_MDS_OP_LOOKUPPARENT
:
2328 handle_client_lookup_ino(mdr
, true, false);
2330 case CEPH_MDS_OP_LOOKUPNAME
:
2331 handle_client_lookup_ino(mdr
, false, true);
2335 case CEPH_MDS_OP_LOOKUP
:
2336 handle_client_getattr(mdr
, true);
2339 case CEPH_MDS_OP_LOOKUPSNAP
:
2340 // lookupsnap does not reference a CDentry; treat it as a getattr
2341 case CEPH_MDS_OP_GETATTR
:
2342 handle_client_getattr(mdr
, false);
2345 case CEPH_MDS_OP_SETATTR
:
2346 handle_client_setattr(mdr
);
2348 case CEPH_MDS_OP_SETLAYOUT
:
2349 handle_client_setlayout(mdr
);
2351 case CEPH_MDS_OP_SETDIRLAYOUT
:
2352 handle_client_setdirlayout(mdr
);
2354 case CEPH_MDS_OP_SETXATTR
:
2355 handle_client_setxattr(mdr
);
2357 case CEPH_MDS_OP_RMXATTR
:
2358 handle_client_removexattr(mdr
);
2361 case CEPH_MDS_OP_READDIR
:
2362 handle_client_readdir(mdr
);
2365 case CEPH_MDS_OP_SETFILELOCK
:
2366 handle_client_file_setlock(mdr
);
2369 case CEPH_MDS_OP_GETFILELOCK
:
2370 handle_client_file_readlock(mdr
);
2374 case CEPH_MDS_OP_CREATE
:
2375 if (mdr
->has_completed
)
2376 handle_client_open(mdr
); // already created.. just open
2378 handle_client_openc(mdr
);
2381 case CEPH_MDS_OP_OPEN
:
2382 handle_client_open(mdr
);
2387 case CEPH_MDS_OP_MKNOD
:
2388 handle_client_mknod(mdr
);
2390 case CEPH_MDS_OP_LINK
:
2391 handle_client_link(mdr
);
2393 case CEPH_MDS_OP_UNLINK
:
2394 case CEPH_MDS_OP_RMDIR
:
2395 handle_client_unlink(mdr
);
2397 case CEPH_MDS_OP_RENAME
:
2398 handle_client_rename(mdr
);
2400 case CEPH_MDS_OP_MKDIR
:
2401 handle_client_mkdir(mdr
);
2403 case CEPH_MDS_OP_SYMLINK
:
2404 handle_client_symlink(mdr
);
2409 case CEPH_MDS_OP_LSSNAP
:
2410 handle_client_lssnap(mdr
);
2412 case CEPH_MDS_OP_MKSNAP
:
2413 handle_client_mksnap(mdr
);
2415 case CEPH_MDS_OP_RMSNAP
:
2416 handle_client_rmsnap(mdr
);
2418 case CEPH_MDS_OP_RENAMESNAP
:
2419 handle_client_renamesnap(mdr
);
2423 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2424 respond_to_request(mdr
, -EOPNOTSUPP
);
2429 // ---------------------------------------
2432 void Server::handle_slave_request(const MMDSSlaveRequest::const_ref
&m
)
2434 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2435 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2437 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
2441 return handle_slave_request_reply(m
);
2443 // the purpose of rename notify is enforcing causal message ordering. making sure
2444 // bystanders have received all messages from rename srcdn's auth MDS.
2445 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
2446 auto reply
= MMDSSlaveRequest::create(m
->get_reqid(), m
->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
2447 mds
->send_message(reply
, m
->get_connection());
2451 CDentry
*straydn
= NULL
;
2452 if (m
->straybl
.length() > 0) {
2453 straydn
= mdcache
->add_replica_stray(m
->straybl
, from
);
2454 ceph_assert(straydn
);
2458 // am i a new slave?
2460 if (mdcache
->have_request(m
->get_reqid())) {
2462 mdr
= mdcache
->request_get(m
->get_reqid());
2464 // is my request newer?
2465 if (mdr
->attempt
> m
->get_attempt()) {
2466 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2467 << ", dropping " << *m
<< dendl
;
2472 if (mdr
->attempt
< m
->get_attempt()) {
2473 // mine is old, close it out
2474 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2475 << ", closing out" << dendl
;
2476 mdcache
->request_finish(mdr
);
2478 } else if (mdr
->slave_to_mds
!= from
) {
2479 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
2483 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
&& m
->is_abort()) {
2484 mdr
->aborted
= true;
2485 if (mdr
->slave_request
) {
2486 // only abort on-going xlock, wrlock and auth pin
2487 ceph_assert(!mdr
->slave_did_prepare());
2489 mdcache
->request_finish(mdr
);
2496 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
2497 dout(10) << "missing slave request for " << m
->get_reqid()
2498 << " OP_FINISH, must have lost race with a forward" << dendl
;
2501 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
2502 mdr
->set_op_stamp(m
->op_stamp
);
2504 ceph_assert(mdr
->slave_request
== 0); // only one at a time, please!
2508 mdr
->straydn
= straydn
;
2511 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2512 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2513 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2515 } else if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2516 mdr
->locks
.empty()) {
2517 dout(3) << "not active yet, waiting" << dendl
;
2518 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2522 mdr
->reset_slave_request(m
);
2524 dispatch_slave_request(mdr
);
2527 void Server::handle_slave_request_reply(const MMDSSlaveRequest::const_ref
&m
)
2529 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2531 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2532 metareqid_t r
= m
->get_reqid();
2533 if (!mdcache
->have_uncommitted_master(r
, from
)) {
2534 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2535 << from
<< " reqid " << r
<< dendl
;
2538 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2539 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2543 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
2544 metareqid_t r
= m
->get_reqid();
2545 mdcache
->committed_master_slave(r
, from
);
2549 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2550 if (m
->get_attempt() != mdr
->attempt
) {
2551 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
2552 << m
->get_attempt() << dendl
;
2556 switch (m
->get_op()) {
2557 case MMDSSlaveRequest::OP_XLOCKACK
:
2559 // identify lock, master request
2560 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2561 m
->get_object_info());
2562 mdr
->more()->slaves
.insert(from
);
2563 lock
->decode_locked_state(m
->get_lock_data());
2564 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2565 mdr
->locks
.emplace_hint(mdr
->locks
.end(), lock
, MutationImpl::LockOp::XLOCK
);
2566 mdr
->finish_locking(lock
);
2567 lock
->get_xlock(mdr
, mdr
->get_client());
2569 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2570 mdr
->more()->waiting_on_slave
.erase(from
);
2571 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
2572 mdcache
->dispatch_request(mdr
);
2576 case MMDSSlaveRequest::OP_WRLOCKACK
:
2578 // identify lock, master request
2579 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2580 m
->get_object_info());
2581 mdr
->more()->slaves
.insert(from
);
2582 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2583 auto it
= mdr
->locks
.emplace_hint(mdr
->locks
.end(),
2584 lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2585 ceph_assert(it
->is_remote_wrlock());
2586 ceph_assert(it
->wrlock_target
== from
);
2588 mdr
->finish_locking(lock
);
2590 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2591 mdr
->more()->waiting_on_slave
.erase(from
);
2592 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
2593 mdcache
->dispatch_request(mdr
);
2597 case MMDSSlaveRequest::OP_AUTHPINACK
:
2598 handle_slave_auth_pin_ack(mdr
, m
);
2601 case MMDSSlaveRequest::OP_LINKPREPACK
:
2602 handle_slave_link_prep_ack(mdr
, m
);
2605 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
2606 handle_slave_rmdir_prep_ack(mdr
, m
);
2609 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
2610 handle_slave_rename_prep_ack(mdr
, m
);
2613 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
2614 handle_slave_rename_notify_ack(mdr
, m
);
2622 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
2624 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
2627 dout(7) << " abort flag set, finishing" << dendl
;
2628 mdcache
->request_finish(mdr
);
2632 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
2634 int op
= mdr
->slave_request
->get_op();
2636 case MMDSSlaveRequest::OP_XLOCK
:
2637 case MMDSSlaveRequest::OP_WRLOCK
:
2640 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2641 mdr
->slave_request
->get_object_info());
2644 dout(10) << "don't have object, dropping" << dendl
;
2645 ceph_abort(); // can this happen, if we auth pinned properly.
2647 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2648 dout(10) << "not auth for remote xlock attempt, dropping on "
2649 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2651 // use acquire_locks so that we get auth_pinning.
2652 MutationImpl::LockOpVec lov
;
2653 for (const auto& p
: mdr
->locks
) {
2655 lov
.add_xlock(p
.lock
);
2656 else if (p
.is_wrlock())
2657 lov
.add_wrlock(p
.lock
);
2662 case MMDSSlaveRequest::OP_XLOCK
:
2663 lov
.add_xlock(lock
);
2664 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2666 case MMDSSlaveRequest::OP_WRLOCK
:
2667 lov
.add_wrlock(lock
);
2668 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2672 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2676 auto r
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, replycode
);
2677 r
->set_lock_type(lock
->get_type());
2678 lock
->get_parent()->set_object_info(r
->get_object_info());
2679 if (replycode
== MMDSSlaveRequest::OP_XLOCKACK
)
2680 lock
->encode_locked_state(r
->get_lock_data());
2681 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2685 mdr
->reset_slave_request();
2689 case MMDSSlaveRequest::OP_UNXLOCK
:
2690 case MMDSSlaveRequest::OP_UNWRLOCK
:
2692 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2693 mdr
->slave_request
->get_object_info());
2695 auto it
= mdr
->locks
.find(lock
);
2696 ceph_assert(it
!= mdr
->locks
.end());
2697 bool need_issue
= false;
2699 case MMDSSlaveRequest::OP_UNXLOCK
:
2700 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2702 case MMDSSlaveRequest::OP_UNWRLOCK
:
2703 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2707 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2709 // done. no ack necessary.
2710 mdr
->reset_slave_request();
2714 case MMDSSlaveRequest::OP_DROPLOCKS
:
2715 mds
->locker
->drop_locks(mdr
.get());
2716 mdr
->reset_slave_request();
2719 case MMDSSlaveRequest::OP_AUTHPIN
:
2720 handle_slave_auth_pin(mdr
);
2723 case MMDSSlaveRequest::OP_LINKPREP
:
2724 case MMDSSlaveRequest::OP_UNLINKPREP
:
2725 handle_slave_link_prep(mdr
);
2728 case MMDSSlaveRequest::OP_RMDIRPREP
:
2729 handle_slave_rmdir_prep(mdr
);
2732 case MMDSSlaveRequest::OP_RENAMEPREP
:
2733 handle_slave_rename_prep(mdr
);
2736 case MMDSSlaveRequest::OP_FINISH
:
2737 // information about rename imported caps
2738 if (mdr
->slave_request
->inode_export
.length() > 0)
2739 mdr
->more()->inode_import
= mdr
->slave_request
->inode_export
;
2740 // finish off request.
2741 mdcache
->request_finish(mdr
);
2749 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2751 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2753 // build list of objects
2754 list
<MDSCacheObject
*> objects
;
2755 CInode
*auth_pin_freeze
= NULL
;
2756 bool fail
= false, wouldblock
= false, readonly
= false;
2758 if (mdcache
->is_readonly()) {
2759 dout(10) << " read-only FS" << dendl
;
2765 for (const auto &oi
: mdr
->slave_request
->get_authpins()) {
2766 MDSCacheObject
*object
= mdcache
->get_object(oi
);
2768 dout(10) << " don't have " << oi
<< dendl
;
2773 objects
.push_back(object
);
2774 if (oi
== mdr
->slave_request
->get_authpin_freeze())
2775 auth_pin_freeze
= static_cast<CInode
*>(object
);
2779 // can we auth pin them?
2781 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2784 if (!(*p
)->is_auth()) {
2785 dout(10) << " not auth for " << **p
<< dendl
;
2789 if (mdr
->is_auth_pinned(*p
))
2791 if (!mdr
->can_auth_pin(*p
)) {
2792 if (mdr
->slave_request
->is_nonblock()) {
2793 dout(10) << " can't auth_pin (freezing?) " << **p
<< " nonblocking" << dendl
;
2799 dout(10) << " waiting for authpinnable on " << **p
<< dendl
;
2800 (*p
)->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2801 mdr
->drop_local_auth_pins();
2803 mds
->locker
->notify_freeze_waiter(*p
);
2811 mdr
->drop_local_auth_pins(); // just in case
2813 /* freeze authpin wrong inode */
2814 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2815 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2816 mdr
->unfreeze_auth_pin(true);
2818 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2819 * on the source inode to complete. This happens after all locks for the rename
2820 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2821 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2822 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2823 * The solution is freeze the inode and prevent other MDRequests from getting new
2826 if (auth_pin_freeze
) {
2827 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
2828 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
2829 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
2830 mds
->mdlog
->flush();
2834 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2837 dout(10) << "auth_pinning " << **p
<< dendl
;
2843 auto reply
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
2845 // return list of my auth_pins (if any)
2846 for (const auto &p
: mdr
->auth_pins
) {
2847 MDSCacheObjectInfo info
;
2848 p
->set_object_info(info
);
2849 reply
->get_authpins().push_back(info
);
2850 if (p
== (MDSCacheObject
*)auth_pin_freeze
)
2851 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
2855 reply
->mark_error_wouldblock();
2857 reply
->mark_error_rofs();
2859 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
2861 // clean up this request
2862 mdr
->reset_slave_request();
2866 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&ack
)
2868 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
2869 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
2872 set
<MDSCacheObject
*> pinned
;
2873 for (const auto &oi
: ack
->get_authpins()) {
2874 MDSCacheObject
*object
= mdcache
->get_object(oi
);
2875 ceph_assert(object
); // we pinned it
2876 dout(10) << " remote has pinned " << *object
<< dendl
;
2877 if (!mdr
->is_auth_pinned(object
))
2878 mdr
->remote_auth_pins
[object
] = from
;
2879 if (oi
== ack
->get_authpin_freeze())
2880 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
2881 pinned
.insert(object
);
2884 // removed frozen auth pin ?
2885 if (mdr
->more()->is_remote_frozen_authpin
&&
2886 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
2887 auto p
= mdr
->remote_auth_pins
.find(mdr
->more()->rename_inode
);
2888 ceph_assert(p
!= mdr
->remote_auth_pins
.end());
2889 if (p
->second
== from
) {
2890 mdr
->more()->is_remote_frozen_authpin
= false;
2894 // removed auth pins?
2895 auto p
= mdr
->remote_auth_pins
.begin();
2896 while (p
!= mdr
->remote_auth_pins
.end()) {
2897 MDSCacheObject
* object
= p
->first
;
2898 if (p
->second
== from
&& pinned
.count(object
) == 0) {
2899 dout(10) << " remote has unpinned " << *object
<< dendl
;
2900 mdr
->remote_auth_pins
.erase(p
++);
2906 if (ack
->is_error_rofs()) {
2907 mdr
->more()->slave_error
= -EROFS
;
2908 mdr
->aborted
= true;
2909 } else if (ack
->is_error_wouldblock()) {
2910 mdr
->more()->slave_error
= -EWOULDBLOCK
;
2911 mdr
->aborted
= true;
2915 mdr
->more()->slaves
.insert(from
);
2917 // clear from waiting list
2918 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2919 mdr
->more()->waiting_on_slave
.erase(from
);
2922 if (mdr
->more()->waiting_on_slave
.empty())
2923 mdcache
->dispatch_request(mdr
);
2925 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
2929 // ---------------------------------------
2934 * check whether we are permitted to complete a request
2936 * Check whether we have permission to perform the operation specified
2937 * by mask on the given inode, based on the capability in the mdr's
2940 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
2943 int r
= mdr
->session
->check_access(
2945 mdr
->client_request
->get_caller_uid(),
2946 mdr
->client_request
->get_caller_gid(),
2947 &mdr
->client_request
->get_caller_gid_list(),
2948 mdr
->client_request
->head
.args
.setattr
.uid
,
2949 mdr
->client_request
->head
.args
.setattr
.gid
);
2951 respond_to_request(mdr
, r
);
2959 * check whether fragment has reached maximum size
2962 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
2964 const auto size
= in
->get_frag_size();
2965 if (size
>= g_conf()->mds_bal_fragment_size_max
) {
2966 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf()->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
2967 respond_to_request(mdr
, -ENOSPC
);
2975 /** validate_dentry_dir
2977 * verify that the dir exists and would own the dname.
2978 * do not check if the dentry exists.
2980 CDir
*Server::validate_dentry_dir(MDRequestRef
& mdr
, CInode
*diri
, std::string_view dname
)
2982 // make sure parent is a dir?
2983 if (!diri
->is_dir()) {
2984 dout(7) << "validate_dentry_dir: not a dir" << dendl
;
2985 respond_to_request(mdr
, -ENOTDIR
);
2990 frag_t fg
= diri
->pick_dirfrag(dname
);
2991 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
2996 if (dir
->is_frozen()) {
2997 dout(7) << "dir is frozen " << *dir
<< dendl
;
2998 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3006 /** prepare_null_dentry
3007 * prepare a null (or existing) dentry in given dir.
3008 * wait for any dn lock.
3010 CDentry
* Server::prepare_null_dentry(MDRequestRef
& mdr
, CDir
*dir
, std::string_view dname
, bool okexist
)
3012 dout(10) << "prepare_null_dentry " << dname
<< " in " << *dir
<< dendl
;
3013 ceph_assert(dir
->is_auth());
3015 client_t client
= mdr
->get_client();
3017 // does it already exist?
3018 CDentry
*dn
= dir
->lookup(dname
);
3021 if (dn->lock.is_xlocked_by_other(mdr)) {
3022 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
3023 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
3027 if (!dn
->get_linkage(client
, mdr
)->is_null()) {
3028 // name already exists
3029 dout(10) << "dentry " << dname
<< " exists in " << *dir
<< dendl
;
3031 respond_to_request(mdr
, -EEXIST
);
3035 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3036 dn
->first
= std::max(dn
->first
, next_snap
);
3041 // make sure dir is complete
3042 if (!dir
->is_complete() && (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
3043 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
3044 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
3049 dn
= dir
->add_null_dentry(dname
, mdcache
->get_global_snaprealm()->get_newest_seq() + 1);
3051 dout(10) << "prepare_null_dentry added " << *dn
<< dendl
;
3055 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3057 CDentry
*straydn
= mdr
->straydn
;
3060 in
->name_stray_dentry(straydname
);
3061 if (straydn
->get_name() == straydname
)
3064 ceph_assert(!mdr
->done_locking
);
3065 mdr
->unpin(straydn
);
3068 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3070 if (!mdr
->client_request
->is_replay() &&
3071 !check_fragment_space(mdr
, straydir
))
3074 straydn
= mdcache
->get_or_create_stray_dentry(in
);
3075 mdr
->straydn
= straydn
;
3080 /** prepare_new_inode
3082 * create a new inode. set c/m/atime. hit dir pop.
3084 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3085 file_layout_t
*layout
)
3087 CInode
*in
= new CInode(mdcache
);
3089 // Server::prepare_force_open_sessions() can re-open session in closing
3090 // state. In that corner case, session's prealloc_inos are being freed.
3091 // To simplify the code, we disallow using/refilling session's prealloc_ino
3092 // while session is opening.
3093 bool allow_prealloc_inos
= !mdr
->session
->is_opening();
3096 if (allow_prealloc_inos
&&
3097 mdr
->session
->info
.prealloc_inos
.size()) {
3098 mdr
->used_prealloc_ino
=
3099 in
->inode
.ino
= mdr
->session
->take_ino(useino
); // prealloc -> used
3100 mds
->sessionmap
.mark_projected(mdr
->session
);
3102 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3103 << " (" << mdr
->session
->info
.prealloc_inos
3104 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3108 in
->inode
.ino
= mds
->inotable
->project_alloc_id();
3109 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3112 if (useino
&& useino
!= in
->inode
.ino
) {
3113 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
3114 mds
->clog
->error() << mdr
->client_request
->get_source()
3115 << " specified ino " << useino
3116 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
3117 //ceph_abort(); // just for now.
3120 if (allow_prealloc_inos
&&
3121 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3122 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3123 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3124 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3125 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3126 mds
->sessionmap
.mark_projected(mdr
->session
);
3127 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3130 in
->inode
.version
= 1;
3131 in
->inode
.xattr_version
= 1;
3132 in
->inode
.nlink
= 1; // FIXME
3134 in
->inode
.mode
= mode
;
3136 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
3137 if (in
->inode
.is_dir()) {
3138 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3139 } else if (layout
) {
3140 in
->inode
.layout
= *layout
;
3142 in
->inode
.layout
= mdcache
->default_file_layout
;
3145 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
3146 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3148 CInode
*diri
= dir
->get_inode();
3150 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3152 if (diri
->inode
.mode
& S_ISGID
) {
3153 dout(10) << " dir is sticky" << dendl
;
3154 in
->inode
.gid
= diri
->inode
.gid
;
3155 if (S_ISDIR(mode
)) {
3156 dout(10) << " new dir also sticky" << dendl
;
3157 in
->inode
.mode
|= S_ISGID
;
3160 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
3162 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
3164 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
3165 mdr
->get_op_stamp();
3167 in
->inode
.change_attr
= 0;
3169 const MClientRequest::const_ref
&req
= mdr
->client_request
;
3170 if (req
->get_data().length()) {
3171 auto p
= req
->get_data().cbegin();
3173 // xattrs on new inode?
3174 CInode::mempool_xattr_map xattrs
;
3176 for (const auto &p
: xattrs
) {
3177 dout(10) << "prepare_new_inode setting xattr " << p
.first
<< dendl
;
3178 auto em
= in
->xattrs
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple(p
.second
));
3180 em
.first
->second
= p
.second
;
3184 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3185 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3186 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
3188 mdcache
->add_inode(in
); // add
3189 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3193 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3195 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3196 << " inotablev " << mds
->inotable
->get_projected_version()
3198 blob
->set_ino_alloc(mdr
->alloc_ino
,
3199 mdr
->used_prealloc_ino
,
3201 mdr
->client_request
->get_source(),
3202 mds
->sessionmap
.get_projected(),
3203 mds
->inotable
->get_projected_version());
3206 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3208 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3209 << " / " << mdr
->prealloc_inos
3210 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3212 if (mdr
->alloc_ino
) {
3213 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3215 if (mdr
->prealloc_inos
.size()) {
3216 ceph_assert(session
);
3217 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3218 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3219 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3220 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3222 if (mdr
->used_prealloc_ino
) {
3223 ceph_assert(session
);
3224 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
3225 mds
->sessionmap
.mark_dirty(session
);
3229 class C_MDS_TryFindInode
: public ServerContext
{
3232 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3233 void finish(int r
) override
{
3234 if (r
== -ESTALE
) // :( find_ino_peers failed
3235 server
->respond_to_request(mdr
, r
);
3237 server
->dispatch_client_request(mdr
);
3241 class CF_MDS_MDRContextFactory
: public MDSContextFactory
{
3243 CF_MDS_MDRContextFactory(MDCache
*cache
, MDRequestRef
&mdr
) : cache(cache
), mdr(mdr
) {}
3244 MDSContext
*build() {
3245 return new C_MDS_RetryRequest(cache
, mdr
);
3252 CDir
*Server::traverse_to_auth_dir(MDRequestRef
& mdr
, vector
<CDentry
*> &trace
, filepath refpath
)
3254 // figure parent dir vs dname
3255 if (refpath
.depth() == 0) {
3256 dout(7) << "can't do that to root" << dendl
;
3257 respond_to_request(mdr
, -EINVAL
);
3260 string dname
= refpath
.last_dentry();
3261 refpath
.pop_dentry();
3263 dout(10) << "traverse_to_auth_dir dirpath " << refpath
<< " dname " << dname
<< dendl
;
3265 // traverse to parent dir
3267 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
3268 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, &trace
, &diri
, MDS_TRAVERSE_FORWARD
);
3269 if (r
> 0) return 0; // delayed
3272 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3273 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3276 respond_to_request(mdr
, r
);
3280 // is it an auth dir?
3281 CDir
*dir
= validate_dentry_dir(mdr
, diri
, dname
);
3283 return 0; // forwarded or waiting for freeze
3285 dout(10) << "traverse_to_auth_dir " << *dir
<< dendl
;
3289 /* If this returns null, the request has been handled
3290 * as appropriate: forwarded on, or the client's been replied to */
3291 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
, int n
,
3292 MutationImpl::LockOpVec
& lov
,
3294 bool no_want_auth
, /* for readdir, who doesn't want auth _even_if_ it's
3296 file_layout_t
**layout
,
3297 bool no_lookup
) // true if we cannot return a null dentry lease
3299 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
3300 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3302 if (mdr
->done_locking
)
3306 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
3307 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, &mdr
->dn
[n
], &mdr
->in
[n
], MDS_TRAVERSE_FORWARD
);
3309 return NULL
; // delayed
3310 if (r
< 0) { // error
3311 if (r
== -ENOENT
&& n
== 0 && !mdr
->dn
[n
].empty()) {
3313 mdr
->tracedn
= mdr
->dn
[n
].back();
3315 respond_to_request(mdr
, r
);
3316 } else if (r
== -ESTALE
) {
3317 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3318 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
3319 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
3321 dout(10) << "FAIL on error " << r
<< dendl
;
3322 respond_to_request(mdr
, r
);
3326 CInode
*ref
= mdr
->in
[n
];
3327 dout(10) << "ref is " << *ref
<< dendl
;
3329 // fw to inode auth?
3330 if (mdr
->snapid
!= CEPH_NOSNAP
&& !no_want_auth
)
3334 if (ref
->is_ambiguous_auth()) {
3335 dout(10) << "waiting for single auth on " << *ref
<< dendl
;
3336 ref
->add_waiter(CInode::WAIT_SINGLEAUTH
, new C_MDS_RetryRequest(mdcache
, mdr
));
3339 if (!ref
->is_auth()) {
3340 dout(10) << "fw to auth for " << *ref
<< dendl
;
3341 mdcache
->request_forward(mdr
, ref
->authority().first
);
3346 // do NOT proceed if freezing, as cap release may defer in that case, and
3347 // we could deadlock when we try to lock @ref.
3348 // if we're already auth_pinned, continue; the release has already been processed.
3349 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3350 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3351 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3352 ref
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3353 /* If we have any auth pins, this will deadlock.
3354 * But the only way to get here if we've already got auth pins
3355 * is because we're on an inode with snapshots that got updated
3356 * between dispatches of this request. So we're going to drop
3357 * our locks and our auth pins and reacquire them later.
3359 * This is safe since we're only in this function when working on
3360 * a single MDS request; otherwise we'd be in
3361 * rdlock_path_xlock_dentry.
3363 mds
->locker
->drop_locks(mdr
.get(), NULL
);
3364 mdr
->drop_local_auth_pins();
3365 if (!mdr
->remote_auth_pins
.empty())
3366 mds
->locker
->notify_freeze_waiter(ref
);
3373 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
3374 lov
.add_rdlock(&mdr
->dn
[n
][i
]->lock
);
3376 mds
->locker
->include_snap_rdlocks_wlayout(ref
, lov
, layout
);
3378 mds
->locker
->include_snap_rdlocks(ref
, lov
);
3386 /** rdlock_path_xlock_dentry
3387 * traverse path to the directory that could/would contain dentry.
3388 * make sure i am auth for that dentry, forward as necessary.
3389 * create null dentry in place (or use existing if okexist).
3390 * get rdlocks on traversed dentries, xlock on new dentry.
3392 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
, int n
,
3393 MutationImpl::LockOpVec
& lov
,
3394 bool okexist
, bool mustexist
, bool alwaysxlock
,
3395 file_layout_t
**layout
)
3397 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
3399 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3401 client_t client
= mdr
->get_client();
3403 if (mdr
->done_locking
)
3404 return mdr
->dn
[n
].back();
3406 CDir
*dir
= traverse_to_auth_dir(mdr
, mdr
->dn
[n
], refpath
);
3409 CInode
*diri
= dir
->get_inode();
3410 if (!mdr
->reqid
.name
.is_mds()) {
3411 if (diri
->is_system() && !diri
->is_root()) {
3412 respond_to_request(mdr
, -EROFS
);
3416 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3417 respond_to_request(mdr
, -ENOENT
);
3421 // make a null dentry?
3422 std::string_view dname
= refpath
.last_dentry();
3425 dn
= dir
->lookup(dname
);
3427 // make sure dir is complete
3428 if (!dn
&& !dir
->is_complete() &&
3429 (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
3430 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
3431 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
3436 if (dn
&& !dn
->lock
.can_read(client
) && dn
->lock
.get_xlock_by() != mdr
) {
3437 dout(10) << "waiting on xlocked dentry " << *dn
<< dendl
;
3438 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryRequest(mdcache
, mdr
));
3443 if (!dn
|| dn
->get_linkage(client
, mdr
)->is_null()) {
3444 dout(7) << "dentry " << dname
<< " dne in " << *dir
<< dendl
;
3445 respond_to_request(mdr
, -ENOENT
);
3449 dn
= prepare_null_dentry(mdr
, dir
, dname
, okexist
);
3454 mdr
->dn
[n
].push_back(dn
);
3455 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
3456 mdr
->in
[n
] = dnl
->get_inode();
3459 // NOTE: rename takes the same set of locks for srcdn
3460 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
3461 lov
.add_rdlock(&mdr
->dn
[n
][i
]->lock
);
3462 if (alwaysxlock
|| dnl
->is_null())
3463 lov
.add_xlock(&dn
->lock
); // new dn, xlock
3465 lov
.add_rdlock(&dn
->lock
); // existing dn, rdlock
3466 lov
.add_wrlock(&dn
->get_dir()->inode
->filelock
); // also, wrlock on dir mtime
3467 lov
.add_wrlock(&dn
->get_dir()->inode
->nestlock
); // also, wrlock on dir mtime
3469 mds
->locker
->include_snap_rdlocks_wlayout(dn
->get_dir()->inode
, lov
, layout
);
3471 mds
->locker
->include_snap_rdlocks(dn
->get_dir()->inode
, lov
);
3481 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3483 * @param diri base inode
3484 * @param fg the exact frag we want
3485 * @param mdr request
3486 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3488 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3490 CDir
*dir
= diri
->get_dirfrag(fg
);
3492 // not open and inode not mine?
3493 if (!dir
&& !diri
->is_auth()) {
3494 mds_rank_t inauth
= diri
->authority().first
;
3495 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3496 mdcache
->request_forward(mdr
, inauth
);
3500 // not open and inode frozen?
3501 if (!dir
&& diri
->is_frozen()) {
3502 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3503 ceph_assert(diri
->get_parent_dir());
3504 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3510 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3512 // am i auth for the dirfrag?
3513 if (!dir
->is_auth()) {
3514 mds_rank_t auth
= dir
->authority().first
;
3515 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3516 << ", fw to mds." << auth
<< dendl
;
3517 mdcache
->request_forward(mdr
, auth
);
3525 // ===============================================================================
3528 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3530 const MClientRequest::const_ref
&req
= mdr
->client_request
;
3532 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3533 // refpath can't be empty for lookup but it can for
3534 // getattr (we do getattr with empty refpath for mount of '/')
3535 respond_to_request(mdr
, -EINVAL
);
3539 bool want_auth
= false;
3540 int mask
= req
->head
.args
.getattr
.mask
;
3541 if (mask
& CEPH_STAT_RSTAT
)
3542 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3544 MutationImpl::LockOpVec lov
;
3545 CInode
*ref
= rdlock_path_pin_ref(mdr
, 0, lov
, want_auth
, false, NULL
,
3550 * if client currently holds the EXCL cap on a field, do not rdlock
3551 * it; client's stat() will result in valid info if _either_ EXCL
3552 * cap is held or MDS rdlocks and reads the value here.
3554 * handling this case here is easier than weakening rdlock
3555 * semantics... that would cause problems elsewhere.
3557 client_t client
= mdr
->get_client();
3559 Capability
*cap
= ref
->get_client_cap(client
);
3560 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3561 mdr
->snapid
<= cap
->client_follows
))
3562 issued
= cap
->issued();
3564 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3565 lov
.add_rdlock(&ref
->linklock
);
3566 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3567 lov
.add_rdlock(&ref
->authlock
);
3568 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3569 lov
.add_rdlock(&ref
->xattrlock
);
3570 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3571 // Don't wait on unstable filelock if client is allowed to read file size.
3572 // This can reduce the response time of getattr in the case that multiple
3573 // clients do stat(2) and there are writers.
3574 // The downside of this optimization is that mds may not issue Fs caps along
3575 // with getattr reply. Client may need to send more getattr requests.
3576 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3577 lov
.add_rdlock(&ref
->filelock
);
3578 } else if (ref
->filelock
.is_stable() ||
3579 ref
->filelock
.get_num_wrlocks() > 0 ||
3580 !ref
->filelock
.can_read(mdr
->get_client())) {
3581 lov
.add_rdlock(&ref
->filelock
);
3582 mdr
->done_locking
= false;
3586 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3589 if (!check_access(mdr
, ref
, MAY_READ
))
3592 utime_t now
= ceph_clock_now();
3593 mdr
->set_mds_stamp(now
);
3595 // note which caps are requested, so we return at least a snapshot
3596 // value for them. (currently this matters for xattrs and inline data)
3597 mdr
->getattr_caps
= mask
;
3599 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3602 dout(10) << "reply to stat on " << *req
<< dendl
;
3605 mdr
->tracedn
= mdr
->dn
[0].back();
3606 respond_to_request(mdr
, 0);
3609 struct C_MDS_LookupIno2
: public ServerContext
{
3611 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3612 void finish(int r
) override
{
3613 server
->_lookup_ino_2(mdr
, r
);
3620 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3621 bool want_parent
, bool want_dentry
)
3623 const MClientRequest::const_ref
&req
= mdr
->client_request
;
3625 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
3626 return _lookup_snap_ino(mdr
);
3628 inodeno_t ino
= req
->get_filepath().get_ino();
3629 CInode
*in
= mdcache
->get_inode(ino
);
3630 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3631 respond_to_request(mdr
, -ESTALE
);
3635 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3639 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->have_past_parents_open() &&
3640 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
3644 // check for nothing (not read or write); this still applies the
3646 if (!check_access(mdr
, in
, 0))
3649 CDentry
*dn
= in
->get_projected_parent_dn();
3650 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3652 MutationImpl::LockOpVec lov
;
3653 if (dn
&& (want_parent
|| want_dentry
)) {
3655 lov
.add_rdlock(&dn
->lock
);
3658 unsigned mask
= req
->head
.args
.lookupino
.mask
;
3660 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3662 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3663 issued
= cap
->issued();
3664 // permission bits, ACL/security xattrs
3665 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3666 lov
.add_rdlock(&in
->authlock
);
3667 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3668 lov
.add_rdlock(&in
->xattrlock
);
3670 mdr
->getattr_caps
= mask
;
3674 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3678 // need read access to directory inode
3679 if (!check_access(mdr
, diri
, MAY_READ
))
3685 if (in
->is_base()) {
3686 respond_to_request(mdr
, -EINVAL
);
3689 if (!diri
|| diri
->is_stray()) {
3690 respond_to_request(mdr
, -ESTALE
);
3693 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3695 respond_to_request(mdr
, 0);
3698 inodeno_t dirino
= req
->get_filepath2().get_ino();
3699 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3700 respond_to_request(mdr
, -ENOENT
);
3703 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3705 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3710 respond_to_request(mdr
, 0);
3714 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
3716 const MClientRequest::const_ref
&req
= mdr
->client_request
;
3719 vino
.ino
= req
->get_filepath().get_ino();
3720 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
3721 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
3722 __u32 hash
= req
->head
.args
.lookupino
.hash
;
3724 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
3726 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
3728 in
= mdcache
->get_inode(vino
.ino
);
3730 if (in
->state_test(CInode::STATE_PURGING
) ||
3731 !in
->has_snap_data(vino
.snapid
)) {
3732 if (in
->is_dir() || !parent_ino
) {
3733 respond_to_request(mdr
, -ESTALE
);
3742 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
3743 mdr
->snapid
= vino
.snapid
;
3745 respond_to_request(mdr
, 0);
3749 CInode
*diri
= NULL
;
3751 diri
= mdcache
->get_inode(parent_ino
);
3753 mdcache
->open_ino(parent_ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
3757 if (!diri
->is_dir()) {
3758 respond_to_request(mdr
, -EINVAL
);
3762 MutationImpl::LockOpVec lov
;
3763 lov
.add_rdlock(&diri
->dirfragtreelock
);
3764 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3767 frag_t frag
= diri
->dirfragtree
[hash
];
3768 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
3772 if (!dir
->is_complete()) {
3773 if (dir
->is_frozen()) {
3774 mds
->locker
->drop_locks(mdr
.get());
3775 mdr
->drop_local_auth_pins();
3776 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3779 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
3783 respond_to_request(mdr
, -ESTALE
);
3785 mdcache
->open_ino(vino
.ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
3789 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
3791 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
3792 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3794 // `r` is a rank if >=0, else an error code
3796 mds_rank_t
dest_rank(r
);
3797 if (dest_rank
== mds
->get_nodeid())
3798 dispatch_client_request(mdr
);
3800 mdcache
->request_forward(mdr
, dest_rank
);
3805 if (r
== -ENOENT
|| r
== -ENODATA
)
3807 respond_to_request(mdr
, r
);
3811 /* This function takes responsibility for the passed mdr*/
3812 void Server::handle_client_open(MDRequestRef
& mdr
)
3814 const MClientRequest::const_ref
&req
= mdr
->client_request
;
3815 dout(7) << "open on " << req
->get_filepath() << dendl
;
3817 int flags
= req
->head
.args
.open
.flags
;
3818 int cmode
= ceph_flags_to_mode(flags
);
3820 respond_to_request(mdr
, -EINVAL
);
3824 bool need_auth
= !file_mode_is_readonly(cmode
) ||
3825 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
3827 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
3828 dout(7) << "read-only FS" << dendl
;
3829 respond_to_request(mdr
, -EROFS
);
3833 MutationImpl::LockOpVec lov
;
3834 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, need_auth
);
3838 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
3839 ceph_assert(!need_auth
);
3840 mdr
->done_locking
= false;
3841 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
3846 if (!cur
->inode
.is_file()) {
3847 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3848 cmode
= CEPH_FILE_MODE_PIN
;
3849 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3850 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
3851 flags
&= ~CEPH_O_TRUNC
;
3854 dout(10) << "open flags = " << flags
3855 << ", filemode = " << cmode
3856 << ", need_auth = " << need_auth
3860 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3861 dout(7) << "not a file or dir " << *cur << dendl;
3862 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3865 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
3866 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
3867 respond_to_request(mdr
, -EINVAL
);
3871 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
3872 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
3873 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3874 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
3878 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
3879 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
3880 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
3881 respond_to_request(mdr
, -EPERM
);
3885 // snapped data is read only
3886 if (mdr
->snapid
!= CEPH_NOSNAP
&&
3887 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
3888 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
3889 respond_to_request(mdr
, -EROFS
);
3893 unsigned mask
= req
->head
.args
.open
.mask
;
3895 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
3897 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3898 issued
= cap
->issued();
3899 // permission bits, ACL/security xattrs
3900 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3901 lov
.add_rdlock(&cur
->authlock
);
3902 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3903 lov
.add_rdlock(&cur
->xattrlock
);
3905 mdr
->getattr_caps
= mask
;
3909 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
3910 ceph_assert(cur
->is_auth());
3912 lov
.add_xlock(&cur
->filelock
);
3913 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3916 if (!check_access(mdr
, cur
, MAY_WRITE
))
3919 // wait for pending truncate?
3920 const auto pi
= cur
->get_projected_inode();
3921 if (pi
->is_truncating()) {
3922 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
3923 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
3924 mds
->locker
->drop_locks(mdr
.get());
3925 mdr
->drop_local_auth_pins();
3926 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
3930 do_open_truncate(mdr
, cmode
);
3934 // sync filelock if snapped.
3935 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3936 // and that data itself is flushed so that we can read the snapped data off disk.
3937 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
3938 lov
.add_rdlock(&cur
->filelock
);
3941 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3945 if (cmode
& CEPH_FILE_MODE_WR
)
3947 if (!check_access(mdr
, cur
, mask
))
3950 utime_t now
= ceph_clock_now();
3951 mdr
->set_mds_stamp(now
);
3953 if (cur
->is_file() || cur
->is_dir()) {
3954 if (mdr
->snapid
== CEPH_NOSNAP
) {
3956 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
->session
, 0, req
->is_replay());
3958 dout(12) << "open issued caps " << ccap_string(cap
->pending())
3959 << " for " << req
->get_source()
3960 << " on " << *cur
<< dendl
;
3962 int caps
= ceph_caps_for_mode(cmode
);
3963 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
3964 << " for " << req
->get_source()
3965 << " snapid " << mdr
->snapid
3966 << " on " << *cur
<< dendl
;
3967 mdr
->snap_caps
= caps
;
3971 // increase max_size?
3972 if (cmode
& CEPH_FILE_MODE_WR
)
3973 mds
->locker
->check_inode_max_size(cur
);
3975 // make sure this inode gets into the journal
3976 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
3977 mdcache
->open_file_table
.should_log_open(cur
)) {
3978 EOpen
*le
= new EOpen(mds
->mdlog
);
3979 mdlog
->start_entry(le
);
3980 le
->add_clean_inode(cur
);
3981 mdlog
->submit_entry(le
);
3985 if (cmode
& CEPH_FILE_MODE_WR
)
3986 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
3988 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
3989 mdr
->client_request
->get_source().num());
3992 if (req
->get_dentry_wanted()) {
3993 ceph_assert(mdr
->dn
[0].size());
3994 dn
= mdr
->dn
[0].back();
3999 respond_to_request(mdr
, 0);
4002 class C_MDS_openc_finish
: public ServerLogContext
{
4006 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4007 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4008 void finish(int r
) override
{
4009 ceph_assert(r
== 0);
4011 dn
->pop_projected_linkage();
4013 // dirty inode, dn, dir
4014 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
4015 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
4016 newi
->mark_dirty_parent(mdr
->ls
, true);
4020 get_mds()->locker
->share_inode_max_size(newi
);
4022 MDRequestRef null_ref
;
4023 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4025 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4027 server
->respond_to_request(mdr
, 0);
4029 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4033 /* This function takes responsibility for the passed mdr*/
4034 void Server::handle_client_openc(MDRequestRef
& mdr
)
4036 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4037 client_t client
= mdr
->get_client();
4039 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4041 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4043 respond_to_request(mdr
, -EINVAL
);
4047 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4050 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
4051 int r
= mdcache
->path_traverse(mdr
, cf
, req
->get_filepath(),
4052 &mdr
->dn
[0], NULL
, MDS_TRAVERSE_FORWARD
);
4056 handle_client_open(mdr
);
4059 if (r
< 0 && r
!= -ENOENT
) {
4061 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
4062 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
4063 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), c
);
4065 dout(10) << "FAIL on error " << r
<< dendl
;
4066 respond_to_request(mdr
, r
);
4072 MutationImpl::LockOpVec lov
;
4073 file_layout_t
*dir_layout
= nullptr;
4074 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, lov
,
4075 !excl
, false, false, &dir_layout
);
4077 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4078 respond_to_request(mdr
, -EROFS
);
4082 file_layout_t layout
;
4084 layout
= *dir_layout
;
4086 layout
= mdcache
->default_file_layout
;
4088 // What kind of client caps are required to complete this operation
4089 uint64_t access
= MAY_WRITE
;
4091 const auto default_layout
= layout
;
4093 // fill in any special params from client
4094 if (req
->head
.args
.open
.stripe_unit
)
4095 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4096 if (req
->head
.args
.open
.stripe_count
)
4097 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4098 if (req
->head
.args
.open
.object_size
)
4099 layout
.object_size
= req
->head
.args
.open
.object_size
;
4100 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4101 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4102 layout
.pool_id
= req
->head
.args
.open
.pool
;
4104 // make sure we have as new a map as the client
4105 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4106 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4111 // If client doesn't have capability to modify layout pools, then
4112 // only permit this request if the requested pool matches what the
4113 // file would have inherited anyway from its parent.
4114 if (default_layout
!= layout
) {
4115 access
|= MAY_SET_VXATTR
;
4118 if (!layout
.is_valid()) {
4119 dout(10) << " invalid initial file layout" << dendl
;
4120 respond_to_request(mdr
, -EINVAL
);
4123 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4124 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4125 respond_to_request(mdr
, -EINVAL
);
4130 CDir
*dir
= dn
->get_dir();
4131 CInode
*diri
= dir
->get_inode();
4132 lov
.add_rdlock(&diri
->authlock
);
4133 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4136 if (!check_access(mdr
, diri
, access
))
4139 if (!check_fragment_space(mdr
, dir
))
4142 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4144 if (!dnl
->is_null()) {
4146 ceph_assert(req
->head
.args
.open
.flags
& CEPH_O_EXCL
);
4147 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl
;
4148 mdr
->tracei
= dnl
->get_inode();
4150 respond_to_request(mdr
, -EEXIST
);
4155 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4156 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4160 dn
->push_projected_linkage(in
);
4162 in
->inode
.version
= dn
->pre_dirty();
4163 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4164 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
4165 in
->inode
.update_backtrace();
4166 in
->inode
.rstat
.rfiles
= 1;
4168 SnapRealm
*realm
= diri
->find_snaprealm();
4169 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4170 ceph_assert(follows
>= realm
->get_newest_seq());
4172 ceph_assert(dn
->first
== follows
+1);
4173 in
->first
= dn
->first
;
4176 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, req
->is_replay());
4177 in
->authlock
.set_state(LOCK_EXCL
);
4178 in
->xattrlock
.set_state(LOCK_EXCL
);
4180 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4181 in
->inode
.client_ranges
[client
].range
.first
= 0;
4182 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.get_layout_size_increment();
4183 in
->inode
.client_ranges
[client
].follows
= follows
;
4184 cap
->mark_clientwriteable();
4188 mdr
->ls
= mdlog
->get_current_segment();
4189 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4190 mdlog
->start_entry(le
);
4191 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4192 journal_allocated_inos(mdr
, &le
->metablob
);
4193 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4194 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
4196 // make sure this inode gets into the journal
4197 le
->metablob
.add_opened_ino(in
->ino());
4199 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
);
4201 if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4202 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4203 // add the file created flag onto the reply if create_flags features is supported
4204 encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
4207 journal_and_reply(mdr
, in
, dn
, le
, fin
);
4209 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4210 // have overshot the split size (multiple opencs in flight), so here is
4211 // an early chance to split the dir if this openc makes it oversized.
4212 mds
->balancer
->maybe_fragment(dir
, false);
4217 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4219 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4220 client_t client
= req
->get_source().num();
4221 MutationImpl::LockOpVec lov
;
4222 CInode
*diri
= rdlock_path_pin_ref(mdr
, 0, lov
, false, true);
4225 // it's a directory, right?
4226 if (!diri
->is_dir()) {
4228 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
4229 respond_to_request(mdr
, -ENOTDIR
);
4233 lov
.add_rdlock(&diri
->filelock
);
4234 lov
.add_rdlock(&diri
->dirfragtreelock
);
4236 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4239 if (!check_access(mdr
, diri
, MAY_READ
))
4243 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4244 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4245 string offset_str
= req
->get_path2();
4247 __u32 offset_hash
= 0;
4248 if (!offset_str
.empty())
4249 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4251 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4253 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4254 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4256 // does the frag exist?
4257 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4259 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4260 if (fg
.contains((unsigned)offset_hash
)) {
4261 newfg
= diri
->dirfragtree
[offset_hash
];
4263 // client actually wants next frag
4264 newfg
= diri
->dirfragtree
[fg
.value()];
4268 newfg
= diri
->dirfragtree
[fg
.value()];
4270 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4274 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4278 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4279 ceph_assert(dir
->is_auth());
4281 if (!dir
->is_complete()) {
4282 if (dir
->is_frozen()) {
4283 dout(7) << "dir is frozen " << *dir
<< dendl
;
4284 mds
->locker
->drop_locks(mdr
.get());
4285 mdr
->drop_local_auth_pins();
4286 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4290 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4291 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4295 #ifdef MDS_VERIFY_FRAGSTAT
4296 dir
->verify_fragstat();
4299 utime_t now
= ceph_clock_now();
4300 mdr
->set_mds_stamp(now
);
4302 snapid_t snapid
= mdr
->snapid
;
4303 dout(10) << "snapid " << snapid
<< dendl
;
4305 SnapRealm
*realm
= diri
->find_snaprealm();
4307 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4309 max
= dir
->get_num_any(); // whatever, something big.
4310 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4312 // make sure at least one item can be encoded
4313 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4318 ds
.frag
= dir
->get_frag();
4319 ds
.auth
= dir
->get_dir_auth().first
;
4321 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4323 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4325 // count bytes available.
4326 // this isn't perfect, but we should capture the main variable/unbounded size items!
4327 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4328 int bytes_left
= max_bytes
- front_bytes
;
4329 bytes_left
-= realm
->get_snap_trace().length();
4331 // build dir contents
4334 bool start
= !offset_hash
&& offset_str
.empty();
4335 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4336 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4337 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4338 bool end
= (it
== dir
->end());
4339 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4340 CDentry
*dn
= it
->second
;
4343 if (dn
->state_test(CDentry::STATE_PURGING
))
4346 bool dnp
= dn
->use_projected(client
, mdr
);
4347 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4352 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4353 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4358 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4359 if (!(offset_key
< dn
->key()))
4363 CInode
*in
= dnl
->get_inode();
4365 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4369 // better for the MDS to do the work, if we think the client will stat any of these files.
4370 if (dnl
->is_remote() && !in
) {
4371 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4373 dn
->link_remote(dnl
, in
);
4374 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4375 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4378 // touch everything i _do_ have
4379 for (auto &p
: *dir
) {
4380 if (!p
.second
->get_linkage()->is_null())
4381 mdcache
->lru
.lru_touch(p
.second
);
4384 // already issued caps and leases, reply immediately.
4385 if (dnbl
.length() > 0) {
4386 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4387 dout(10) << " open remote dentry after caps were issued, stopping at "
4388 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4392 mds
->locker
->drop_locks(mdr
.get());
4393 mdr
->drop_local_auth_pins();
4394 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4400 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4401 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4405 unsigned start_len
= dnbl
.length();
4408 dout(12) << "including dn " << *dn
<< dendl
;
4409 encode(dn
->get_name(), dnbl
);
4410 mds
->locker
->issue_client_lease(dn
, client
, dnbl
, now
, mdr
->session
);
4413 dout(12) << "including inode " << *in
<< dendl
;
4414 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4416 // chop off dn->name, lease
4417 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4419 keep
.substr_of(dnbl
, 0, start_len
);
4423 ceph_assert(r
>= 0);
4427 mdcache
->lru
.lru_touch(dn
);
4432 flags
= CEPH_READDIR_FRAG_END
;
4434 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4436 // client only understand END and COMPLETE flags ?
4437 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4438 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4441 // finish final blob
4442 encode(numfiles
, dirbl
);
4443 encode(flags
, dirbl
);
4444 dirbl
.claim_append(dnbl
);
4447 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4448 << " bytes=" << dirbl
.length()
4449 << " start=" << (int)start
4450 << " end=" << (int)end
4452 mdr
->reply_extra_bl
= dirbl
;
4454 // bump popularity. NOTE: this doesn't quite capture it.
4455 mds
->balancer
->hit_dir(dir
, META_POP_IRD
, -1, numfiles
);
4459 respond_to_request(mdr
, 0);
4464 // ===============================================================================
4469 * finisher for basic inode updates
4471 class C_MDS_inode_update_finish
: public ServerLogContext
{
4473 bool truncating_smaller
, changed_ranges
, new_realm
;
4475 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4476 bool sm
=false, bool cr
=false, bool nr
=false) :
4477 ServerLogContext(s
, r
), in(i
),
4478 truncating_smaller(sm
), changed_ranges(cr
), new_realm(nr
) { }
4479 void finish(int r
) override
{
4480 ceph_assert(r
== 0);
4483 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4486 MDSRank
*mds
= get_mds();
4488 // notify any clients
4489 if (truncating_smaller
&& in
->inode
.is_truncating()) {
4490 mds
->locker
->issue_truncate(in
);
4491 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4495 int op
= CEPH_SNAP_OP_SPLIT
;
4496 mds
->mdcache
->send_snap_update(in
, 0, op
);
4497 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, op
);
4500 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4502 server
->respond_to_request(mdr
, 0);
4505 get_mds()->locker
->share_inode_max_size(in
);
4509 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4511 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4512 MutationImpl::LockOpVec lov
;
4514 // get the inode to operate on, and set up any locks needed for that
4515 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
4519 lov
.add_xlock(&cur
->flocklock
);
4520 /* acquire_locks will return true if it gets the locks. If it fails,
4521 it will redeliver this request at a later date, so drop the request.
4523 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4524 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4528 // copy the lock change into a ceph_filelock so we can store/apply it
4529 ceph_filelock set_lock
;
4530 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4531 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4532 set_lock
.client
= req
->get_orig_source().num();
4533 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4534 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4535 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4536 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4538 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4540 ceph_lock_state_t
*lock_state
= NULL
;
4541 bool interrupt
= false;
4543 // get the appropriate lock state
4544 switch (req
->head
.args
.filelock_change
.rule
) {
4545 case CEPH_LOCK_FLOCK_INTR
:
4548 case CEPH_LOCK_FLOCK
:
4549 lock_state
= cur
->get_flock_lock_state();
4552 case CEPH_LOCK_FCNTL_INTR
:
4555 case CEPH_LOCK_FCNTL
:
4556 lock_state
= cur
->get_fcntl_lock_state();
4560 dout(10) << "got unknown lock type " << set_lock
.type
4561 << ", dropping request!" << dendl
;
4562 respond_to_request(mdr
, -EOPNOTSUPP
);
4566 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4567 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4568 list
<ceph_filelock
> activated_locks
;
4569 MDSContext::vec waiters
;
4570 if (lock_state
->is_waiting(set_lock
)) {
4571 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4572 lock_state
->remove_waiting(set_lock
);
4573 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4574 } else if (!interrupt
) {
4575 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4576 lock_state
->remove_lock(set_lock
, activated_locks
);
4577 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4579 mds
->queue_waiters(waiters
);
4581 respond_to_request(mdr
, 0);
4583 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4584 bool deadlock
= false;
4585 if (mdr
->more()->flock_was_waiting
&&
4586 !lock_state
->is_waiting(set_lock
)) {
4587 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4588 respond_to_request(mdr
, -EINTR
);
4589 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4590 dout(10) << " it failed on this attempt" << dendl
;
4591 // couldn't set lock right now
4593 respond_to_request(mdr
, -EDEADLK
);
4594 } else if (!will_wait
) {
4595 respond_to_request(mdr
, -EWOULDBLOCK
);
4597 dout(10) << " added to waiting list" << dendl
;
4598 ceph_assert(lock_state
->is_waiting(set_lock
));
4599 mdr
->more()->flock_was_waiting
= true;
4600 mds
->locker
->drop_locks(mdr
.get());
4601 mdr
->drop_local_auth_pins();
4602 mdr
->mark_event("failed to add lock, waiting");
4604 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4607 respond_to_request(mdr
, 0);
4609 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4612 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4614 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4615 MutationImpl::LockOpVec lov
;
4617 // get the inode to operate on, and set up any locks needed for that
4618 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
4622 /* acquire_locks will return true if it gets the locks. If it fails,
4623 it will redeliver this request at a later date, so drop the request.
4625 lov
.add_rdlock(&cur
->flocklock
);
4626 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4627 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4631 // copy the lock change into a ceph_filelock so we can store/apply it
4632 ceph_filelock checking_lock
;
4633 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4634 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4635 checking_lock
.client
= req
->get_orig_source().num();
4636 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4637 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4638 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4640 // get the appropriate lock state
4641 ceph_lock_state_t
*lock_state
= NULL
;
4642 switch (req
->head
.args
.filelock_change
.rule
) {
4643 case CEPH_LOCK_FLOCK
:
4644 lock_state
= cur
->get_flock_lock_state();
4647 case CEPH_LOCK_FCNTL
:
4648 lock_state
= cur
->get_fcntl_lock_state();
4652 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4653 respond_to_request(mdr
, -EINVAL
);
4656 lock_state
->look_for_lock(checking_lock
);
4659 encode(checking_lock
, lock_bl
);
4661 mdr
->reply_extra_bl
= lock_bl
;
4662 respond_to_request(mdr
, 0);
4665 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4667 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4668 MutationImpl::LockOpVec lov
;
4669 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
4672 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4673 respond_to_request(mdr
, -EROFS
);
4676 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4677 respond_to_request(mdr
, -EPERM
);
4681 __u32 mask
= req
->head
.args
.setattr
.mask
;
4682 __u32 access_mask
= MAY_WRITE
;
4685 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4686 lov
.add_xlock(&cur
->authlock
);
4687 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4688 lov
.add_xlock(&cur
->filelock
);
4689 if (mask
& CEPH_SETATTR_CTIME
)
4690 lov
.add_wrlock(&cur
->versionlock
);
4692 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4695 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
4696 access_mask
|= MAY_CHOWN
;
4698 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
4699 access_mask
|= MAY_CHGRP
;
4701 if (!check_access(mdr
, cur
, access_mask
))
4704 // trunc from bigger -> smaller?
4705 auto pip
= cur
->get_projected_inode();
4707 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
4709 // ENOSPC on growing file while full, but allow shrinks
4710 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4711 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
4712 respond_to_request(mdr
, -ENOSPC
);
4716 bool truncating_smaller
= false;
4717 if (mask
& CEPH_SETATTR_SIZE
) {
4718 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
4719 if (truncating_smaller
&& pip
->is_truncating()) {
4720 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
4721 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4722 mds
->locker
->drop_locks(mdr
.get());
4723 mdr
->drop_local_auth_pins();
4724 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4729 bool changed_ranges
= false;
4732 mdr
->ls
= mdlog
->get_current_segment();
4733 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
4734 mdlog
->start_entry(le
);
4736 auto &pi
= cur
->project_inode();
4738 if (mask
& CEPH_SETATTR_UID
)
4739 pi
.inode
.uid
= req
->head
.args
.setattr
.uid
;
4740 if (mask
& CEPH_SETATTR_GID
)
4741 pi
.inode
.gid
= req
->head
.args
.setattr
.gid
;
4743 if (mask
& CEPH_SETATTR_MODE
)
4744 pi
.inode
.mode
= (pi
.inode
.mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
4745 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4746 S_ISREG(pi
.inode
.mode
) &&
4747 (pi
.inode
.mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
4748 pi
.inode
.mode
&= ~(S_ISUID
|S_ISGID
);
4751 if (mask
& CEPH_SETATTR_MTIME
)
4752 pi
.inode
.mtime
= req
->head
.args
.setattr
.mtime
;
4753 if (mask
& CEPH_SETATTR_ATIME
)
4754 pi
.inode
.atime
= req
->head
.args
.setattr
.atime
;
4755 if (mask
& CEPH_SETATTR_BTIME
)
4756 pi
.inode
.btime
= req
->head
.args
.setattr
.btime
;
4757 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4758 pi
.inode
.time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4759 if (mask
& CEPH_SETATTR_SIZE
) {
4760 if (truncating_smaller
) {
4761 pi
.inode
.truncate(old_size
, req
->head
.args
.setattr
.size
);
4762 le
->metablob
.add_truncate_start(cur
->ino());
4764 pi
.inode
.size
= req
->head
.args
.setattr
.size
;
4765 pi
.inode
.rstat
.rbytes
= pi
.inode
.size
;
4767 pi
.inode
.mtime
= mdr
->get_op_stamp();
4769 // adjust client's max_size?
4770 CInode::mempool_inode::client_range_map new_ranges
;
4771 bool max_increased
= false;
4772 mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
.size
, true, &new_ranges
, &max_increased
);
4773 if (pi
.inode
.client_ranges
!= new_ranges
) {
4774 dout(10) << " client_ranges " << pi
.inode
.client_ranges
<< " -> " << new_ranges
<< dendl
;
4775 pi
.inode
.client_ranges
= new_ranges
;
4776 changed_ranges
= true;
4780 pi
.inode
.version
= cur
->pre_dirty();
4781 pi
.inode
.ctime
= mdr
->get_op_stamp();
4782 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4783 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4784 pi
.inode
.change_attr
++;
4787 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4788 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4789 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4791 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
4792 truncating_smaller
, changed_ranges
));
4794 // flush immediately if there are readers/writers waiting
4795 if (mdr
->is_xlocked(&cur
->filelock
) &&
4796 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
4797 mds
->mdlog
->flush();
4800 /* Takes responsibility for mdr */
4801 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
4803 CInode
*in
= mdr
->in
[0];
4804 client_t client
= mdr
->get_client();
4807 dout(10) << "do_open_truncate " << *in
<< dendl
;
4809 SnapRealm
*realm
= in
->find_snaprealm();
4810 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, mdr
->client_request
->is_replay());
4812 mdr
->ls
= mdlog
->get_current_segment();
4813 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
4814 mdlog
->start_entry(le
);
4817 auto &pi
= in
->project_inode();
4818 pi
.inode
.version
= in
->pre_dirty();
4819 pi
.inode
.mtime
= pi
.inode
.ctime
= mdr
->get_op_stamp();
4820 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4821 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4822 pi
.inode
.change_attr
++;
4824 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
.size
, mdr
->client_request
->head
.args
.open
.old_size
);
4826 pi
.inode
.truncate(old_size
, 0);
4827 le
->metablob
.add_truncate_start(in
->ino());
4830 bool changed_ranges
= false;
4831 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4832 pi
.inode
.client_ranges
[client
].range
.first
= 0;
4833 pi
.inode
.client_ranges
[client
].range
.last
= pi
.inode
.get_layout_size_increment();
4834 pi
.inode
.client_ranges
[client
].follows
= realm
->get_newest_seq();
4835 changed_ranges
= true;
4836 cap
->mark_clientwriteable();
4839 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
4841 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
4842 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
4844 // make sure ino gets into the journal
4845 le
->metablob
.add_opened_ino(in
->ino());
4847 mdr
->o_trunc
= true;
4850 if (mdr
->client_request
->get_dentry_wanted()) {
4851 ceph_assert(mdr
->dn
[0].size());
4852 dn
= mdr
->dn
[0].back();
4855 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
4857 // Although the `open` part can give an early reply, the truncation won't
4858 // happen until our EUpdate is persistent, to give the client a prompt
4859 // response we must also flush that event.
4864 /* This function cleans up the passed mdr */
4865 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
4867 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4868 MutationImpl::LockOpVec lov
;
4869 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
4872 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4873 respond_to_request(mdr
, -EROFS
);
4876 if (!cur
->is_file()) {
4877 respond_to_request(mdr
, -EINVAL
);
4880 if (cur
->get_projected_inode()->size
||
4881 cur
->get_projected_inode()->truncate_seq
> 1) {
4882 respond_to_request(mdr
, -ENOTEMPTY
);
4887 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4888 // save existing layout for later
4889 const auto old_layout
= layout
;
4891 int access
= MAY_WRITE
;
4893 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4894 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4895 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4896 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4897 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4898 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4899 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4900 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4902 // make sure we have as new a map as the client
4903 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4904 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4909 // Don't permit layout modifications without 'p' caps
4910 if (layout
!= old_layout
) {
4911 access
|= MAY_SET_VXATTR
;
4914 if (!layout
.is_valid()) {
4915 dout(10) << "bad layout" << dendl
;
4916 respond_to_request(mdr
, -EINVAL
);
4919 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4920 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4921 respond_to_request(mdr
, -EINVAL
);
4925 lov
.add_xlock(&cur
->filelock
);
4926 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4929 if (!check_access(mdr
, cur
, access
))
4933 auto &pi
= cur
->project_inode();
4934 pi
.inode
.layout
= layout
;
4935 // add the old pool to the inode
4936 pi
.inode
.add_old_pool(old_layout
.pool_id
);
4937 pi
.inode
.version
= cur
->pre_dirty();
4938 pi
.inode
.ctime
= mdr
->get_op_stamp();
4939 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4940 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4941 pi
.inode
.change_attr
++;
4944 mdr
->ls
= mdlog
->get_current_segment();
4945 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4946 mdlog
->start_entry(le
);
4947 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4948 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4949 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4951 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4954 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
4956 const MClientRequest::const_ref
&req
= mdr
->client_request
;
4957 MutationImpl::LockOpVec lov
;
4958 file_layout_t
*dir_layout
= nullptr;
4959 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true, false, &dir_layout
);
4962 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4963 respond_to_request(mdr
, -EROFS
);
4967 if (!cur
->is_dir()) {
4968 respond_to_request(mdr
, -ENOTDIR
);
4972 lov
.add_xlock(&cur
->policylock
);
4973 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4977 const auto old_pi
= cur
->get_projected_inode();
4978 file_layout_t layout
;
4979 if (old_pi
->has_layout())
4980 layout
= old_pi
->layout
;
4981 else if (dir_layout
)
4982 layout
= *dir_layout
;
4984 layout
= mdcache
->default_file_layout
;
4986 // Level of access required to complete
4987 int access
= MAY_WRITE
;
4989 const auto old_layout
= layout
;
4991 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4992 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4993 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4994 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4995 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4996 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4997 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4998 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4999 // make sure we have as new a map as the client
5000 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5001 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5006 if (layout
!= old_layout
) {
5007 access
|= MAY_SET_VXATTR
;
5010 if (!layout
.is_valid()) {
5011 dout(10) << "bad layout" << dendl
;
5012 respond_to_request(mdr
, -EINVAL
);
5015 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5016 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5017 respond_to_request(mdr
, -EINVAL
);
5021 if (!check_access(mdr
, cur
, access
))
5024 auto &pi
= cur
->project_inode();
5025 pi
.inode
.layout
= layout
;
5026 pi
.inode
.version
= cur
->pre_dirty();
5029 mdr
->ls
= mdlog
->get_current_segment();
5030 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5031 mdlog
->start_entry(le
);
5032 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5033 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5034 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5036 mdr
->no_early_reply
= true;
5037 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5042 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5043 file_layout_t
*layout
, bool validate
)
5045 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5047 if (name
== "layout") {
5048 string::iterator begin
= value
.begin();
5049 string::iterator end
= value
.end();
5050 keys_and_values
<string::iterator
> p
; // create instance of parser
5051 std::map
<string
, string
> m
; // map to receive results
5052 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5055 string
left(begin
, end
);
5056 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5059 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5060 // Skip validation on each attr, we do it once at the end (avoid
5061 // rejecting intermediate states if the overall result is ok)
5062 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
5063 osdmap
, layout
, false);
5067 } else if (name
== "layout.object_size") {
5068 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5069 } else if (name
== "layout.stripe_unit") {
5070 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5071 } else if (name
== "layout.stripe_count") {
5072 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5073 } else if (name
== "layout.pool") {
5075 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5076 } catch (boost::bad_lexical_cast
const&) {
5077 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5079 dout(10) << " unknown pool " << value
<< dendl
;
5082 layout
->pool_id
= pool
;
5084 } else if (name
== "layout.pool_namespace") {
5085 layout
->pool_ns
= value
;
5087 dout(10) << " unknown layout vxattr " << name
<< dendl
;
5090 } catch (boost::bad_lexical_cast
const&) {
5091 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5095 if (validate
&& !layout
->is_valid()) {
5096 dout(10) << "bad layout" << dendl
;
5099 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5100 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
5106 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5108 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5110 if (name
== "quota") {
5111 string::iterator begin
= value
.begin();
5112 string::iterator end
= value
.end();
5114 // keep quota unchanged. (for create_quota_realm())
5117 keys_and_values
<string::iterator
> p
; // create instance of parser
5118 std::map
<string
, string
> m
; // map to receive results
5119 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5122 string
left(begin
, end
);
5123 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5126 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5127 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5131 } else if (name
== "quota.max_bytes") {
5132 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5135 quota
->max_bytes
= q
;
5136 } else if (name
== "quota.max_files") {
5137 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5140 quota
->max_files
= q
;
5142 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5145 } catch (boost::bad_lexical_cast
const&) {
5146 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5150 if (!quota
->is_valid()) {
5151 dout(10) << "bad quota" << dendl
;
5157 void Server::create_quota_realm(CInode
*in
)
5159 dout(10) << __func__
<< " " << *in
<< dendl
;
5161 auto req
= MClientRequest::create(CEPH_MDS_OP_SETXATTR
);
5162 req
->set_filepath(filepath(in
->ino()));
5163 req
->set_string2("ceph.quota");
5164 // empty vxattr value
5165 req
->set_tid(mds
->issue_tid());
5167 mds
->send_message_mds(req
, in
->authority().first
);
5171 * Verify that the file layout attribute carried by client
5172 * is well-formatted.
5173 * Return 0 on success, otherwise this function takes
5174 * responsibility for the passed mdr.
5176 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5179 file_layout_t
*layout
)
5181 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5185 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5186 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5187 epoch
= osdmap
.get_epoch();
5192 // we don't have the specified pool, make sure our map
5193 // is newer than or as new as the client.
5194 epoch_t req_epoch
= req
->get_osdmap_epoch();
5196 if (req_epoch
> epoch
) {
5198 // well, our map is older. consult mds.
5199 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5201 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
5202 return r
; // wait, fin will retry this request later
5206 // now we have at least as new a map as the client, try again.
5207 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5208 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5209 epoch
= osdmap
.get_epoch();
5212 ceph_assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
5214 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5216 // For compatibility with client w/ old code, we still need get the
5217 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5218 // we can remove those code.
5219 mdr
->waited_for_osdmap
= true;
5220 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
5221 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
5231 respond_to_request(mdr
, r
);
5239 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
5240 file_layout_t
*dir_layout
,
5241 MutationImpl::LockOpVec
& lov
)
5243 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5244 string
name(req
->get_path2());
5245 bufferlist bl
= req
->get_data();
5246 string
value (bl
.c_str(), bl
.length());
5247 dout(10) << "handle_set_vxattr " << name
5248 << " val " << value
.length()
5249 << " bytes on " << *cur
5252 CInode::mempool_inode
*pip
= nullptr;
5255 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5259 bool new_realm
= false;
5260 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5261 if (!cur
->is_dir()) {
5262 respond_to_request(mdr
, -EINVAL
);
5266 file_layout_t layout
;
5267 if (cur
->get_projected_inode()->has_layout())
5268 layout
= cur
->get_projected_inode()->layout
;
5269 else if (dir_layout
)
5270 layout
= *dir_layout
;
5272 layout
= mdcache
->default_file_layout
;
5274 rest
= name
.substr(name
.find("layout"));
5275 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5278 lov
.add_xlock(&cur
->policylock
);
5279 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5282 auto &pi
= cur
->project_inode();
5283 pi
.inode
.layout
= layout
;
5284 mdr
->no_early_reply
= true;
5286 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5287 if (!cur
->is_file()) {
5288 respond_to_request(mdr
, -EINVAL
);
5291 if (cur
->get_projected_inode()->size
||
5292 cur
->get_projected_inode()->truncate_seq
> 1) {
5293 respond_to_request(mdr
, -ENOTEMPTY
);
5296 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5297 rest
= name
.substr(name
.find("layout"));
5298 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5301 lov
.add_xlock(&cur
->filelock
);
5302 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5305 auto &pi
= cur
->project_inode();
5306 int64_t old_pool
= pi
.inode
.layout
.pool_id
;
5307 pi
.inode
.add_old_pool(old_pool
);
5308 pi
.inode
.layout
= layout
;
5310 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5311 if (!cur
->is_dir() || cur
->is_root()) {
5312 respond_to_request(mdr
, -EINVAL
);
5316 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5318 rest
= name
.substr(name
.find("quota"));
5319 int r
= parse_quota_vxattr(rest
, value
, "a
);
5321 respond_to_request(mdr
, r
);
5325 lov
.add_xlock(&cur
->policylock
);
5326 if (quota
.is_enable() && !cur
->get_projected_srnode()) {
5327 lov
.add_xlock(&cur
->snaplock
);
5331 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5334 auto &pi
= cur
->project_inode(false, new_realm
);
5335 pi
.inode
.quota
= quota
;
5338 SnapRealm
*realm
= cur
->find_snaprealm();
5339 auto seq
= realm
->get_newest_seq();
5340 auto &newsnap
= *pi
.snapnode
;
5341 newsnap
.created
= seq
;
5344 mdr
->no_early_reply
= true;
5347 client_t exclude_ct
= mdr
->get_client();
5348 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5349 } else if (name
.find("ceph.dir.pin") == 0) {
5350 if (!cur
->is_dir() || cur
->is_root()) {
5351 respond_to_request(mdr
, -EINVAL
);
5357 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5358 if (rank
< 0) rank
= MDS_RANK_NONE
;
5359 } catch (boost::bad_lexical_cast
const&) {
5360 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5361 respond_to_request(mdr
, -EINVAL
);
5365 lov
.add_xlock(&cur
->policylock
);
5366 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5369 auto &pi
= cur
->project_inode();
5370 cur
->set_export_pin(rank
);
5373 dout(10) << " unknown vxattr " << name
<< dendl
;
5374 respond_to_request(mdr
, -EINVAL
);
5379 pip
->ctime
= mdr
->get_op_stamp();
5380 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
5381 pip
->rstat
.rctime
= mdr
->get_op_stamp();
5382 pip
->version
= cur
->pre_dirty();
5384 pip
->update_backtrace();
5387 mdr
->ls
= mdlog
->get_current_segment();
5388 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
5389 mdlog
->start_entry(le
);
5390 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5391 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5392 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5394 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5395 false, false, new_realm
));
5399 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
5400 file_layout_t
*dir_layout
,
5401 MutationImpl::LockOpVec
& lov
)
5403 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5404 string
name(req
->get_path2());
5406 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
5408 if (name
== "ceph.dir.layout") {
5409 if (!cur
->is_dir()) {
5410 respond_to_request(mdr
, -ENODATA
);
5413 if (cur
->is_root()) {
5414 dout(10) << "can't remove layout policy on the root directory" << dendl
;
5415 respond_to_request(mdr
, -EINVAL
);
5419 if (!cur
->get_projected_inode()->has_layout()) {
5420 respond_to_request(mdr
, -ENODATA
);
5424 lov
.add_xlock(&cur
->policylock
);
5425 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5428 auto &pi
= cur
->project_inode();
5429 pi
.inode
.clear_layout();
5430 pi
.inode
.version
= cur
->pre_dirty();
5433 mdr
->ls
= mdlog
->get_current_segment();
5434 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
5435 mdlog
->start_entry(le
);
5436 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5437 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5438 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5440 mdr
->no_early_reply
= true;
5441 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5443 } else if (name
== "ceph.dir.layout.pool_namespace"
5444 || name
== "ceph.file.layout.pool_namespace") {
5445 // Namespace is the only layout field that has a meaningful
5446 // null/none value (empty string, means default layout). Is equivalent
5447 // to a setxattr with empty string: pass through the empty payload of
5448 // the rmxattr request to do this.
5449 handle_set_vxattr(mdr
, cur
, dir_layout
, lov
);
5453 respond_to_request(mdr
, -ENODATA
);
5456 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
5460 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
5461 ServerLogContext(s
, r
), in(i
) { }
5462 void finish(int r
) override
{
5463 ceph_assert(r
== 0);
5466 in
->pop_and_dirty_projected_inode(mdr
->ls
);
5470 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
5472 server
->respond_to_request(mdr
, 0);
5476 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
5478 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5479 string
name(req
->get_path2());
5480 MutationImpl::LockOpVec lov
;
5483 file_layout_t
*dir_layout
= NULL
;
5484 if (name
.compare(0, 15, "ceph.dir.layout") == 0)
5485 cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true, false, &dir_layout
);
5487 cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
5491 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5492 respond_to_request(mdr
, -EROFS
);
5496 int flags
= req
->head
.args
.setxattr
.flags
;
5498 // magic ceph.* namespace?
5499 if (name
.compare(0, 5, "ceph.") == 0) {
5500 handle_set_vxattr(mdr
, cur
, dir_layout
, lov
);
5504 lov
.add_xlock(&cur
->xattrlock
);
5505 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5508 if (!check_access(mdr
, cur
, MAY_WRITE
))
5511 auto pxattrs
= cur
->get_projected_xattrs();
5512 size_t len
= req
->get_data().length();
5513 size_t inc
= len
+ name
.length();
5515 // check xattrs kv pairs size
5516 size_t cur_xattrs_size
= 0;
5517 for (const auto& p
: *pxattrs
) {
5518 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(p
.first
) == 0)) {
5521 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
5524 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
5525 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5526 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
5527 respond_to_request(mdr
, -ENOSPC
);
5531 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(mempool::mds_co::string(name
))) {
5532 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
5533 respond_to_request(mdr
, -EEXIST
);
5536 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(mempool::mds_co::string(name
))) {
5537 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
5538 respond_to_request(mdr
, -ENODATA
);
5542 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
5545 auto &pi
= cur
->project_inode(true);
5546 pi
.inode
.version
= cur
->pre_dirty();
5547 pi
.inode
.ctime
= mdr
->get_op_stamp();
5548 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5549 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5550 pi
.inode
.change_attr
++;
5551 pi
.inode
.xattr_version
++;
5552 auto &px
= *pi
.xattrs
;
5553 if ((flags
& CEPH_XATTR_REMOVE
)) {
5554 px
.erase(mempool::mds_co::string(name
));
5556 bufferptr b
= buffer::create(len
);
5558 req
->get_data().copy(0, len
, b
.c_str());
5559 auto em
= px
.emplace(std::piecewise_construct
, std::forward_as_tuple(mempool::mds_co::string(name
)), std::forward_as_tuple(b
));
5561 em
.first
->second
= b
;
5565 mdr
->ls
= mdlog
->get_current_segment();
5566 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
5567 mdlog
->start_entry(le
);
5568 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5569 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5570 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5572 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5575 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
5577 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5578 std::string
name(req
->get_path2());
5580 MutationImpl::LockOpVec lov
;
5581 file_layout_t
*dir_layout
= nullptr;
5583 if (name
== "ceph.dir.layout")
5584 cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true, false, &dir_layout
);
5586 cur
= rdlock_path_pin_ref(mdr
, 0, lov
, true);
5590 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5591 respond_to_request(mdr
, -EROFS
);
5595 if (name
.compare(0, 5, "ceph.") == 0) {
5596 handle_remove_vxattr(mdr
, cur
, dir_layout
, lov
);
5600 lov
.add_xlock(&cur
->xattrlock
);
5601 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5604 auto pxattrs
= cur
->get_projected_xattrs();
5605 if (pxattrs
->count(mempool::mds_co::string(name
)) == 0) {
5606 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
5607 respond_to_request(mdr
, -ENODATA
);
5611 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
5614 auto &pi
= cur
->project_inode(true);
5615 auto &px
= *pi
.xattrs
;
5616 pi
.inode
.version
= cur
->pre_dirty();
5617 pi
.inode
.ctime
= mdr
->get_op_stamp();
5618 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5619 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5620 pi
.inode
.change_attr
++;
5621 pi
.inode
.xattr_version
++;
5622 px
.erase(mempool::mds_co::string(name
));
5625 mdr
->ls
= mdlog
->get_current_segment();
5626 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
5627 mdlog
->start_entry(le
);
5628 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5629 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5630 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5632 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5636 // =================================================================
5637 // DIRECTORY and NAMESPACE OPS
5640 // ------------------------------------------------
5644 class C_MDS_mknod_finish
: public ServerLogContext
{
5648 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
5649 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
5650 void finish(int r
) override
{
5651 ceph_assert(r
== 0);
5654 dn
->pop_projected_linkage();
5656 // be a bit hacky with the inode version, here.. we decrement it
5657 // just to keep mark_dirty() happen. (we didn't bother projecting
5658 // a new version of hte inode since it's just been created)
5659 newi
->inode
.version
--;
5660 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
5661 newi
->mark_dirty_parent(mdr
->ls
, true);
5664 if (newi
->inode
.is_dir()) {
5665 CDir
*dir
= newi
->get_dirfrag(frag_t());
5667 dir
->fnode
.version
--;
5668 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
5669 dir
->mark_new(mdr
->ls
);
5674 MDRequestRef null_ref
;
5675 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
5677 if (newi
->inode
.is_file())
5678 get_mds()->locker
->share_inode_max_size(newi
);
5681 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
5684 server
->respond_to_request(mdr
, 0);
5689 void Server::handle_client_mknod(MDRequestRef
& mdr
)
5691 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5692 client_t client
= mdr
->get_client();
5693 MutationImpl::LockOpVec lov
;
5694 file_layout_t
*dir_layout
= nullptr;
5695 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, lov
, false, false, false,
5698 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5699 respond_to_request(mdr
, -EROFS
);
5702 CInode
*diri
= dn
->get_dir()->get_inode();
5703 lov
.add_rdlock(&diri
->authlock
);
5704 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5707 if (!check_access(mdr
, diri
, MAY_WRITE
))
5710 if (!check_fragment_space(mdr
, dn
->get_dir()))
5713 unsigned mode
= req
->head
.args
.mknod
.mode
;
5714 if ((mode
& S_IFMT
) == 0)
5718 file_layout_t layout
;
5719 if (dir_layout
&& S_ISREG(mode
))
5720 layout
= *dir_layout
;
5722 layout
= mdcache
->default_file_layout
;
5724 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
5727 dn
->push_projected_linkage(newi
);
5729 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
5730 newi
->inode
.version
= dn
->pre_dirty();
5731 newi
->inode
.rstat
.rfiles
= 1;
5732 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
5733 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
5734 newi
->inode
.update_backtrace();
5736 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
5737 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5738 ceph_assert(follows
>= realm
->get_newest_seq());
5740 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5741 // want to write to it (e.g., if they are reexporting NFS)
5742 if (S_ISREG(newi
->inode
.mode
)) {
5743 // issue a cap on the file
5744 int cmode
= CEPH_FILE_MODE_RDWR
;
5745 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5749 // put locks in excl mode
5750 newi
->filelock
.set_state(LOCK_EXCL
);
5751 newi
->authlock
.set_state(LOCK_EXCL
);
5752 newi
->xattrlock
.set_state(LOCK_EXCL
);
5754 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
5755 newi
->inode
.client_ranges
[client
].range
.first
= 0;
5756 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.get_layout_size_increment();
5757 newi
->inode
.client_ranges
[client
].follows
= follows
;
5758 cap
->mark_clientwriteable();
5762 ceph_assert(dn
->first
== follows
+ 1);
5763 newi
->first
= dn
->first
;
5765 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
5768 mdr
->ls
= mdlog
->get_current_segment();
5769 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
5770 mdlog
->start_entry(le
);
5771 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5772 journal_allocated_inos(mdr
, &le
->metablob
);
5774 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
5775 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5776 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
5778 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5784 /* This function takes responsibility for the passed mdr*/
5785 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
5787 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5788 if (req
->get_filepath().is_last_dot_or_dotdot()) {
5789 respond_to_request(mdr
, -EEXIST
);
5793 MutationImpl::LockOpVec lov
;
5794 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, lov
, false, false, false);
5796 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5797 respond_to_request(mdr
, -EROFS
);
5800 CDir
*dir
= dn
->get_dir();
5801 CInode
*diri
= dir
->get_inode();
5802 lov
.add_rdlock(&diri
->authlock
);
5803 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5806 // mkdir check access
5807 if (!check_access(mdr
, diri
, MAY_WRITE
))
5810 if (!check_fragment_space(mdr
, dir
))
5814 unsigned mode
= req
->head
.args
.mkdir
.mode
;
5817 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5820 // it's a directory.
5821 dn
->push_projected_linkage(newi
);
5823 newi
->inode
.version
= dn
->pre_dirty();
5824 newi
->inode
.rstat
.rsubdirs
= 1;
5825 newi
->inode
.update_backtrace();
5827 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
5828 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5829 ceph_assert(follows
>= realm
->get_newest_seq());
5831 dout(12) << " follows " << follows
<< dendl
;
5832 ceph_assert(dn
->first
== follows
+ 1);
5833 newi
->first
= dn
->first
;
5835 // ...and that new dir is empty.
5836 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
5837 newdir
->state_set(CDir::STATE_CREATING
);
5838 newdir
->mark_complete();
5839 newdir
->fnode
.version
= newdir
->pre_dirty();
5842 mdr
->ls
= mdlog
->get_current_segment();
5843 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
5844 mdlog
->start_entry(le
);
5845 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5846 journal_allocated_inos(mdr
, &le
->metablob
);
5847 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5848 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5849 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
5851 // issue a cap on the directory
5852 int cmode
= CEPH_FILE_MODE_RDWR
;
5853 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5857 // put locks in excl mode
5858 newi
->filelock
.set_state(LOCK_EXCL
);
5859 newi
->authlock
.set_state(LOCK_EXCL
);
5860 newi
->xattrlock
.set_state(LOCK_EXCL
);
5863 // make sure this inode gets into the journal
5864 le
->metablob
.add_opened_ino(newi
->ino());
5866 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5868 // We hit_dir (via hit_inode) in our finish callback, but by then we might
5869 // have overshot the split size (multiple mkdir in flight), so here is
5870 // an early chance to split the dir if this mkdir makes it oversized.
5871 mds
->balancer
->maybe_fragment(dir
, false);
5877 void Server::handle_client_symlink(MDRequestRef
& mdr
)
5879 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5880 MutationImpl::LockOpVec lov
;
5881 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, lov
, false, false, false);
5883 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5884 respond_to_request(mdr
, -EROFS
);
5887 CDir
*dir
= dn
->get_dir();
5888 CInode
*diri
= dir
->get_inode();
5889 lov
.add_rdlock(&diri
->authlock
);
5890 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5893 if (!check_access(mdr
, diri
, MAY_WRITE
))
5896 if (!check_fragment_space(mdr
, dir
))
5899 unsigned mode
= S_IFLNK
| 0777;
5900 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5904 dn
->push_projected_linkage(newi
);
5906 newi
->symlink
= req
->get_path2();
5907 newi
->inode
.size
= newi
->symlink
.length();
5908 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
5909 newi
->inode
.rstat
.rfiles
= 1;
5910 newi
->inode
.version
= dn
->pre_dirty();
5911 newi
->inode
.update_backtrace();
5913 newi
->first
= dn
->first
;
5916 mdr
->ls
= mdlog
->get_current_segment();
5917 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
5918 mdlog
->start_entry(le
);
5919 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5920 journal_allocated_inos(mdr
, &le
->metablob
);
5921 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5922 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5924 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5933 void Server::handle_client_link(MDRequestRef
& mdr
)
5935 const MClientRequest::const_ref
&req
= mdr
->client_request
;
5937 dout(7) << "handle_client_link " << req
->get_filepath()
5938 << " to " << req
->get_filepath2()
5941 MutationImpl::LockOpVec lov
;
5943 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, lov
, false, false, false);
5945 CInode
*targeti
= rdlock_path_pin_ref(mdr
, 1, lov
, false);
5946 if (!targeti
) return;
5947 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5948 respond_to_request(mdr
, -EROFS
);
5952 CDir
*dir
= dn
->get_dir();
5953 dout(7) << "handle_client_link link " << dn
->get_name() << " in " << *dir
<< dendl
;
5954 dout(7) << "target is " << *targeti
<< dendl
;
5955 if (targeti
->is_dir()) {
5956 // if srcdn is replica, need to make sure its linkage is correct
5957 vector
<CDentry
*>& trace
= mdr
->dn
[1];
5958 if (trace
.empty() ||
5959 trace
.back()->is_auth() ||
5960 trace
.back()->lock
.can_read(mdr
->get_client())) {
5961 dout(7) << "target is a dir, failing..." << dendl
;
5962 respond_to_request(mdr
, -EINVAL
);
5967 lov
.erase_rdlock(&targeti
->snaplock
);
5968 lov
.add_xlock(&targeti
->snaplock
);
5969 lov
.add_xlock(&targeti
->linklock
);
5971 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5974 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5975 if (!check_access(mdr
, targeti
, MAY_WRITE
))
5978 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
5981 if (!check_fragment_space(mdr
, dir
))
5986 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
5989 if (targeti
->is_auth())
5990 _link_local(mdr
, dn
, targeti
);
5992 _link_remote(mdr
, true, dn
, targeti
);
5996 class C_MDS_link_local_finish
: public ServerLogContext
{
6003 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
6004 version_t dnpv_
, version_t tipv_
, bool ar
) :
6005 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
6006 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
6007 void finish(int r
) override
{
6008 ceph_assert(r
== 0);
6009 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
6014 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
6016 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
6018 mdr
->ls
= mdlog
->get_current_segment();
6020 // predirty NEW dentry
6021 version_t dnpv
= dn
->pre_dirty();
6022 version_t tipv
= targeti
->pre_dirty();
6024 // project inode update
6025 auto &pi
= targeti
->project_inode();
6027 pi
.inode
.ctime
= mdr
->get_op_stamp();
6028 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
6029 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
6030 pi
.inode
.change_attr
++;
6031 pi
.inode
.version
= tipv
;
6033 bool adjust_realm
= false;
6034 if (!targeti
->is_projected_snaprealm_global()) {
6035 sr_t
*newsnap
= targeti
->project_snaprealm();
6036 targeti
->mark_snaprealm_global(newsnap
);
6037 targeti
->record_snaprealm_parent_dentry(newsnap
, NULL
, targeti
->get_projected_parent_dn(), true);
6038 adjust_realm
= true;
6042 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
6043 mdlog
->start_entry(le
);
6044 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6045 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
6046 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
6047 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6048 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
6050 // do this after predirty_*, to avoid funky extra dnl arg
6051 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6053 journal_and_reply(mdr
, targeti
, dn
, le
,
6054 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
6057 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
6058 version_t dnpv
, version_t tipv
, bool adjust_realm
)
6060 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
6062 // link and unlock the NEW dentry
6063 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6064 if (!dnl
->get_inode())
6065 dn
->link_remote(dnl
, targeti
);
6066 dn
->mark_dirty(dnpv
, mdr
->ls
);
6069 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
6073 MDRequestRef null_ref
;
6074 mdcache
->send_dentry_link(dn
, null_ref
);
6077 int op
= CEPH_SNAP_OP_SPLIT
;
6078 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6079 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6082 // bump target popularity
6083 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6084 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6087 respond_to_request(mdr
, 0);
6091 // link / unlink remote
6093 class C_MDS_link_remote_finish
: public ServerLogContext
{
6099 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
6100 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
6101 dpv(d
->get_projected_version()) {}
6102 void finish(int r
) override
{
6103 ceph_assert(r
== 0);
6104 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
6108 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
6110 dout(10) << "_link_remote "
6111 << (inc
? "link ":"unlink ")
6112 << *dn
<< " to " << *targeti
<< dendl
;
6114 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6115 mds_rank_t linkauth
= targeti
->authority().first
;
6116 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
6117 if (mds
->is_cluster_degraded() &&
6118 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
6119 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
6120 if (mdr
->more()->waiting_on_slave
.empty())
6121 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
6125 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
6128 op
= MMDSSlaveRequest::OP_LINKPREP
;
6130 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
6131 auto req
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, op
);
6132 targeti
->set_object_info(req
->get_object_info());
6133 req
->op_stamp
= mdr
->get_op_stamp();
6134 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
6135 encode(*desti_srnode
, req
->desti_snapbl
);
6136 mds
->send_message_mds(req
, linkauth
);
6138 ceph_assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
6139 mdr
->more()->waiting_on_slave
.insert(linkauth
);
6142 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
6144 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
6146 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
6147 delete desti_srnode
;
6148 desti_srnode
= NULL
;
6151 mdr
->set_mds_stamp(ceph_clock_now());
6154 mdr
->ls
= mdlog
->get_current_segment();
6155 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
6156 mdlog
->start_entry(le
);
6157 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6158 if (!mdr
->more()->witnessed
.empty()) {
6159 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6160 le
->reqid
= mdr
->reqid
;
6161 le
->had_slaves
= true;
6162 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6167 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
6168 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6169 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6172 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6173 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6174 le
->metablob
.add_null_dentry(dn
, true);
6175 dn
->push_projected_linkage();
6178 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
6181 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
6182 CDentry
*dn
, CInode
*targeti
,
6185 dout(10) << "_link_remote_finish "
6186 << (inc
? "link ":"unlink ")
6187 << *dn
<< " to " << *targeti
<< dendl
;
6189 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
6191 if (!mdr
->more()->witnessed
.empty())
6192 mdcache
->logged_master_update(mdr
->reqid
);
6195 // link the new dentry
6196 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6197 if (!dnl
->get_inode())
6198 dn
->link_remote(dnl
, targeti
);
6199 dn
->mark_dirty(dpv
, mdr
->ls
);
6201 // unlink main dentry
6202 dn
->get_dir()->unlink_inode(dn
);
6203 dn
->pop_projected_linkage();
6204 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
6209 MDRequestRef null_ref
;
6211 mdcache
->send_dentry_link(dn
, null_ref
);
6213 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
6215 // bump target popularity
6216 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6217 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6220 respond_to_request(mdr
, 0);
6223 // removing a new dn?
6224 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6228 // remote linking/unlinking
6230 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
6234 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
6235 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
6236 void finish(int r
) override
{
6237 ceph_assert(r
== 0);
6238 server
->_logged_slave_link(mdr
, targeti
, adjust_realm
);
6242 class C_MDS_SlaveLinkCommit
: public ServerContext
{
6246 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
6247 ServerContext(s
), mdr(r
), targeti(t
) { }
6248 void finish(int r
) override
{
6249 server
->_commit_slave_link(mdr
, r
, targeti
);
6253 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
6255 dout(10) << "handle_slave_link_prep " << *mdr
6256 << " on " << mdr
->slave_request
->get_object_info()
6259 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
6261 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
6262 ceph_assert(targeti
);
6263 dout(10) << "targeti " << *targeti
<< dendl
;
6264 CDentry
*dn
= targeti
->get_parent_dn();
6265 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6266 ceph_assert(dnl
->is_primary());
6268 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6270 mdr
->auth_pin(targeti
);
6272 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6273 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
6276 mdr
->ls
= mdlog
->get_current_segment();
6277 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
6278 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
6279 mdlog
->start_entry(le
);
6281 auto &pi
= dnl
->get_inode()->project_inode();
6283 // update journaled target inode
6285 bool adjust_realm
= false;
6286 bool realm_projected
= false;
6287 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
6290 if (!targeti
->is_projected_snaprealm_global()) {
6291 sr_t
*newsnap
= targeti
->project_snaprealm();
6292 targeti
->mark_snaprealm_global(newsnap
);
6293 targeti
->record_snaprealm_parent_dentry(newsnap
, NULL
, targeti
->get_projected_parent_dn(), true);
6294 adjust_realm
= true;
6295 realm_projected
= true;
6300 if (targeti
->is_projected_snaprealm_global()) {
6301 ceph_assert(mdr
->slave_request
->desti_snapbl
.length());
6302 auto p
= mdr
->slave_request
->desti_snapbl
.cbegin();
6304 sr_t
*newsnap
= targeti
->project_snaprealm();
6305 decode(*newsnap
, p
);
6307 if (pi
.inode
.nlink
== 0)
6308 ceph_assert(!newsnap
->is_parent_global());
6310 realm_projected
= true;
6312 ceph_assert(mdr
->slave_request
->desti_snapbl
.length() == 0);
6316 link_rollback rollback
;
6317 rollback
.reqid
= mdr
->reqid
;
6318 rollback
.ino
= targeti
->ino();
6319 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
6320 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
6321 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
6322 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
6323 rollback
.was_inc
= inc
;
6324 if (realm_projected
) {
6325 if (targeti
->snaprealm
) {
6326 encode(true, rollback
.snapbl
);
6327 targeti
->encode_snap_blob(rollback
.snapbl
);
6329 encode(false, rollback
.snapbl
);
6332 encode(rollback
, le
->rollback
);
6333 mdr
->more()->rollback_bl
= le
->rollback
;
6335 pi
.inode
.ctime
= mdr
->get_op_stamp();
6336 pi
.inode
.version
= targeti
->pre_dirty();
6338 dout(10) << " projected inode " << pi
.inode
.ino
<< " v " << pi
.inode
.version
<< dendl
;
6341 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
6342 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
6344 // set up commit waiter
6345 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
6347 mdr
->more()->slave_update_journaled
= true;
6348 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
, adjust_realm
),
6353 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
6355 dout(10) << "_logged_slave_link " << *mdr
6356 << " " << *targeti
<< dendl
;
6358 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
6360 // update the target
6361 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
6365 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6368 mdr
->reset_slave_request();
6371 int op
= CEPH_SNAP_OP_SPLIT
;
6372 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6373 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6377 if (!mdr
->aborted
) {
6378 auto reply
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_LINKPREPACK
);
6379 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6381 dout(10) << " abort flag set, finishing" << dendl
;
6382 mdcache
->request_finish(mdr
);
6387 struct C_MDS_CommittedSlave
: public ServerLogContext
{
6388 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
6389 void finish(int r
) override
{
6390 server
->_committed_slave(mdr
);
6394 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
6396 dout(10) << "_commit_slave_link " << *mdr
6398 << " " << *targeti
<< dendl
;
6400 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
6403 // drop our pins, etc.
6406 // write a commit to the journal
6407 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
6408 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
6409 mdlog
->start_entry(le
);
6410 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6413 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6417 void Server::_committed_slave(MDRequestRef
& mdr
)
6419 dout(10) << "_committed_slave " << *mdr
<< dendl
;
6421 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
6423 auto req
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_COMMITTED
);
6424 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
6425 mdcache
->request_finish(mdr
);
6428 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
6430 map
<client_t
,MClientSnap::ref
> splits
;
6431 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
6432 map
<client_t
,MClientSnap::ref
>&& _splits
) :
6433 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
6435 void finish(int r
) override
{
6436 server
->_link_rollback_finish(mut
, mdr
, splits
);
6440 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6442 link_rollback rollback
;
6443 auto p
= rbl
.cbegin();
6444 decode(rollback
, p
);
6446 dout(10) << "do_link_rollback on " << rollback
.reqid
6447 << (rollback
.was_inc
? " inc":" dec")
6448 << " ino " << rollback
.ino
6451 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
6453 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6454 ceph_assert(mdr
|| mds
->is_resolve());
6456 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
6457 mut
->ls
= mds
->mdlog
->get_current_segment();
6459 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
6461 dout(10) << " target is " << *in
<< dendl
;
6462 ceph_assert(!in
->is_projected()); // live slave request hold versionlock xlock.
6464 auto &pi
= in
->project_inode();
6465 pi
.inode
.version
= in
->pre_dirty();
6466 mut
->add_projected_inode(in
);
6468 // parent dir rctime
6469 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
6470 fnode_t
*pf
= parent
->project_fnode();
6471 mut
->add_projected_fnode(parent
);
6472 pf
->version
= parent
->pre_dirty();
6473 if (pf
->fragstat
.mtime
== pi
.inode
.ctime
) {
6474 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
6475 if (pf
->rstat
.rctime
== pi
.inode
.ctime
)
6476 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
6477 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
6478 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
6482 pi
.inode
.ctime
= rollback
.old_ctime
;
6483 if (rollback
.was_inc
)
6488 map
<client_t
,MClientSnap::ref
> splits
;
6489 if (rollback
.snapbl
.length() && in
->snaprealm
) {
6491 auto p
= rollback
.snapbl
.cbegin();
6492 decode(hadrealm
, p
);
6494 if (!mds
->is_resolve()) {
6495 sr_t
*new_srnode
= new sr_t();
6496 decode(*new_srnode
, p
);
6497 in
->project_snaprealm(new_srnode
);
6499 decode(in
->snaprealm
->srnode
, p
);
6502 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
6503 if (!mds
->is_resolve())
6504 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
6505 in
->project_snaprealm(NULL
);
6510 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
6511 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
6512 mdlog
->start_entry(le
);
6513 le
->commit
.add_dir_context(parent
);
6514 le
->commit
.add_dir(parent
, true);
6515 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
6517 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
6522 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
6523 map
<client_t
,MClientSnap::ref
>& splits
)
6525 dout(10) << "_link_rollback_finish" << dendl
;
6527 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
6531 if (!mds
->is_resolve())
6532 mdcache
->send_snaps(splits
);
6535 mdcache
->request_finish(mdr
);
6537 mdcache
->finish_rollback(mut
->reqid
);
6543 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&m
)
6545 dout(10) << "handle_slave_link_prep_ack " << *mdr
6546 << " " << *m
<< dendl
;
6547 mds_rank_t from
= mds_rank_t(m
->get_source().num());
6549 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
6552 mdr
->more()->slaves
.insert(from
);
6555 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
6556 mdr
->more()->witnessed
.insert(from
);
6557 ceph_assert(!m
->is_not_journaled());
6558 mdr
->more()->has_journaled_slaves
= true;
6560 // remove from waiting list
6561 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
6562 mdr
->more()->waiting_on_slave
.erase(from
);
6564 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
6566 dispatch_client_request(mdr
); // go again!
6575 void Server::handle_client_unlink(MDRequestRef
& mdr
)
6577 const MClientRequest::const_ref
&req
= mdr
->client_request
;
6578 client_t client
= mdr
->get_client();
6582 if (req
->get_op() == CEPH_MDS_OP_RMDIR
) rmdir
= true;
6584 const filepath
& refpath
= req
->get_filepath();
6585 if (refpath
.depth() == 0) {
6586 respond_to_request(mdr
, -EINVAL
);
6589 if (refpath
.is_last_dot_or_dotdot()) {
6590 respond_to_request(mdr
, -ENOTEMPTY
);
6595 vector
<CDentry
*> trace
;
6597 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
6598 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, &trace
, &in
, MDS_TRAVERSE_FORWARD
);
6602 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
6603 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
6606 respond_to_request(mdr
, r
);
6609 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6610 respond_to_request(mdr
, -EROFS
);
6614 CDentry
*dn
= trace
.back();
6616 if (!dn
->is_auth()) {
6617 mdcache
->request_forward(mdr
, dn
->authority().first
);
6621 CInode
*diri
= dn
->get_dir()->get_inode();
6623 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
6624 ceph_assert(!dnl
->is_null());
6627 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
6629 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
6631 dout(7) << "dn links to " << *in
<< dendl
;
6636 // do empty directory checks
6637 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
6638 respond_to_request(mdr
, -ENOTEMPTY
);
6642 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
6643 respond_to_request(mdr
, -EISDIR
);
6649 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
6650 respond_to_request(mdr
, -ENOTDIR
);
6655 // -- create stray dentry? --
6656 CDentry
*straydn
= NULL
;
6657 if (dnl
->is_primary()) {
6658 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
6661 dout(10) << " straydn is " << *straydn
<< dendl
;
6662 } else if (mdr
->straydn
) {
6663 mdr
->unpin(mdr
->straydn
);
6664 mdr
->straydn
= NULL
;
6668 MutationImpl::LockOpVec lov
;
6670 for (int i
=0; i
<(int)trace
.size()-1; i
++)
6671 lov
.add_rdlock(&trace
[i
]->lock
);
6672 lov
.add_xlock(&dn
->lock
);
6673 lov
.add_wrlock(&diri
->filelock
);
6674 lov
.add_wrlock(&diri
->nestlock
);
6675 lov
.add_xlock(&in
->linklock
);
6677 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
6678 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
6679 lov
.add_xlock(&straydn
->lock
);
6682 mds
->locker
->include_snap_rdlocks(diri
, lov
);
6683 lov
.add_xlock(&in
->snaplock
);
6685 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
6687 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6691 _dir_is_nonempty(mdr
, in
)) {
6692 respond_to_request(mdr
, -ENOTEMPTY
);
6696 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6697 if (!check_access(mdr
, diri
, MAY_WRITE
))
6702 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
6704 if (!mdr
->more()->desti_srnode
) {
6705 if (in
->is_projected_snaprealm_global()) {
6706 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
6707 in
->record_snaprealm_parent_dentry(new_srnode
, NULL
, dn
, dnl
->is_primary());
6708 // dropping the last linkage or dropping the last remote linkage,
6709 // detch the inode from global snaprealm
6710 auto nlink
= in
->get_projected_inode()->nlink
;
6712 (nlink
== 2 && !dnl
->is_primary() &&
6713 !in
->get_projected_parent_dir()->inode
->is_stray()))
6714 in
->clear_snaprealm_global(new_srnode
);
6715 mdr
->more()->desti_srnode
= new_srnode
;
6716 } else if (dnl
->is_primary()) {
6717 // prepare snaprealm blob for slave request
6718 SnapRealm
*realm
= in
->find_snaprealm();
6719 snapid_t follows
= realm
->get_newest_seq();
6720 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
6721 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
6722 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
6723 mdr
->more()->desti_srnode
= new_srnode
;
6729 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
6730 // subtree root auths need to be witnesses
6731 set
<mds_rank_t
> witnesses
;
6732 in
->list_replicas(witnesses
);
6733 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6735 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6736 p
!= witnesses
.end();
6738 if (mdr
->more()->witnessed
.count(*p
)) {
6739 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6740 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6741 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6743 if (!_rmdir_prepare_witness(mdr
, *p
, trace
, straydn
))
6747 if (!mdr
->more()->waiting_on_slave
.empty())
6748 return; // we're waiting for a witness.
6752 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
6753 _link_remote(mdr
, false, dn
, dnl
->get_inode());
6755 _unlink_local(mdr
, dn
, straydn
);
6758 class C_MDS_unlink_local_finish
: public ServerLogContext
{
6761 version_t dnpv
; // deleted dentry
6763 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
6764 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
6765 dnpv(d
->get_projected_version()) {}
6766 void finish(int r
) override
{
6767 ceph_assert(r
== 0);
6768 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
6772 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6774 dout(10) << "_unlink_local " << *dn
<< dendl
;
6776 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
6777 CInode
*in
= dnl
->get_inode();
6781 mdr
->ls
= mdlog
->get_current_segment();
6783 // prepare log entry
6784 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
6785 mdlog
->start_entry(le
);
6786 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6787 if (!mdr
->more()->witnessed
.empty()) {
6788 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6789 le
->reqid
= mdr
->reqid
;
6790 le
->had_slaves
= true;
6791 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6795 ceph_assert(dnl
->is_primary());
6796 straydn
->push_projected_linkage(in
);
6799 // the unlinked dentry
6802 auto &pi
= in
->project_inode();
6805 dn
->make_path_string(t
, true);
6806 pi
.inode
.stray_prior_path
= std::move(t
);
6808 pi
.inode
.version
= in
->pre_dirty();
6809 pi
.inode
.ctime
= mdr
->get_op_stamp();
6810 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
6811 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
6812 pi
.inode
.change_attr
++;
6814 if (pi
.inode
.nlink
== 0)
6815 in
->state_set(CInode::STATE_ORPHAN
);
6817 if (mdr
->more()->desti_srnode
) {
6818 auto& desti_srnode
= mdr
->more()->desti_srnode
;
6819 in
->project_snaprealm(desti_srnode
);
6820 desti_srnode
= NULL
;
6824 // will manually pop projected inode
6826 // primary link. add stray dentry.
6827 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
6828 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6830 pi
.inode
.update_backtrace();
6831 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
6833 mdr
->add_projected_inode(in
);
6834 // remote link. update remote inode.
6835 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6836 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
6837 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
6840 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6841 le
->metablob
.add_null_dentry(dn
, true);
6844 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6845 le
->metablob
.renamed_dirino
= in
->ino();
6848 dn
->push_projected_linkage();
6851 ceph_assert(in
->first
<= straydn
->first
);
6852 in
->first
= straydn
->first
;
6856 ceph_assert(straydn
);
6857 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6860 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
6863 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
6864 CDentry
*dn
, CDentry
*straydn
,
6867 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
6869 if (!mdr
->more()->witnessed
.empty())
6870 mdcache
->logged_master_update(mdr
->reqid
);
6872 CInode
*strayin
= NULL
;
6873 bool hadrealm
= false;
6875 // if there is newly created snaprealm, need to split old snaprealm's
6876 // inodes_with_caps. So pop snaprealm before linkage changes.
6877 strayin
= dn
->get_linkage()->get_inode();
6878 hadrealm
= strayin
->snaprealm
? true : false;
6879 strayin
->early_pop_projected_snaprealm();
6882 // unlink main dentry
6883 dn
->get_dir()->unlink_inode(dn
);
6884 dn
->pop_projected_linkage();
6886 // relink as stray? (i.e. was primary link?)
6888 dout(20) << " straydn is " << *straydn
<< dendl
;
6889 straydn
->pop_projected_linkage();
6891 strayin
->pop_and_dirty_projected_inode(mdr
->ls
);
6893 mdcache
->touch_dentry_bottom(straydn
);
6896 dn
->mark_dirty(dnpv
, mdr
->ls
);
6899 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
6902 // update subtree map?
6903 if (strayin
->is_dir())
6904 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
6906 if (strayin
->snaprealm
&& !hadrealm
)
6907 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
6911 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6914 respond_to_request(mdr
, 0);
6916 // removing a new dn?
6917 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6920 // respond_to_request() drops locks. So stray reintegration can race with us.
6921 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6922 // Tip off the MDCache that this dentry is a stray that
6923 // might be elegible for purge.
6924 mdcache
->notify_stray(straydn
);
6928 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
6930 if (mds
->is_cluster_degraded() &&
6931 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
6932 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
6933 if (mdr
->more()->waiting_on_slave
.empty())
6934 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
6938 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
6939 auto req
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RMDIRPREP
);
6940 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
6941 for (auto dn
: trace
)
6942 req
->srcdnpath
.push_dentry(dn
->get_name());
6943 mdcache
->replicate_stray(straydn
, who
, req
->straybl
);
6944 if (mdr
->more()->desti_srnode
)
6945 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
6947 req
->op_stamp
= mdr
->get_op_stamp();
6948 mds
->send_message_mds(req
, who
);
6950 ceph_assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
6951 mdr
->more()->waiting_on_slave
.insert(who
);
6955 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
6956 CDentry
*dn
, *straydn
;
6957 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
6958 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
6959 void finish(int r
) override
{
6960 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
6964 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
6967 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
6968 : ServerContext(s
), mdr(r
), straydn(sd
) { }
6969 void finish(int r
) override
{
6970 server
->_commit_slave_rmdir(mdr
, r
, straydn
);
6974 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
6976 dout(10) << "handle_slave_rmdir_prep " << *mdr
6977 << " " << mdr
->slave_request
->srcdnpath
6978 << " to " << mdr
->slave_request
->destdnpath
6981 vector
<CDentry
*> trace
;
6982 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
6983 dout(10) << " src " << srcpath
<< dendl
;
6985 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
6986 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
, &trace
, &in
, MDS_TRAVERSE_DISCOVERXLOCK
);
6989 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
6993 ceph_assert(r
== 0);
6994 CDentry
*dn
= trace
.back();
6995 dout(10) << " dn " << *dn
<< dendl
;
6998 ceph_assert(mdr
->straydn
);
6999 CDentry
*straydn
= mdr
->straydn
;
7000 dout(10) << " straydn " << *straydn
<< dendl
;
7002 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7004 rmdir_rollback rollback
;
7005 rollback
.reqid
= mdr
->reqid
;
7006 rollback
.src_dir
= dn
->get_dir()->dirfrag();
7007 rollback
.src_dname
= dn
->get_name();
7008 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
7009 rollback
.dest_dname
= straydn
->get_name();
7010 if (mdr
->slave_request
->desti_snapbl
.length()) {
7011 if (in
->snaprealm
) {
7012 encode(true, rollback
.snapbl
);
7013 in
->encode_snap_blob(rollback
.snapbl
);
7015 encode(false, rollback
.snapbl
);
7018 encode(rollback
, mdr
->more()->rollback_bl
);
7019 // FIXME: rollback snaprealm
7020 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7022 // set up commit waiter
7023 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
, straydn
);
7025 straydn
->push_projected_linkage(in
);
7026 dn
->push_projected_linkage();
7028 ceph_assert(straydn
->first
>= in
->first
);
7029 in
->first
= straydn
->first
;
7031 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
7032 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
7033 _logged_slave_rmdir(mdr
, dn
, straydn
);
7037 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
7038 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
7039 mdlog
->start_entry(le
);
7040 le
->rollback
= mdr
->more()->rollback_bl
;
7042 le
->commit
.add_dir_context(straydn
->get_dir());
7043 le
->commit
.add_primary_dentry(straydn
, in
, true);
7044 // slave: no need to journal original dentry
7046 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7047 le
->commit
.renamed_dirino
= in
->ino();
7049 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7051 mdr
->more()->slave_update_journaled
= true;
7052 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
7057 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7059 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
7060 CInode
*in
= dn
->get_linkage()->get_inode();
7063 if (mdr
->slave_request
->desti_snapbl
.length()) {
7064 new_realm
= !in
->snaprealm
;
7065 in
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
7066 ceph_assert(in
->snaprealm
);
7067 ceph_assert(in
->snaprealm
->have_past_parents_open());
7072 // update our cache now, so we are consistent with what is in the journal
7073 // when we journal a subtree map
7074 dn
->get_dir()->unlink_inode(dn
);
7075 straydn
->pop_projected_linkage();
7076 dn
->pop_projected_linkage();
7078 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->slave_update_journaled
);
7081 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
7084 mdr
->reset_slave_request();
7087 if (!mdr
->aborted
) {
7088 auto reply
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RMDIRPREPACK
);
7089 if (!mdr
->more()->slave_update_journaled
)
7090 reply
->mark_not_journaled();
7091 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7093 dout(10) << " abort flag set, finishing" << dendl
;
7094 mdcache
->request_finish(mdr
);
7098 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&ack
)
7100 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7101 << " " << *ack
<< dendl
;
7103 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
7105 mdr
->more()->slaves
.insert(from
);
7106 mdr
->more()->witnessed
.insert(from
);
7107 if (!ack
->is_not_journaled())
7108 mdr
->more()->has_journaled_slaves
= true;
7110 // remove from waiting list
7111 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
7112 mdr
->more()->waiting_on_slave
.erase(from
);
7114 if (mdr
->more()->waiting_on_slave
.empty())
7115 dispatch_client_request(mdr
); // go again!
7117 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
7120 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
7122 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
7125 if (mdr
->more()->slave_update_journaled
) {
7126 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7127 if (strayin
&& !strayin
->snaprealm
)
7128 mdcache
->clear_dirty_bits_for_stray(strayin
);
7133 if (mdr
->more()->slave_update_journaled
) {
7134 // write a commit to the journal
7135 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
7136 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
7137 ESlaveUpdate::RMDIR
);
7138 mdlog
->start_entry(le
);
7139 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
7142 _committed_slave(mdr
);
7146 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
7150 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
7154 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
7155 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
7156 void finish(int r
) override
{
7157 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
7161 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
7163 // unlink the other rollback methods, the rmdir rollback is only
7164 // needed to record the subtree changes in the journal for inode
7165 // replicas who are auth for empty dirfrags. no actual changes to
7166 // the file system are taking place here, so there is no Mutation.
7168 rmdir_rollback rollback
;
7169 auto p
= rbl
.cbegin();
7170 decode(rollback
, p
);
7172 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
7173 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
7174 ceph_assert(mdr
|| mds
->is_resolve());
7176 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
7178 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
7180 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
7182 dout(10) << " dn " << *dn
<< dendl
;
7183 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
7184 ceph_assert(straydir
);
7185 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
7186 ceph_assert(straydn
);
7187 dout(10) << " straydn " << *straydn
<< dendl
;
7188 CInode
*in
= straydn
->get_linkage()->get_inode();
7190 dn
->push_projected_linkage(in
);
7191 straydn
->push_projected_linkage();
7193 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7195 auto p
= rollback
.snapbl
.cbegin();
7196 decode(hadrealm
, p
);
7198 decode(in
->snaprealm
->srnode
, p
);
7200 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
7204 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
7205 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
7207 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
7212 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
7213 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
7214 mdlog
->start_entry(le
);
7216 le
->commit
.add_dir_context(dn
->get_dir());
7217 le
->commit
.add_primary_dentry(dn
, in
, true);
7218 // slave: no need to journal straydn
7220 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7221 le
->commit
.renamed_dirino
= in
->ino();
7223 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
7225 submit_mdlog_entry(le
,
7226 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
7232 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
7234 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
7236 straydn
->get_dir()->unlink_inode(straydn
);
7237 dn
->pop_projected_linkage();
7238 straydn
->pop_projected_linkage();
7240 CInode
*in
= dn
->get_linkage()->get_inode();
7241 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
7242 !mdr
|| mdr
->more()->slave_update_journaled
);
7244 if (mds
->is_resolve()) {
7245 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
7246 mdcache
->try_trim_non_auth_subtree(root
);
7250 mdcache
->request_finish(mdr
);
7252 mdcache
->finish_rollback(reqid
);
7256 /** _dir_is_nonempty[_unlocked]
7258 * check if a directory is non-empty (i.e. we can rmdir it).
7260 * the unlocked varient this is a fastpath check. we can't really be
7261 * sure until we rdlock the filelock.
7263 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
7265 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
7266 ceph_assert(in
->is_auth());
7268 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
7269 return true; // in a snapshot!
7272 in
->get_dirfrags(ls
);
7273 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7275 // is the frag obviously non-empty?
7276 if (dir
->is_auth()) {
7277 if (dir
->get_projected_fnode()->fragstat
.size()) {
7278 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7279 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
7288 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
7290 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
7291 ceph_assert(in
->is_auth());
7292 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
7294 frag_info_t dirstat
;
7295 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
7298 in
->get_dirfrags(ls
);
7299 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7301 const fnode_t
*pf
= dir
->get_projected_fnode();
7302 if (pf
->fragstat
.size()) {
7303 dout(10) << "dir_is_nonempty dirstat has "
7304 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
7308 if (pf
->accounted_fragstat
.version
== dirstat_version
)
7309 dirstat
.add(pf
->accounted_fragstat
);
7311 dirstat
.add(pf
->fragstat
);
7314 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
7318 // ======================================================
7321 class C_MDS_rename_finish
: public ServerLogContext
{
7326 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
7327 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
7328 ServerLogContext(s
, r
),
7329 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
7330 void finish(int r
) override
{
7331 ceph_assert(r
== 0);
7332 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
7337 /** handle_client_rename
7339 * rename master is the destdn auth. this is because cached inodes
7340 * must remain connected. thus, any replica of srci, must also
7341 * replicate destdn, and possibly straydn, so that srci (and
7342 * destdn->inode) remain connected during the rename.
7344 * to do this, we freeze srci, then master (destdn auth) verifies that
7345 * all other nodes have also replciated destdn and straydn. note that
7346 * destdn replicas need not also replicate srci. this only works when
7349 * This function takes responsibility for the passed mdr.
7351 void Server::handle_client_rename(MDRequestRef
& mdr
)
7353 const MClientRequest::const_ref
&req
= mdr
->client_request
;
7354 dout(7) << "handle_client_rename " << *req
<< dendl
;
7356 filepath destpath
= req
->get_filepath();
7357 filepath srcpath
= req
->get_filepath2();
7358 if (destpath
.depth() == 0 || srcpath
.depth() == 0) {
7359 respond_to_request(mdr
, -EINVAL
);
7362 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
7363 respond_to_request(mdr
, -EBUSY
);
7367 std::string_view destname
= destpath
.last_dentry();
7369 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
7370 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
7372 MutationImpl::LockOpVec lov
;
7374 CDentry
*destdn
= rdlock_path_xlock_dentry(mdr
, 0, lov
, true, false, true);
7375 if (!destdn
) return;
7376 dout(10) << " destdn " << *destdn
<< dendl
;
7377 if (mdr
->snapid
!= CEPH_NOSNAP
) {
7378 respond_to_request(mdr
, -EROFS
);
7381 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7382 CDir
*destdir
= destdn
->get_dir();
7383 ceph_assert(destdir
->is_auth());
7385 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
7386 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
, &srctrace
, NULL
, MDS_TRAVERSE_DISCOVER
);
7391 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
7392 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
7394 dout(10) << "FAIL on error " << r
<< dendl
;
7395 respond_to_request(mdr
, r
);
7400 ceph_assert(!srctrace
.empty());
7401 CDentry
*srcdn
= srctrace
.back();
7402 dout(10) << " srcdn " << *srcdn
<< dendl
;
7403 if (srcdn
->last
!= CEPH_NOSNAP
) {
7404 respond_to_request(mdr
, -EROFS
);
7407 CDir
*srcdir
= srcdn
->get_dir();
7408 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7409 CInode
*srci
= srcdnl
->get_inode();
7410 dout(10) << " srci " << *srci
<< dendl
;
7413 if (!destdnl
->is_null()) {
7414 //dout(10) << "dest dn exists " << *destdn << dendl;
7415 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
7417 dout(10) << " oldin " << *oldin
<< dendl
;
7419 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7420 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
7421 respond_to_request(mdr
, -ENOTEMPTY
);
7425 // if srcdn is replica, need to make sure its linkage is correct
7426 if (srcdn
->is_auth() ||
7427 srcdn
->lock
.can_read(mdr
->get_client()) ||
7428 (srcdn
->lock
.is_xlocked() && srcdn
->lock
.get_xlock_by() == mdr
)) {
7429 // mv /some/thing /to/some/existing_other_thing
7430 if (oldin
->is_dir() && !srci
->is_dir()) {
7431 respond_to_request(mdr
, -EISDIR
);
7434 if (!oldin
->is_dir() && srci
->is_dir()) {
7435 respond_to_request(mdr
, -ENOTDIR
);
7438 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
7439 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
7445 // -- some sanity checks --
7447 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7448 if (destpath
.get_ino() != srcpath
.get_ino() &&
7449 !(req
->get_source().is_mds() &&
7450 MDS_INO_IS_MDSDIR(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7451 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
7452 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
7453 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7454 while (srcbase
!= destbase
&&
7455 !srcbase
->is_projected_ancestor_of(destbase
)) {
7456 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
7457 srctrace
.insert(srctrace
.begin(), pdn
);
7458 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
7459 srcbase
= pdn
->get_dir()->get_inode();
7462 // then, extend destpath until it shares the same parent inode as srcpath.
7463 while (destbase
!= srcbase
) {
7464 CDentry
*pdn
= destbase
->get_projected_parent_dn();
7465 desttrace
.insert(desttrace
.begin(), pdn
);
7466 lov
.add_rdlock(&pdn
->lock
);
7467 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
7468 destbase
= pdn
->get_dir()->get_inode();
7470 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
7474 if (srcdir
== destdir
&& srcdn
->get_name() == destname
) {
7475 dout(7) << "rename src=dest, noop" << dendl
;
7476 respond_to_request(mdr
, 0);
7480 // dest a child of src?
7481 // e.g. mv /usr /usr/foo
7482 CDentry
*pdn
= destdir
->inode
->get_projected_parent_dn();
7485 dout(7) << "cannot rename item to be a child of itself" << dendl
;
7486 respond_to_request(mdr
, -EINVAL
);
7489 pdn
= pdn
->get_dir()->inode
->parent
;
7492 // is this a stray migration, reintegration or merge? (sanity checks!)
7493 if (mdr
->reqid
.name
.is_mds() &&
7494 !(MDS_INO_IS_MDSDIR(srcpath
.get_ino()) &&
7495 MDS_INO_IS_MDSDIR(destpath
.get_ino())) &&
7496 !(destdnl
->is_remote() &&
7497 destdnl
->get_remote_ino() == srci
->ino())) {
7498 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
7502 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
7504 dout(10) << " this is a link merge" << dendl
;
7506 // -- create stray dentry? --
7507 CDentry
*straydn
= NULL
;
7508 if (destdnl
->is_primary() && !linkmerge
) {
7509 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
7512 dout(10) << " straydn is " << *straydn
<< dendl
;
7513 } else if (mdr
->straydn
) {
7514 mdr
->unpin(mdr
->straydn
);
7515 mdr
->straydn
= NULL
;
7518 // -- prepare witness list --
7520 * NOTE: we use _all_ replicas as witnesses.
7521 * this probably isn't totally necessary (esp for file renames),
7522 * but if/when we change that, we have to make sure rejoin is
7523 * sufficiently robust to handle strong rejoins from survivors
7524 * with totally wrong dentry->inode linkage.
7525 * (currently, it can ignore rename effects, because the resolve
7526 * stage will sort them out.)
7528 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
7529 if (srcdn
->is_auth())
7530 srcdn
->list_replicas(witnesses
);
7532 witnesses
.insert(srcdn
->authority().first
);
7533 if (srcdnl
->is_remote() && !srci
->is_auth())
7534 witnesses
.insert(srci
->authority().first
);
7535 destdn
->list_replicas(witnesses
);
7536 if (destdnl
->is_remote() && !oldin
->is_auth())
7537 witnesses
.insert(oldin
->authority().first
);
7538 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7543 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
7544 for (int i
=0; i
<(int)srctrace
.size(); i
++)
7545 lov
.add_rdlock(&srctrace
[i
]->lock
);
7546 lov
.add_xlock(&srcdn
->lock
);
7547 mds_rank_t srcdirauth
= srcdir
->authority().first
;
7548 if (srcdirauth
!= mds
->get_nodeid()) {
7549 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth
<< dendl
;
7550 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdirauth
);
7551 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdirauth
);
7553 lov
.add_rdlock(&srci
->dirfragtreelock
);
7555 lov
.add_wrlock(&srcdir
->inode
->filelock
);
7556 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
7558 mds
->locker
->include_snap_rdlocks(srcdir
->inode
, lov
);
7562 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7563 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7564 lov
.add_xlock(&straydn
->lock
);
7567 // xlock versionlock on dentries if there are witnesses.
7568 // replicas can't see projected dentry linkages, and will get
7569 // confused if we try to pipeline things.
7570 if (!witnesses
.empty()) {
7571 // take xlock on all projected ancestor dentries for srcdn and destdn.
7572 // this ensures the srcdn and destdn can be traversed to by the witnesses.
7573 for (int i
= 0; i
<(int)srctrace
.size(); i
++) {
7574 if (srctrace
[i
]->is_auth() && srctrace
[i
]->is_projected())
7575 lov
.add_xlock(&srctrace
[i
]->versionlock
);
7577 for (int i
=0; i
<(int)desttrace
.size(); i
++) {
7578 if (desttrace
[i
]->is_auth() && desttrace
[i
]->is_projected())
7579 lov
.add_xlock(&desttrace
[i
]->versionlock
);
7581 // xlock srci and oldin's primary dentries, so witnesses can call
7582 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
7584 if (srcdnl
->is_remote())
7585 lov
.add_xlock(&srci
->get_projected_parent_dn()->lock
);
7586 if (destdnl
->is_remote())
7587 lov
.add_xlock(&oldin
->get_projected_parent_dn()->lock
);
7590 // we need to update srci's ctime. xlock its least contended lock to do that...
7591 lov
.add_xlock(&srci
->linklock
);
7592 lov
.add_xlock(&srci
->snaplock
);
7595 // xlock oldin (for nlink--)
7596 lov
.add_xlock(&oldin
->linklock
);
7597 lov
.add_xlock(&oldin
->snaplock
);
7598 if (oldin
->is_dir())
7599 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
7602 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: NULL
;
7603 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
7607 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
7609 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7610 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
7613 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
7616 if (!check_fragment_space(mdr
, destdn
->get_dir()))
7619 if (!check_access(mdr
, srci
, MAY_WRITE
))
7623 // with read lock, really verify oldin is empty
7626 _dir_is_nonempty(mdr
, oldin
)) {
7627 respond_to_request(mdr
, -ENOTEMPTY
);
7631 /* project_snaprealm_past_parent() will do this job
7633 // moving between snaprealms?
7634 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7635 SnapRealm *srcrealm = srci->find_snaprealm();
7636 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7637 if (srcrealm != destrealm &&
7638 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7639 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7640 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7641 mdcache->snaprealm_create(mdr, srci);
7647 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
7649 // -- open all srcdn inode frags, if any --
7650 // we need these open so that auth can properly delegate from inode to dirfrags
7651 // after the inode is _ours_.
7652 if (srcdnl
->is_primary() &&
7653 !srcdn
->is_auth() &&
7655 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
7656 mdr
->set_stickydirs(srci
);
7659 srci
->dirfragtree
.get_leaves(leaves
);
7660 for (const auto& leaf
: leaves
) {
7661 CDir
*dir
= srci
->get_dirfrag(leaf
);
7663 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
7664 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
7670 // -- prepare snaprealm ---
7673 if (!mdr
->more()->srci_srnode
&&
7674 srci
->get_projected_inode()->nlink
== 1 &&
7675 srci
->is_projected_snaprealm_global()) {
7676 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
7677 srci
->record_snaprealm_parent_dentry(new_srnode
, NULL
, destdn
, false);
7679 srci
->clear_snaprealm_global(new_srnode
);
7680 mdr
->more()->srci_srnode
= new_srnode
;
7683 if (oldin
&& !mdr
->more()->desti_srnode
) {
7684 if (oldin
->is_projected_snaprealm_global()) {
7685 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
7686 oldin
->record_snaprealm_parent_dentry(new_srnode
, NULL
, destdn
, destdnl
->is_primary());
7687 // dropping the last linkage or dropping the last remote linkage,
7688 // detch the inode from global snaprealm
7689 auto nlink
= oldin
->get_projected_inode()->nlink
;
7691 (nlink
== 2 && !destdnl
->is_primary() &&
7692 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
7693 oldin
->clear_snaprealm_global(new_srnode
);
7694 mdr
->more()->desti_srnode
= new_srnode
;
7695 } else if (destdnl
->is_primary()) {
7696 SnapRealm
*dest_realm
= destdir
->inode
->find_snaprealm();
7697 snapid_t follows
= dest_realm
->get_newest_seq();
7698 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
7699 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
7700 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7701 mdr
->more()->desti_srnode
= new_srnode
;
7705 if (!mdr
->more()->srci_srnode
) {
7706 SnapRealm
*dest_realm
= destdir
->inode
->find_snaprealm();
7707 if (srci
->is_projected_snaprealm_global()) {
7708 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
7709 srci
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, srcdn
, srcdnl
->is_primary());
7710 mdr
->more()->srci_srnode
= new_srnode
;
7711 } else if (srcdnl
->is_primary()) {
7712 SnapRealm
*src_realm
= srcdir
->inode
->find_snaprealm();
7713 snapid_t follows
= src_realm
->get_newest_seq();
7714 if (src_realm
!= dest_realm
&&
7715 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
7716 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
7717 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
7718 mdr
->more()->srci_srnode
= new_srnode
;
7724 // -- prepare witnesses --
7726 // do srcdn auth last
7727 mds_rank_t last
= MDS_RANK_NONE
;
7728 if (!srcdn
->is_auth()) {
7729 last
= srcdn
->authority().first
;
7730 mdr
->more()->srcdn_auth_mds
= last
;
7731 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
7732 // are involved in the rename operation.
7733 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
7734 dout(10) << " preparing ambiguous auth for srci" << dendl
;
7735 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
7736 ceph_assert(mdr
->more()->rename_inode
== srci
);
7737 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
7742 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7743 p
!= witnesses
.end();
7745 if (*p
== last
) continue; // do it last!
7746 if (mdr
->more()->witnessed
.count(*p
)) {
7747 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7748 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
7749 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7751 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
7755 if (!mdr
->more()->waiting_on_slave
.empty())
7756 return; // we're waiting for a witness.
7758 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
7759 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
7760 ceph_assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
7761 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
7765 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
7766 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
7767 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
7768 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
7769 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
7771 // -- declare now --
7772 mdr
->set_mds_stamp(ceph_clock_now());
7774 // -- prepare journal entry --
7775 mdr
->ls
= mdlog
->get_current_segment();
7776 EUpdate
*le
= new EUpdate(mdlog
, "rename");
7777 mdlog
->start_entry(le
);
7778 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7779 if (!mdr
->more()->witnessed
.empty()) {
7780 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
7782 le
->reqid
= mdr
->reqid
;
7783 le
->had_slaves
= true;
7785 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7786 // no need to send frozen auth pin to recovring auth MDS of srci
7787 mdr
->more()->is_remote_frozen_authpin
= false;
7790 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
7791 if (le
->client_map
.length())
7792 le
->cmapv
= mds
->sessionmap
.get_projected();
7794 // -- commit locally --
7795 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
7797 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
7798 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
7802 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7804 dout(10) << "_rename_finish " << *mdr
<< dendl
;
7806 if (!mdr
->more()->witnessed
.empty())
7807 mdcache
->logged_master_update(mdr
->reqid
);
7810 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
7812 mdcache
->send_dentry_link(destdn
, mdr
);
7814 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7815 CInode
*in
= destdnl
->get_inode();
7816 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
7818 // test hack: test slave commit
7819 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
7820 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
7821 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
7822 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
7825 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
7826 if (destdnl
->is_remote() && in
->is_auth())
7827 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
7829 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
7831 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
7834 respond_to_request(mdr
, 0);
7837 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
7840 // respond_to_request() drops locks. So stray reintegration can race with us.
7841 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7842 mdcache
->notify_stray(straydn
);
7850 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
7851 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
7853 if (mds
->is_cluster_degraded() &&
7854 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7855 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
7856 if (mdr
->more()->waiting_on_slave
.empty())
7857 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7861 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
7862 auto req
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREP
);
7864 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
7865 for (auto dn
: srctrace
)
7866 req
->srcdnpath
.push_dentry(dn
->get_name());
7867 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
7868 for (auto dn
: dsttrace
)
7869 req
->destdnpath
.push_dentry(dn
->get_name());
7871 mdcache
->replicate_stray(straydn
, who
, req
->straybl
);
7873 if (mdr
->more()->srci_srnode
)
7874 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
7875 if (mdr
->more()->desti_srnode
)
7876 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7878 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
7880 // srcdn auth will verify our current witness list is sufficient
7881 req
->witnesses
= witnesse
;
7883 req
->op_stamp
= mdr
->get_op_stamp();
7884 mds
->send_message_mds(req
, who
);
7886 ceph_assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
7887 mdr
->more()->waiting_on_slave
.insert(who
);
7891 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
7893 version_t oldpv
= mdr
->more()->inode_import_v
;
7895 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7898 auto blp
= mdr
->more()->inode_import
.cbegin();
7901 map
<client_t
,entity_inst_t
> client_map
;
7902 map
<client_t
, client_metadata_t
> client_metadata_map
;
7903 decode(client_map
, blp
);
7904 decode(client_metadata_map
, blp
);
7905 prepare_force_open_sessions(client_map
, client_metadata_map
,
7906 mdr
->more()->imported_session_map
);
7907 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
7908 encode(client_metadata_map
, *client_map_bl
);
7910 list
<ScatterLock
*> updated_scatterlocks
;
7911 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
7912 mdr
->more()->cap_imports
, updated_scatterlocks
);
7914 // hack: force back to !auth and clean, temporarily
7915 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
7916 srcdnl
->get_inode()->mark_clean();
7921 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
7923 std::vector
<CDir
*> dirs
;
7924 diri
->get_dirfrags(dirs
);
7926 bool force_journal
= false;
7928 for (const auto& dir
: dirs
) {
7929 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
7930 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
7931 force_journal
= true;
7934 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
7937 // see if any children of our frags are auth subtrees.
7938 std::vector
<CDir
*> subtrees
;
7939 mdcache
->get_subtrees(subtrees
);
7940 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
7941 for (const auto& dir
: dirs
) {
7942 for (const auto& subtree
: subtrees
) {
7943 if (dir
->contains(subtree
)) {
7944 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
7945 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
7946 << *subtree
<< dendl
;
7947 force_journal
= true;
7950 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
7952 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
7958 return force_journal
;
7961 void Server::_rename_prepare(MDRequestRef
& mdr
,
7962 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
7963 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7965 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7967 dout(10) << " straydn " << *straydn
<< dendl
;
7969 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7970 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7971 CInode
*srci
= srcdnl
->get_inode();
7972 CInode
*oldin
= destdnl
->get_inode();
7974 // primary+remote link merge?
7975 bool linkmerge
= (srci
== oldin
);
7977 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
7978 bool silent
= srcdn
->get_dir()->inode
->is_stray();
7980 bool force_journal_dest
= false;
7981 if (srci
->is_dir() && !destdn
->is_auth()) {
7982 if (srci
->is_auth()) {
7983 // if we are auth for srci and exporting it, force journal because journal replay needs
7984 // the source inode to create auth subtrees.
7985 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
7986 force_journal_dest
= true;
7988 force_journal_dest
= _need_force_journal(srci
, false);
7991 bool force_journal_stray
= false;
7992 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
7993 force_journal_stray
= _need_force_journal(oldin
, true);
7996 dout(10) << " merging remote and primary links to the same inode" << dendl
;
7998 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
7999 if (force_journal_dest
)
8000 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
8001 if (force_journal_stray
)
8002 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
8004 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
8005 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
8006 metablob
->renamed_dirino
= srci
->ino();
8007 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
8008 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
8009 metablob
->renamed_dirino
= oldin
->ino();
8013 CInode::mempool_inode
*spi
= 0; // renamed inode
8014 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
8018 if (destdnl
->is_primary()) {
8019 ceph_assert(straydn
); // moving to straydn.
8020 // link--, and move.
8021 if (destdn
->is_auth()) {
8022 auto &pi
= oldin
->project_inode(); //project_snaprealm
8023 pi
.inode
.version
= straydn
->pre_dirty(pi
.inode
.version
);
8024 pi
.inode
.update_backtrace();
8027 straydn
->push_projected_linkage(oldin
);
8028 } else if (destdnl
->is_remote()) {
8030 if (oldin
->is_auth()) {
8031 auto &pi
= oldin
->project_inode();
8032 pi
.inode
.version
= oldin
->pre_dirty();
8039 if (srcdnl
->is_remote()) {
8042 if (destdn
->is_auth())
8043 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
8044 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8046 if (srci
->is_auth()) {
8047 auto &pi
= srci
->project_inode();
8048 pi
.inode
.version
= srci
->pre_dirty();
8052 dout(10) << " will merge remote onto primary link" << dendl
;
8053 if (destdn
->is_auth()) {
8054 auto &pi
= oldin
->project_inode();
8055 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
8060 if (destdn
->is_auth()) {
8062 if (srcdn
->is_auth())
8063 oldpv
= srci
->get_projected_version();
8065 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
8067 // note which dirfrags have child subtrees in the journal
8068 // event, so that we can open those (as bounds) during replay.
8069 if (srci
->is_dir()) {
8071 srci
->get_dirfrags(ls
);
8072 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
8074 if (!dir
->is_auth())
8075 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
8077 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
8080 auto &pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
8081 // & srcdnl->snaprealm
8082 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
8083 pi
.inode
.update_backtrace();
8086 destdn
->push_projected_linkage(srci
);
8090 if (srcdn
->is_auth())
8091 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
8092 srcdn
->push_projected_linkage(); // push null linkage
8096 spi
->ctime
= mdr
->get_op_stamp();
8097 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
8098 spi
->rstat
.rctime
= mdr
->get_op_stamp();
8104 tpi
->ctime
= mdr
->get_op_stamp();
8105 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
8106 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
8110 destdn
->make_path_string(t
, true);
8111 tpi
->stray_prior_path
= std::move(t
);
8114 if (tpi
->nlink
== 0)
8115 oldin
->state_set(CInode::STATE_ORPHAN
);
8119 // prepare nesting, mtime updates
8120 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
8122 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8123 // then link the source inode to destdn
8124 if (destdnl
->is_primary()) {
8125 ceph_assert(straydn
);
8126 if (straydn
->is_auth()) {
8127 metablob
->add_dir_context(straydn
->get_dir());
8128 metablob
->add_dir(straydn
->get_dir(), true);
8133 if (destdn
->is_auth() && !destdnl
->is_null()) {
8134 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
8135 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
8136 if (destdnl
->is_primary()) {
8137 ceph_assert(straydn
);
8138 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
8139 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8144 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
8145 int flags
= predirty_dir
| predirty_primary
;
8146 if (srcdn
->is_auth())
8147 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
8148 if (destdn
->is_auth())
8149 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
8151 // add it all to the metablob
8154 if (destdnl
->is_primary()) {
8155 ceph_assert(straydn
);
8156 if (destdn
->is_auth()) {
8157 // project snaprealm, too
8158 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8159 oldin
->project_snaprealm(desti_srnode
);
8160 if (tpi
->nlink
== 0)
8161 ceph_assert(!desti_srnode
->is_parent_global());
8162 desti_srnode
= NULL
;
8164 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8165 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
8166 } else if (force_journal_stray
) {
8167 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
8168 metablob
->add_dir_context(straydn
->get_dir());
8169 metablob
->add_primary_dentry(straydn
, oldin
, true);
8171 } else if (destdnl
->is_remote()) {
8172 if (oldin
->is_auth()) {
8173 sr_t
*new_srnode
= NULL
;
8174 if (mdr
->slave_request
) {
8175 if (mdr
->slave_request
->desti_snapbl
.length() > 0) {
8176 new_srnode
= new sr_t();
8177 auto p
= mdr
->slave_request
->desti_snapbl
.cbegin();
8178 decode(*new_srnode
, p
);
8180 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8181 new_srnode
= desti_srnode
;
8182 desti_srnode
= NULL
;
8185 oldin
->project_snaprealm(new_srnode
);
8186 if (tpi
->nlink
== 0)
8187 ceph_assert(!new_srnode
->is_parent_global());
8190 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
8191 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
8192 CEPH_NOSNAP
, 0, destdnl
);
8193 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
8199 if (srcdnl
->is_remote()) {
8200 ceph_assert(!linkmerge
);
8201 if (destdn
->is_auth() && !destdnl
->is_null())
8202 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8204 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8206 if (destdn
->is_auth())
8207 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8209 if (srci
->is_auth() ) { // it's remote
8210 if (mdr
->slave_request
) {
8211 if (mdr
->slave_request
->srci_snapbl
.length() > 0) {
8212 sr_t
*new_srnode
= new sr_t();
8213 auto p
= mdr
->slave_request
->srci_snapbl
.cbegin();
8214 decode(*new_srnode
, p
);
8215 srci
->project_snaprealm(new_srnode
);
8217 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8218 srci
->project_snaprealm(srci_srnode
);
8222 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
8223 metablob
->add_dir_context(srci_pdn
->get_dir());
8224 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
, CEPH_NOSNAP
, 0, srcdnl
);
8225 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
8227 } else if (srcdnl
->is_primary()) {
8228 // project snap parent update?
8229 if (destdn
->is_auth()) {
8230 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8231 srci
->project_snaprealm(srci_srnode
);
8236 if (destdn
->is_auth() && !destdnl
->is_null())
8237 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8239 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8241 if (destdn
->is_auth())
8242 metablob
->add_primary_dentry(destdn
, srci
, true, true);
8243 else if (force_journal_dest
) {
8244 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
8245 metablob
->add_dir_context(destdn
->get_dir());
8246 metablob
->add_primary_dentry(destdn
, srci
, true);
8247 if (srcdn
->is_auth() && srci
->is_dir()) {
8248 // journal new subtrees root dirfrags
8250 srci
->get_dirfrags(ls
);
8251 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
8254 metablob
->add_dir(dir
, true);
8261 if (srcdn
->is_auth()) {
8262 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
8263 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
8264 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8265 // both primary and NULL dentries. Because during journal replay, null dentry is
8266 // processed after primary dentry.
8267 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
8268 metablob
->add_primary_dentry(srcdn
, srci
, true);
8269 metablob
->add_null_dentry(srcdn
, true);
8271 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
8273 // make renamed inode first track the dn
8274 if (srcdnl
->is_primary() && destdn
->is_auth()) {
8275 ceph_assert(srci
->first
<= destdn
->first
);
8276 srci
->first
= destdn
->first
;
8278 // make stray inode first track the straydn
8279 if (straydn
&& straydn
->is_auth()) {
8280 ceph_assert(oldin
->first
<= straydn
->first
);
8281 oldin
->first
= straydn
->first
;
8284 if (oldin
&& oldin
->is_dir()) {
8285 ceph_assert(straydn
);
8286 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
8289 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
8294 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8296 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8297 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
8299 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8300 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8302 CInode
*oldin
= destdnl
->get_inode();
8304 // primary+remote link merge?
8305 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
8307 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
8309 bool new_in_snaprealm
= false;
8310 bool new_oldin_snaprealm
= false;
8314 if (destdnl
->is_primary()) {
8315 ceph_assert(straydn
);
8316 dout(10) << "straydn is " << *straydn
<< dendl
;
8318 // if there is newly created snaprealm, need to split old snaprealm's
8319 // inodes_with_caps. So pop snaprealm before linkage changes.
8320 if (destdn
->is_auth()) {
8321 bool hadrealm
= (oldin
->snaprealm
? true : false);
8322 oldin
->early_pop_projected_snaprealm();
8323 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
8325 ceph_assert(mdr
->slave_request
);
8326 if (mdr
->slave_request
->desti_snapbl
.length()) {
8327 new_oldin_snaprealm
= !oldin
->snaprealm
;
8328 oldin
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
8329 ceph_assert(oldin
->snaprealm
);
8330 ceph_assert(oldin
->snaprealm
->have_past_parents_open());
8334 destdn
->get_dir()->unlink_inode(destdn
, false);
8336 straydn
->pop_projected_linkage();
8337 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8338 ceph_assert(!straydn
->is_projected()); // no other projected
8341 if (destdn
->is_auth())
8342 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8344 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
8345 } else if (destdnl
->is_remote()) {
8346 destdn
->get_dir()->unlink_inode(destdn
, false);
8347 if (oldin
->is_auth()) {
8348 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8349 } else if (mdr
->slave_request
) {
8350 if (mdr
->slave_request
->desti_snapbl
.length() > 0) {
8351 ceph_assert(oldin
->snaprealm
);
8352 oldin
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
8354 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8355 delete desti_srnode
;
8356 desti_srnode
= NULL
;
8361 // unlink src before we relink it at dest
8362 CInode
*in
= srcdnl
->get_inode();
8365 bool srcdn_was_remote
= srcdnl
->is_remote();
8366 if (!srcdn_was_remote
) {
8367 // if there is newly created snaprealm, need to split old snaprealm's
8368 // inodes_with_caps. So pop snaprealm before linkage changes.
8369 if (destdn
->is_auth()) {
8370 bool hadrealm
= (in
->snaprealm
? true : false);
8371 in
->early_pop_projected_snaprealm();
8372 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
8374 ceph_assert(mdr
->slave_request
);
8375 if (mdr
->slave_request
->srci_snapbl
.length()) {
8376 new_in_snaprealm
= !in
->snaprealm
;
8377 in
->decode_snap_blob(mdr
->slave_request
->srci_snapbl
);
8378 ceph_assert(in
->snaprealm
);
8379 ceph_assert(in
->snaprealm
->have_past_parents_open());
8384 srcdn
->get_dir()->unlink_inode(srcdn
);
8387 if (srcdn_was_remote
) {
8390 destdnl
= destdn
->pop_projected_linkage();
8391 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8392 ceph_assert(!destdn
->is_projected()); // no other projected
8394 destdn
->link_remote(destdnl
, in
);
8395 if (destdn
->is_auth())
8396 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
8398 if (in
->is_auth()) {
8399 in
->pop_and_dirty_projected_inode(mdr
->ls
);
8400 } else if (mdr
->slave_request
) {
8401 if (mdr
->slave_request
->srci_snapbl
.length() > 0) {
8402 ceph_assert(in
->snaprealm
);
8403 in
->decode_snap_blob(mdr
->slave_request
->srci_snapbl
);
8405 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8410 dout(10) << "merging remote onto primary link" << dendl
;
8411 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8415 dout(10) << "merging primary onto remote link" << dendl
;
8416 destdn
->get_dir()->unlink_inode(destdn
, false);
8418 destdnl
= destdn
->pop_projected_linkage();
8419 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8420 ceph_assert(!destdn
->is_projected()); // no other projected
8422 // srcdn inode import?
8423 if (!srcdn
->is_auth() && destdn
->is_auth()) {
8424 ceph_assert(mdr
->more()->inode_import
.length() > 0);
8426 map
<client_t
,Capability::Import
> imported_caps
;
8428 // finish cap imports
8429 finish_force_open_sessions(mdr
->more()->imported_session_map
);
8430 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
8431 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
8432 mdr
->more()->srcdn_auth_mds
, true,
8433 mdr
->more()->imported_session_map
,
8434 mdr
->more()->cap_imports
[destdnl
->get_inode()],
8438 mdr
->more()->inode_import
.clear();
8439 encode(imported_caps
, mdr
->more()->inode_import
);
8441 /* hack: add an auth pin for each xlock we hold. These were
8442 * remote xlocks previously but now they're local and
8443 * we're going to try and unpin when we xlock_finish. */
8445 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
8446 i
!= mdr
->locks
.end();
8448 SimpleLock
*lock
= i
->lock
;
8449 if (lock
->get_parent() != destdnl
->get_inode())
8451 if (i
->is_xlock() && !lock
->is_locallock())
8452 mds
->locker
->xlock_import(lock
);
8455 // hack: fix auth bit
8456 in
->state_set(CInode::STATE_AUTH
);
8458 mdr
->clear_ambiguous_auth();
8461 if (destdn
->is_auth())
8462 in
->pop_and_dirty_projected_inode(mdr
->ls
);
8466 if (srcdn
->is_auth())
8467 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
8468 srcdn
->pop_projected_linkage();
8469 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8470 ceph_assert(!srcdn
->is_projected()); // no other projected
8472 // apply remaining projected inodes (nested)
8475 // update subtree map?
8476 if (destdnl
->is_primary() && in
->is_dir())
8477 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
8479 if (straydn
&& oldin
->is_dir())
8480 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
8482 if (new_oldin_snaprealm
)
8483 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
8484 if (new_in_snaprealm
)
8485 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
8487 // removing a new dn?
8488 if (srcdn
->is_auth())
8489 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
8497 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
8498 CDentry
*srcdn
, *destdn
, *straydn
;
8500 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
8501 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
8502 void finish(int r
) override
{
8503 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
8507 class C_MDS_SlaveRenameCommit
: public ServerContext
{
8509 CDentry
*srcdn
, *destdn
, *straydn
;
8511 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
8512 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
8513 void finish(int r
) override
{
8514 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
8518 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
8521 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
8522 ServerContext(s
), mdr(r
) {}
8523 void finish(int r
) override
{
8524 server
->_slave_rename_sessions_flushed(mdr
);
8528 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
8530 dout(10) << "handle_slave_rename_prep " << *mdr
8531 << " " << mdr
->slave_request
->srcdnpath
8532 << " to " << mdr
->slave_request
->destdnpath
8535 if (mdr
->slave_request
->is_interrupted()) {
8536 dout(10) << " slave request interrupted, sending noop reply" << dendl
;
8537 auto reply
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8538 reply
->mark_interrupted();
8539 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8540 mdr
->reset_slave_request();
8545 filepath
destpath(mdr
->slave_request
->destdnpath
);
8546 dout(10) << " dest " << destpath
<< dendl
;
8547 vector
<CDentry
*> trace
;
8548 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
);
8549 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVERXLOCK
);
8552 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
8556 ceph_assert(r
== 0); // we shouldn't get an error here!
8558 CDentry
*destdn
= trace
.back();
8559 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8560 dout(10) << " destdn " << *destdn
<< dendl
;
8564 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
8565 dout(10) << " src " << srcpath
<< dendl
;
8566 CInode
*srci
= nullptr;
8567 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
, &trace
, &srci
, MDS_TRAVERSE_DISCOVERXLOCK
);
8569 ceph_assert(r
== 0);
8571 // srcpath must not point to a null dentry
8572 ceph_assert(srci
!= nullptr);
8574 CDentry
*srcdn
= trace
.back();
8575 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8576 dout(10) << " srcdn " << *srcdn
<< dendl
;
8581 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8583 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8584 CDentry
*straydn
= mdr
->straydn
;
8585 if (destdnl
->is_primary() && !linkmerge
)
8586 ceph_assert(straydn
);
8588 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
8589 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
8591 // set up commit waiter (early, to clean up any freezing etc we do)
8592 if (!mdr
->more()->slave_commit
)
8593 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
8596 if (srcdn
->is_auth()) {
8597 set
<mds_rank_t
> srcdnrep
;
8598 srcdn
->list_replicas(srcdnrep
);
8600 bool reply_witness
= false;
8601 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
8604 // - avoid conflicting lock state changes
8605 // - avoid concurrent updates to the inode
8606 // (this could also be accomplished with the versionlock)
8607 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8608 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
8609 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
8611 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8612 if (srcdnl
->get_inode()->is_frozen_auth_pin())
8613 mdr
->unfreeze_auth_pin();
8615 if (!frozen_inode
) {
8616 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
8621 * set ambiguous auth for srci
8622 * NOTE: we don't worry about ambiguous cache expire as we do
8623 * with subtree migrations because all slaves will pin
8624 * srcdn->get_inode() for duration of this rename.
8626 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
8628 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8629 // the master will send another OP_RENAMEPREP slave request later.
8630 if (mdr
->slave_request
->witnesses
.size() > 1) {
8631 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
8632 reply_witness
= true;
8635 // make sure bystanders have received all lock related messages
8636 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
8637 if (*p
== mdr
->slave_to_mds
||
8638 (mds
->is_cluster_degraded() &&
8639 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
8641 auto notify
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMENOTIFY
);
8642 mds
->send_message_mds(notify
, *p
);
8643 mdr
->more()->waiting_on_slave
.insert(*p
);
8646 // make sure clients have received all cap related messages
8647 set
<client_t
> export_client_set
;
8648 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
8650 MDSGatherBuilder
gather(g_ceph_context
);
8651 flush_client_sessions(export_client_set
, gather
);
8652 if (gather
.has_subs()) {
8653 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
8654 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
8659 // is witness list sufficient?
8660 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
8661 if (*p
== mdr
->slave_to_mds
||
8662 mdr
->slave_request
->witnesses
.count(*p
)) continue;
8663 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
8664 reply_witness
= true;
8668 if (reply_witness
) {
8669 ceph_assert(!srcdnrep
.empty());
8670 auto reply
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8671 reply
->witnesses
.swap(srcdnrep
);
8672 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8673 mdr
->reset_slave_request();
8676 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
8677 if (!mdr
->more()->waiting_on_slave
.empty()) {
8678 dout(10) << " still waiting for rename notify acks from "
8679 << mdr
->more()->waiting_on_slave
<< dendl
;
8682 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
8683 // set ambiguous auth for srci on witnesses
8684 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
8687 // encode everything we'd need to roll this back... basically, just the original state.
8688 rename_rollback rollback
;
8690 rollback
.reqid
= mdr
->reqid
;
8692 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
8693 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8694 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8695 rollback
.orig_src
.dname
= srcdn
->get_name();
8696 if (srcdnl
->is_primary())
8697 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
8699 ceph_assert(srcdnl
->is_remote());
8700 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
8701 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
8704 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
8705 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8706 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8707 rollback
.orig_dest
.dname
= destdn
->get_name();
8708 if (destdnl
->is_primary())
8709 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
8710 else if (destdnl
->is_remote()) {
8711 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
8712 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
8716 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
8717 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8718 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8719 rollback
.stray
.dname
= straydn
->get_name();
8721 if (mdr
->slave_request
->desti_snapbl
.length()) {
8722 CInode
*oldin
= destdnl
->get_inode();
8723 if (oldin
->snaprealm
) {
8724 encode(true, rollback
.desti_snapbl
);
8725 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
8727 encode(false, rollback
.desti_snapbl
);
8730 if (mdr
->slave_request
->srci_snapbl
.length()) {
8731 if (srci
->snaprealm
) {
8732 encode(true, rollback
.srci_snapbl
);
8733 srci
->encode_snap_blob(rollback
.srci_snapbl
);
8735 encode(false, rollback
.srci_snapbl
);
8738 encode(rollback
, mdr
->more()->rollback_bl
);
8739 // FIXME: rollback snaprealm
8740 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
8743 mdr
->ls
= mdlog
->get_current_segment();
8744 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
8745 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
8746 mdlog
->start_entry(le
);
8747 le
->rollback
= mdr
->more()->rollback_bl
;
8749 bufferlist blah
; // inode import data... obviously not used if we're the slave
8750 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
8752 if (le
->commit
.empty()) {
8753 dout(10) << " empty metablob, skipping journal" << dendl
;
8754 mdlog
->cancel_entry(le
);
8756 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
8758 mdr
->more()->slave_update_journaled
= true;
8759 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
8765 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
8766 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8768 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
8771 MMDSSlaveRequest::ref reply
;
8772 if (!mdr
->aborted
) {
8773 reply
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8774 if (!mdr
->more()->slave_update_journaled
)
8775 reply
->mark_not_journaled();
8778 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8779 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
8782 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
8783 // set export bounds for CInode::encode_export()
8786 if (srcdnl
->get_inode()->is_dir()) {
8787 srcdnl
->get_inode()->get_dirfrags(bounds
);
8788 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
8789 (*p
)->state_set(CDir::STATE_EXPORTBOUND
);
8792 map
<client_t
,entity_inst_t
> exported_client_map
;
8793 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
8795 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
8796 exported_client_map
,
8797 exported_client_metadata_map
);
8799 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
8800 (*p
)->state_clear(CDir::STATE_EXPORTBOUND
);
8802 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
8803 encode(exported_client_metadata_map
, reply
->inode_export
);
8804 reply
->inode_export
.claim_append(inodebl
);
8805 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
8808 // remove mdr auth pin
8809 mdr
->auth_unpin(srcdnl
->get_inode());
8810 mdr
->more()->is_inode_exporter
= true;
8812 if (srcdnl
->get_inode()->is_dirty())
8813 srcdnl
->get_inode()->mark_clean();
8815 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
8819 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8821 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8824 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8825 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
8826 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
8829 mdr
->reset_slave_request();
8833 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8835 ceph_assert(mdr
->aborted
);
8836 dout(10) << " abort flag set, finishing" << dendl
;
8837 mdcache
->request_finish(mdr
);
8841 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
8842 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8844 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
8846 CInode
*in
= destdn
->get_linkage()->get_inode();
8848 inodeno_t migrated_stray
;
8849 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
8850 migrated_stray
= in
->ino();
8852 MDSContext::vec finished
;
8854 // unfreeze+singleauth inode
8855 // hmm, do i really need to delay this?
8856 if (mdr
->more()->is_inode_exporter
) {
8858 // we exported, clear out any xlocks that we moved to another MDS
8860 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
8861 i
!= mdr
->locks
.end(); ) {
8862 SimpleLock
*lock
= i
->lock
;
8863 if (lock
->get_parent() != in
)
8865 // we only care about xlocks on the exported inode
8866 if (i
->is_xlock() && !lock
->is_locallock())
8867 mds
->locker
->xlock_export(i
++, mdr
.get());
8872 map
<client_t
,Capability::Import
> peer_imported
;
8873 auto bp
= mdr
->more()->inode_import
.cbegin();
8874 decode(peer_imported
, bp
);
8876 dout(10) << " finishing inode export on " << *in
<< dendl
;
8877 mdcache
->migrator
->finish_export_inode(in
, mdr
->slave_to_mds
, peer_imported
, finished
);
8878 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
8881 ceph_assert(in
->is_frozen_inode());
8882 in
->unfreeze_inode(finished
);
8886 if (mdr
->more()->is_ambiguous_auth
) {
8887 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8888 mdr
->more()->is_ambiguous_auth
= false;
8891 if (straydn
&& mdr
->more()->slave_update_journaled
) {
8892 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
8893 if (strayin
&& !strayin
->snaprealm
)
8894 mdcache
->clear_dirty_bits_for_stray(strayin
);
8897 mds
->queue_waiters(finished
);
8900 if (mdr
->more()->slave_update_journaled
) {
8901 // write a commit to the journal
8902 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
8903 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
8904 ESlaveUpdate::RENAME
);
8905 mdlog
->start_entry(le
);
8906 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
8909 _committed_slave(mdr
);
8914 // rollback_bl may be empty if we froze the inode but had to provide an expanded
8915 // witness list from the master, and they failed before we tried prep again.
8916 if (mdr
->more()->rollback_bl
.length()) {
8917 if (mdr
->more()->is_inode_exporter
) {
8918 dout(10) << " reversing inode export of " << *in
<< dendl
;
8921 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
8922 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
8923 // rollback but preserve the slave request
8924 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
8925 mdr
->more()->rollback_bl
.clear();
8927 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
8929 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
8931 if (mdr
->more()->is_ambiguous_auth
) {
8932 if (srcdn
->is_auth())
8933 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
8935 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8936 mdr
->more()->is_ambiguous_auth
= false;
8938 mds
->queue_waiters(finished
);
8939 mdcache
->request_finish(mdr
);
8943 if (migrated_stray
&& mds
->is_stopping())
8944 mdcache
->shutdown_export_stray_finish(migrated_stray
);
8947 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
8948 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
8951 pf
= dir
->project_fnode();
8952 mut
->add_projected_fnode(dir
);
8953 pf
->version
= dir
->pre_dirty();
8956 pf
->fragstat
.nsubdirs
+= linkunlink
;
8958 pf
->fragstat
.nfiles
+= linkunlink
;
8961 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
8962 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
8963 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
8964 pf
->rstat
.rsnaps
+= linkunlink
* rstat
.rsnaps
;
8966 if (pf
->fragstat
.mtime
== ctime
) {
8967 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
8968 if (pf
->rstat
.rctime
== ctime
)
8969 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
8971 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
8972 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
8975 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
8981 map
<client_t
,MClientSnap::ref
> splits
[2];
8983 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
8984 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
8985 map
<client_t
,MClientSnap::ref
> _splits
[2], bool f
) :
8986 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
8987 straydn(st
), finish_mdr(f
) {
8988 splits
[0].swap(_splits
[0]);
8989 splits
[1].swap(_splits
[1]);
8991 void finish(int r
) override
{
8992 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
8993 destdn
, straydn
, splits
, finish_mdr
);
8997 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
9000 rename_rollback rollback
;
9001 auto p
= rbl
.cbegin();
9002 decode(rollback
, p
);
9004 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
9005 // need to finish this update before sending resolve to claim the subtree
9006 mdcache
->add_rollback(rollback
.reqid
, master
);
9008 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
9009 mut
->ls
= mds
->mdlog
->get_current_segment();
9011 CDentry
*srcdn
= NULL
;
9012 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
9014 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
9016 dout(10) << " srcdir " << *srcdir
<< dendl
;
9017 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
9019 dout(10) << " srcdn " << *srcdn
<< dendl
;
9020 ceph_assert(srcdn
->get_linkage()->is_null());
9022 dout(10) << " srcdn not found" << dendl
;
9024 dout(10) << " srcdir not found" << dendl
;
9026 CDentry
*destdn
= NULL
;
9027 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
9029 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
9031 dout(10) << " destdir " << *destdir
<< dendl
;
9032 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
9034 dout(10) << " destdn " << *destdn
<< dendl
;
9036 dout(10) << " destdn not found" << dendl
;
9038 dout(10) << " destdir not found" << dendl
;
9041 if (rollback
.orig_src
.ino
) {
9042 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
9043 if (in
&& in
->is_dir())
9044 ceph_assert(srcdn
&& destdn
);
9046 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
9048 CDir
*straydir
= NULL
;
9049 CDentry
*straydn
= NULL
;
9050 if (rollback
.stray
.dirfrag
.ino
) {
9051 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
9053 dout(10) << "straydir " << *straydir
<< dendl
;
9054 straydn
= straydir
->lookup(rollback
.stray
.dname
);
9056 dout(10) << " straydn " << *straydn
<< dendl
;
9057 ceph_assert(straydn
->get_linkage()->is_primary());
9059 dout(10) << " straydn not found" << dendl
;
9061 dout(10) << "straydir not found" << dendl
;
9064 CInode
*target
= NULL
;
9065 if (rollback
.orig_dest
.ino
) {
9066 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
9068 ceph_assert(destdn
&& straydn
);
9069 } else if (rollback
.orig_dest
.remote_ino
)
9070 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
9072 // can't use is_auth() in the resolve stage
9073 mds_rank_t whoami
= mds
->get_nodeid();
9075 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
9076 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
9078 bool force_journal_src
= false;
9079 bool force_journal_dest
= false;
9080 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
9081 force_journal_src
= _need_force_journal(in
, false);
9082 if (in
&& target
&& target
->is_dir())
9083 force_journal_dest
= _need_force_journal(in
, true);
9085 version_t srcdnpv
= 0;
9088 if (srcdn
->authority().first
== whoami
)
9089 srcdnpv
= srcdn
->pre_dirty();
9090 if (rollback
.orig_src
.ino
) {
9092 srcdn
->push_projected_linkage(in
);
9094 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
9095 rollback
.orig_src
.remote_d_type
);
9098 map
<client_t
,MClientSnap::ref
> splits
[2];
9100 CInode::mempool_inode
*pip
= nullptr;
9103 if (in
->get_projected_parent_dn()->authority().first
== whoami
) {
9104 auto &pi
= in
->project_inode();
9106 mut
->add_projected_inode(in
);
9107 pip
->version
= in
->pre_dirty();
9110 pip
= in
->get_projected_inode();
9113 if (pip
->ctime
== rollback
.ctime
)
9114 pip
->ctime
= rollback
.orig_src
.old_ctime
;
9116 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
9118 auto p
= rollback
.srci_snapbl
.cbegin();
9119 decode(hadrealm
, p
);
9121 if (projected
&& !mds
->is_resolve()) {
9122 sr_t
*new_srnode
= new sr_t();
9123 decode(*new_srnode
, p
);
9124 in
->project_snaprealm(new_srnode
);
9126 decode(in
->snaprealm
->srnode
, p
);
9129 if (rollback
.orig_src
.ino
) {
9130 ceph_assert(srcdir
);
9131 realm
= srcdir
->get_inode()->find_snaprealm();
9133 realm
= in
->snaprealm
->parent
;
9135 if (!mds
->is_resolve())
9136 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
9138 in
->project_snaprealm(NULL
);
9140 in
->snaprealm
->merge_to(realm
);
9145 if (srcdn
&& srcdn
->authority().first
== whoami
) {
9147 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
9148 in
? in
->is_dir() : false, 1, pip
? pip
->accounted_rstat
: blah
);
9153 if (rollback
.orig_dest
.ino
&& target
) {
9154 destdn
->push_projected_linkage(target
);
9155 } else if (rollback
.orig_dest
.remote_ino
) {
9156 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
9157 rollback
.orig_dest
.remote_d_type
);
9159 // the dentry will be trimmed soon, it's ok to have wrong linkage
9160 if (rollback
.orig_dest
.ino
)
9161 ceph_assert(mds
->is_resolve());
9162 destdn
->push_projected_linkage();
9167 straydn
->push_projected_linkage();
9171 CInode::mempool_inode
*ti
= nullptr;
9172 if (target
->get_projected_parent_dn()->authority().first
== whoami
) {
9173 auto &pi
= target
->project_inode();
9175 mut
->add_projected_inode(target
);
9176 ti
->version
= target
->pre_dirty();
9179 ti
= target
->get_projected_inode();
9182 if (ti
->ctime
== rollback
.ctime
)
9183 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
9184 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
9185 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
9186 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
9188 ceph_assert(rollback
.orig_dest
.remote_ino
&&
9189 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
9193 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
9195 auto p
= rollback
.desti_snapbl
.cbegin();
9196 decode(hadrealm
, p
);
9198 if (projected
&& !mds
->is_resolve()) {
9199 sr_t
*new_srnode
= new sr_t();
9200 decode(*new_srnode
, p
);
9201 target
->project_snaprealm(new_srnode
);
9203 decode(target
->snaprealm
->srnode
, p
);
9206 if (rollback
.orig_dest
.ino
) {
9207 ceph_assert(destdir
);
9208 realm
= destdir
->get_inode()->find_snaprealm();
9210 realm
= target
->snaprealm
->parent
;
9212 if (!mds
->is_resolve())
9213 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
9215 target
->project_snaprealm(NULL
);
9217 target
->snaprealm
->merge_to(realm
);
9223 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
9225 dout(0) << " srci back to " << *in
<< dendl
;
9227 dout(0) << " destdn back to " << *destdn
<< dendl
;
9229 dout(0) << " desti back to " << *target
<< dendl
;
9232 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
9233 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
9234 mdlog
->start_entry(le
);
9236 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9237 le
->commit
.add_dir_context(srcdir
);
9238 if (rollback
.orig_src
.ino
)
9239 le
->commit
.add_primary_dentry(srcdn
, 0, true);
9241 le
->commit
.add_remote_dentry(srcdn
, true);
9244 if (!rollback
.orig_src
.ino
&& // remote linkage
9245 in
&& in
->authority().first
== whoami
) {
9246 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
9247 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9250 if (force_journal_dest
) {
9251 ceph_assert(rollback
.orig_dest
.ino
);
9252 le
->commit
.add_dir_context(destdir
);
9253 le
->commit
.add_primary_dentry(destdn
, 0, true);
9256 // slave: no need to journal straydn
9258 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
9259 ceph_assert(rollback
.orig_dest
.remote_ino
);
9260 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
9261 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
9264 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9265 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
9266 le
->commit
.renamed_dirino
= in
->ino();
9267 if (srcdn
->authority().first
== whoami
) {
9269 in
->get_dirfrags(ls
);
9270 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
9272 if (!dir
->is_auth())
9273 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
9275 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
9277 } else if (force_journal_dest
) {
9278 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
9279 le
->commit
.renamed_dirino
= target
->ino();
9282 if (target
&& target
->is_dir()) {
9283 ceph_assert(destdn
);
9284 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
9287 if (in
&& in
->is_dir()) {
9289 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
9292 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
9293 ceph_assert(le
->commit
.empty());
9294 mdlog
->cancel_entry(le
);
9296 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
9298 ceph_assert(!le
->commit
.empty());
9300 mdr
->more()->slave_update_journaled
= false;
9301 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
9302 srcdn
, srcdnpv
, destdn
, straydn
,
9303 splits
, finish_mdr
);
9304 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
9309 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
9310 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
9311 map
<client_t
,MClientSnap::ref
> splits
[2], bool finish_mdr
)
9313 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
9316 straydn
->get_dir()->unlink_inode(straydn
);
9317 straydn
->pop_projected_linkage();
9320 destdn
->get_dir()->unlink_inode(destdn
);
9321 destdn
->pop_projected_linkage();
9324 srcdn
->pop_projected_linkage();
9325 if (srcdn
->authority().first
== mds
->get_nodeid()) {
9326 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
9327 if (srcdn
->get_linkage()->is_primary())
9328 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
9334 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
9335 CInode
*in
= srcdn
->get_linkage()->get_inode();
9336 if (in
&& in
->is_dir()) {
9337 ceph_assert(destdn
);
9338 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
9343 CInode
*oldin
= destdn
->get_linkage()->get_inode();
9344 // update subtree map?
9345 if (oldin
&& oldin
->is_dir()) {
9346 ceph_assert(straydn
);
9347 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
9351 if (mds
->is_resolve()) {
9354 root
= mdcache
->get_subtree_root(straydn
->get_dir());
9356 root
= mdcache
->get_subtree_root(destdn
->get_dir());
9358 mdcache
->try_trim_non_auth_subtree(root
);
9360 mdcache
->send_snaps(splits
[1]);
9361 mdcache
->send_snaps(splits
[0]);
9365 MDSContext::vec finished
;
9366 if (mdr
->more()->is_ambiguous_auth
) {
9367 if (srcdn
->is_auth())
9368 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9370 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9371 mdr
->more()->is_ambiguous_auth
= false;
9373 mds
->queue_waiters(finished
);
9374 if (finish_mdr
|| mdr
->aborted
)
9375 mdcache
->request_finish(mdr
);
9377 mdr
->more()->slave_rolling_back
= false;
9380 mdcache
->finish_rollback(mut
->reqid
);
9385 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&ack
)
9387 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9388 << " witnessed by " << ack
->get_source()
9389 << " " << *ack
<< dendl
;
9390 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
9393 mdr
->more()->slaves
.insert(from
);
9394 if (mdr
->more()->srcdn_auth_mds
== from
&&
9395 mdr
->more()->is_remote_frozen_authpin
&&
9396 !mdr
->more()->is_ambiguous_auth
) {
9397 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
9400 // witnessed? or add extra witnesses?
9401 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
9402 if (ack
->is_interrupted()) {
9403 dout(10) << " slave request interrupted, noop" << dendl
;
9404 } else if (ack
->witnesses
.empty()) {
9405 mdr
->more()->witnessed
.insert(from
);
9406 if (!ack
->is_not_journaled())
9407 mdr
->more()->has_journaled_slaves
= true;
9409 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
9410 mdr
->more()->extra_witnesses
= ack
->witnesses
;
9411 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
9415 if (ack
->inode_export
.length()) {
9416 dout(10) << " got srci import" << dendl
;
9417 mdr
->more()->inode_import
.share(ack
->inode_export
);
9418 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
9421 // remove from waiting list
9422 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
9423 mdr
->more()->waiting_on_slave
.erase(from
);
9425 if (mdr
->more()->waiting_on_slave
.empty())
9426 dispatch_client_request(mdr
); // go again!
9428 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
9431 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&ack
)
9433 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
9434 << ack
->get_source() << dendl
;
9435 ceph_assert(mdr
->is_slave());
9436 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
9438 if (mdr
->more()->waiting_on_slave
.count(from
)) {
9439 mdr
->more()->waiting_on_slave
.erase(from
);
9441 if (mdr
->more()->waiting_on_slave
.empty()) {
9442 if (mdr
->slave_request
)
9443 dispatch_slave_request(mdr
);
9445 dout(10) << " still waiting for rename notify acks from "
9446 << mdr
->more()->waiting_on_slave
<< dendl
;
9450 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
9452 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
9454 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
9455 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
9457 if (mdr
->more()->waiting_on_slave
.empty()) {
9458 if (mdr
->slave_request
)
9459 dispatch_slave_request(mdr
);
9461 dout(10) << " still waiting for rename notify acks from "
9462 << mdr
->more()->waiting_on_slave
<< dendl
;
9467 /* This function takes responsibility for the passed mdr*/
9468 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
9470 const MClientRequest::const_ref
&req
= mdr
->client_request
;
9473 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
9474 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
9475 respond_to_request(mdr
, -ESTALE
);
9478 if (!diri
->is_auth()) {
9479 mdcache
->request_forward(mdr
, diri
->authority().first
);
9482 if (!diri
->is_dir()) {
9483 respond_to_request(mdr
, -ENOTDIR
);
9486 dout(10) << "lssnap on " << *diri
<< dendl
;
9489 MutationImpl::LockOpVec lov
;
9490 mds
->locker
->include_snap_rdlocks(diri
, lov
);
9491 if (!mds
->locker
->acquire_locks(mdr
, lov
))
9494 if (!check_access(mdr
, diri
, MAY_READ
))
9497 SnapRealm
*realm
= diri
->find_snaprealm();
9498 map
<snapid_t
,const SnapInfo
*> infomap
;
9499 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
9501 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
9503 max_entries
= infomap
.size();
9504 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
9506 // make sure at least one item can be encoded
9507 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
9509 __u64 last_snapid
= 0;
9510 string offset_str
= req
->get_path2();
9511 if (!offset_str
.empty())
9512 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
9516 static DirStat empty
;
9517 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
9519 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
9523 auto p
= infomap
.upper_bound(last_snapid
);
9524 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
9525 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
9529 if (p
->second
->ino
== diri
->ino())
9530 snap_name
= p
->second
->name
;
9532 snap_name
= p
->second
->get_long_name();
9534 unsigned start_len
= dnbl
.length();
9535 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
9538 encode(snap_name
, dnbl
);
9540 LeaseStat
e(-1, -1, 0);
9541 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
9542 dout(20) << "encode_infinite_lease" << dendl
;
9544 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
9547 keep
.substr_of(dnbl
, 0, start_len
);
9556 if (p
== infomap
.end()) {
9557 flags
= CEPH_READDIR_FRAG_END
;
9558 if (last_snapid
== 0)
9559 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
9561 encode(flags
, dirbl
);
9562 dirbl
.claim_append(dnbl
);
9564 mdr
->reply_extra_bl
= dirbl
;
9566 respond_to_request(mdr
, 0);
9572 struct C_MDS_mksnap_finish
: public ServerLogContext
{
9575 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
9576 ServerLogContext(s
, r
), diri(di
), info(i
) {}
9577 void finish(int r
) override
{
9578 server
->_mksnap_finish(mdr
, diri
, info
);
9582 /* This function takes responsibility for the passed mdr*/
9583 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
9585 const MClientRequest::const_ref
&req
= mdr
->client_request
;
9586 // make sure we have as new a map as the client
9587 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
9588 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
9591 if (!mds
->mdsmap
->allows_snaps()) {
9592 // you can't make snapshots until you set an option right now
9593 respond_to_request(mdr
, -EPERM
);
9597 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
9598 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
9599 respond_to_request(mdr
, -ESTALE
);
9603 if (!diri
->is_auth()) { // fw to auth?
9604 mdcache
->request_forward(mdr
, diri
->authority().first
);
9609 if (!diri
->is_dir()) {
9610 respond_to_request(mdr
, -ENOTDIR
);
9613 if (diri
->is_system() && !diri
->is_root()) {
9614 // no snaps in system dirs (root is ok)
9615 respond_to_request(mdr
, -EPERM
);
9619 std::string_view snapname
= req
->get_filepath().last_dentry();
9621 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
9622 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
9623 respond_to_request(mdr
, -EPERM
);
9627 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
9630 MutationImpl::LockOpVec lov
;
9632 mds
->locker
->include_snap_rdlocks(diri
, lov
);
9633 lov
.erase_rdlock(&diri
->snaplock
);
9634 lov
.add_xlock(&diri
->snaplock
);
9636 if (!mds
->locker
->acquire_locks(mdr
, lov
))
9639 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
9642 // make sure name is unique
9643 if (diri
->snaprealm
&&
9644 diri
->snaprealm
->exists(snapname
)) {
9645 respond_to_request(mdr
, -EEXIST
);
9648 if (snapname
.length() == 0 ||
9649 snapname
[0] == '_') {
9650 respond_to_request(mdr
, -EINVAL
);
9654 // allocate a snapid
9655 if (!mdr
->more()->stid
) {
9657 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
9658 mdr
->get_mds_stamp(),
9659 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
9660 new C_MDS_RetryRequest(mdcache
, mdr
));
9664 version_t stid
= mdr
->more()->stid
;
9666 auto p
= mdr
->more()->snapidbl
.cbegin();
9668 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
9670 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
9674 info
.ino
= diri
->ino();
9675 info
.snapid
= snapid
;
9676 info
.name
= snapname
;
9677 info
.stamp
= mdr
->get_op_stamp();
9679 auto &pi
= diri
->project_inode(false, true);
9680 pi
.inode
.ctime
= info
.stamp
;
9681 if (info
.stamp
> pi
.inode
.rstat
.rctime
)
9682 pi
.inode
.rstat
.rctime
= info
.stamp
;
9683 pi
.inode
.rstat
.rsnaps
++;
9684 pi
.inode
.version
= diri
->pre_dirty();
9686 // project the snaprealm
9687 auto &newsnap
= *pi
.snapnode
;
9688 newsnap
.created
= snapid
;
9689 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
9691 em
.first
->second
= info
;
9692 newsnap
.seq
= snapid
;
9693 newsnap
.last_created
= snapid
;
9695 // journal the inode changes
9696 mdr
->ls
= mdlog
->get_current_segment();
9697 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
9698 mdlog
->start_entry(le
);
9700 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
9701 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
9702 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
9703 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
9705 // journal the snaprealm changes
9706 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
9711 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
9713 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
9715 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
9717 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
9720 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
9723 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
9726 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
9728 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
9732 mdr
->snapid
= info
.snapid
;
9734 respond_to_request(mdr
, 0);
9740 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
9743 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
9744 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
9745 void finish(int r
) override
{
9746 server
->_rmsnap_finish(mdr
, diri
, snapid
);
9750 /* This function takes responsibility for the passed mdr*/
9751 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
9753 const MClientRequest::const_ref
&req
= mdr
->client_request
;
9755 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
9756 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
9757 respond_to_request(mdr
, -ESTALE
);
9760 if (!diri
->is_auth()) { // fw to auth?
9761 mdcache
->request_forward(mdr
, diri
->authority().first
);
9764 if (!diri
->is_dir()) {
9765 respond_to_request(mdr
, -ENOTDIR
);
9769 std::string_view snapname
= req
->get_filepath().last_dentry();
9771 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
9772 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
9773 respond_to_request(mdr
, -EPERM
);
9777 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
9780 if (snapname
.length() == 0 || snapname
[0] == '_') {
9781 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
9784 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
9785 respond_to_request(mdr
, -ENOENT
);
9788 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
9789 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
9791 MutationImpl::LockOpVec lov
;
9792 mds
->locker
->include_snap_rdlocks(diri
, lov
);
9793 lov
.erase_rdlock(&diri
->snaplock
);
9794 lov
.add_xlock(&diri
->snaplock
);
9796 if (!mds
->locker
->acquire_locks(mdr
, lov
))
9799 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
9803 if (!mdr
->more()->stid
) {
9804 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
9805 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
9806 new C_MDS_RetryRequest(mdcache
, mdr
));
9809 version_t stid
= mdr
->more()->stid
;
9810 auto p
= mdr
->more()->snapidbl
.cbegin();
9813 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
9815 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
9818 auto &pi
= diri
->project_inode(false, true);
9819 pi
.inode
.version
= diri
->pre_dirty();
9820 pi
.inode
.ctime
= mdr
->get_op_stamp();
9821 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
9822 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
9823 pi
.inode
.rstat
.rsnaps
--;
9825 mdr
->ls
= mdlog
->get_current_segment();
9826 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
9827 mdlog
->start_entry(le
);
9829 // project the snaprealm
9830 auto &newnode
= *pi
.snapnode
;
9831 newnode
.snaps
.erase(snapid
);
9833 newnode
.last_destroyed
= seq
;
9835 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
9836 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
9837 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
9838 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
9840 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
9845 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
9847 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
9848 snapid_t stid
= mdr
->more()->stid
;
9849 auto p
= mdr
->more()->snapidbl
.cbegin();
9853 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
9856 mds
->snapclient
->commit(stid
, mdr
->ls
);
9858 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
9861 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
9863 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
9867 respond_to_request(mdr
, 0);
9869 // purge snapshot data
9870 if (diri
->snaprealm
->have_past_parents_open())
9871 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
9874 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
9877 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
9878 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
9879 void finish(int r
) override
{
9880 server
->_renamesnap_finish(mdr
, diri
, snapid
);
9884 /* This function takes responsibility for the passed mdr*/
9885 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
9887 const MClientRequest::const_ref
&req
= mdr
->client_request
;
9888 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
9889 respond_to_request(mdr
, -EINVAL
);
9893 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
9894 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
9895 respond_to_request(mdr
, -ESTALE
);
9899 if (!diri
->is_auth()) { // fw to auth?
9900 mdcache
->request_forward(mdr
, diri
->authority().first
);
9904 if (!diri
->is_dir()) { // dir only
9905 respond_to_request(mdr
, -ENOTDIR
);
9909 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
9910 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
9911 respond_to_request(mdr
, -EPERM
);
9915 std::string_view dstname
= req
->get_filepath().last_dentry();
9916 std::string_view srcname
= req
->get_filepath2().last_dentry();
9917 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
9919 if (srcname
.length() == 0 || srcname
[0] == '_') {
9920 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
9923 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
9924 respond_to_request(mdr
, -ENOENT
);
9927 if (dstname
.length() == 0 || dstname
[0] == '_') {
9928 respond_to_request(mdr
, -EINVAL
);
9931 if (diri
->snaprealm
->exists(dstname
)) {
9932 respond_to_request(mdr
, -EEXIST
);
9936 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
9937 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
9940 MutationImpl::LockOpVec lov
;
9942 mds
->locker
->include_snap_rdlocks(diri
, lov
);
9943 lov
.erase_rdlock(&diri
->snaplock
);
9944 lov
.add_xlock(&diri
->snaplock
);
9946 if (!mds
->locker
->acquire_locks(mdr
, lov
))
9949 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
9953 if (!mdr
->more()->stid
) {
9954 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
9956 new C_MDS_RetryRequest(mdcache
, mdr
));
9960 version_t stid
= mdr
->more()->stid
;
9961 dout(10) << " stid is " << stid
<< dendl
;
9963 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
9966 auto &pi
= diri
->project_inode(false, true);
9967 pi
.inode
.ctime
= mdr
->get_op_stamp();
9968 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
9969 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
9970 pi
.inode
.version
= diri
->pre_dirty();
9972 // project the snaprealm
9973 auto &newsnap
= *pi
.snapnode
;
9974 auto it
= newsnap
.snaps
.find(snapid
);
9975 ceph_assert(it
!= newsnap
.snaps
.end());
9976 it
->second
.name
= dstname
;
9978 // journal the inode changes
9979 mdr
->ls
= mdlog
->get_current_segment();
9980 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
9981 mdlog
->start_entry(le
);
9983 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
9984 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
9985 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
9986 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
9988 // journal the snaprealm changes
9989 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
9994 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
9996 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
9998 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
10001 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10003 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10005 // notify other mds
10006 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
10008 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
10012 mdr
->tracei
= diri
;
10013 mdr
->snapid
= snapid
;
10014 respond_to_request(mdr
, 0);
10018 * Return true if server is in state RECONNECT and this
10019 * client has not yet reconnected.
10021 bool Server::waiting_for_reconnect(client_t c
) const
10023 return client_reconnect_gather
.count(c
) > 0;
10026 void Server::dump_reconnect_status(Formatter
*f
) const
10028 f
->open_object_section("reconnect_status");
10029 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
10030 f
->close_section();