1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "cephfs_features.h"
34 #include "msg/Messenger.h"
36 #include "osdc/Objecter.h"
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43 #include "events/EPurged.h"
45 #include "include/stringify.h"
46 #include "include/filepath.h"
47 #include "common/errno.h"
48 #include "common/Timer.h"
49 #include "common/perf_counters.h"
50 #include "include/compat.h"
51 #include "osd/OSDMap.h"
58 #include <string_view>
60 #include "common/config.h"
62 #define dout_context g_ceph_context
63 #define dout_subsys ceph_subsys_mds
65 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67 class ServerContext
: public MDSContext
{
70 MDSRank
*get_mds() override
76 explicit ServerContext(Server
*s
) : server(s
) {
77 ceph_assert(server
!= NULL
);
81 class Batch_Getattr_Lookup
: public BatchOp
{
84 ceph::ref_t
<MDRequestImpl
> mdr
;
88 Batch_Getattr_Lookup(Server
* s
, ceph::ref_t
<MDRequestImpl
> r
, MDCache
* mdc
) : server(s
), mdr(std::move(r
)), mdcache(mdc
) {}
89 void add_request(const ceph::ref_t
<MDRequestImpl
>& m
) override
{
90 mdr
->batch_reqs
.push_back(m
);
92 void set_request(const ceph::ref_t
<MDRequestImpl
>& m
) override
{
95 void _forward(mds_rank_t t
) override
{
96 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
97 mdr
->set_mds_stamp(ceph_clock_now());
98 for (auto& m
: mdr
->batch_reqs
) {
100 mdcache
->request_forward(m
, t
);
102 mdr
->batch_reqs
.clear();
104 void _respond(int r
) override
{
105 mdr
->set_mds_stamp(ceph_clock_now());
106 for (auto& m
: mdr
->batch_reqs
) {
108 m
->tracei
= mdr
->tracei
;
109 m
->tracedn
= mdr
->tracedn
;
110 server
->respond_to_request(m
, r
);
113 mdr
->batch_reqs
.clear();
114 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
116 void print(std::ostream
& o
) {
117 o
<< "[batch front=" << *mdr
<< "]";
121 class ServerLogContext
: public MDSLogContextBase
{
124 MDSRank
*get_mds() override
130 void pre_finish(int r
) override
{
132 mdr
->mark_event("journal_committed: ");
135 explicit ServerLogContext(Server
*s
) : server(s
) {
136 ceph_assert(server
!= NULL
);
138 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
139 ceph_assert(server
!= NULL
);
143 void Server::create_logger()
145 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
147 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
148 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
149 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
150 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
151 plb
.add_u64_counter(l_mdss_handle_client_session
,
152 "handle_client_session", "Client session messages", "hcs",
153 PerfCountersBuilder::PRIO_INTERESTING
);
154 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
155 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
157 // fop latencies are useful
158 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
159 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
160 "Request type lookup hash of inode latency");
161 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
162 "Request type lookup inode latency");
163 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
164 "Request type lookup parent latency");
165 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
166 "Request type lookup name latency");
167 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
168 "Request type lookup latency");
169 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
170 "Request type lookup snapshot latency");
171 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
172 "Request type get attribute latency");
173 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
174 "Request type set attribute latency");
175 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
176 "Request type set file layout latency");
177 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
178 "Request type set directory layout latency");
179 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
180 "Request type set extended attribute latency");
181 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
182 "Request type remove extended attribute latency");
183 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
184 "Request type read directory latency");
185 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
186 "Request type set file lock latency");
187 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
188 "Request type get file lock latency");
189 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
190 "Request type create latency");
191 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
192 "Request type open latency");
193 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
194 "Request type make node latency");
195 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
196 "Request type link latency");
197 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
198 "Request type unlink latency");
199 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
200 "Request type remove directory latency");
201 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
202 "Request type rename latency");
203 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
204 "Request type make directory latency");
205 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
206 "Request type symbolic link latency");
207 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
208 "Request type list snapshot latency");
209 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
210 "Request type make snapshot latency");
211 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
212 "Request type remove snapshot latency");
213 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
214 "Request type rename snapshot latency");
216 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
217 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
218 "Client requests dispatched");
219 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request",
220 "Server requests dispatched");
222 logger
= plb
.create_perf_counters();
223 g_ceph_context
->get_perfcounters_collection()->add(logger
);
226 Server::Server(MDSRank
*m
) :
228 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
229 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate"))
231 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
232 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
233 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
234 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
235 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
238 void Server::dispatch(const cref_t
<Message
> &m
)
240 switch (m
->get_type()) {
241 case CEPH_MSG_CLIENT_RECONNECT
:
242 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
247 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
249 1. In reconnect phase, client sent unsafe requests to mds.
250 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
251 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
252 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
255 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
257 // handle_slave_request()/handle_client_session() will wait if necessary
258 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
259 const auto &req
= ref_cast
<MClientRequest
>(m
);
260 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
261 Session
*session
= mds
->get_session(req
);
262 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
263 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
266 bool queue_replay
= false;
267 if (req
->is_replay() || req
->is_async()) {
268 dout(3) << "queuing replayed op" << dendl
;
271 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
272 mdcache
->add_replay_ino_alloc(inodeno_t(req
->head
.ino
));
274 } else if (req
->get_retry_attempt()) {
275 // process completed request in clientreplay stage. The completed request
276 // might have created new file/directorie. This guarantees MDS sends a reply
277 // to client before other request modifies the new file/directorie.
278 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
279 dout(3) << "queuing completed op" << dendl
;
282 // this request was created before the cap reconnect message, drop any embedded
284 req
->releases
.clear();
287 req
->mark_queued_for_replay();
288 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
293 bool wait_for_active
= true;
294 if (mds
->is_stopping()) {
295 wait_for_active
= false;
296 } else if (mds
->is_clientreplay()) {
297 if (req
->is_queued_for_replay()) {
298 wait_for_active
= false;
301 if (wait_for_active
) {
302 dout(3) << "not active yet, waiting" << dendl
;
303 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
308 switch (m
->get_type()) {
309 case CEPH_MSG_CLIENT_SESSION
:
310 handle_client_session(ref_cast
<MClientSession
>(m
));
312 case CEPH_MSG_CLIENT_REQUEST
:
313 handle_client_request(ref_cast
<MClientRequest
>(m
));
315 case CEPH_MSG_CLIENT_RECLAIM
:
316 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
318 case MSG_MDS_SLAVE_REQUEST
:
319 handle_slave_request(ref_cast
<MMDSSlaveRequest
>(m
));
322 derr
<< "server unknown message " << m
->get_type() << dendl
;
323 ceph_abort_msg("server unknown message");
329 // ----------------------------------------------------------
330 // SESSION management
332 class C_MDS_session_finish
: public ServerLogContext
{
337 interval_set
<inodeno_t
> inos
;
339 interval_set
<inodeno_t
> purge_inos
;
340 LogSegment
*ls
= nullptr;
343 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
344 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
345 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
> i
, version_t iv
, Context
*fin_
= NULL
) :
346 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(std::move(i
)), inotablev(iv
), fin(fin_
) { }
347 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
> i
, version_t iv
,
348 interval_set
<inodeno_t
> _purge_inos
, LogSegment
*_ls
, Context
*fin_
= NULL
) :
349 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(std::move(i
)), inotablev(iv
), purge_inos(std::move(_purge_inos
)), ls(_ls
), fin(fin_
){}
350 void finish(int r
) override
{
352 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
, purge_inos
, ls
);
359 Session
* Server::find_session_by_uuid(std::string_view uuid
)
361 Session
* session
= nullptr;
362 for (auto& it
: mds
->sessionmap
.get_sessions()) {
363 auto& metadata
= it
.second
->info
.client_metadata
;
365 auto p
= metadata
.find("uuid");
366 if (p
== metadata
.end() || p
->second
!= uuid
)
371 } else if (!session
->reclaiming_from
) {
372 assert(it
.second
->reclaiming_from
== session
);
375 assert(session
->reclaiming_from
== it
.second
);
381 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
383 if (!session
->is_open() && !session
->is_stale()) {
384 dout(10) << "session not open, dropping this req" << dendl
;
388 auto reply
= make_message
<MClientReclaimReply
>(0);
389 if (m
->get_uuid().empty()) {
390 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
391 reply
->set_result(-EINVAL
);
392 mds
->send_message_client(reply
, session
);
396 unsigned flags
= m
->get_flags();
397 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
398 dout(10) << __func__
<< " unsupported flags" << dendl
;
399 reply
->set_result(-EOPNOTSUPP
);
400 mds
->send_message_client(reply
, session
);
404 Session
* target
= find_session_by_uuid(m
->get_uuid());
406 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
407 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
408 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
409 reply
->set_result(-EPERM
);
410 mds
->send_message_client(reply
, session
);
413 assert(!target
->reclaiming_from
);
414 assert(!session
->reclaiming_from
);
415 session
->reclaiming_from
= target
;
416 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
419 if (flags
& CEPH_RECLAIM_RESET
) {
420 finish_reclaim_session(session
, reply
);
427 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
429 Session
*target
= session
->reclaiming_from
;
431 session
->reclaiming_from
= nullptr;
435 int64_t session_id
= session
->get_client().v
;
436 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
437 assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
438 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
442 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
443 reply
->set_epoch(epoch
);
444 mds
->send_message_client(reply
, session
);
447 send_reply
= nullptr;
450 bool blacklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
451 return map
.is_blacklisted(target
->info
.inst
.addr
);
454 if (blacklisted
|| !g_conf()->mds_session_blacklist_on_evict
) {
455 kill_session(target
, send_reply
);
457 std::stringstream ss
;
458 mds
->evict_client(target
->get_client().v
, false, true, ss
, send_reply
);
461 mds
->send_message_client(reply
, session
);
465 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
467 Session
*session
= mds
->get_session(m
);
468 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
469 assert(m
->get_source().is_client()); // should _not_ come from an mds!
472 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
476 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
477 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
481 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
482 finish_reclaim_session(session
);
484 reclaim_session(session
, m
);
488 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
491 Session
*session
= mds
->get_session(m
);
493 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
494 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
497 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
498 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
499 reply
->metadata
["error_string"] = "sessionless";
500 mds
->send_message(reply
, m
->get_connection());
504 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
505 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
506 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
507 // close requests need to be handled when mds is active
508 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
509 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
513 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
514 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
520 logger
->inc(l_mdss_handle_client_session
);
523 switch (m
->get_op()) {
524 case CEPH_SESSION_REQUEST_OPEN
:
525 if (session
->is_opening() ||
526 session
->is_open() ||
527 session
->is_stale() ||
528 session
->is_killing() ||
529 terminating_sessions
) {
530 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
533 ceph_assert(session
->is_closed() || session
->is_closing());
535 if (mds
->is_stopping()) {
536 dout(10) << "mds is stopping, dropping open req" << dendl
;
541 auto& addr
= session
->info
.inst
.addr
;
542 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
543 auto& client_metadata
= session
->info
.client_metadata
;
545 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
546 auto now
= ceph_clock_now();
547 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
548 auto elapsed
= now
- m
->get_recv_stamp();
549 CachedStackStringStream css
;
550 *css
<< "New client session:"
551 << " addr=\"" << session
->info
.inst
.addr
<< "\""
552 << ",elapsed=" << elapsed
553 << ",throttled=" << throttle_elapsed
554 << ",status=\"" << status
<< "\"";
556 *css
<< ",error=\"" << err
<< "\"";
558 const auto& metadata
= session
->info
.client_metadata
;
559 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
560 *css
<< ",root=\"" << it
->second
<< "\"";
562 dout(2) << css
->strv() << dendl
;
565 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
) {
566 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
567 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
568 m
->metadata
["error_string"] = err_str
;
569 mds
->send_message_client(m
, session
);
570 log_session_status("REJECTED", err_str
);
573 bool blacklisted
= mds
->objecter
->with_osdmap(
574 [&addr
](const OSDMap
&osd_map
) -> bool {
575 return osd_map
.is_blacklisted(addr
);
579 dout(10) << "rejecting blacklisted client " << addr
<< dendl
;
580 send_reject_message("blacklisted");
585 if (client_metadata
.features
.empty())
586 infer_supported_features(session
, client_metadata
);
588 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
589 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
590 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
591 for (const auto& p
: client_metadata
) {
592 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
595 feature_bitset_t missing_features
= required_client_features
;
596 missing_features
-= client_metadata
.features
;
597 if (!missing_features
.empty()) {
599 ss
<< "missing required features '" << missing_features
<< "'";
600 send_reject_message(ss
.str());
601 mds
->clog
->warn() << "client session (" << session
->info
.inst
602 << ") lacks required features " << missing_features
603 << "; client supports " << client_metadata
.features
;
608 // Special case for the 'root' metadata path; validate that the claimed
609 // root is actually within the caps of the session
610 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
611 auto claimed_root
= it
->second
;
614 // claimed_root has a leading "/" which we strip before passing
616 if (claimed_root
.empty() || claimed_root
[0] != '/') {
618 ss
<< "invalue root '" << claimed_root
<< "'";
619 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
621 ss
<< "non-allowable root '" << claimed_root
<< "'";
625 // Tell the client we're rejecting their open
626 send_reject_message(ss
.str());
627 mds
->clog
->warn() << "client session with " << ss
.str()
628 << " denied (" << session
->info
.inst
<< ")";
634 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
635 if (find_session_by_uuid(it
->second
)) {
636 send_reject_message("duplicated session uuid");
637 mds
->clog
->warn() << "client session with duplicated session uuid '"
638 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
644 if (session
->is_closed())
645 mds
->sessionmap
.add_session(session
);
647 pv
= mds
->sessionmap
.mark_projected(session
);
648 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
649 mds
->sessionmap
.touch_session(session
);
650 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
652 log_session_status("ACCEPTED", "");
654 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
655 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
660 case CEPH_SESSION_REQUEST_RENEWCAPS
:
661 if (session
->is_open() || session
->is_stale()) {
662 mds
->sessionmap
.touch_session(session
);
663 if (session
->is_stale()) {
664 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
665 mds
->locker
->resume_stale_caps(session
);
666 mds
->sessionmap
.touch_session(session
);
668 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
669 mds
->send_message_client(reply
, session
);
671 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
675 case CEPH_SESSION_REQUEST_CLOSE
:
677 if (session
->is_closed() ||
678 session
->is_closing() ||
679 session
->is_killing()) {
680 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
683 if (session
->is_importing()) {
684 dout(10) << "ignoring close req on importing session" << dendl
;
687 ceph_assert(session
->is_open() ||
688 session
->is_stale() ||
689 session
->is_opening());
690 if (m
->get_seq() < session
->get_push_seq()) {
691 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
692 << ", dropping" << dendl
;
695 // We are getting a seq that is higher than expected.
696 // Handle the same as any other seqn error.
698 if (m
->get_seq() != session
->get_push_seq()) {
699 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
700 << ", BUGGY!" << dendl
;
701 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
702 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
705 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
709 case CEPH_SESSION_FLUSHMSG_ACK
:
710 finish_flush_session(session
, m
->get_seq());
713 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
714 if (mds
->is_active())
724 void Server::flush_session(Session
*session
, MDSGatherBuilder
*gather
) {
725 if (!session
->is_open() ||
726 !session
->get_connection() ||
727 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
731 version_t seq
= session
->wait_for_flush(gather
->new_sub());
732 mds
->send_message_client(
733 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
736 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
738 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
739 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
740 ceph_assert(session
);
741 flush_session(session
, &gather
);
745 void Server::finish_flush_session(Session
*session
, version_t seq
)
747 MDSContext::vec finished
;
748 session
->finish_flush(seq
, finished
);
749 mds
->queue_waiters(finished
);
752 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
753 const interval_set
<inodeno_t
>& inos
, version_t piv
,
754 const interval_set
<inodeno_t
>& purge_inos
, LogSegment
*ls
)
756 dout(10) << "_session_logged " << session
->info
.inst
757 << " state_seq " << state_seq
758 << " " << (open
? "open":"close")
760 << " purge_inos : " << purge_inos
<< dendl
;
763 dout(10) << "_session_logged seq : " << ls
->seq
<< dendl
;
764 if (purge_inos
.size()){
765 ls
->purge_inodes
.insert(purge_inos
);
766 mdcache
->purge_inodes(purge_inos
, ls
);
771 ceph_assert(session
->is_closing() || session
->is_killing() ||
772 session
->is_opening()); // re-open closing session
773 session
->info
.prealloc_inos
.subtract(inos
);
774 session
->delegated_inos
.clear();
775 mds
->inotable
->apply_release_ids(inos
);
776 ceph_assert(mds
->inotable
->get_version() == piv
);
779 mds
->sessionmap
.mark_dirty(session
);
782 if (session
->get_state_seq() != state_seq
) {
783 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
784 << ", noop" << dendl
;
785 // close must have been canceled (by an import?), or any number of other things..
787 ceph_assert(session
->is_opening());
788 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
789 mds
->sessionmap
.touch_session(session
);
790 ceph_assert(session
->get_connection());
791 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
792 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
793 reply
->supported_features
= supported_features
;
794 mds
->send_message_client(reply
, session
);
795 if (mdcache
->is_readonly()) {
796 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
797 mds
->send_message_client(m
, session
);
799 } else if (session
->is_closing() ||
800 session
->is_killing()) {
801 // kill any lingering capabilities, leases, requests
802 while (!session
->caps
.empty()) {
803 Capability
*cap
= session
->caps
.front();
804 CInode
*in
= cap
->get_inode();
805 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
806 mds
->locker
->remove_client_cap(in
, cap
, true);
808 while (!session
->leases
.empty()) {
809 ClientLease
*r
= session
->leases
.front();
810 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
811 dout(20) << " killing client lease of " << *dn
<< dendl
;
812 dn
->remove_client_lease(r
, mds
->locker
);
814 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
815 dout(20) << " removing client from reconnect set" << dendl
;
816 if (client_reconnect_gather
.empty()) {
817 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
818 reconnect_gather_finish();
821 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
822 dout(20) << " removing client from reclaim set" << dendl
;
823 if (client_reclaim_gather
.empty()) {
824 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
825 mds
->maybe_clientreplay_done();
829 if (session
->is_closing()) {
830 // mark con disposable. if there is a fault, we will get a
831 // reset and clean it up. if the client hasn't received the
832 // CLOSE message yet, they will reconnect and get an
833 // ms_handle_remote_reset() and realize they had in fact closed.
834 // do this *before* sending the message to avoid a possible
836 if (session
->get_connection()) {
837 // Conditional because terminate_sessions will indiscrimately
838 // put sessions in CLOSING whether they ever had a conn or not.
839 session
->get_connection()->mark_disposable();
843 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
844 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
846 mds
->sessionmap
.remove_session(session
);
847 } else if (session
->is_killing()) {
848 // destroy session, close connection
849 if (session
->get_connection()) {
850 session
->get_connection()->mark_down();
851 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
852 session
->set_connection(nullptr);
854 mds
->sessionmap
.remove_session(session
);
864 * Inject sessions from some source other than actual connections.
867 * - sessions inferred from journal replay
868 * - sessions learned from other MDSs during rejoin
869 * - sessions learned from other MDSs during dir/caps migration
870 * - sessions learned from other MDSs during a cross-MDS rename
872 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
873 map
<client_t
,client_metadata_t
>& cmm
,
874 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
876 version_t pv
= mds
->sessionmap
.get_projected();
878 dout(10) << "prepare_force_open_sessions " << pv
879 << " on " << cm
.size() << " clients"
882 mds
->objecter
->with_osdmap(
883 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
884 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
885 if (osd_map
.is_blacklisted(p
->second
.addr
)) {
886 dout(10) << " ignoring blacklisted client." << p
->first
887 << " (" << p
->second
.addr
<< ")" << dendl
;
896 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
897 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
898 pv
= mds
->sessionmap
.mark_projected(session
);
900 if (session
->is_closed() ||
901 session
->is_closing() ||
902 session
->is_killing()) {
903 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
904 auto q
= cmm
.find(p
->first
);
906 session
->info
.client_metadata
.merge(q
->second
);
908 ceph_assert(session
->is_open() ||
909 session
->is_opening() ||
910 session
->is_stale());
913 smap
[p
->first
] = make_pair(session
, sseq
);
914 session
->inc_importing();
919 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
923 * FIXME: need to carefully consider the race conditions between a
924 * client trying to close a session and an MDS doing an import
925 * trying to force open a session...
927 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
928 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
930 for (auto &it
: smap
) {
931 Session
*session
= it
.second
.first
;
932 uint64_t sseq
= it
.second
.second
;
934 if (session
->get_state_seq() != sseq
) {
935 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
937 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
938 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
939 mds
->sessionmap
.touch_session(session
);
941 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
942 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
943 reply
->supported_features
= supported_features
;
944 mds
->send_message_client(reply
, session
);
946 if (mdcache
->is_readonly())
947 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
950 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
951 ceph_assert(session
->is_open() || session
->is_stale());
955 session
->dec_importing();
958 mds
->sessionmap
.mark_dirty(session
);
961 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
964 class C_MDS_TerminatedSessions
: public ServerContext
{
965 void finish(int r
) override
{
966 server
->terminating_sessions
= false;
969 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
972 void Server::terminate_sessions()
974 dout(5) << "terminating all sessions..." << dendl
;
976 terminating_sessions
= true;
978 // kill them off. clients will retry etc.
979 set
<Session
*> sessions
;
980 mds
->sessionmap
.get_client_session_set(sessions
);
981 for (set
<Session
*>::const_iterator p
= sessions
.begin();
984 Session
*session
= *p
;
985 if (session
->is_closing() ||
986 session
->is_killing() ||
987 session
->is_closed())
989 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
992 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
996 void Server::find_idle_sessions()
998 auto now
= clock::now();
999 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1001 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1004 // (caps go stale, lease die)
1005 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1006 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1008 // don't kick clients if we've been laggy
1009 if (last_cleared_laggy
< cutoff
) {
1010 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1011 << "), not marking any client stale" << dendl
;
1015 std::vector
<Session
*> to_evict
;
1017 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1018 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1019 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1020 std::vector
<Session
*> new_stale
;
1022 for (auto session
: *(sessions_p1
->second
)) {
1023 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1024 if (last_cap_renew_span
< cutoff
) {
1025 dout(20) << "laggiest active session is " << session
->info
.inst
1026 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1030 if (session
->last_seen
> session
->last_cap_renew
) {
1031 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1032 if (last_cap_renew_span
< cutoff
) {
1033 dout(20) << "laggiest active session is " << session
->info
.inst
1034 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1039 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1040 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1041 "has arrived" << dendl
;
1042 // evict session without marking it stale
1043 to_evict
.push_back(session
);
1047 if (defer_session_stale
&&
1048 !session
->is_any_flush_waiter() &&
1049 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1050 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1051 "since it holds no caps" << dendl
;
1055 auto it
= session
->info
.client_metadata
.find("timeout");
1056 if (it
!= session
->info
.client_metadata
.end()) {
1057 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1059 dout(10) << "skipping session " << session
->info
.inst
1060 << ", infinite timeout specified" << dendl
;
1063 double cutoff
= queue_max_age
+ timeout
;
1064 if (last_cap_renew_span
< cutoff
) {
1065 dout(10) << "skipping session " << session
->info
.inst
1066 << ", timeout (" << timeout
<< ") specified"
1067 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1071 // do not go through stale, evict it directly.
1072 to_evict
.push_back(session
);
1074 dout(10) << "new stale session " << session
->info
.inst
1075 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1076 new_stale
.push_back(session
);
1080 for (auto session
: new_stale
) {
1081 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1082 if (mds
->locker
->revoke_stale_caps(session
)) {
1083 mds
->locker
->remove_stale_leases(session
);
1084 finish_flush_session(session
, session
->get_push_seq());
1085 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
, session
->get_push_seq());
1086 mds
->send_message_client(m
, session
);
1088 to_evict
.push_back(session
);
1094 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1096 // Collect a list of sessions exceeding the autoclose threshold
1097 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1098 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1099 for (auto session
: *(sessions_p2
->second
)) {
1100 assert(session
->is_stale());
1101 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1102 if (last_cap_renew_span
< cutoff
) {
1103 dout(20) << "oldest stale session is " << session
->info
.inst
1104 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1107 to_evict
.push_back(session
);
1111 for (auto session
: to_evict
) {
1112 if (session
->is_importing()) {
1113 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1117 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1118 mds
->clog
->warn() << "evicting unresponsive client " << *session
1119 << ", after " << last_cap_renew_span
<< " seconds";
1120 dout(10) << "autoclosing stale session " << session
->info
.inst
1121 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1123 if (g_conf()->mds_session_blacklist_on_timeout
) {
1124 std::stringstream ss
;
1125 mds
->evict_client(session
->get_client().v
, false, true, ss
, nullptr);
1127 kill_session(session
, NULL
);
1132 void Server::evict_cap_revoke_non_responders() {
1133 if (!cap_revoke_eviction_timeout
) {
1137 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1139 for (auto const &client
: to_evict
) {
1140 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1141 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1142 << " seconds, evicting";
1143 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1146 std::stringstream ss
;
1147 bool evicted
= mds
->evict_client(client
.v
, false,
1148 g_conf()->mds_session_blacklist_on_evict
,
1150 if (evicted
&& logger
) {
1151 logger
->inc(l_mdss_cap_revoke_eviction
);
1156 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1157 if (changed
.count("mds_replay_unsafe_with_closed_session")) {
1158 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
1160 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1161 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1162 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1163 << cap_revoke_eviction_timeout
<< dendl
;
1165 if (changed
.count("mds_recall_max_decay_rate")) {
1166 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1168 if (changed
.count("mds_max_snaps_per_dir")) {
1169 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1170 dout(20) << __func__
<< " max snapshots per directory changed to "
1171 << max_snaps_per_dir
<< dendl
;
1173 if (changed
.count("mds_client_delegate_inos_pct")) {
1174 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1179 * XXX bump in the interface here, not using an MDSContext here
1180 * because all the callers right now happen to use a SaferCond
1182 void Server::kill_session(Session
*session
, Context
*on_safe
, bool need_purge_inos
)
1184 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1186 if ((session
->is_opening() ||
1187 session
->is_open() ||
1188 session
->is_stale()) &&
1189 !session
->is_importing()) {
1190 dout(10) << "kill_session " << session
<< dendl
;
1191 journal_close_session(session
, Session::STATE_KILLING
, on_safe
, need_purge_inos
);
1193 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1194 if (session
->is_closing() ||
1195 session
->is_killing()) {
1197 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1199 ceph_assert(session
->is_closed() ||
1200 session
->is_importing());
1202 on_safe
->complete(0);
1207 size_t Server::apply_blacklist(const std::set
<entity_addr_t
> &blacklist
)
1209 bool prenautilus
= mds
->objecter
->with_osdmap(
1210 [&](const OSDMap
& o
) {
1211 return o
.require_osd_release
< ceph_release_t::nautilus
;
1214 std::vector
<Session
*> victims
;
1215 const auto& sessions
= mds
->sessionmap
.get_sessions();
1216 for (const auto& p
: sessions
) {
1217 if (!p
.first
.is_client()) {
1218 // Do not apply OSDMap blacklist to MDS daemons, we find out
1219 // about their death via MDSMap.
1223 Session
*s
= p
.second
;
1224 auto inst_addr
= s
->info
.inst
.addr
;
1225 // blacklist entries are always TYPE_ANY for nautilus+
1226 inst_addr
.set_type(entity_addr_t::TYPE_ANY
);
1227 if (blacklist
.count(inst_addr
)) {
1228 victims
.push_back(s
);
1232 // ...except pre-nautilus, they were TYPE_LEGACY
1233 inst_addr
.set_type(entity_addr_t::TYPE_LEGACY
);
1234 if (blacklist
.count(inst_addr
)) {
1235 victims
.push_back(s
);
1240 for (const auto& s
: victims
) {
1241 kill_session(s
, nullptr);
1244 dout(10) << "apply_blacklist: killed " << victims
.size() << dendl
;
1246 return victims
.size();
1249 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
, bool need_purge_inos
)
1251 dout(10) << __func__
<< " : "
1252 << "("<< need_purge_inos
<< ")"
1253 << session
->info
.inst
1254 << "(" << session
->info
.prealloc_inos
.size() << "|" << session
->pending_prealloc_inos
.size() << ")" << dendl
;
1256 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1257 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1260 // release alloc and pending-alloc inos for this session
1261 // and wipe out session state, in case the session close aborts for some reason
1262 interval_set
<inodeno_t
> both
;
1263 both
.insert(session
->pending_prealloc_inos
);
1264 if (!need_purge_inos
)
1265 both
.insert(session
->info
.prealloc_inos
);
1267 mds
->inotable
->project_release_ids(both
);
1268 piv
= mds
->inotable
->get_projected_version();
1272 if(need_purge_inos
&& session
->info
.prealloc_inos
.size()) {
1273 dout(10) << "start purge indoes " << session
->info
.prealloc_inos
<< dendl
;
1274 LogSegment
* ls
= mdlog
->get_current_segment();
1275 LogEvent
* e
= new ESession(session
->info
.inst
, false, pv
, both
, piv
, session
->info
.prealloc_inos
);
1276 MDSLogContextBase
* c
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
,
1277 session
->info
.prealloc_inos
, ls
, on_safe
);
1278 mdlog
->start_submit_entry(e
, c
);
1280 interval_set
<inodeno_t
> empty
;
1281 LogEvent
* e
= new ESession(session
->info
.inst
, false, pv
, both
, piv
, empty
);
1282 MDSLogContextBase
* c
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
);
1283 mdlog
->start_submit_entry(e
, c
);
1287 // clean up requests, too
1288 for (auto p
= session
->requests
.begin(); !p
.end(); ) {
1289 MDRequestRef
mdr(*p
);
1291 mdcache
->request_kill(mdr
);
1294 finish_flush_session(session
, session
->get_push_seq());
1297 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1299 reconnect_done
= reconnect_done_
;
1301 auto now
= clock::now();
1302 set
<Session
*> sessions
;
1303 mds
->sessionmap
.get_client_session_set(sessions
);
1304 for (auto session
: sessions
) {
1305 if (session
->is_open()) {
1306 client_reconnect_gather
.insert(session
->get_client());
1307 session
->set_reconnecting(true);
1308 session
->last_cap_renew
= now
;
1312 if (client_reconnect_gather
.empty()) {
1313 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1314 reconnect_gather_finish();
1318 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1320 reconnect_start
= now
;
1321 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1322 mds
->sessionmap
.dump();
1325 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1327 dout(7) << "handle_client_reconnect " << m
->get_source()
1328 << (m
->has_more() ? " (more)" : "") << dendl
;
1329 client_t from
= m
->get_source().num();
1330 Session
*session
= mds
->get_session(m
);
1332 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1333 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1334 reply
->metadata
["error_string"] = "sessionless";
1335 mds
->send_message(reply
, m
->get_connection());
1339 if (!session
->is_open()) {
1340 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1341 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1342 mds
->send_message(reply
, m
->get_connection());
1346 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1347 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1348 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1352 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1353 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1356 if (!mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1357 // XXX maybe in the future we can do better than this?
1358 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1359 mds
->clog
->info() << "denied reconnect attempt (mds is "
1360 << ceph_mds_state_name(mds
->get_state())
1361 << ") from " << m
->get_source_inst()
1362 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1365 std::string error_str
;
1366 if (!session
->is_open()) {
1367 error_str
= "session is closed";
1368 } else if (mdcache
->is_readonly()) {
1369 error_str
= "mds is readonly";
1371 if (session
->info
.client_metadata
.features
.empty())
1372 infer_supported_features(session
, session
->info
.client_metadata
);
1374 feature_bitset_t missing_features
= required_client_features
;
1375 missing_features
-= session
->info
.client_metadata
.features
;
1376 if (!missing_features
.empty()) {
1378 ss
<< "missing required features '" << missing_features
<< "'";
1379 error_str
= ss
.str();
1383 if (!error_str
.empty()) {
1385 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1386 mds
->clog
->info() << "denied reconnect attempt from "
1387 << m
->get_source_inst() << " (" << error_str
<< ")";
1392 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1393 mds
->send_message_client(r
, session
);
1394 if (session
->is_open())
1395 kill_session(session
, nullptr);
1399 if (!m
->has_more()) {
1400 // notify client of success with an OPEN
1401 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1402 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1403 reply
->supported_features
= supported_features
;
1404 mds
->send_message_client(reply
, session
);
1405 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1408 session
->last_cap_renew
= clock::now();
1411 for (const auto &r
: m
->realms
) {
1412 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1413 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1416 if (in
->snaprealm
) {
1417 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1419 // this can happen if we are non-auth or we rollback snaprealm
1420 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1422 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1424 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1425 << " seq " << r
.realm
.seq
<< dendl
;
1426 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1431 for (const auto &p
: m
->caps
) {
1432 // make sure our last_cap_id is MAX over all issued caps
1433 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1434 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1436 CInode
*in
= mdcache
->get_inode(p
.first
);
1437 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1439 if (in
&& in
->is_auth()) {
1440 // we recovered it, and it's ours. take note.
1441 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1442 << " on " << *in
<< dendl
;
1443 in
->reconnect_cap(from
, p
.second
, session
);
1444 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1445 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1449 if (in
&& !in
->is_auth()) {
1451 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1452 // add to cap export list.
1453 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1454 in
->authority().first
, true);
1456 // don't know if the inode is mine
1457 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1458 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1462 reconnect_last_seen
= clock::now();
1464 if (!m
->has_more()) {
1465 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1467 // remove from gather set
1468 client_reconnect_gather
.erase(from
);
1469 session
->set_reconnecting(false);
1470 if (client_reconnect_gather
.empty())
1471 reconnect_gather_finish();
1475 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1478 auto it
= client_metadata
.find("ceph_version");
1479 if (it
!= client_metadata
.end()) {
1480 // user space client
1481 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1482 supported
= CEPHFS_FEATURE_LUMINOUS
;
1483 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1484 supported
= CEPHFS_FEATURE_KRAKEN
;
1486 it
= client_metadata
.find("kernel_version");
1487 if (it
!= client_metadata
.end()) {
1489 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1490 supported
= CEPHFS_FEATURE_LUMINOUS
;
1493 if (supported
== -1 &&
1494 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1495 supported
= CEPHFS_FEATURE_JEWEL
;
1497 if (supported
>= 0) {
1498 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1499 client_metadata
.features
= feature_bitset_t(value
);
1500 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1504 void Server::update_required_client_features()
1506 vector
<size_t> bits
= CEPHFS_FEATURES_MDS_REQUIRED
;
1508 /* If this blows up on you, you added a release without adding a new release bit to cephfs_features.h */
1509 static_assert(CEPHFS_CURRENT_RELEASE
== CEPH_RELEASE_MAX
-1);
1511 ceph_release_t min_compat
= mds
->mdsmap
->get_min_compat_client();
1512 if (min_compat
>= ceph_release_t::octopus
)
1513 bits
.push_back(CEPHFS_FEATURE_OCTOPUS
);
1514 else if (min_compat
>= ceph_release_t::nautilus
)
1515 bits
.push_back(CEPHFS_FEATURE_NAUTILUS
);
1516 else if (min_compat
>= ceph_release_t::mimic
)
1517 bits
.push_back(CEPHFS_FEATURE_MIMIC
);
1518 else if (min_compat
>= ceph_release_t::luminous
)
1519 bits
.push_back(CEPHFS_FEATURE_LUMINOUS
);
1520 else if (min_compat
>= ceph_release_t::kraken
)
1521 bits
.push_back(CEPHFS_FEATURE_KRAKEN
);
1522 else if (min_compat
>= ceph_release_t::jewel
)
1523 bits
.push_back(CEPHFS_FEATURE_JEWEL
);
1525 std::sort(bits
.begin(), bits
.end());
1526 required_client_features
= feature_bitset_t(bits
);
1527 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1529 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1530 set
<Session
*> sessions
;
1531 mds
->sessionmap
.get_client_session_set(sessions
);
1532 for (auto session
: sessions
) {
1533 feature_bitset_t missing_features
= required_client_features
;
1534 missing_features
-= session
->info
.client_metadata
.features
;
1535 if (!missing_features
.empty()) {
1536 bool blacklisted
= mds
->objecter
->with_osdmap(
1537 [session
](const OSDMap
&osd_map
) -> bool {
1538 return osd_map
.is_blacklisted(session
->info
.inst
.addr
);
1543 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1544 << missing_features
<< "'";
1545 std::stringstream ss
;
1546 mds
->evict_client(session
->get_client().v
, false,
1547 g_conf()->mds_session_blacklist_on_evict
, ss
);
1553 void Server::reconnect_gather_finish()
1555 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1556 ceph_assert(reconnect_done
);
1558 if (!mds
->snapclient
->is_synced()) {
1559 // make sure snaptable cache is populated. snaprealms will be
1560 // extensively used in rejoin stage.
1561 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1562 mds
->snapclient
->wait_for_sync(reconnect_done
);
1564 reconnect_done
->complete(0);
1566 reconnect_done
= NULL
;
1569 void Server::reconnect_tick()
1571 if (reconnect_evicting
) {
1572 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1576 if (client_reconnect_gather
.empty())
1579 auto now
= clock::now();
1580 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1581 if (elapse1
< g_conf()->mds_reconnect_timeout
)
1584 vector
<Session
*> remaining_sessions
;
1585 remaining_sessions
.reserve(client_reconnect_gather
.size());
1586 for (auto c
: client_reconnect_gather
) {
1587 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1588 ceph_assert(session
);
1589 remaining_sessions
.push_back(session
);
1590 // client re-sends cap flush messages before the reconnect message
1591 if (session
->last_seen
> reconnect_last_seen
)
1592 reconnect_last_seen
= session
->last_seen
;
1595 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1596 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2) {
1597 dout(7) << "reconnect_tick: last seen " << elapse2
1598 << " seconds ago, extending reconnect interval" << dendl
;
1602 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1603 << " clients have not reconnected in time" << dendl
;
1605 // If we're doing blacklist evictions, use this to wait for them before
1606 // proceeding to reconnect_gather_finish
1607 MDSGatherBuilder
gather(g_ceph_context
);
1609 for (auto session
: remaining_sessions
) {
1610 // Keep sessions that have specified timeout. These sessions will prevent
1611 // mds from going to active. MDS goes to active after they all have been
1612 // killed or reclaimed.
1613 if (session
->info
.client_metadata
.find("timeout") !=
1614 session
->info
.client_metadata
.end()) {
1615 dout(1) << "reconnect keeps " << session
->info
.inst
1616 << ", need to be reclaimed" << dendl
;
1617 client_reclaim_gather
.insert(session
->get_client());
1621 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1623 mds
->clog
->warn() << "evicting unresponsive client " << *session
1624 << ", after waiting " << elapse1
1625 << " seconds during MDS startup";
1627 if (g_conf()->mds_session_blacklist_on_timeout
) {
1628 std::stringstream ss
;
1629 mds
->evict_client(session
->get_client().v
, false, true, ss
,
1632 kill_session(session
, NULL
, true);
1635 failed_reconnects
++;
1637 client_reconnect_gather
.clear();
1639 if (gather
.has_subs()) {
1640 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1641 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1642 [this](int r
){reconnect_gather_finish();})));
1644 reconnect_evicting
= true;
1646 reconnect_gather_finish();
1650 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1652 if (!locks
.length()) return;
1655 auto p
= locks
.cbegin();
1656 decode(numlocks
, p
);
1657 for (int i
= 0; i
< numlocks
; ++i
) {
1659 lock
.client
= client
;
1660 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1661 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1663 decode(numlocks
, p
);
1664 for (int i
= 0; i
< numlocks
; ++i
) {
1666 lock
.client
= client
;
1667 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1668 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1673 * Call this when the MDCache is oversized, to send requests to the clients
1674 * to trim some caps, and consequently unpin some inodes in the MDCache so
1675 * that it can trim too.
1677 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1679 const auto now
= clock::now();
1680 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1681 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1682 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1683 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1685 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1686 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1687 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1688 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1689 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1690 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1692 dout(7) << __func__
<< ":"
1693 << " min=" << min_caps_per_client
1694 << " max=" << max_caps_per_client
1695 << " total=" << Capability::count()
1696 << " flags=" << flags
1699 /* trim caps of sessions with the most caps first */
1700 std::multimap
<uint64_t, Session
*> caps_session
;
1701 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1702 auto num_caps
= s
->caps
.size();
1703 auto cache_liveness
= s
->get_session_cache_liveness();
1704 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1705 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1708 mds
->sessionmap
.get_client_sessions(std::move(f
));
1710 std::pair
<bool, uint64_t> result
= {false, 0};
1711 auto& [throttled
, caps_recalled
] = result
;
1712 last_recall_state
= now
;
1713 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1714 if (!session
->is_open() ||
1715 !session
->get_connection() ||
1716 !session
->info
.inst
.name
.is_client())
1719 dout(10) << __func__
<< ":"
1720 << " session " << session
->info
.inst
1721 << " caps " << num_caps
1722 << ", leases " << session
->leases
.size()
1726 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1727 newlim
= min_caps_per_client
;
1729 newlim
= num_caps
-recall_max_caps
;
1731 if (num_caps
> newlim
) {
1732 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1733 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1734 newlim
= num_caps
-recall
;
1735 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1736 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1737 const uint64_t global_recall_throttle
= recall_throttle
.get();
1738 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1739 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1742 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1743 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1746 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1747 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1752 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1754 const auto session_recall
= session
->get_recall_caps();
1755 const auto session_release
= session
->get_release_caps();
1756 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1757 /* The session has been unable to keep up with the number of caps
1758 * recalled (by half); additionally, to prevent marking sessions
1759 * we've just begun to recall from, the session_recall counter
1760 * (decayed count of caps recently recalled) is **greater** than the
1761 * session threshold for the session's cap recall throttle.
1763 dout(15) << " 2*session_release < session_recall"
1764 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1765 " 2*session_recall < recall_max_decay_threshold"
1766 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1767 " Skipping because we are unlikely to get more released." << dendl
;
1769 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1770 /* The number of caps recalled is less than the number we *could*
1771 * recall (so there isn't much left to recall?) and the number of
1772 * caps is less than the current recall_caps counter (decayed count
1773 * of caps recently recalled).
1775 dout(15) << " 2*recall < session_recall "
1776 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1777 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1778 " Skipping because we are unlikely to get more released." << dendl
;
1783 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1785 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1786 m
->head
.max_caps
= newlim
;
1787 mds
->send_message_client(m
, session
);
1789 flush_session(session
, gather
);
1791 caps_recalled
+= session
->notify_recall_sent(newlim
);
1792 recall_throttle
.hit(recall
);
1796 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1801 void Server::force_clients_readonly()
1803 dout(10) << "force_clients_readonly" << dendl
;
1804 set
<Session
*> sessions
;
1805 mds
->sessionmap
.get_client_session_set(sessions
);
1806 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1807 p
!= sessions
.end();
1809 Session
*session
= *p
;
1810 if (!session
->info
.inst
.name
.is_client() ||
1811 !(session
->is_open() || session
->is_stale()))
1813 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1818 * some generic stuff for finishing off requests
1820 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1822 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1823 ceph_assert(!mdr
->has_completed
);
1825 // note trace items for eventual reply.
1834 early_reply(mdr
, in
, dn
);
1836 mdr
->committing
= true;
1837 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1839 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1840 if (mds
->queue_one_replay()) {
1841 dout(10) << " queued next replay op" << dendl
;
1843 dout(10) << " journaled last replay op" << dendl
;
1845 } else if (mdr
->did_early_reply
)
1846 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1851 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1852 std::string_view event
)
1855 string
event_str("submit entry: ");
1857 mdr
->mark_event(event_str
);
1859 mdlog
->submit_entry(le
, fin
);
1863 * send response built from mdr contents and error code; clean up mdr
1865 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1867 if (mdr
->client_request
) {
1868 if (mdr
->is_batch_op() && mdr
->is_batch_head
) {
1869 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
1871 std::unique_ptr
<BatchOp
> bop
;
1872 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_GETATTR
) {
1873 dout(20) << __func__
<< ": respond other getattr ops. " << *mdr
<< dendl
;
1874 auto it
= mdr
->in
[0]->batch_ops
.find(mask
);
1875 bop
= std::move(it
->second
);
1876 mdr
->in
[0]->batch_ops
.erase(it
);
1878 dout(20) << __func__
<< ": respond other lookup ops. " << *mdr
<< dendl
;
1879 auto it
= mdr
->dn
[0].back()->batch_ops
.find(mask
);
1880 bop
= std::move(it
->second
);
1881 mdr
->dn
[0].back()->batch_ops
.erase(it
);
1886 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
1888 } else if (mdr
->internal_op
> -1) {
1889 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1890 if (!mdr
->internal_op_finish
)
1891 ceph_abort_msg("trying to respond to internal op without finisher");
1892 mdr
->internal_op_finish
->complete(r
);
1893 mdcache
->request_finish(mdr
);
1897 // statistics mds req op number and latency
1898 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
1900 int code
= l_mdss_first
;
1901 switch(req
->get_op()) {
1902 case CEPH_MDS_OP_LOOKUPHASH
:
1903 code
= l_mdss_req_lookuphash_latency
;
1905 case CEPH_MDS_OP_LOOKUPINO
:
1906 code
= l_mdss_req_lookupino_latency
;
1908 case CEPH_MDS_OP_LOOKUPPARENT
:
1909 code
= l_mdss_req_lookupparent_latency
;
1911 case CEPH_MDS_OP_LOOKUPNAME
:
1912 code
= l_mdss_req_lookupname_latency
;
1914 case CEPH_MDS_OP_LOOKUP
:
1915 code
= l_mdss_req_lookup_latency
;
1917 case CEPH_MDS_OP_LOOKUPSNAP
:
1918 code
= l_mdss_req_lookupsnap_latency
;
1920 case CEPH_MDS_OP_GETATTR
:
1921 code
= l_mdss_req_getattr_latency
;
1923 case CEPH_MDS_OP_SETATTR
:
1924 code
= l_mdss_req_setattr_latency
;
1926 case CEPH_MDS_OP_SETLAYOUT
:
1927 code
= l_mdss_req_setlayout_latency
;
1929 case CEPH_MDS_OP_SETDIRLAYOUT
:
1930 code
= l_mdss_req_setdirlayout_latency
;
1932 case CEPH_MDS_OP_SETXATTR
:
1933 code
= l_mdss_req_setxattr_latency
;
1935 case CEPH_MDS_OP_RMXATTR
:
1936 code
= l_mdss_req_rmxattr_latency
;
1938 case CEPH_MDS_OP_READDIR
:
1939 code
= l_mdss_req_readdir_latency
;
1941 case CEPH_MDS_OP_SETFILELOCK
:
1942 code
= l_mdss_req_setfilelock_latency
;
1944 case CEPH_MDS_OP_GETFILELOCK
:
1945 code
= l_mdss_req_getfilelock_latency
;
1947 case CEPH_MDS_OP_CREATE
:
1948 code
= l_mdss_req_create_latency
;
1950 case CEPH_MDS_OP_OPEN
:
1951 code
= l_mdss_req_open_latency
;
1953 case CEPH_MDS_OP_MKNOD
:
1954 code
= l_mdss_req_mknod_latency
;
1956 case CEPH_MDS_OP_LINK
:
1957 code
= l_mdss_req_link_latency
;
1959 case CEPH_MDS_OP_UNLINK
:
1960 code
= l_mdss_req_unlink_latency
;
1962 case CEPH_MDS_OP_RMDIR
:
1963 code
= l_mdss_req_rmdir_latency
;
1965 case CEPH_MDS_OP_RENAME
:
1966 code
= l_mdss_req_rename_latency
;
1968 case CEPH_MDS_OP_MKDIR
:
1969 code
= l_mdss_req_mkdir_latency
;
1971 case CEPH_MDS_OP_SYMLINK
:
1972 code
= l_mdss_req_symlink_latency
;
1974 case CEPH_MDS_OP_LSSNAP
:
1975 code
= l_mdss_req_lssnap_latency
;
1977 case CEPH_MDS_OP_MKSNAP
:
1978 code
= l_mdss_req_mksnap_latency
;
1980 case CEPH_MDS_OP_RMSNAP
:
1981 code
= l_mdss_req_rmsnap_latency
;
1983 case CEPH_MDS_OP_RENAMESNAP
:
1984 code
= l_mdss_req_renamesnap_latency
;
1986 default: ceph_abort();
1988 logger
->tinc(code
, lat
);
1991 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1993 if (!g_conf()->mds_early_reply
)
1996 if (mdr
->no_early_reply
) {
1997 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2001 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
2002 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
2006 if (mdr
->alloc_ino
) {
2007 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2011 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2012 entity_inst_t client_inst
= req
->get_source_inst();
2013 if (client_inst
.name
.is_mds())
2016 if (req
->is_replay()) {
2017 dout(10) << " no early reply on replay op" << dendl
;
2022 auto reply
= make_message
<MClientReply
>(*req
, 0);
2023 reply
->set_unsafe();
2025 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2027 //_rename_finish() does not send dentry link/unlink message to replicas.
2028 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2029 // that have projected linkages from getting new replica.
2030 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2032 dout(10) << "early_reply " << reply
->get_result()
2033 << " (" << cpp_strerror(reply
->get_result())
2034 << ") " << *req
<< dendl
;
2036 if (tracei
|| tracedn
) {
2038 mdr
->cap_releases
.erase(tracei
->vino());
2040 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2042 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2045 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2046 mds
->send_message_client(reply
, mdr
->session
);
2048 mdr
->did_early_reply
= true;
2050 mds
->logger
->inc(l_mds_reply
);
2051 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2052 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2053 if (client_inst
.name
.is_client()) {
2054 mds
->sessionmap
.hit_session(mdr
->session
);
2056 perf_gather_op_latency(req
, lat
);
2057 dout(20) << "lat " << lat
<< dendl
;
2059 mdr
->mark_event("early_replied");
2064 * include a trace to tracei
2067 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2069 ceph_assert(mdr
.get());
2070 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2072 dout(7) << "reply_client_request " << reply
->get_result()
2073 << " (" << cpp_strerror(reply
->get_result())
2074 << ") " << *req
<< dendl
;
2076 mdr
->mark_event("replying");
2078 Session
*session
= mdr
->session
;
2080 // note successful request in session map?
2082 // setfilelock requests are special, they only modify states in MDS memory.
2083 // The states get lost when MDS fails. If Client re-send a completed
2084 // setfilelock request, it means that client did not receive corresponding
2085 // setfilelock reply. So MDS should re-execute the setfilelock request.
2086 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2087 reply
->get_result() == 0 && session
) {
2088 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2089 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2091 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2095 // give any preallocated inos to the session
2096 apply_allocated_inos(mdr
, session
);
2098 // get tracei/tracedn from mdr?
2099 CInode
*tracei
= mdr
->tracei
;
2100 CDentry
*tracedn
= mdr
->tracedn
;
2102 bool is_replay
= mdr
->client_request
->is_replay();
2103 bool did_early_reply
= mdr
->did_early_reply
;
2104 entity_inst_t client_inst
= req
->get_source_inst();
2106 if (!did_early_reply
&& !is_replay
) {
2108 mds
->logger
->inc(l_mds_reply
);
2109 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2110 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2111 if (session
&& client_inst
.name
.is_client()) {
2112 mds
->sessionmap
.hit_session(session
);
2114 perf_gather_op_latency(req
, lat
);
2115 dout(20) << "lat " << lat
<< dendl
;
2118 mdr
->cap_releases
.erase(tracei
->vino());
2120 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2123 // drop non-rdlocks before replying, so that we can issue leases
2124 mdcache
->request_drop_non_rdlocks(mdr
);
2127 if (session
&& !client_inst
.name
.is_mds()) {
2129 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2130 (tracei
|| tracedn
)) {
2133 mdcache
->try_reconnect_cap(tracei
, session
);
2135 // include metadata in reply
2136 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2140 // We can set the extra bl unconditionally: if it's already been sent in the
2141 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2142 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2144 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2145 mds
->send_message_client(reply
, session
);
2148 if (req
->is_queued_for_replay() &&
2149 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2150 if (reply
->get_result() < 0) {
2151 int r
= reply
->get_result();
2152 derr
<< "reply_client_request: failed to replay " << *req
2153 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2154 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2156 mds
->queue_one_replay();
2160 mdcache
->request_finish(mdr
);
2162 // take a closer look at tracei, if it happens to be a remote link
2165 tracedn
->get_projected_linkage()->is_remote()) {
2166 mdcache
->eval_remote(tracedn
);
2171 * pass inode OR dentry (not both, or we may get confused)
2173 * trace is in reverse order (i.e. root inode comes last)
2175 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2176 CInode
*in
, CDentry
*dn
,
2179 // skip doing this for debugging purposes?
2180 if (g_conf()->mds_inject_traceless_reply_probability
&&
2181 mdr
->ls
&& !mdr
->o_trunc
&&
2182 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2183 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2187 // inode, dentry, dir, ..., inode
2189 mds_rank_t whoami
= mds
->get_nodeid();
2190 Session
*session
= mdr
->session
;
2191 snapid_t snapid
= mdr
->snapid
;
2192 utime_t now
= ceph_clock_now();
2194 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2197 if (snapid
== CEPH_NOSNAP
) {
2200 realm
= in
->find_snaprealm();
2202 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2203 reply
->snapbl
= realm
->get_snap_trace();
2204 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2209 reply
->head
.is_dentry
= 1;
2210 CDir
*dir
= dn
->get_dir();
2211 CInode
*diri
= dir
->get_inode();
2213 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2214 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2216 #ifdef MDS_VERIFY_FRAGSTAT
2217 if (dir
->is_complete())
2218 dir
->verify_fragstat();
2221 ds
.frag
= dir
->get_frag();
2222 ds
.auth
= dir
->get_dir_auth().first
;
2223 if (dir
->is_auth() && !mdcache
->forward_all_reqs_to_auth())
2224 dir
->get_dist_spec(ds
.dist
, whoami
);
2226 dir
->encode_dirstat(bl
, session
->info
, ds
);
2227 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2229 encode(dn
->get_name(), bl
);
2232 CDentry::linkage_t
*dnl
= dn
->get_linkage(mdr
->get_client(), mdr
);
2233 if (dnl
->is_primary()) {
2234 ceph_assert(dnl
->get_inode() == in
);
2235 lease_mask
= CEPH_LEASE_PRIMARY_LINK
;
2237 if (dnl
->is_remote())
2238 ceph_assert(dnl
->get_remote_ino() == in
->ino());
2242 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, bl
);
2243 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
2245 reply
->head
.is_dentry
= 0;
2249 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2250 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2251 reply
->head
.is_target
= 1;
2253 reply
->head
.is_target
= 0;
2255 reply
->set_trace(bl
);
2258 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2260 dout(4) << "handle_client_request " << *req
<< dendl
;
2263 mds
->logger
->inc(l_mds_request
);
2265 logger
->inc(l_mdss_handle_client_request
);
2267 if (!mdcache
->is_open()) {
2268 dout(5) << "waiting for root" << dendl
;
2269 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2273 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2275 Session
*session
= 0;
2276 if (req
->get_source().is_client()) {
2277 session
= mds
->get_session(req
);
2279 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2280 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2281 session
->is_closing() ||
2282 session
->is_killing()) {
2283 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2287 if (req
->is_queued_for_replay())
2288 mds
->queue_one_replay();
2294 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2295 // send it? hrm, this isn't ideal; they may get a lot of copies if
2296 // they have a high request rate.
2299 // completed request?
2300 bool has_completed
= false;
2301 if (req
->is_replay() || req
->get_retry_attempt()) {
2302 ceph_assert(session
);
2304 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2305 has_completed
= true;
2306 if (!session
->is_open())
2308 // Don't send traceless reply if the completed request has created
2309 // new inode. Treat the request as lookup request instead.
2310 if (req
->is_replay() ||
2311 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2312 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2313 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2314 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2315 auto reply
= make_message
<MClientReply
>(*req
, 0);
2316 if (created
!= inodeno_t()) {
2318 encode(created
, extra
);
2319 reply
->set_extra_bl(extra
);
2321 mds
->send_message_client(reply
, session
);
2323 if (req
->is_queued_for_replay())
2324 mds
->queue_one_replay();
2328 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2329 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2330 dout(10) << " completed request which created new inode " << created
2331 << ", convert it to lookup request" << dendl
;
2332 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2333 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2338 // trim completed_request list
2339 if (req
->get_oldest_client_tid() > 0) {
2340 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2341 ceph_assert(session
);
2342 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2343 // Sessions 'completed_requests' was dirtied, mark it to be
2344 // potentially flushed at segment expiry.
2345 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2347 if (session
->get_num_trim_requests_warnings() > 0 &&
2348 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2349 session
->reset_num_trim_requests_warnings();
2351 if (session
->get_num_completed_requests() >=
2352 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2353 session
->inc_num_trim_requests_warnings();
2355 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2356 << req
->get_oldest_client_tid() << "), "
2357 << session
->get_num_completed_requests()
2358 << " completed requests recorded in session\n";
2359 mds
->clog
->warn() << ss
.str();
2360 dout(20) << __func__
<< " " << ss
.str() << dendl
;
2365 // register + dispatch
2366 MDRequestRef mdr
= mdcache
->request_start(req
);
2371 mdr
->session
= session
;
2372 session
->requests
.push_back(&mdr
->item_session_request
);
2376 mdr
->has_completed
= true;
2378 // process embedded cap releases?
2379 // (only if NOT replay!)
2380 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2381 client_t client
= req
->get_source().num();
2382 for (const auto &r
: req
->releases
) {
2383 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2385 req
->releases
.clear();
2388 dispatch_client_request(mdr
);
2392 void Server::handle_osd_map()
2394 /* Note that we check the OSDMAP_FULL flag directly rather than
2395 * using osdmap_full_flag(), because we want to know "is the flag set"
2396 * rather than "does the flag apply to us?" */
2397 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2398 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
2399 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2400 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2401 << o
.get_epoch() << dendl
;
2405 void Server::clear_batch_ops(const MDRequestRef
& mdr
)
2407 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2408 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_GETATTR
&& mdr
->in
[0]) {
2409 mdr
->in
[0]->batch_ops
.erase(mask
);
2410 } else if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
&& mdr
->dn
[0].size()) {
2411 mdr
->dn
[0].back()->batch_ops
.erase(mask
);
2415 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2417 // we shouldn't be waiting on anyone.
2418 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
2421 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2422 //if the mdr is a "batch_op" and it has followers, pick a follower as
2423 //the new "head of the batch ops" and go on processing the new one.
2424 if (mdr
->is_batch_op() && mdr
->is_batch_head
) {
2425 if (!mdr
->batch_reqs
.empty()) {
2426 MDRequestRef new_batch_head
;
2427 for (auto itr
= mdr
->batch_reqs
.cbegin(); itr
!= mdr
->batch_reqs
.cend();) {
2429 itr
= mdr
->batch_reqs
.erase(itr
);
2431 new_batch_head
= req
;
2436 if (!new_batch_head
) {
2437 clear_batch_ops(mdr
);
2441 new_batch_head
->batch_reqs
= std::move(mdr
->batch_reqs
);
2443 mdr
= new_batch_head
;
2444 mdr
->is_batch_head
= true;
2445 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2446 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_GETATTR
) {
2447 auto& fin
= mdr
->in
[0]->batch_ops
[mask
];
2448 fin
->set_request(new_batch_head
);
2449 } else if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
) {
2450 auto& fin
= mdr
->dn
[0].back()->batch_ops
[mask
];
2451 fin
->set_request(new_batch_head
);
2454 clear_batch_ops(mdr
);
2460 } else if (mdr
->aborted
) {
2461 mdr
->aborted
= false;
2462 mdcache
->request_kill(mdr
);
2466 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2468 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2470 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2472 if (req
->may_write() && mdcache
->is_readonly()) {
2473 dout(10) << " read-only FS" << dendl
;
2474 respond_to_request(mdr
, -EROFS
);
2477 if (mdr
->has_more() && mdr
->more()->slave_error
) {
2478 dout(10) << " got error from slaves" << dendl
;
2479 respond_to_request(mdr
, mdr
->more()->slave_error
);
2484 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2485 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2486 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2487 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2488 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2489 req
->get_op() == CEPH_MDS_OP_CREATE
||
2490 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2491 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2492 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2493 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2494 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
2497 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2498 respond_to_request(mdr
, -ENOSPC
);
2501 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2505 switch (req
->get_op()) {
2506 case CEPH_MDS_OP_LOOKUPHASH
:
2507 case CEPH_MDS_OP_LOOKUPINO
:
2508 handle_client_lookup_ino(mdr
, false, false);
2510 case CEPH_MDS_OP_LOOKUPPARENT
:
2511 handle_client_lookup_ino(mdr
, true, false);
2513 case CEPH_MDS_OP_LOOKUPNAME
:
2514 handle_client_lookup_ino(mdr
, false, true);
2518 case CEPH_MDS_OP_LOOKUP
:
2519 handle_client_getattr(mdr
, true);
2522 case CEPH_MDS_OP_LOOKUPSNAP
:
2523 // lookupsnap does not reference a CDentry; treat it as a getattr
2524 case CEPH_MDS_OP_GETATTR
:
2525 handle_client_getattr(mdr
, false);
2528 case CEPH_MDS_OP_SETATTR
:
2529 handle_client_setattr(mdr
);
2531 case CEPH_MDS_OP_SETLAYOUT
:
2532 handle_client_setlayout(mdr
);
2534 case CEPH_MDS_OP_SETDIRLAYOUT
:
2535 handle_client_setdirlayout(mdr
);
2537 case CEPH_MDS_OP_SETXATTR
:
2538 handle_client_setxattr(mdr
);
2540 case CEPH_MDS_OP_RMXATTR
:
2541 handle_client_removexattr(mdr
);
2544 case CEPH_MDS_OP_READDIR
:
2545 handle_client_readdir(mdr
);
2548 case CEPH_MDS_OP_SETFILELOCK
:
2549 handle_client_file_setlock(mdr
);
2552 case CEPH_MDS_OP_GETFILELOCK
:
2553 handle_client_file_readlock(mdr
);
2557 case CEPH_MDS_OP_CREATE
:
2558 if (mdr
->has_completed
)
2559 handle_client_open(mdr
); // already created.. just open
2561 handle_client_openc(mdr
);
2564 case CEPH_MDS_OP_OPEN
:
2565 handle_client_open(mdr
);
2570 case CEPH_MDS_OP_MKNOD
:
2571 handle_client_mknod(mdr
);
2573 case CEPH_MDS_OP_LINK
:
2574 handle_client_link(mdr
);
2576 case CEPH_MDS_OP_UNLINK
:
2577 case CEPH_MDS_OP_RMDIR
:
2578 handle_client_unlink(mdr
);
2580 case CEPH_MDS_OP_RENAME
:
2581 handle_client_rename(mdr
);
2583 case CEPH_MDS_OP_MKDIR
:
2584 handle_client_mkdir(mdr
);
2586 case CEPH_MDS_OP_SYMLINK
:
2587 handle_client_symlink(mdr
);
2592 case CEPH_MDS_OP_LSSNAP
:
2593 handle_client_lssnap(mdr
);
2595 case CEPH_MDS_OP_MKSNAP
:
2596 handle_client_mksnap(mdr
);
2598 case CEPH_MDS_OP_RMSNAP
:
2599 handle_client_rmsnap(mdr
);
2601 case CEPH_MDS_OP_RENAMESNAP
:
2602 handle_client_renamesnap(mdr
);
2606 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2607 respond_to_request(mdr
, -EOPNOTSUPP
);
2612 // ---------------------------------------
2615 void Server::handle_slave_request(const cref_t
<MMDSSlaveRequest
> &m
)
2617 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2618 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2620 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
2624 return handle_slave_request_reply(m
);
2626 // the purpose of rename notify is enforcing causal message ordering. making sure
2627 // bystanders have received all messages from rename srcdn's auth MDS.
2628 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
2629 auto reply
= make_message
<MMDSSlaveRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
2630 mds
->send_message(reply
, m
->get_connection());
2634 CDentry
*straydn
= NULL
;
2635 if (m
->straybl
.length() > 0) {
2636 mdcache
->decode_replica_stray(straydn
, m
->straybl
, from
);
2637 ceph_assert(straydn
);
2641 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2642 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2643 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2647 // am i a new slave?
2649 if (mdcache
->have_request(m
->get_reqid())) {
2651 mdr
= mdcache
->request_get(m
->get_reqid());
2653 // is my request newer?
2654 if (mdr
->attempt
> m
->get_attempt()) {
2655 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2656 << ", dropping " << *m
<< dendl
;
2660 if (mdr
->attempt
< m
->get_attempt()) {
2661 // mine is old, close it out
2662 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2663 << ", closing out" << dendl
;
2664 mdcache
->request_finish(mdr
);
2666 } else if (mdr
->slave_to_mds
!= from
) {
2667 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
2671 // may get these while mdr->slave_request is non-null
2672 if (m
->get_op() == MMDSSlaveRequest::OP_DROPLOCKS
) {
2673 mds
->locker
->drop_locks(mdr
.get());
2676 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
2677 if (m
->is_abort()) {
2678 mdr
->aborted
= true;
2679 if (mdr
->slave_request
) {
2680 // only abort on-going xlock, wrlock and auth pin
2681 ceph_assert(!mdr
->slave_did_prepare());
2683 mdcache
->request_finish(mdr
);
2686 if (m
->inode_export
.length() > 0)
2687 mdr
->more()->inode_import
= m
->inode_export
;
2688 // finish off request.
2689 mdcache
->request_finish(mdr
);
2696 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
2697 dout(10) << "missing slave request for " << m
->get_reqid()
2698 << " OP_FINISH, must have lost race with a forward" << dendl
;
2701 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
2702 mdr
->set_op_stamp(m
->op_stamp
);
2704 ceph_assert(mdr
->slave_request
== 0); // only one at a time, please!
2708 mdr
->straydn
= straydn
;
2711 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2712 mdr
->locks
.empty()) {
2713 dout(3) << "not active yet, waiting" << dendl
;
2714 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2718 mdr
->reset_slave_request(m
);
2720 dispatch_slave_request(mdr
);
2723 void Server::handle_slave_request_reply(const cref_t
<MMDSSlaveRequest
> &m
)
2725 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2727 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2728 metareqid_t r
= m
->get_reqid();
2729 if (!mdcache
->have_uncommitted_master(r
, from
)) {
2730 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2731 << from
<< " reqid " << r
<< dendl
;
2734 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2735 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2739 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
2740 metareqid_t r
= m
->get_reqid();
2741 mdcache
->committed_master_slave(r
, from
);
2745 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2746 if (m
->get_attempt() != mdr
->attempt
) {
2747 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
2748 << m
->get_attempt() << dendl
;
2752 switch (m
->get_op()) {
2753 case MMDSSlaveRequest::OP_XLOCKACK
:
2755 // identify lock, master request
2756 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2757 m
->get_object_info());
2758 mdr
->more()->slaves
.insert(from
);
2759 lock
->decode_locked_state(m
->get_lock_data());
2760 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2761 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2762 mdr
->finish_locking(lock
);
2763 lock
->get_xlock(mdr
, mdr
->get_client());
2765 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2766 mdr
->more()->waiting_on_slave
.erase(from
);
2767 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
2768 mdcache
->dispatch_request(mdr
);
2772 case MMDSSlaveRequest::OP_WRLOCKACK
:
2774 // identify lock, master request
2775 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2776 m
->get_object_info());
2777 mdr
->more()->slaves
.insert(from
);
2778 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2779 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2780 ceph_assert(it
->is_remote_wrlock());
2781 ceph_assert(it
->wrlock_target
== from
);
2783 mdr
->finish_locking(lock
);
2785 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2786 mdr
->more()->waiting_on_slave
.erase(from
);
2787 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
2788 mdcache
->dispatch_request(mdr
);
2792 case MMDSSlaveRequest::OP_AUTHPINACK
:
2793 handle_slave_auth_pin_ack(mdr
, m
);
2796 case MMDSSlaveRequest::OP_LINKPREPACK
:
2797 handle_slave_link_prep_ack(mdr
, m
);
2800 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
2801 handle_slave_rmdir_prep_ack(mdr
, m
);
2804 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
2805 handle_slave_rename_prep_ack(mdr
, m
);
2808 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
2809 handle_slave_rename_notify_ack(mdr
, m
);
2817 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
2819 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
2822 dout(7) << " abort flag set, finishing" << dendl
;
2823 mdcache
->request_finish(mdr
);
2827 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
2829 int op
= mdr
->slave_request
->get_op();
2831 case MMDSSlaveRequest::OP_XLOCK
:
2832 case MMDSSlaveRequest::OP_WRLOCK
:
2835 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2836 mdr
->slave_request
->get_object_info());
2839 dout(10) << "don't have object, dropping" << dendl
;
2840 ceph_abort(); // can this happen, if we auth pinned properly.
2842 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2843 dout(10) << "not auth for remote xlock attempt, dropping on "
2844 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2846 // use acquire_locks so that we get auth_pinning.
2847 MutationImpl::LockOpVec lov
;
2848 for (const auto& p
: mdr
->locks
) {
2850 lov
.add_xlock(p
.lock
);
2851 else if (p
.is_wrlock())
2852 lov
.add_wrlock(p
.lock
);
2857 case MMDSSlaveRequest::OP_XLOCK
:
2858 lov
.add_xlock(lock
);
2859 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2861 case MMDSSlaveRequest::OP_WRLOCK
:
2862 lov
.add_wrlock(lock
);
2863 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2867 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2871 auto r
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2872 r
->set_lock_type(lock
->get_type());
2873 lock
->get_parent()->set_object_info(r
->get_object_info());
2874 if (replycode
== MMDSSlaveRequest::OP_XLOCKACK
)
2875 lock
->encode_locked_state(r
->get_lock_data());
2876 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2880 mdr
->reset_slave_request();
2884 case MMDSSlaveRequest::OP_UNXLOCK
:
2885 case MMDSSlaveRequest::OP_UNWRLOCK
:
2887 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2888 mdr
->slave_request
->get_object_info());
2890 auto it
= mdr
->locks
.find(lock
);
2891 ceph_assert(it
!= mdr
->locks
.end());
2892 bool need_issue
= false;
2894 case MMDSSlaveRequest::OP_UNXLOCK
:
2895 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2897 case MMDSSlaveRequest::OP_UNWRLOCK
:
2898 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2902 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2904 // done. no ack necessary.
2905 mdr
->reset_slave_request();
2909 case MMDSSlaveRequest::OP_AUTHPIN
:
2910 handle_slave_auth_pin(mdr
);
2913 case MMDSSlaveRequest::OP_LINKPREP
:
2914 case MMDSSlaveRequest::OP_UNLINKPREP
:
2915 handle_slave_link_prep(mdr
);
2918 case MMDSSlaveRequest::OP_RMDIRPREP
:
2919 handle_slave_rmdir_prep(mdr
);
2922 case MMDSSlaveRequest::OP_RENAMEPREP
:
2923 handle_slave_rename_prep(mdr
);
2931 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2933 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2935 // build list of objects
2936 list
<MDSCacheObject
*> objects
;
2937 CInode
*auth_pin_freeze
= NULL
;
2938 bool nonblocking
= mdr
->slave_request
->is_nonblocking();
2939 bool fail
= false, wouldblock
= false, readonly
= false;
2940 ref_t
<MMDSSlaveRequest
> reply
;
2942 if (mdcache
->is_readonly()) {
2943 dout(10) << " read-only FS" << dendl
;
2949 for (const auto &oi
: mdr
->slave_request
->get_authpins()) {
2950 MDSCacheObject
*object
= mdcache
->get_object(oi
);
2952 dout(10) << " don't have " << oi
<< dendl
;
2957 objects
.push_back(object
);
2958 if (oi
== mdr
->slave_request
->get_authpin_freeze())
2959 auth_pin_freeze
= static_cast<CInode
*>(object
);
2963 // can we auth pin them?
2965 for (const auto& obj
: objects
) {
2966 if (!obj
->is_auth()) {
2967 dout(10) << " not auth for " << *obj
<< dendl
;
2971 if (mdr
->is_auth_pinned(obj
))
2973 if (!mdr
->can_auth_pin(obj
)) {
2975 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
2981 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
2982 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2983 mdr
->drop_local_auth_pins();
2985 mds
->locker
->notify_freeze_waiter(obj
);
2992 /* freeze authpin wrong inode */
2993 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2994 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2995 mdr
->unfreeze_auth_pin(true);
2997 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2998 * on the source inode to complete. This happens after all locks for the rename
2999 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3000 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3001 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
3002 * The solution is freeze the inode and prevent other MDRequests from getting new
3005 if (auth_pin_freeze
) {
3006 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3007 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3008 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3009 mds
->mdlog
->flush();
3015 reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
3018 mdr
->drop_local_auth_pins(); // just in case
3020 reply
->mark_error_rofs();
3022 reply
->mark_error_wouldblock();
3025 for (const auto& obj
: objects
) {
3026 dout(10) << "auth_pinning " << *obj
<< dendl
;
3029 // return list of my auth_pins (if any)
3030 for (const auto &p
: mdr
->object_states
) {
3031 if (!p
.second
.auth_pinned
)
3033 MDSCacheObjectInfo info
;
3034 p
.first
->set_object_info(info
);
3035 reply
->get_authpins().push_back(info
);
3036 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3037 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3041 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
3043 // clean up this request
3044 mdr
->reset_slave_request();
3048 if (mdr
->slave_request
->should_notify_blocking()) {
3049 reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
3050 reply
->mark_req_blocked();
3051 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
3052 mdr
->slave_request
->clear_notify_blocking();
3057 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
3059 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3060 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3062 if (ack
->is_req_blocked()) {
3063 mdr
->disable_lock_cache();
3064 // slave auth pin is blocked, drop locks to avoid deadlock
3065 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3070 set
<MDSCacheObject
*> pinned
;
3071 for (const auto &oi
: ack
->get_authpins()) {
3072 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3073 ceph_assert(object
); // we pinned it
3074 dout(10) << " remote has pinned " << *object
<< dendl
;
3075 mdr
->set_remote_auth_pinned(object
, from
);
3076 if (oi
== ack
->get_authpin_freeze())
3077 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3078 pinned
.insert(object
);
3081 // removed frozen auth pin ?
3082 if (mdr
->more()->is_remote_frozen_authpin
&&
3083 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3084 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3085 ceph_assert(stat_p
);
3086 if (stat_p
->remote_auth_pinned
== from
) {
3087 mdr
->more()->is_remote_frozen_authpin
= false;
3091 // removed auth pins?
3092 for (auto& p
: mdr
->object_states
) {
3093 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3095 MDSCacheObject
* object
= p
.first
;
3096 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3097 dout(10) << " remote has unpinned " << *object
<< dendl
;
3098 mdr
->_clear_remote_auth_pinned(p
.second
);
3103 mdr
->more()->slaves
.insert(from
);
3105 // clear from waiting list
3106 auto ret
= mdr
->more()->waiting_on_slave
.erase(from
);
3109 if (ack
->is_error_rofs()) {
3110 mdr
->more()->slave_error
= -EROFS
;
3111 } else if (ack
->is_error_wouldblock()) {
3112 mdr
->more()->slave_error
= -EWOULDBLOCK
;
3116 if (mdr
->more()->waiting_on_slave
.empty())
3117 mdcache
->dispatch_request(mdr
);
3119 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
3123 // ---------------------------------------
3128 * check whether we are permitted to complete a request
3130 * Check whether we have permission to perform the operation specified
3131 * by mask on the given inode, based on the capability in the mdr's
3134 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3137 int r
= mdr
->session
->check_access(
3139 mdr
->client_request
->get_caller_uid(),
3140 mdr
->client_request
->get_caller_gid(),
3141 &mdr
->client_request
->get_caller_gid_list(),
3142 mdr
->client_request
->head
.args
.setattr
.uid
,
3143 mdr
->client_request
->head
.args
.setattr
.gid
);
3145 respond_to_request(mdr
, r
);
3153 * check whether fragment has reached maximum size
3156 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
3158 const auto size
= in
->get_frag_size();
3159 if (size
>= g_conf()->mds_bal_fragment_size_max
) {
3160 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf()->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
3161 respond_to_request(mdr
, -ENOSPC
);
3168 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3170 CDentry
*straydn
= mdr
->straydn
;
3173 in
->name_stray_dentry(straydname
);
3174 ceph_assert(straydn
->get_name() == straydname
);
3178 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3180 if (!mdr
->client_request
->is_replay() &&
3181 !check_fragment_space(mdr
, straydir
))
3184 straydn
= mdcache
->get_or_create_stray_dentry(in
);
3185 mdr
->straydn
= straydn
;
3190 /** prepare_new_inode
3192 * create a new inode. set c/m/atime. hit dir pop.
3194 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3195 file_layout_t
*layout
)
3197 CInode
*in
= new CInode(mdcache
);
3199 // Server::prepare_force_open_sessions() can re-open session in closing
3200 // state. In that corner case, session's prealloc_inos are being freed.
3201 // To simplify the code, we disallow using/refilling session's prealloc_ino
3202 // while session is opening.
3203 bool allow_prealloc_inos
= mdr
->session
->is_open();
3206 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= in
->inode
.ino
= mdr
->session
->take_ino(useino
))) {
3207 mds
->sessionmap
.mark_projected(mdr
->session
);
3208 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3209 << " (" << mdr
->session
->info
.prealloc_inos
3210 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3214 in
->inode
.ino
= mds
->inotable
->project_alloc_id(useino
);
3215 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3218 if (useino
&& useino
!= in
->inode
.ino
) {
3219 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
3220 mds
->clog
->error() << mdr
->client_request
->get_source()
3221 << " specified ino " << useino
3222 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
3223 //ceph_abort(); // just for now.
3226 if (allow_prealloc_inos
&&
3227 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3228 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3229 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3230 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3231 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3232 mds
->sessionmap
.mark_projected(mdr
->session
);
3233 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3236 in
->inode
.version
= 1;
3237 in
->inode
.xattr_version
= 1;
3238 in
->inode
.nlink
= 1; // FIXME
3240 in
->inode
.mode
= mode
;
3242 // FIPS zeroization audit 20191117: this memset is not security related.
3243 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
3244 if (in
->inode
.is_dir()) {
3245 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3246 } else if (layout
) {
3247 in
->inode
.layout
= *layout
;
3249 in
->inode
.layout
= mdcache
->default_file_layout
;
3252 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
3253 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3255 CInode
*diri
= dir
->get_inode();
3257 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3259 if (diri
->inode
.mode
& S_ISGID
) {
3260 dout(10) << " dir is sticky" << dendl
;
3261 in
->inode
.gid
= diri
->inode
.gid
;
3262 if (S_ISDIR(mode
)) {
3263 dout(10) << " new dir also sticky" << dendl
;
3264 in
->inode
.mode
|= S_ISGID
;
3267 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
3269 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
3271 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
3272 mdr
->get_op_stamp();
3274 in
->inode
.change_attr
= 0;
3276 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3277 if (req
->get_data().length()) {
3278 auto p
= req
->get_data().cbegin();
3280 // xattrs on new inode?
3281 CInode::mempool_xattr_map xattrs
;
3282 decode_noshare(xattrs
, p
);
3283 for (const auto &p
: xattrs
) {
3284 dout(10) << "prepare_new_inode setting xattr " << p
.first
<< dendl
;
3285 auto em
= in
->xattrs
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple(p
.second
));
3287 em
.first
->second
= p
.second
;
3291 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3292 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3293 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
3295 mdcache
->add_inode(in
); // add
3296 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3300 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3302 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3303 << " inotablev " << mds
->inotable
->get_projected_version()
3305 blob
->set_ino_alloc(mdr
->alloc_ino
,
3306 mdr
->used_prealloc_ino
,
3308 mdr
->client_request
->get_source(),
3309 mds
->sessionmap
.get_projected(),
3310 mds
->inotable
->get_projected_version());
3313 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3315 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3316 << " / " << mdr
->prealloc_inos
3317 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3319 if (mdr
->alloc_ino
) {
3320 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3322 if (mdr
->prealloc_inos
.size()) {
3323 ceph_assert(session
);
3324 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3325 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3326 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3327 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3329 if (mdr
->used_prealloc_ino
) {
3330 ceph_assert(session
);
3331 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
3332 mds
->sessionmap
.mark_dirty(session
);
3336 class C_MDS_TryFindInode
: public ServerContext
{
3339 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3340 void finish(int r
) override
{
3341 if (r
== -ESTALE
) // :( find_ino_peers failed
3342 server
->respond_to_request(mdr
, r
);
3344 server
->dispatch_client_request(mdr
);
3348 class CF_MDS_MDRContextFactory
: public MDSContextFactory
{
3350 CF_MDS_MDRContextFactory(MDCache
*cache
, MDRequestRef
&mdr
, bool dl
) :
3351 mdcache(cache
), mdr(mdr
), drop_locks(dl
) {}
3352 MDSContext
*build() {
3354 mdcache
->mds
->locker
->drop_locks(mdr
.get(), nullptr);
3355 mdr
->drop_local_auth_pins();
3357 return new C_MDS_RetryRequest(mdcache
, mdr
);
3365 /* If this returns null, the request has been handled
3366 * as appropriate: forwarded on, or the client's been replied to */
3367 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3371 const filepath
& refpath
= mdr
->get_filepath();
3372 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3374 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3378 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, true);
3380 if (refpath
.is_last_snap()) {
3384 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3387 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3388 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3390 return nullptr; // delayed
3391 if (r
< 0) { // error
3392 if (r
== -ENOENT
&& !mdr
->dn
[0].empty()) {
3393 if (mdr
->client_request
&&
3394 mdr
->client_request
->get_dentry_wanted())
3395 mdr
->tracedn
= mdr
->dn
[0].back();
3396 respond_to_request(mdr
, r
);
3397 } else if (r
== -ESTALE
) {
3398 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3399 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
3400 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
3402 dout(10) << "FAIL on error " << r
<< dendl
;
3403 respond_to_request(mdr
, r
);
3407 CInode
*ref
= mdr
->in
[0];
3408 dout(10) << "ref is " << *ref
<< dendl
;
3412 // do NOT proceed if freezing, as cap release may defer in that case, and
3413 // we could deadlock when we try to lock @ref.
3414 // if we're already auth_pinned, continue; the release has already been processed.
3415 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3416 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3417 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3418 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3419 if (mdr
->is_any_remote_auth_pin())
3420 mds
->locker
->notify_freeze_waiter(ref
);
3432 /** rdlock_path_xlock_dentry
3433 * traverse path to the directory that could/would contain dentry.
3434 * make sure i am auth for that dentry, forward as necessary.
3435 * create null dentry in place (or use existing if okexist).
3436 * get rdlocks on traversed dentries, xlock on new dentry.
3438 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3439 bool create
, bool okexist
, bool want_layout
)
3441 const filepath
& refpath
= mdr
->get_filepath();
3442 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3444 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3445 return mdr
->dn
[0].back();
3447 // figure parent dir vs dname
3448 if (refpath
.depth() == 0) {
3449 dout(7) << "invalid path (zero length)" << dendl
;
3450 respond_to_request(mdr
, -EINVAL
);
3454 if (refpath
.is_last_snap()) {
3455 respond_to_request(mdr
, -EROFS
);
3459 if (refpath
.is_last_dot_or_dotdot()) {
3460 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3462 respond_to_request(mdr
, -EEXIST
);
3464 respond_to_request(mdr
, -ENOTEMPTY
);
3468 // traverse to parent dir
3469 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, true);
3470 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3471 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3472 MDS_TRAVERSE_WANT_AUTH
;
3473 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3474 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3476 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3478 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3479 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3481 return nullptr; // delayed
3484 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3485 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3488 respond_to_request(mdr
, r
);
3492 CDentry
*dn
= mdr
->dn
[0].back();
3493 CDir
*dir
= dn
->get_dir();
3494 CInode
*diri
= dir
->get_inode();
3496 if (!mdr
->reqid
.name
.is_mds()) {
3497 if (diri
->is_system() && !diri
->is_root()) {
3498 respond_to_request(mdr
, -EROFS
);
3503 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3504 respond_to_request(mdr
, -ENOENT
);
3508 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3509 if (dnl
->is_null()) {
3510 if (!create
&& okexist
) {
3511 respond_to_request(mdr
, -ENOENT
);
3515 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3516 dn
->first
= std::max(dn
->first
, next_snap
);
3519 respond_to_request(mdr
, -EEXIST
);
3522 mdr
->in
[0] = dnl
->get_inode();
3528 /** rdlock_two_paths_xlock_destdn
3529 * traverse two paths and lock the two paths in proper order.
3530 * The order of taking locks is:
3531 * 1. Lock directory inodes or dentries according to which trees they
3532 * are under. Lock objects under fs root before objects under mdsdir.
3533 * 2. Lock directory inodes or dentries according to their depth, in
3535 * 3. Lock directory inodes or dentries according to inode numbers or
3536 * dentries' parent inode numbers, in ascending order.
3537 * 4. Lock dentries in the same directory in order of their keys.
3538 * 5. Lock non-directory inodes according to inode numbers, in ascending
3541 std::pair
<CDentry
*, CDentry
*>
3542 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3545 const filepath
& refpath
= mdr
->get_filepath();
3546 const filepath
& refpath2
= mdr
->get_filepath2();
3548 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3550 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3551 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3553 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3554 respond_to_request(mdr
, -EINVAL
);
3555 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3558 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3559 respond_to_request(mdr
, -EROFS
);
3560 return std::make_pair(nullptr, nullptr);
3563 // traverse to parent dir
3564 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, true);
3565 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3566 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3569 dout(10) << "ESTALE on path, attempting recovery" << dendl
;
3570 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3572 respond_to_request(mdr
, r
);
3574 return std::make_pair(nullptr, nullptr);
3577 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3578 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3581 dout(10) << "ESTALE on path2, attempting recovery" << dendl
;
3582 mdcache
->find_ino_peers(refpath2
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3584 respond_to_request(mdr
, r
);
3586 return std::make_pair(nullptr, nullptr);
3589 CDentry
*srcdn
= mdr
->dn
[1].back();
3590 CDir
*srcdir
= srcdn
->get_dir();
3591 CDentry
*destdn
= mdr
->dn
[0].back();
3592 CDir
*destdir
= destdn
->get_dir();
3594 if (!mdr
->reqid
.name
.is_mds()) {
3595 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3596 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3597 respond_to_request(mdr
, -EROFS
);
3598 return std::make_pair(nullptr, nullptr);
3602 if (!destdir
->get_inode()->is_base() &&
3603 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3604 respond_to_request(mdr
, -ENOENT
);
3605 return std::make_pair(nullptr, nullptr);
3608 MutationImpl::LockOpVec lov
;
3609 if (srcdir
->get_inode() == destdir
->get_inode()) {
3610 lov
.add_wrlock(&destdir
->inode
->filelock
);
3611 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3612 if (xlock_srcdn
&& srcdir
!= destdir
) {
3613 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3614 if (srcdir_auth
!= mds
->get_nodeid()) {
3615 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3616 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3620 if (srcdn
->get_name() > destdn
->get_name())
3621 lov
.add_xlock(&destdn
->lock
);
3624 lov
.add_xlock(&srcdn
->lock
);
3626 lov
.add_rdlock(&srcdn
->lock
);
3628 if (srcdn
->get_name() < destdn
->get_name())
3629 lov
.add_xlock(&destdn
->lock
);
3631 int cmp
= mdr
->compare_paths();
3632 bool lock_destdir_first
=
3633 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3635 if (lock_destdir_first
) {
3636 lov
.add_wrlock(&destdir
->inode
->filelock
);
3637 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3638 lov
.add_xlock(&destdn
->lock
);
3642 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3643 if (srcdir_auth
== mds
->get_nodeid()) {
3644 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3645 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3647 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3648 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3650 lov
.add_xlock(&srcdn
->lock
);
3652 lov
.add_rdlock(&srcdn
->lock
);
3655 if (!lock_destdir_first
) {
3656 lov
.add_wrlock(&destdir
->inode
->filelock
);
3657 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3658 lov
.add_xlock(&destdn
->lock
);
3662 CInode
*auth_pin_freeze
= nullptr;
3663 // XXX any better way to do this?
3664 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3665 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3666 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3668 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3669 return std::make_pair(nullptr, nullptr);
3671 if (srcdn
->get_projected_linkage()->is_null()) {
3672 respond_to_request(mdr
, -ENOENT
);
3673 return std::make_pair(nullptr, nullptr);
3676 if (destdn
->get_projected_linkage()->is_null()) {
3677 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3678 destdn
->first
= std::max(destdn
->first
, next_snap
);
3681 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3683 return std::make_pair(destdn
, srcdn
);
3687 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3689 * @param diri base inode
3690 * @param fg the exact frag we want
3691 * @param mdr request
3692 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3694 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3696 CDir
*dir
= diri
->get_dirfrag(fg
);
3699 // am i auth for the dirfrag?
3700 if (!dir
->is_auth()) {
3701 mds_rank_t auth
= dir
->authority().first
;
3702 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3703 << ", fw to mds." << auth
<< dendl
;
3704 mdcache
->request_forward(mdr
, auth
);
3708 // not open and inode not mine?
3709 if (!diri
->is_auth()) {
3710 mds_rank_t inauth
= diri
->authority().first
;
3711 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3712 mdcache
->request_forward(mdr
, inauth
);
3716 // not open and inode frozen?
3717 if (diri
->is_frozen()) {
3718 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3719 ceph_assert(diri
->get_parent_dir());
3720 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3725 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3732 // ===============================================================================
3735 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3737 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3739 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3740 // refpath can't be empty for lookup but it can for
3741 // getattr (we do getattr with empty refpath for mount of '/')
3742 respond_to_request(mdr
, -EINVAL
);
3746 bool want_auth
= false;
3747 int mask
= req
->head
.args
.getattr
.mask
;
3748 if (mask
& CEPH_STAT_RSTAT
)
3749 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3751 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3755 mdr
->getattr_caps
= mask
;
3757 if (mdr
->snapid
== CEPH_NOSNAP
&& !mdr
->is_batch_head
&& mdr
->is_batch_op()) {
3759 auto em
= ref
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3761 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
, mdcache
);
3763 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3764 em
.first
->second
->add_request(mdr
);
3768 CDentry
* dn
= mdr
->dn
[0].back();
3769 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3771 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
, mdcache
);
3774 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3775 em
.first
->second
->add_request(mdr
);
3779 mdr
->is_batch_head
= true;
3783 * if client currently holds the EXCL cap on a field, do not rdlock
3784 * it; client's stat() will result in valid info if _either_ EXCL
3785 * cap is held or MDS rdlocks and reads the value here.
3787 * handling this case here is easier than weakening rdlock
3788 * semantics... that would cause problems elsewhere.
3790 client_t client
= mdr
->get_client();
3792 Capability
*cap
= ref
->get_client_cap(client
);
3793 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3794 mdr
->snapid
<= cap
->client_follows
))
3795 issued
= cap
->issued();
3798 MutationImpl::LockOpVec lov
;
3799 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3800 lov
.add_rdlock(&ref
->linklock
);
3801 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3802 lov
.add_rdlock(&ref
->authlock
);
3803 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3804 lov
.add_rdlock(&ref
->xattrlock
);
3805 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3806 // Don't wait on unstable filelock if client is allowed to read file size.
3807 // This can reduce the response time of getattr in the case that multiple
3808 // clients do stat(2) and there are writers.
3809 // The downside of this optimization is that mds may not issue Fs caps along
3810 // with getattr reply. Client may need to send more getattr requests.
3811 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3812 lov
.add_rdlock(&ref
->filelock
);
3813 } else if (ref
->filelock
.is_stable() ||
3814 ref
->filelock
.get_num_wrlocks() > 0 ||
3815 !ref
->filelock
.can_read(mdr
->get_client())) {
3816 lov
.add_rdlock(&ref
->filelock
);
3817 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
3821 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3824 if (!check_access(mdr
, ref
, MAY_READ
))
3827 utime_t now
= ceph_clock_now();
3828 mdr
->set_mds_stamp(now
);
3830 // note which caps are requested, so we return at least a snapshot
3831 // value for them. (currently this matters for xattrs and inline data)
3832 mdr
->getattr_caps
= mask
;
3834 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3837 dout(10) << "reply to stat on " << *req
<< dendl
;
3840 mdr
->tracedn
= mdr
->dn
[0].back();
3841 respond_to_request(mdr
, 0);
3844 struct C_MDS_LookupIno2
: public ServerContext
{
3846 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3847 void finish(int r
) override
{
3848 server
->_lookup_ino_2(mdr
, r
);
3855 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3856 bool want_parent
, bool want_dentry
)
3858 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3860 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
3861 return _lookup_snap_ino(mdr
);
3863 inodeno_t ino
= req
->get_filepath().get_ino();
3864 CInode
*in
= mdcache
->get_inode(ino
);
3865 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3866 respond_to_request(mdr
, -ESTALE
);
3870 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3874 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->have_past_parents_open() &&
3875 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
3879 // check for nothing (not read or write); this still applies the
3881 if (!check_access(mdr
, in
, 0))
3884 CDentry
*dn
= in
->get_projected_parent_dn();
3885 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3887 MutationImpl::LockOpVec lov
;
3888 if (dn
&& (want_parent
|| want_dentry
)) {
3890 lov
.add_rdlock(&dn
->lock
);
3893 unsigned mask
= req
->head
.args
.lookupino
.mask
;
3895 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3897 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3898 issued
= cap
->issued();
3900 // permission bits, ACL/security xattrs
3901 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3902 lov
.add_rdlock(&in
->authlock
);
3903 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3904 lov
.add_rdlock(&in
->xattrlock
);
3906 mdr
->getattr_caps
= mask
;
3910 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3914 // need read access to directory inode
3915 if (!check_access(mdr
, diri
, MAY_READ
))
3921 if (in
->is_base()) {
3922 respond_to_request(mdr
, -EINVAL
);
3925 if (!diri
|| diri
->is_stray()) {
3926 respond_to_request(mdr
, -ESTALE
);
3929 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3931 respond_to_request(mdr
, 0);
3934 inodeno_t dirino
= req
->get_filepath2().get_ino();
3935 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3936 respond_to_request(mdr
, -ENOENT
);
3939 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3941 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3946 respond_to_request(mdr
, 0);
3950 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
3952 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3955 vino
.ino
= req
->get_filepath().get_ino();
3956 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
3957 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
3958 __u32 hash
= req
->head
.args
.lookupino
.hash
;
3960 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
3962 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
3964 in
= mdcache
->get_inode(vino
.ino
);
3966 if (in
->state_test(CInode::STATE_PURGING
) ||
3967 !in
->has_snap_data(vino
.snapid
)) {
3968 if (in
->is_dir() || !parent_ino
) {
3969 respond_to_request(mdr
, -ESTALE
);
3978 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
3979 mdr
->snapid
= vino
.snapid
;
3981 respond_to_request(mdr
, 0);
3985 CInode
*diri
= NULL
;
3987 diri
= mdcache
->get_inode(parent_ino
);
3989 mdcache
->open_ino(parent_ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
3993 if (!diri
->is_dir()) {
3994 respond_to_request(mdr
, -EINVAL
);
3998 MutationImpl::LockOpVec lov
;
3999 lov
.add_rdlock(&diri
->dirfragtreelock
);
4000 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4003 frag_t frag
= diri
->dirfragtree
[hash
];
4004 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4008 if (!dir
->is_complete()) {
4009 if (dir
->is_frozen()) {
4010 mds
->locker
->drop_locks(mdr
.get());
4011 mdr
->drop_local_auth_pins();
4012 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4015 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4019 respond_to_request(mdr
, -ESTALE
);
4021 mdcache
->open_ino(vino
.ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4025 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4027 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4028 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4030 // `r` is a rank if >=0, else an error code
4032 mds_rank_t
dest_rank(r
);
4033 if (dest_rank
== mds
->get_nodeid())
4034 dispatch_client_request(mdr
);
4036 mdcache
->request_forward(mdr
, dest_rank
);
4041 if (r
== -ENOENT
|| r
== -ENODATA
)
4043 respond_to_request(mdr
, r
);
4047 /* This function takes responsibility for the passed mdr*/
4048 void Server::handle_client_open(MDRequestRef
& mdr
)
4050 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4051 dout(7) << "open on " << req
->get_filepath() << dendl
;
4053 int flags
= req
->head
.args
.open
.flags
;
4054 int cmode
= ceph_flags_to_mode(flags
);
4056 respond_to_request(mdr
, -EINVAL
);
4060 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4061 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4063 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4064 dout(7) << "read-only FS" << dendl
;
4065 respond_to_request(mdr
, -EROFS
);
4069 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4073 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4074 ceph_assert(!need_auth
);
4075 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4076 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4081 if (!cur
->inode
.is_file()) {
4082 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4083 cmode
= CEPH_FILE_MODE_PIN
;
4084 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4085 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4086 flags
&= ~CEPH_O_TRUNC
;
4089 dout(10) << "open flags = " << flags
4090 << ", filemode = " << cmode
4091 << ", need_auth = " << need_auth
4095 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4096 dout(7) << "not a file or dir " << *cur << dendl;
4097 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
4100 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
4101 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4102 respond_to_request(mdr
, -EINVAL
);
4106 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
4107 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4108 // we should return -EISDIR for directory, return -EINVAL for other non-regular
4109 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
4113 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
4114 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4115 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4116 respond_to_request(mdr
, -EPERM
);
4120 // snapped data is read only
4121 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4122 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4123 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4124 respond_to_request(mdr
, -EROFS
);
4128 MutationImpl::LockOpVec lov
;
4130 unsigned mask
= req
->head
.args
.open
.mask
;
4132 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4134 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4135 issued
= cap
->issued();
4136 // permission bits, ACL/security xattrs
4137 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4138 lov
.add_rdlock(&cur
->authlock
);
4139 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4140 lov
.add_rdlock(&cur
->xattrlock
);
4142 mdr
->getattr_caps
= mask
;
4146 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4147 ceph_assert(cur
->is_auth());
4149 lov
.add_xlock(&cur
->filelock
);
4150 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4153 if (!check_access(mdr
, cur
, MAY_WRITE
))
4156 // wait for pending truncate?
4157 const auto pi
= cur
->get_projected_inode();
4158 if (pi
->is_truncating()) {
4159 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4160 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4161 mds
->locker
->drop_locks(mdr
.get());
4162 mdr
->drop_local_auth_pins();
4163 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4167 do_open_truncate(mdr
, cmode
);
4171 // sync filelock if snapped.
4172 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4173 // and that data itself is flushed so that we can read the snapped data off disk.
4174 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4175 lov
.add_rdlock(&cur
->filelock
);
4178 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4182 if (cmode
& CEPH_FILE_MODE_WR
)
4184 if (!check_access(mdr
, cur
, mask
))
4187 utime_t now
= ceph_clock_now();
4188 mdr
->set_mds_stamp(now
);
4190 if (cur
->is_file() || cur
->is_dir()) {
4191 if (mdr
->snapid
== CEPH_NOSNAP
) {
4193 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4195 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4196 << " for " << req
->get_source()
4197 << " on " << *cur
<< dendl
;
4199 int caps
= ceph_caps_for_mode(cmode
);
4200 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4201 << " for " << req
->get_source()
4202 << " snapid " << mdr
->snapid
4203 << " on " << *cur
<< dendl
;
4204 mdr
->snap_caps
= caps
;
4208 // increase max_size?
4209 if (cmode
& CEPH_FILE_MODE_WR
)
4210 mds
->locker
->check_inode_max_size(cur
);
4212 // make sure this inode gets into the journal
4213 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4214 mdcache
->open_file_table
.should_log_open(cur
)) {
4215 EOpen
*le
= new EOpen(mds
->mdlog
);
4216 mdlog
->start_entry(le
);
4217 le
->add_clean_inode(cur
);
4218 mdlog
->submit_entry(le
);
4222 if (cmode
& CEPH_FILE_MODE_WR
)
4223 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4225 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
4226 mdr
->client_request
->get_source().num());
4229 if (req
->get_dentry_wanted()) {
4230 ceph_assert(mdr
->dn
[0].size());
4231 dn
= mdr
->dn
[0].back();
4236 respond_to_request(mdr
, 0);
4239 class C_MDS_openc_finish
: public ServerLogContext
{
4243 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4244 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4245 void finish(int r
) override
{
4246 ceph_assert(r
== 0);
4248 dn
->pop_projected_linkage();
4250 // dirty inode, dn, dir
4251 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
4252 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
4253 newi
->mark_dirty_parent(mdr
->ls
, true);
4257 get_mds()->locker
->share_inode_max_size(newi
);
4259 MDRequestRef null_ref
;
4260 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4262 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4264 server
->respond_to_request(mdr
, 0);
4266 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4270 /* This function takes responsibility for the passed mdr*/
4271 void Server::handle_client_openc(MDRequestRef
& mdr
)
4273 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4274 client_t client
= mdr
->get_client();
4276 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4278 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4280 respond_to_request(mdr
, -EINVAL
);
4284 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4285 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true);
4289 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4290 if (!excl
&& !dnl
->is_null()) {
4292 mds
->locker
->xlock_downgrade(&dn
->lock
, mdr
.get());
4294 MutationImpl::LockOpVec lov
;
4295 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4296 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4299 handle_client_open(mdr
);
4303 ceph_assert(dnl
->is_null());
4306 file_layout_t layout
;
4307 if (mdr
->dir_layout
!= file_layout_t())
4308 layout
= mdr
->dir_layout
;
4310 layout
= mdcache
->default_file_layout
;
4312 // What kind of client caps are required to complete this operation
4313 uint64_t access
= MAY_WRITE
;
4315 const auto default_layout
= layout
;
4317 // fill in any special params from client
4318 if (req
->head
.args
.open
.stripe_unit
)
4319 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4320 if (req
->head
.args
.open
.stripe_count
)
4321 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4322 if (req
->head
.args
.open
.object_size
)
4323 layout
.object_size
= req
->head
.args
.open
.object_size
;
4324 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4325 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4326 layout
.pool_id
= req
->head
.args
.open
.pool
;
4328 // make sure we have as new a map as the client
4329 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4330 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4335 // If client doesn't have capability to modify layout pools, then
4336 // only permit this request if the requested pool matches what the
4337 // file would have inherited anyway from its parent.
4338 if (default_layout
!= layout
) {
4339 access
|= MAY_SET_VXATTR
;
4342 if (!layout
.is_valid()) {
4343 dout(10) << " invalid initial file layout" << dendl
;
4344 respond_to_request(mdr
, -EINVAL
);
4347 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4348 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4349 respond_to_request(mdr
, -EINVAL
);
4354 CDir
*dir
= dn
->get_dir();
4355 CInode
*diri
= dir
->get_inode();
4356 if (!check_access(mdr
, diri
, access
))
4358 if (!check_fragment_space(mdr
, dir
))
4361 if (mdr
->dn
[0].size() == 1)
4362 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4365 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4366 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4370 dn
->push_projected_linkage(in
);
4372 in
->inode
.version
= dn
->pre_dirty();
4373 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4374 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
4375 in
->inode
.update_backtrace();
4376 in
->inode
.rstat
.rfiles
= 1;
4378 SnapRealm
*realm
= diri
->find_snaprealm();
4379 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4380 ceph_assert(follows
>= realm
->get_newest_seq());
4382 ceph_assert(dn
->first
== follows
+1);
4383 in
->first
= dn
->first
;
4386 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
4387 in
->authlock
.set_state(LOCK_EXCL
);
4388 in
->xattrlock
.set_state(LOCK_EXCL
);
4390 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4391 in
->inode
.client_ranges
[client
].range
.first
= 0;
4392 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.layout
.stripe_unit
;
4393 in
->inode
.client_ranges
[client
].follows
= follows
;
4394 cap
->mark_clientwriteable();
4398 mdr
->ls
= mdlog
->get_current_segment();
4399 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4400 mdlog
->start_entry(le
);
4401 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4402 journal_allocated_inos(mdr
, &le
->metablob
);
4403 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4404 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
4406 // make sure this inode gets into the journal
4407 le
->metablob
.add_opened_ino(in
->ino());
4409 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
);
4411 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4412 openc_response_t ocresp
;
4414 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4415 ocresp
.created_ino
= in
->inode
.ino
;
4417 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4418 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4419 unsigned frac
= 100 / delegate_inos_pct
;
4420 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4421 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4424 encode(ocresp
, mdr
->reply_extra_bl
);
4425 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4426 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4427 // add the file created flag onto the reply if create_flags features is supported
4428 encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
4431 journal_and_reply(mdr
, in
, dn
, le
, fin
);
4433 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4434 // have overshot the split size (multiple opencs in flight), so here is
4435 // an early chance to split the dir if this openc makes it oversized.
4436 mds
->balancer
->maybe_fragment(dir
, false);
4441 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4443 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4444 client_t client
= req
->get_source().num();
4445 MutationImpl::LockOpVec lov
;
4446 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4449 // it's a directory, right?
4450 if (!diri
->is_dir()) {
4452 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
4453 respond_to_request(mdr
, -ENOTDIR
);
4457 lov
.add_rdlock(&diri
->filelock
);
4458 lov
.add_rdlock(&diri
->dirfragtreelock
);
4460 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4463 if (!check_access(mdr
, diri
, MAY_READ
))
4467 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4468 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4469 string offset_str
= req
->get_path2();
4471 __u32 offset_hash
= 0;
4472 if (!offset_str
.empty())
4473 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4475 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4477 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4478 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4480 // does the frag exist?
4481 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4483 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4484 if (fg
.contains((unsigned)offset_hash
)) {
4485 newfg
= diri
->dirfragtree
[offset_hash
];
4487 // client actually wants next frag
4488 newfg
= diri
->dirfragtree
[fg
.value()];
4492 newfg
= diri
->dirfragtree
[fg
.value()];
4494 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4498 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4502 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4503 ceph_assert(dir
->is_auth());
4505 if (!dir
->is_complete()) {
4506 if (dir
->is_frozen()) {
4507 dout(7) << "dir is frozen " << *dir
<< dendl
;
4508 mds
->locker
->drop_locks(mdr
.get());
4509 mdr
->drop_local_auth_pins();
4510 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4514 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4515 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4519 #ifdef MDS_VERIFY_FRAGSTAT
4520 dir
->verify_fragstat();
4523 utime_t now
= ceph_clock_now();
4524 mdr
->set_mds_stamp(now
);
4526 snapid_t snapid
= mdr
->snapid
;
4527 dout(10) << "snapid " << snapid
<< dendl
;
4529 SnapRealm
*realm
= diri
->find_snaprealm();
4531 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4533 max
= dir
->get_num_any(); // whatever, something big.
4534 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4536 // make sure at least one item can be encoded
4537 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4542 ds
.frag
= dir
->get_frag();
4543 ds
.auth
= dir
->get_dir_auth().first
;
4544 if (dir
->is_auth() && !mdcache
->forward_all_reqs_to_auth())
4545 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4547 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4549 // count bytes available.
4550 // this isn't perfect, but we should capture the main variable/unbounded size items!
4551 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4552 int bytes_left
= max_bytes
- front_bytes
;
4553 bytes_left
-= realm
->get_snap_trace().length();
4555 // build dir contents
4558 bool start
= !offset_hash
&& offset_str
.empty();
4559 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4560 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4561 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4562 bool end
= (it
== dir
->end());
4563 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4564 CDentry
*dn
= it
->second
;
4567 if (dn
->state_test(CDentry::STATE_PURGING
))
4570 bool dnp
= dn
->use_projected(client
, mdr
);
4571 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4576 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4577 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4582 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4583 if (!(offset_key
< dn
->key()))
4587 CInode
*in
= dnl
->get_inode();
4589 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4593 // better for the MDS to do the work, if we think the client will stat any of these files.
4594 if (dnl
->is_remote() && !in
) {
4595 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4597 dn
->link_remote(dnl
, in
);
4598 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4599 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4602 // touch everything i _do_ have
4603 for (auto &p
: *dir
) {
4604 if (!p
.second
->get_linkage()->is_null())
4605 mdcache
->lru
.lru_touch(p
.second
);
4608 // already issued caps and leases, reply immediately.
4609 if (dnbl
.length() > 0) {
4610 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4611 dout(10) << " open remote dentry after caps were issued, stopping at "
4612 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4616 mds
->locker
->drop_locks(mdr
.get());
4617 mdr
->drop_local_auth_pins();
4618 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4624 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4625 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4629 unsigned start_len
= dnbl
.length();
4632 dout(12) << "including dn " << *dn
<< dendl
;
4633 encode(dn
->get_name(), dnbl
);
4634 int lease_mask
= dnl
->is_primary() ? CEPH_LEASE_PRIMARY_LINK
: 0;
4635 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, dnbl
);
4638 dout(12) << "including inode " << *in
<< dendl
;
4639 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4641 // chop off dn->name, lease
4642 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4644 keep
.substr_of(dnbl
, 0, start_len
);
4648 ceph_assert(r
>= 0);
4652 mdcache
->lru
.lru_touch(dn
);
4657 flags
= CEPH_READDIR_FRAG_END
;
4659 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4661 // client only understand END and COMPLETE flags ?
4662 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4663 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4666 // finish final blob
4667 encode(numfiles
, dirbl
);
4668 encode(flags
, dirbl
);
4669 dirbl
.claim_append(dnbl
);
4672 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4673 << " bytes=" << dirbl
.length()
4674 << " start=" << (int)start
4675 << " end=" << (int)end
4677 mdr
->reply_extra_bl
= dirbl
;
4679 // bump popularity. NOTE: this doesn't quite capture it.
4680 mds
->balancer
->hit_dir(dir
, META_POP_IRD
, -1, numfiles
);
4684 respond_to_request(mdr
, 0);
4689 // ===============================================================================
4694 * finisher for basic inode updates
4696 class C_MDS_inode_update_finish
: public ServerLogContext
{
4698 bool truncating_smaller
, changed_ranges
, new_realm
;
4700 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4701 bool sm
=false, bool cr
=false, bool nr
=false) :
4702 ServerLogContext(s
, r
), in(i
),
4703 truncating_smaller(sm
), changed_ranges(cr
), new_realm(nr
) { }
4704 void finish(int r
) override
{
4705 ceph_assert(r
== 0);
4708 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4711 MDSRank
*mds
= get_mds();
4713 // notify any clients
4714 if (truncating_smaller
&& in
->inode
.is_truncating()) {
4715 mds
->locker
->issue_truncate(in
);
4716 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4720 int op
= CEPH_SNAP_OP_SPLIT
;
4721 mds
->mdcache
->send_snap_update(in
, 0, op
);
4722 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, op
);
4725 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4727 server
->respond_to_request(mdr
, 0);
4730 get_mds()->locker
->share_inode_max_size(in
);
4734 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4736 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4737 MutationImpl::LockOpVec lov
;
4739 // get the inode to operate on, and set up any locks needed for that
4740 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4744 lov
.add_xlock(&cur
->flocklock
);
4745 /* acquire_locks will return true if it gets the locks. If it fails,
4746 it will redeliver this request at a later date, so drop the request.
4748 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4749 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4753 // copy the lock change into a ceph_filelock so we can store/apply it
4754 ceph_filelock set_lock
;
4755 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4756 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4757 set_lock
.client
= req
->get_orig_source().num();
4758 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4759 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4760 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4761 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4763 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4765 ceph_lock_state_t
*lock_state
= NULL
;
4766 bool interrupt
= false;
4768 // get the appropriate lock state
4769 switch (req
->head
.args
.filelock_change
.rule
) {
4770 case CEPH_LOCK_FLOCK_INTR
:
4773 case CEPH_LOCK_FLOCK
:
4774 lock_state
= cur
->get_flock_lock_state();
4777 case CEPH_LOCK_FCNTL_INTR
:
4780 case CEPH_LOCK_FCNTL
:
4781 lock_state
= cur
->get_fcntl_lock_state();
4785 dout(10) << "got unknown lock type " << set_lock
.type
4786 << ", dropping request!" << dendl
;
4787 respond_to_request(mdr
, -EOPNOTSUPP
);
4791 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4792 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4793 list
<ceph_filelock
> activated_locks
;
4794 MDSContext::vec waiters
;
4795 if (lock_state
->is_waiting(set_lock
)) {
4796 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4797 lock_state
->remove_waiting(set_lock
);
4798 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4799 } else if (!interrupt
) {
4800 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4801 lock_state
->remove_lock(set_lock
, activated_locks
);
4802 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4804 mds
->queue_waiters(waiters
);
4806 respond_to_request(mdr
, 0);
4808 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4809 bool deadlock
= false;
4810 if (mdr
->more()->flock_was_waiting
&&
4811 !lock_state
->is_waiting(set_lock
)) {
4812 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4813 respond_to_request(mdr
, -EINTR
);
4814 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4815 dout(10) << " it failed on this attempt" << dendl
;
4816 // couldn't set lock right now
4818 respond_to_request(mdr
, -EDEADLK
);
4819 } else if (!will_wait
) {
4820 respond_to_request(mdr
, -EWOULDBLOCK
);
4822 dout(10) << " added to waiting list" << dendl
;
4823 ceph_assert(lock_state
->is_waiting(set_lock
));
4824 mdr
->more()->flock_was_waiting
= true;
4825 mds
->locker
->drop_locks(mdr
.get());
4826 mdr
->drop_local_auth_pins();
4827 mdr
->mark_event("failed to add lock, waiting");
4829 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4832 respond_to_request(mdr
, 0);
4834 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4837 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4839 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4840 MutationImpl::LockOpVec lov
;
4842 // get the inode to operate on, and set up any locks needed for that
4843 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4847 /* acquire_locks will return true if it gets the locks. If it fails,
4848 it will redeliver this request at a later date, so drop the request.
4850 lov
.add_rdlock(&cur
->flocklock
);
4851 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4852 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4856 // copy the lock change into a ceph_filelock so we can store/apply it
4857 ceph_filelock checking_lock
;
4858 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4859 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4860 checking_lock
.client
= req
->get_orig_source().num();
4861 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4862 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4863 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4865 // get the appropriate lock state
4866 ceph_lock_state_t
*lock_state
= NULL
;
4867 switch (req
->head
.args
.filelock_change
.rule
) {
4868 case CEPH_LOCK_FLOCK
:
4869 lock_state
= cur
->get_flock_lock_state();
4872 case CEPH_LOCK_FCNTL
:
4873 lock_state
= cur
->get_fcntl_lock_state();
4877 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4878 respond_to_request(mdr
, -EINVAL
);
4881 lock_state
->look_for_lock(checking_lock
);
4884 encode(checking_lock
, lock_bl
);
4886 mdr
->reply_extra_bl
= lock_bl
;
4887 respond_to_request(mdr
, 0);
4890 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4892 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4893 MutationImpl::LockOpVec lov
;
4894 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4897 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4898 respond_to_request(mdr
, -EROFS
);
4901 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4902 respond_to_request(mdr
, -EPERM
);
4906 __u32 mask
= req
->head
.args
.setattr
.mask
;
4907 __u32 access_mask
= MAY_WRITE
;
4910 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4911 lov
.add_xlock(&cur
->authlock
);
4912 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4913 lov
.add_xlock(&cur
->filelock
);
4914 if (mask
& CEPH_SETATTR_CTIME
)
4915 lov
.add_wrlock(&cur
->versionlock
);
4917 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4920 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
4921 access_mask
|= MAY_CHOWN
;
4923 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
4924 access_mask
|= MAY_CHGRP
;
4926 if (!check_access(mdr
, cur
, access_mask
))
4929 // trunc from bigger -> smaller?
4930 auto pip
= cur
->get_projected_inode();
4932 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
4934 // ENOSPC on growing file while full, but allow shrinks
4935 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4936 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
4937 respond_to_request(mdr
, -ENOSPC
);
4941 bool truncating_smaller
= false;
4942 if (mask
& CEPH_SETATTR_SIZE
) {
4943 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
4944 if (truncating_smaller
&& pip
->is_truncating()) {
4945 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
4946 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4947 mds
->locker
->drop_locks(mdr
.get());
4948 mdr
->drop_local_auth_pins();
4949 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4954 bool changed_ranges
= false;
4957 mdr
->ls
= mdlog
->get_current_segment();
4958 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
4959 mdlog
->start_entry(le
);
4961 auto &pi
= cur
->project_inode();
4963 if (mask
& CEPH_SETATTR_UID
)
4964 pi
.inode
.uid
= req
->head
.args
.setattr
.uid
;
4965 if (mask
& CEPH_SETATTR_GID
)
4966 pi
.inode
.gid
= req
->head
.args
.setattr
.gid
;
4968 if (mask
& CEPH_SETATTR_MODE
)
4969 pi
.inode
.mode
= (pi
.inode
.mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
4970 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4971 S_ISREG(pi
.inode
.mode
) &&
4972 (pi
.inode
.mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
4973 pi
.inode
.mode
&= ~(S_ISUID
|S_ISGID
);
4976 if (mask
& CEPH_SETATTR_MTIME
)
4977 pi
.inode
.mtime
= req
->head
.args
.setattr
.mtime
;
4978 if (mask
& CEPH_SETATTR_ATIME
)
4979 pi
.inode
.atime
= req
->head
.args
.setattr
.atime
;
4980 if (mask
& CEPH_SETATTR_BTIME
)
4981 pi
.inode
.btime
= req
->head
.args
.setattr
.btime
;
4982 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4983 pi
.inode
.time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4984 if (mask
& CEPH_SETATTR_SIZE
) {
4985 if (truncating_smaller
) {
4986 pi
.inode
.truncate(old_size
, req
->head
.args
.setattr
.size
);
4987 le
->metablob
.add_truncate_start(cur
->ino());
4989 pi
.inode
.size
= req
->head
.args
.setattr
.size
;
4990 pi
.inode
.rstat
.rbytes
= pi
.inode
.size
;
4992 pi
.inode
.mtime
= mdr
->get_op_stamp();
4994 // adjust client's max_size?
4995 CInode::mempool_inode::client_range_map new_ranges
;
4996 bool max_increased
= false;
4997 mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
.size
, true, &new_ranges
, &max_increased
);
4998 if (pi
.inode
.client_ranges
!= new_ranges
) {
4999 dout(10) << " client_ranges " << pi
.inode
.client_ranges
<< " -> " << new_ranges
<< dendl
;
5000 pi
.inode
.client_ranges
= new_ranges
;
5001 changed_ranges
= true;
5005 pi
.inode
.version
= cur
->pre_dirty();
5006 pi
.inode
.ctime
= mdr
->get_op_stamp();
5007 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5008 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5009 pi
.inode
.change_attr
++;
5012 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5013 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5014 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5016 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5017 truncating_smaller
, changed_ranges
));
5019 // flush immediately if there are readers/writers waiting
5020 if (mdr
->is_xlocked(&cur
->filelock
) &&
5021 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5022 mds
->mdlog
->flush();
5025 /* Takes responsibility for mdr */
5026 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5028 CInode
*in
= mdr
->in
[0];
5029 client_t client
= mdr
->get_client();
5032 dout(10) << "do_open_truncate " << *in
<< dendl
;
5034 SnapRealm
*realm
= in
->find_snaprealm();
5035 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5037 mdr
->ls
= mdlog
->get_current_segment();
5038 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5039 mdlog
->start_entry(le
);
5042 auto &pi
= in
->project_inode();
5043 pi
.inode
.version
= in
->pre_dirty();
5044 pi
.inode
.mtime
= pi
.inode
.ctime
= mdr
->get_op_stamp();
5045 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5046 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5047 pi
.inode
.change_attr
++;
5049 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
.size
, mdr
->client_request
->head
.args
.open
.old_size
);
5051 pi
.inode
.truncate(old_size
, 0);
5052 le
->metablob
.add_truncate_start(in
->ino());
5055 bool changed_ranges
= false;
5056 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5057 pi
.inode
.client_ranges
[client
].range
.first
= 0;
5058 pi
.inode
.client_ranges
[client
].range
.last
= pi
.inode
.get_layout_size_increment();
5059 pi
.inode
.client_ranges
[client
].follows
= realm
->get_newest_seq();
5060 changed_ranges
= true;
5061 cap
->mark_clientwriteable();
5064 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5066 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5067 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5069 // make sure ino gets into the journal
5070 le
->metablob
.add_opened_ino(in
->ino());
5072 mdr
->o_trunc
= true;
5075 if (mdr
->client_request
->get_dentry_wanted()) {
5076 ceph_assert(mdr
->dn
[0].size());
5077 dn
= mdr
->dn
[0].back();
5080 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5082 // Although the `open` part can give an early reply, the truncation won't
5083 // happen until our EUpdate is persistent, to give the client a prompt
5084 // response we must also flush that event.
5089 /* This function cleans up the passed mdr */
5090 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5092 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5093 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5096 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5097 respond_to_request(mdr
, -EROFS
);
5100 if (!cur
->is_file()) {
5101 respond_to_request(mdr
, -EINVAL
);
5104 if (cur
->get_projected_inode()->size
||
5105 cur
->get_projected_inode()->truncate_seq
> 1) {
5106 respond_to_request(mdr
, -ENOTEMPTY
);
5111 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5112 // save existing layout for later
5113 const auto old_layout
= layout
;
5115 int access
= MAY_WRITE
;
5117 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5118 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5119 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5120 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5121 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5122 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5123 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5124 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5126 // make sure we have as new a map as the client
5127 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5128 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5133 // Don't permit layout modifications without 'p' caps
5134 if (layout
!= old_layout
) {
5135 access
|= MAY_SET_VXATTR
;
5138 if (!layout
.is_valid()) {
5139 dout(10) << "bad layout" << dendl
;
5140 respond_to_request(mdr
, -EINVAL
);
5143 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5144 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5145 respond_to_request(mdr
, -EINVAL
);
5149 MutationImpl::LockOpVec lov
;
5150 lov
.add_xlock(&cur
->filelock
);
5151 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5154 if (!check_access(mdr
, cur
, access
))
5158 auto &pi
= cur
->project_inode();
5159 pi
.inode
.layout
= layout
;
5160 // add the old pool to the inode
5161 pi
.inode
.add_old_pool(old_layout
.pool_id
);
5162 pi
.inode
.version
= cur
->pre_dirty();
5163 pi
.inode
.ctime
= mdr
->get_op_stamp();
5164 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5165 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5166 pi
.inode
.change_attr
++;
5169 mdr
->ls
= mdlog
->get_current_segment();
5170 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5171 mdlog
->start_entry(le
);
5172 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5173 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5174 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5176 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5179 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5181 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5184 MutationImpl::LockOpVec lov
;
5185 lov
.add_xlock(&in
->policylock
);
5187 lov
.add_xlock(&in
->snaplock
);
5189 lov
.add_rdlock(&in
->snaplock
);
5190 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5193 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5194 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5195 want_layout
= false;
5197 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5198 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5202 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5206 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5208 CInode
*in
= mdcache
->get_inode(ino
);
5209 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5210 respond_to_request(mdr
, -ESTALE
);
5213 if (!in
->is_auth()) {
5214 mdcache
->request_forward(mdr
, in
->authority().first
);
5221 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5223 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5225 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5226 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5230 if (!cur
->is_dir()) {
5231 respond_to_request(mdr
, -ENOTDIR
);
5235 if (!xlock_policylock(mdr
, cur
, true))
5239 const auto old_pi
= cur
->get_projected_inode();
5240 file_layout_t layout
;
5241 if (old_pi
->has_layout())
5242 layout
= old_pi
->layout
;
5243 else if (mdr
->dir_layout
!= file_layout_t())
5244 layout
= mdr
->dir_layout
;
5246 layout
= mdcache
->default_file_layout
;
5248 // Level of access required to complete
5249 int access
= MAY_WRITE
;
5251 const auto old_layout
= layout
;
5253 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5254 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5255 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5256 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5257 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5258 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5259 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5260 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5261 // make sure we have as new a map as the client
5262 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5263 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5268 if (layout
!= old_layout
) {
5269 access
|= MAY_SET_VXATTR
;
5272 if (!layout
.is_valid()) {
5273 dout(10) << "bad layout" << dendl
;
5274 respond_to_request(mdr
, -EINVAL
);
5277 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5278 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5279 respond_to_request(mdr
, -EINVAL
);
5283 if (!check_access(mdr
, cur
, access
))
5286 auto &pi
= cur
->project_inode();
5287 pi
.inode
.layout
= layout
;
5288 pi
.inode
.version
= cur
->pre_dirty();
5291 mdr
->ls
= mdlog
->get_current_segment();
5292 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5293 mdlog
->start_entry(le
);
5294 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5295 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5296 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5298 mdr
->no_early_reply
= true;
5299 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5304 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5305 file_layout_t
*layout
, bool validate
)
5307 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5309 if (name
== "layout") {
5310 string::iterator begin
= value
.begin();
5311 string::iterator end
= value
.end();
5312 keys_and_values
<string::iterator
> p
; // create instance of parser
5313 std::map
<string
, string
> m
; // map to receive results
5314 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5317 string
left(begin
, end
);
5318 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5321 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5322 // Skip validation on each attr, we do it once at the end (avoid
5323 // rejecting intermediate states if the overall result is ok)
5324 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
5325 osdmap
, layout
, false);
5329 } else if (name
== "layout.object_size") {
5330 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5331 } else if (name
== "layout.stripe_unit") {
5332 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5333 } else if (name
== "layout.stripe_count") {
5334 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5335 } else if (name
== "layout.pool") {
5337 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5338 } catch (boost::bad_lexical_cast
const&) {
5339 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5341 dout(10) << " unknown pool " << value
<< dendl
;
5344 layout
->pool_id
= pool
;
5346 } else if (name
== "layout.pool_namespace") {
5347 layout
->pool_ns
= value
;
5349 dout(10) << " unknown layout vxattr " << name
<< dendl
;
5352 } catch (boost::bad_lexical_cast
const&) {
5353 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5357 if (validate
&& !layout
->is_valid()) {
5358 dout(10) << "bad layout" << dendl
;
5361 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5362 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
5368 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5370 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5372 if (name
== "quota") {
5373 string::iterator begin
= value
.begin();
5374 string::iterator end
= value
.end();
5376 // keep quota unchanged. (for create_quota_realm())
5379 keys_and_values
<string::iterator
> p
; // create instance of parser
5380 std::map
<string
, string
> m
; // map to receive results
5381 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5384 string
left(begin
, end
);
5385 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5388 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5389 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5393 } else if (name
== "quota.max_bytes") {
5394 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5397 quota
->max_bytes
= q
;
5398 } else if (name
== "quota.max_files") {
5399 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5402 quota
->max_files
= q
;
5404 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5407 } catch (boost::bad_lexical_cast
const&) {
5408 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5412 if (!quota
->is_valid()) {
5413 dout(10) << "bad quota" << dendl
;
5419 void Server::create_quota_realm(CInode
*in
)
5421 dout(10) << __func__
<< " " << *in
<< dendl
;
5423 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5424 req
->set_filepath(filepath(in
->ino()));
5425 req
->set_string2("ceph.quota");
5426 // empty vxattr value
5427 req
->set_tid(mds
->issue_tid());
5429 mds
->send_message_mds(req
, in
->authority().first
);
5433 * Verify that the file layout attribute carried by client
5434 * is well-formatted.
5435 * Return 0 on success, otherwise this function takes
5436 * responsibility for the passed mdr.
5438 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5441 file_layout_t
*layout
)
5443 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5447 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5448 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5449 epoch
= osdmap
.get_epoch();
5454 // we don't have the specified pool, make sure our map
5455 // is newer than or as new as the client.
5456 epoch_t req_epoch
= req
->get_osdmap_epoch();
5458 if (req_epoch
> epoch
) {
5460 // well, our map is older. consult mds.
5461 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5463 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
5464 return r
; // wait, fin will retry this request later
5468 // now we have at least as new a map as the client, try again.
5469 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5470 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5471 epoch
= osdmap
.get_epoch();
5474 ceph_assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
5476 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5478 // For compatibility with client w/ old code, we still need get the
5479 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5480 // we can remove those code.
5481 mdr
->waited_for_osdmap
= true;
5482 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
5483 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
5493 respond_to_request(mdr
, r
);
5501 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5503 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5504 string
name(req
->get_path2());
5505 bufferlist bl
= req
->get_data();
5506 string
value (bl
.c_str(), bl
.length());
5507 dout(10) << "handle_set_vxattr " << name
5508 << " val " << value
.length()
5509 << " bytes on " << *cur
5512 CInode::mempool_inode
*pip
= nullptr;
5515 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5519 bool new_realm
= false;
5520 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5521 if (!cur
->is_dir()) {
5522 respond_to_request(mdr
, -EINVAL
);
5526 if (!xlock_policylock(mdr
, cur
, true))
5529 file_layout_t layout
;
5530 if (cur
->get_projected_inode()->has_layout())
5531 layout
= cur
->get_projected_inode()->layout
;
5532 else if (mdr
->dir_layout
!= file_layout_t())
5533 layout
= mdr
->dir_layout
;
5535 layout
= mdcache
->default_file_layout
;
5537 rest
= name
.substr(name
.find("layout"));
5538 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5541 auto &pi
= cur
->project_inode();
5542 pi
.inode
.layout
= layout
;
5543 mdr
->no_early_reply
= true;
5545 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5546 if (!cur
->is_file()) {
5547 respond_to_request(mdr
, -EINVAL
);
5550 if (cur
->get_projected_inode()->size
||
5551 cur
->get_projected_inode()->truncate_seq
> 1) {
5552 respond_to_request(mdr
, -ENOTEMPTY
);
5555 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5556 rest
= name
.substr(name
.find("layout"));
5557 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5560 MutationImpl::LockOpVec lov
;
5561 lov
.add_xlock(&cur
->filelock
);
5562 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5565 auto &pi
= cur
->project_inode();
5566 int64_t old_pool
= pi
.inode
.layout
.pool_id
;
5567 pi
.inode
.add_old_pool(old_pool
);
5568 pi
.inode
.layout
= layout
;
5570 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5571 if (!cur
->is_dir() || cur
->is_root()) {
5572 respond_to_request(mdr
, -EINVAL
);
5576 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5578 rest
= name
.substr(name
.find("quota"));
5579 int r
= parse_quota_vxattr(rest
, value
, "a
);
5581 respond_to_request(mdr
, r
);
5585 if (quota
.is_enable() && !cur
->get_projected_srnode())
5588 if (!xlock_policylock(mdr
, cur
, false, new_realm
))
5591 auto &pi
= cur
->project_inode(false, new_realm
);
5592 pi
.inode
.quota
= quota
;
5595 SnapRealm
*realm
= cur
->find_snaprealm();
5596 auto seq
= realm
->get_newest_seq();
5597 auto &newsnap
= *pi
.snapnode
;
5598 newsnap
.created
= seq
;
5601 mdr
->no_early_reply
= true;
5604 client_t exclude_ct
= mdr
->get_client();
5605 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5606 } else if (name
.find("ceph.dir.pin") == 0) {
5607 if (!cur
->is_dir() || cur
->is_root()) {
5608 respond_to_request(mdr
, -EINVAL
);
5614 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5615 if (rank
< 0) rank
= MDS_RANK_NONE
;
5616 } catch (boost::bad_lexical_cast
const&) {
5617 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5618 respond_to_request(mdr
, -EINVAL
);
5622 if (!xlock_policylock(mdr
, cur
))
5625 auto &pi
= cur
->project_inode();
5626 cur
->set_export_pin(rank
);
5629 dout(10) << " unknown vxattr " << name
<< dendl
;
5630 respond_to_request(mdr
, -EINVAL
);
5635 pip
->ctime
= mdr
->get_op_stamp();
5636 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
5637 pip
->rstat
.rctime
= mdr
->get_op_stamp();
5638 pip
->version
= cur
->pre_dirty();
5640 pip
->update_backtrace();
5643 mdr
->ls
= mdlog
->get_current_segment();
5644 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
5645 mdlog
->start_entry(le
);
5646 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5647 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5648 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5650 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5651 false, false, new_realm
));
5655 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5657 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5658 string
name(req
->get_path2());
5660 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
5662 if (name
== "ceph.dir.layout") {
5663 if (!cur
->is_dir()) {
5664 respond_to_request(mdr
, -ENODATA
);
5667 if (cur
->is_root()) {
5668 dout(10) << "can't remove layout policy on the root directory" << dendl
;
5669 respond_to_request(mdr
, -EINVAL
);
5673 if (!cur
->get_projected_inode()->has_layout()) {
5674 respond_to_request(mdr
, -ENODATA
);
5678 MutationImpl::LockOpVec lov
;
5679 lov
.add_xlock(&cur
->policylock
);
5680 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5683 auto &pi
= cur
->project_inode();
5684 pi
.inode
.clear_layout();
5685 pi
.inode
.version
= cur
->pre_dirty();
5688 mdr
->ls
= mdlog
->get_current_segment();
5689 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
5690 mdlog
->start_entry(le
);
5691 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5692 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5693 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5695 mdr
->no_early_reply
= true;
5696 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5698 } else if (name
== "ceph.dir.layout.pool_namespace"
5699 || name
== "ceph.file.layout.pool_namespace") {
5700 // Namespace is the only layout field that has a meaningful
5701 // null/none value (empty string, means default layout). Is equivalent
5702 // to a setxattr with empty string: pass through the empty payload of
5703 // the rmxattr request to do this.
5704 handle_set_vxattr(mdr
, cur
);
5708 respond_to_request(mdr
, -ENODATA
);
5711 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
5715 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
5716 ServerLogContext(s
, r
), in(i
) { }
5717 void finish(int r
) override
{
5718 ceph_assert(r
== 0);
5721 in
->pop_and_dirty_projected_inode(mdr
->ls
);
5725 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
5727 server
->respond_to_request(mdr
, 0);
5731 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
5733 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5734 string
name(req
->get_path2());
5736 // magic ceph.* namespace?
5737 if (name
.compare(0, 5, "ceph.") == 0) {
5738 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5739 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5743 handle_set_vxattr(mdr
, cur
);
5747 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5751 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5752 respond_to_request(mdr
, -EROFS
);
5756 int flags
= req
->head
.args
.setxattr
.flags
;
5758 MutationImpl::LockOpVec lov
;
5759 lov
.add_xlock(&cur
->xattrlock
);
5760 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5763 if (!check_access(mdr
, cur
, MAY_WRITE
))
5766 auto pxattrs
= cur
->get_projected_xattrs();
5767 size_t len
= req
->get_data().length();
5768 size_t inc
= len
+ name
.length();
5770 // check xattrs kv pairs size
5771 size_t cur_xattrs_size
= 0;
5772 for (const auto& p
: *pxattrs
) {
5773 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(p
.first
) == 0)) {
5776 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
5779 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
5780 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5781 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
5782 respond_to_request(mdr
, -ENOSPC
);
5786 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(mempool::mds_co::string(name
))) {
5787 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
5788 respond_to_request(mdr
, -EEXIST
);
5791 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(mempool::mds_co::string(name
))) {
5792 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
5793 respond_to_request(mdr
, -ENODATA
);
5797 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
5800 auto &pi
= cur
->project_inode(true);
5801 pi
.inode
.version
= cur
->pre_dirty();
5802 pi
.inode
.ctime
= mdr
->get_op_stamp();
5803 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5804 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5805 pi
.inode
.change_attr
++;
5806 pi
.inode
.xattr_version
++;
5807 auto &px
= *pi
.xattrs
;
5808 if ((flags
& CEPH_XATTR_REMOVE
)) {
5809 px
.erase(mempool::mds_co::string(name
));
5811 bufferptr b
= buffer::create(len
);
5813 req
->get_data().begin().copy(len
, b
.c_str());
5814 auto em
= px
.emplace(std::piecewise_construct
, std::forward_as_tuple(mempool::mds_co::string(name
)), std::forward_as_tuple(b
));
5816 em
.first
->second
= b
;
5820 mdr
->ls
= mdlog
->get_current_segment();
5821 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
5822 mdlog
->start_entry(le
);
5823 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5824 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5825 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5827 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5830 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
5832 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5833 std::string
name(req
->get_path2());
5835 if (name
.compare(0, 5, "ceph.") == 0) {
5836 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5837 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5841 handle_remove_vxattr(mdr
, cur
);
5845 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
5849 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5850 respond_to_request(mdr
, -EROFS
);
5854 MutationImpl::LockOpVec lov
;
5855 lov
.add_xlock(&cur
->xattrlock
);
5856 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5859 auto pxattrs
= cur
->get_projected_xattrs();
5860 if (pxattrs
->count(mempool::mds_co::string(name
)) == 0) {
5861 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
5862 respond_to_request(mdr
, -ENODATA
);
5866 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
5869 auto &pi
= cur
->project_inode(true);
5870 auto &px
= *pi
.xattrs
;
5871 pi
.inode
.version
= cur
->pre_dirty();
5872 pi
.inode
.ctime
= mdr
->get_op_stamp();
5873 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5874 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5875 pi
.inode
.change_attr
++;
5876 pi
.inode
.xattr_version
++;
5877 px
.erase(mempool::mds_co::string(name
));
5880 mdr
->ls
= mdlog
->get_current_segment();
5881 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
5882 mdlog
->start_entry(le
);
5883 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5884 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5885 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5887 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5891 // =================================================================
5892 // DIRECTORY and NAMESPACE OPS
5895 // ------------------------------------------------
5899 class C_MDS_mknod_finish
: public ServerLogContext
{
5903 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
5904 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
5905 void finish(int r
) override
{
5906 ceph_assert(r
== 0);
5909 dn
->pop_projected_linkage();
5911 // be a bit hacky with the inode version, here.. we decrement it
5912 // just to keep mark_dirty() happen. (we didn't bother projecting
5913 // a new version of hte inode since it's just been created)
5914 newi
->inode
.version
--;
5915 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
5916 newi
->mark_dirty_parent(mdr
->ls
, true);
5919 if (newi
->inode
.is_dir()) {
5920 CDir
*dir
= newi
->get_dirfrag(frag_t());
5922 dir
->fnode
.version
--;
5923 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
5924 dir
->mark_new(mdr
->ls
);
5929 MDRequestRef null_ref
;
5930 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
5932 if (newi
->inode
.is_file())
5933 get_mds()->locker
->share_inode_max_size(newi
);
5936 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
5939 server
->respond_to_request(mdr
, 0);
5944 void Server::handle_client_mknod(MDRequestRef
& mdr
)
5946 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5947 client_t client
= mdr
->get_client();
5949 unsigned mode
= req
->head
.args
.mknod
.mode
;
5950 if ((mode
& S_IFMT
) == 0)
5953 mdr
->disable_lock_cache();
5954 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, S_ISREG(mode
));
5958 CDir
*dir
= dn
->get_dir();
5959 CInode
*diri
= dir
->get_inode();
5960 if (!check_access(mdr
, diri
, MAY_WRITE
))
5962 if (!check_fragment_space(mdr
, dn
->get_dir()))
5966 file_layout_t layout
;
5967 if (mdr
->dir_layout
!= file_layout_t())
5968 layout
= mdr
->dir_layout
;
5970 layout
= mdcache
->default_file_layout
;
5972 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
5975 dn
->push_projected_linkage(newi
);
5977 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
5978 newi
->inode
.version
= dn
->pre_dirty();
5979 newi
->inode
.rstat
.rfiles
= 1;
5980 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
5981 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
5982 newi
->inode
.update_backtrace();
5984 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
5985 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5986 ceph_assert(follows
>= realm
->get_newest_seq());
5988 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5989 // want to write to it (e.g., if they are reexporting NFS)
5990 if (S_ISREG(newi
->inode
.mode
)) {
5991 // issue a cap on the file
5992 int cmode
= CEPH_FILE_MODE_RDWR
;
5993 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
5997 // put locks in excl mode
5998 newi
->filelock
.set_state(LOCK_EXCL
);
5999 newi
->authlock
.set_state(LOCK_EXCL
);
6000 newi
->xattrlock
.set_state(LOCK_EXCL
);
6002 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6003 newi
->inode
.client_ranges
[client
].range
.first
= 0;
6004 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.layout
.stripe_unit
;
6005 newi
->inode
.client_ranges
[client
].follows
= follows
;
6006 cap
->mark_clientwriteable();
6010 ceph_assert(dn
->first
== follows
+ 1);
6011 newi
->first
= dn
->first
;
6013 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
6016 mdr
->ls
= mdlog
->get_current_segment();
6017 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6018 mdlog
->start_entry(le
);
6019 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6020 journal_allocated_inos(mdr
, &le
->metablob
);
6022 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6023 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6024 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6026 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6027 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6033 /* This function takes responsibility for the passed mdr*/
6034 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6036 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6038 mdr
->disable_lock_cache();
6039 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6043 CDir
*dir
= dn
->get_dir();
6044 CInode
*diri
= dir
->get_inode();
6046 // mkdir check access
6047 if (!check_access(mdr
, diri
, MAY_WRITE
))
6050 if (!check_fragment_space(mdr
, dir
))
6054 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6057 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6060 // it's a directory.
6061 dn
->push_projected_linkage(newi
);
6063 newi
->inode
.version
= dn
->pre_dirty();
6064 newi
->inode
.rstat
.rsubdirs
= 1;
6065 newi
->inode
.update_backtrace();
6067 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6068 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6069 ceph_assert(follows
>= realm
->get_newest_seq());
6071 dout(12) << " follows " << follows
<< dendl
;
6072 ceph_assert(dn
->first
== follows
+ 1);
6073 newi
->first
= dn
->first
;
6075 // ...and that new dir is empty.
6076 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
6077 newdir
->state_set(CDir::STATE_CREATING
);
6078 newdir
->mark_complete();
6079 newdir
->fnode
.version
= newdir
->pre_dirty();
6082 mdr
->ls
= mdlog
->get_current_segment();
6083 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
6084 mdlog
->start_entry(le
);
6085 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6086 journal_allocated_inos(mdr
, &le
->metablob
);
6087 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6088 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6089 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
6091 // issue a cap on the directory
6092 int cmode
= CEPH_FILE_MODE_RDWR
;
6093 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6097 // put locks in excl mode
6098 newi
->filelock
.set_state(LOCK_EXCL
);
6099 newi
->authlock
.set_state(LOCK_EXCL
);
6100 newi
->xattrlock
.set_state(LOCK_EXCL
);
6103 // make sure this inode gets into the journal
6104 le
->metablob
.add_opened_ino(newi
->ino());
6106 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6108 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6109 // have overshot the split size (multiple mkdir in flight), so here is
6110 // an early chance to split the dir if this mkdir makes it oversized.
6111 mds
->balancer
->maybe_fragment(dir
, false);
6117 void Server::handle_client_symlink(MDRequestRef
& mdr
)
6119 mdr
->disable_lock_cache();
6120 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6124 CDir
*dir
= dn
->get_dir();
6125 CInode
*diri
= dir
->get_inode();
6127 if (!check_access(mdr
, diri
, MAY_WRITE
))
6129 if (!check_fragment_space(mdr
, dir
))
6132 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6134 unsigned mode
= S_IFLNK
| 0777;
6135 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6139 dn
->push_projected_linkage(newi
);
6141 newi
->symlink
= req
->get_path2();
6142 newi
->inode
.size
= newi
->symlink
.length();
6143 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
6144 newi
->inode
.rstat
.rfiles
= 1;
6145 newi
->inode
.version
= dn
->pre_dirty();
6146 newi
->inode
.update_backtrace();
6148 newi
->first
= dn
->first
;
6151 mdr
->ls
= mdlog
->get_current_segment();
6152 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
6153 mdlog
->start_entry(le
);
6154 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6155 journal_allocated_inos(mdr
, &le
->metablob
);
6156 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6157 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6159 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6160 mds
->balancer
->maybe_fragment(dir
, false);
6169 void Server::handle_client_link(MDRequestRef
& mdr
)
6171 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6173 dout(7) << "handle_client_link " << req
->get_filepath()
6174 << " to " << req
->get_filepath2()
6177 mdr
->disable_lock_cache();
6182 if (req
->get_filepath2().depth() == 0) {
6183 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
6185 dout(10) << "ESTALE on path2, attempting recovery" << dendl
;
6186 mdcache
->find_ino_peers(req
->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr
));
6191 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
6192 CDentry
*pdn
= targeti
->get_projected_parent_dn();
6194 dout(7) << "target has no parent dn, failing..." << dendl
;
6195 respond_to_request(mdr
, -EINVAL
);
6198 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
6200 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
6203 destdn
= rdlock_path_xlock_dentry(mdr
, false);
6208 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
6213 if (!destdn
->get_projected_linkage()->is_null()) {
6214 respond_to_request(mdr
, -EEXIST
);
6218 targeti
= ret
.second
->get_projected_linkage()->get_inode();
6221 if (targeti
->is_dir()) {
6222 dout(7) << "target is a dir, failing..." << dendl
;
6223 respond_to_request(mdr
, -EINVAL
);
6227 CDir
*dir
= destdn
->get_dir();
6228 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
6229 dout(7) << "target is " << *targeti
<< dendl
;
6231 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6232 MutationImpl::LockOpVec lov
;
6233 lov
.add_xlock(&targeti
->snaplock
);
6234 lov
.add_xlock(&targeti
->linklock
);
6236 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6239 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6242 if (targeti
->get_projected_inode()->nlink
== 0) {
6243 dout(7) << "target has no link, failing..." << dendl
;
6244 respond_to_request(mdr
, -ENOENT
);
6247 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6248 if (!check_access(mdr
, targeti
, MAY_WRITE
))
6251 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
6254 if (!check_fragment_space(mdr
, dir
))
6259 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
6262 if (targeti
->is_auth())
6263 _link_local(mdr
, destdn
, targeti
);
6265 _link_remote(mdr
, true, destdn
, targeti
);
6266 mds
->balancer
->maybe_fragment(dir
, false);
6270 class C_MDS_link_local_finish
: public ServerLogContext
{
6277 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
6278 version_t dnpv_
, version_t tipv_
, bool ar
) :
6279 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
6280 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
6281 void finish(int r
) override
{
6282 ceph_assert(r
== 0);
6283 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
6288 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
6290 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
6292 mdr
->ls
= mdlog
->get_current_segment();
6294 // predirty NEW dentry
6295 version_t dnpv
= dn
->pre_dirty();
6296 version_t tipv
= targeti
->pre_dirty();
6298 // project inode update
6299 auto &pi
= targeti
->project_inode();
6301 pi
.inode
.ctime
= mdr
->get_op_stamp();
6302 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
6303 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
6304 pi
.inode
.change_attr
++;
6305 pi
.inode
.version
= tipv
;
6307 bool adjust_realm
= false;
6308 if (!targeti
->is_projected_snaprealm_global()) {
6309 sr_t
*newsnap
= targeti
->project_snaprealm();
6310 targeti
->mark_snaprealm_global(newsnap
);
6311 targeti
->record_snaprealm_parent_dentry(newsnap
, NULL
, targeti
->get_projected_parent_dn(), true);
6312 adjust_realm
= true;
6316 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
6317 mdlog
->start_entry(le
);
6318 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6319 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
6320 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
6321 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6322 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
6324 // do this after predirty_*, to avoid funky extra dnl arg
6325 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6327 journal_and_reply(mdr
, targeti
, dn
, le
,
6328 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
6331 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
6332 version_t dnpv
, version_t tipv
, bool adjust_realm
)
6334 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
6336 // link and unlock the NEW dentry
6337 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6338 if (!dnl
->get_inode())
6339 dn
->link_remote(dnl
, targeti
);
6340 dn
->mark_dirty(dnpv
, mdr
->ls
);
6343 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
6347 MDRequestRef null_ref
;
6348 mdcache
->send_dentry_link(dn
, null_ref
);
6351 int op
= CEPH_SNAP_OP_SPLIT
;
6352 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6353 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6356 // bump target popularity
6357 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6358 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6361 respond_to_request(mdr
, 0);
6365 // link / unlink remote
6367 class C_MDS_link_remote_finish
: public ServerLogContext
{
6373 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
6374 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
6375 dpv(d
->get_projected_version()) {}
6376 void finish(int r
) override
{
6377 ceph_assert(r
== 0);
6378 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
6382 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
6384 dout(10) << "_link_remote "
6385 << (inc
? "link ":"unlink ")
6386 << *dn
<< " to " << *targeti
<< dendl
;
6388 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6389 mds_rank_t linkauth
= targeti
->authority().first
;
6390 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
6391 if (mds
->is_cluster_degraded() &&
6392 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
6393 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
6394 if (mdr
->more()->waiting_on_slave
.empty())
6395 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
6399 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
6402 op
= MMDSSlaveRequest::OP_LINKPREP
;
6404 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
6405 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
6406 targeti
->set_object_info(req
->get_object_info());
6407 req
->op_stamp
= mdr
->get_op_stamp();
6408 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
6409 encode(*desti_srnode
, req
->desti_snapbl
);
6410 mds
->send_message_mds(req
, linkauth
);
6412 ceph_assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
6413 mdr
->more()->waiting_on_slave
.insert(linkauth
);
6416 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
6418 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
6420 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
6421 delete desti_srnode
;
6422 desti_srnode
= NULL
;
6425 mdr
->set_mds_stamp(ceph_clock_now());
6428 mdr
->ls
= mdlog
->get_current_segment();
6429 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
6430 mdlog
->start_entry(le
);
6431 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6432 if (!mdr
->more()->witnessed
.empty()) {
6433 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6434 le
->reqid
= mdr
->reqid
;
6435 le
->had_slaves
= true;
6436 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6441 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
6442 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6443 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6446 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6447 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6448 le
->metablob
.add_null_dentry(dn
, true);
6449 dn
->push_projected_linkage();
6452 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
6453 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
6456 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
6457 CDentry
*dn
, CInode
*targeti
,
6460 dout(10) << "_link_remote_finish "
6461 << (inc
? "link ":"unlink ")
6462 << *dn
<< " to " << *targeti
<< dendl
;
6464 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
6466 if (!mdr
->more()->witnessed
.empty())
6467 mdcache
->logged_master_update(mdr
->reqid
);
6470 // link the new dentry
6471 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6472 if (!dnl
->get_inode())
6473 dn
->link_remote(dnl
, targeti
);
6474 dn
->mark_dirty(dpv
, mdr
->ls
);
6476 // unlink main dentry
6477 dn
->get_dir()->unlink_inode(dn
);
6478 dn
->pop_projected_linkage();
6479 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
6484 MDRequestRef null_ref
;
6486 mdcache
->send_dentry_link(dn
, null_ref
);
6488 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
6490 // bump target popularity
6491 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6492 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6495 respond_to_request(mdr
, 0);
6498 // removing a new dn?
6499 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6503 // remote linking/unlinking
6505 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
6509 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
6510 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
6511 void finish(int r
) override
{
6512 ceph_assert(r
== 0);
6513 server
->_logged_slave_link(mdr
, targeti
, adjust_realm
);
6517 class C_MDS_SlaveLinkCommit
: public ServerContext
{
6521 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
6522 ServerContext(s
), mdr(r
), targeti(t
) { }
6523 void finish(int r
) override
{
6524 server
->_commit_slave_link(mdr
, r
, targeti
);
6528 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
6530 dout(10) << "handle_slave_link_prep " << *mdr
6531 << " on " << mdr
->slave_request
->get_object_info()
6534 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
6536 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
6537 ceph_assert(targeti
);
6538 dout(10) << "targeti " << *targeti
<< dendl
;
6539 CDentry
*dn
= targeti
->get_parent_dn();
6540 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6541 ceph_assert(dnl
->is_primary());
6543 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6545 mdr
->auth_pin(targeti
);
6547 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6548 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
6551 mdr
->ls
= mdlog
->get_current_segment();
6552 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
6553 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
6554 mdlog
->start_entry(le
);
6556 auto &pi
= dnl
->get_inode()->project_inode();
6558 // update journaled target inode
6560 bool adjust_realm
= false;
6561 bool realm_projected
= false;
6562 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
6565 if (!targeti
->is_projected_snaprealm_global()) {
6566 sr_t
*newsnap
= targeti
->project_snaprealm();
6567 targeti
->mark_snaprealm_global(newsnap
);
6568 targeti
->record_snaprealm_parent_dentry(newsnap
, NULL
, targeti
->get_projected_parent_dn(), true);
6569 adjust_realm
= true;
6570 realm_projected
= true;
6575 if (targeti
->is_projected_snaprealm_global()) {
6576 ceph_assert(mdr
->slave_request
->desti_snapbl
.length());
6577 auto p
= mdr
->slave_request
->desti_snapbl
.cbegin();
6579 sr_t
*newsnap
= targeti
->project_snaprealm();
6580 decode(*newsnap
, p
);
6582 if (pi
.inode
.nlink
== 0)
6583 ceph_assert(!newsnap
->is_parent_global());
6585 realm_projected
= true;
6587 ceph_assert(mdr
->slave_request
->desti_snapbl
.length() == 0);
6591 link_rollback rollback
;
6592 rollback
.reqid
= mdr
->reqid
;
6593 rollback
.ino
= targeti
->ino();
6594 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
6595 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
6596 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
6597 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
6598 rollback
.was_inc
= inc
;
6599 if (realm_projected
) {
6600 if (targeti
->snaprealm
) {
6601 encode(true, rollback
.snapbl
);
6602 targeti
->encode_snap_blob(rollback
.snapbl
);
6604 encode(false, rollback
.snapbl
);
6607 encode(rollback
, le
->rollback
);
6608 mdr
->more()->rollback_bl
= le
->rollback
;
6610 pi
.inode
.ctime
= mdr
->get_op_stamp();
6611 pi
.inode
.version
= targeti
->pre_dirty();
6613 dout(10) << " projected inode " << pi
.inode
.ino
<< " v " << pi
.inode
.version
<< dendl
;
6616 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
6617 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
6618 mdcache
->add_uncommitted_slave(mdr
->reqid
, mdr
->ls
, mdr
->slave_to_mds
);
6620 // set up commit waiter
6621 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
6623 mdr
->more()->slave_update_journaled
= true;
6624 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
, adjust_realm
),
6629 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
6631 dout(10) << "_logged_slave_link " << *mdr
6632 << " " << *targeti
<< dendl
;
6634 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
6636 // update the target
6637 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
6641 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6644 mdr
->reset_slave_request();
6647 int op
= CEPH_SNAP_OP_SPLIT
;
6648 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6649 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6653 if (!mdr
->aborted
) {
6654 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_LINKPREPACK
);
6655 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6657 dout(10) << " abort flag set, finishing" << dendl
;
6658 mdcache
->request_finish(mdr
);
6663 struct C_MDS_CommittedSlave
: public ServerLogContext
{
6664 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
6665 void finish(int r
) override
{
6666 server
->_committed_slave(mdr
);
6670 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
6672 dout(10) << "_commit_slave_link " << *mdr
6674 << " " << *targeti
<< dendl
;
6676 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
6679 // drop our pins, etc.
6682 // write a commit to the journal
6683 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
6684 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
6685 mdlog
->start_entry(le
);
6686 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6689 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6693 void Server::_committed_slave(MDRequestRef
& mdr
)
6695 dout(10) << "_committed_slave " << *mdr
<< dendl
;
6697 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
6699 bool assert_exist
= mdr
->more()->slave_update_journaled
;
6700 mdcache
->finish_uncommitted_slave(mdr
->reqid
, assert_exist
);
6701 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_COMMITTED
);
6702 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
6703 mdcache
->request_finish(mdr
);
6706 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
6708 map
<client_t
,ref_t
<MClientSnap
>> splits
;
6709 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
6710 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
6711 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
6713 void finish(int r
) override
{
6714 server
->_link_rollback_finish(mut
, mdr
, splits
);
6718 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6720 link_rollback rollback
;
6721 auto p
= rbl
.cbegin();
6722 decode(rollback
, p
);
6724 dout(10) << "do_link_rollback on " << rollback
.reqid
6725 << (rollback
.was_inc
? " inc":" dec")
6726 << " ino " << rollback
.ino
6729 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
6731 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6732 ceph_assert(mdr
|| mds
->is_resolve());
6734 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
6735 mut
->ls
= mds
->mdlog
->get_current_segment();
6737 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
6739 dout(10) << " target is " << *in
<< dendl
;
6740 ceph_assert(!in
->is_projected()); // live slave request hold versionlock xlock.
6742 auto &pi
= in
->project_inode();
6743 pi
.inode
.version
= in
->pre_dirty();
6744 mut
->add_projected_inode(in
);
6746 // parent dir rctime
6747 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
6748 fnode_t
*pf
= parent
->project_fnode();
6749 mut
->add_projected_fnode(parent
);
6750 pf
->version
= parent
->pre_dirty();
6751 if (pf
->fragstat
.mtime
== pi
.inode
.ctime
) {
6752 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
6753 if (pf
->rstat
.rctime
== pi
.inode
.ctime
)
6754 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
6755 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
6756 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
6760 pi
.inode
.ctime
= rollback
.old_ctime
;
6761 if (rollback
.was_inc
)
6766 map
<client_t
,ref_t
<MClientSnap
>> splits
;
6767 if (rollback
.snapbl
.length() && in
->snaprealm
) {
6769 auto p
= rollback
.snapbl
.cbegin();
6770 decode(hadrealm
, p
);
6772 if (!mds
->is_resolve()) {
6773 sr_t
*new_srnode
= new sr_t();
6774 decode(*new_srnode
, p
);
6775 in
->project_snaprealm(new_srnode
);
6777 decode(in
->snaprealm
->srnode
, p
);
6780 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
6781 if (!mds
->is_resolve())
6782 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
6783 in
->project_snaprealm(NULL
);
6788 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
6789 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
6790 mdlog
->start_entry(le
);
6791 le
->commit
.add_dir_context(parent
);
6792 le
->commit
.add_dir(parent
, true);
6793 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
6795 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
6800 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
6801 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
6803 dout(10) << "_link_rollback_finish" << dendl
;
6805 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
6809 if (!mds
->is_resolve())
6810 mdcache
->send_snaps(splits
);
6813 mdcache
->request_finish(mdr
);
6815 mdcache
->finish_rollback(mut
->reqid
, mdr
);
6821 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &m
)
6823 dout(10) << "handle_slave_link_prep_ack " << *mdr
6824 << " " << *m
<< dendl
;
6825 mds_rank_t from
= mds_rank_t(m
->get_source().num());
6827 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
6830 mdr
->more()->slaves
.insert(from
);
6833 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
6834 mdr
->more()->witnessed
.insert(from
);
6835 ceph_assert(!m
->is_not_journaled());
6836 mdr
->more()->has_journaled_slaves
= true;
6838 // remove from waiting list
6839 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
6840 mdr
->more()->waiting_on_slave
.erase(from
);
6842 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
6844 dispatch_client_request(mdr
); // go again!
6853 void Server::handle_client_unlink(MDRequestRef
& mdr
)
6855 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6856 client_t client
= mdr
->get_client();
6859 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
6862 mdr
->disable_lock_cache();
6863 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
6867 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
6868 ceph_assert(!dnl
->is_null());
6869 CInode
*in
= dnl
->get_inode();
6872 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
6874 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
6876 dout(7) << "dn links to " << *in
<< dendl
;
6881 // do empty directory checks
6882 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
6883 respond_to_request(mdr
, -ENOTEMPTY
);
6887 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
6888 respond_to_request(mdr
, -EISDIR
);
6894 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
6895 respond_to_request(mdr
, -ENOTDIR
);
6900 CInode
*diri
= dn
->get_dir()->get_inode();
6901 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6902 if (!check_access(mdr
, diri
, MAY_WRITE
))
6906 // -- create stray dentry? --
6907 CDentry
*straydn
= NULL
;
6908 if (dnl
->is_primary()) {
6909 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
6912 dout(10) << " straydn is " << *straydn
<< dendl
;
6913 } else if (mdr
->straydn
) {
6914 mdr
->unpin(mdr
->straydn
);
6915 mdr
->straydn
= NULL
;
6919 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6920 MutationImpl::LockOpVec lov
;
6922 lov
.add_xlock(&in
->linklock
);
6923 lov
.add_xlock(&in
->snaplock
);
6925 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
6928 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
6929 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
6930 lov
.add_xlock(&straydn
->lock
);
6933 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6936 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6940 _dir_is_nonempty(mdr
, in
)) {
6941 respond_to_request(mdr
, -ENOTEMPTY
);
6946 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
6948 if (!mdr
->more()->desti_srnode
) {
6949 if (in
->is_projected_snaprealm_global()) {
6950 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
6951 in
->record_snaprealm_parent_dentry(new_srnode
, NULL
, dn
, dnl
->is_primary());
6952 // dropping the last linkage or dropping the last remote linkage,
6953 // detch the inode from global snaprealm
6954 auto nlink
= in
->get_projected_inode()->nlink
;
6956 (nlink
== 2 && !dnl
->is_primary() &&
6957 !in
->get_projected_parent_dir()->inode
->is_stray()))
6958 in
->clear_snaprealm_global(new_srnode
);
6959 mdr
->more()->desti_srnode
= new_srnode
;
6960 } else if (dnl
->is_primary()) {
6961 // prepare snaprealm blob for slave request
6962 SnapRealm
*realm
= in
->find_snaprealm();
6963 snapid_t follows
= realm
->get_newest_seq();
6964 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
6965 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
6966 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
6967 mdr
->more()->desti_srnode
= new_srnode
;
6973 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
6974 // subtree root auths need to be witnesses
6975 set
<mds_rank_t
> witnesses
;
6976 in
->list_replicas(witnesses
);
6977 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6979 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6980 p
!= witnesses
.end();
6982 if (mdr
->more()->witnessed
.count(*p
)) {
6983 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6984 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6985 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6987 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
6991 if (!mdr
->more()->waiting_on_slave
.empty())
6992 return; // we're waiting for a witness.
6995 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
6996 mds
->locker
->create_lock_cache(mdr
, diri
);
6999 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7000 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7002 _unlink_local(mdr
, dn
, straydn
);
7005 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7008 version_t dnpv
; // deleted dentry
7010 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7011 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
7012 dnpv(d
->get_projected_version()) {}
7013 void finish(int r
) override
{
7014 ceph_assert(r
== 0);
7015 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
7019 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7021 dout(10) << "_unlink_local " << *dn
<< dendl
;
7023 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7024 CInode
*in
= dnl
->get_inode();
7028 mdr
->ls
= mdlog
->get_current_segment();
7030 // prepare log entry
7031 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
7032 mdlog
->start_entry(le
);
7033 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7034 if (!mdr
->more()->witnessed
.empty()) {
7035 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
7036 le
->reqid
= mdr
->reqid
;
7037 le
->had_slaves
= true;
7038 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7042 ceph_assert(dnl
->is_primary());
7043 straydn
->push_projected_linkage(in
);
7046 // the unlinked dentry
7049 auto &pi
= in
->project_inode();
7052 dn
->make_path_string(t
, true);
7053 pi
.inode
.stray_prior_path
= std::move(t
);
7055 pi
.inode
.version
= in
->pre_dirty();
7056 pi
.inode
.ctime
= mdr
->get_op_stamp();
7057 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
7058 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
7059 pi
.inode
.change_attr
++;
7061 if (pi
.inode
.nlink
== 0)
7062 in
->state_set(CInode::STATE_ORPHAN
);
7064 if (mdr
->more()->desti_srnode
) {
7065 auto& desti_srnode
= mdr
->more()->desti_srnode
;
7066 in
->project_snaprealm(desti_srnode
);
7067 desti_srnode
= NULL
;
7071 // will manually pop projected inode
7073 // primary link. add stray dentry.
7074 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
7075 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7077 pi
.inode
.update_backtrace();
7078 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
7080 mdr
->add_projected_inode(in
);
7081 // remote link. update remote inode.
7082 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7083 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
7084 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
7087 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7088 le
->metablob
.add_null_dentry(dn
, true);
7091 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7092 le
->metablob
.renamed_dirino
= in
->ino();
7095 dn
->push_projected_linkage();
7098 ceph_assert(in
->first
<= straydn
->first
);
7099 in
->first
= straydn
->first
;
7103 ceph_assert(straydn
);
7104 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7107 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
7110 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
7111 CDentry
*dn
, CDentry
*straydn
,
7114 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
7116 if (!mdr
->more()->witnessed
.empty())
7117 mdcache
->logged_master_update(mdr
->reqid
);
7119 CInode
*strayin
= NULL
;
7120 bool hadrealm
= false;
7122 // if there is newly created snaprealm, need to split old snaprealm's
7123 // inodes_with_caps. So pop snaprealm before linkage changes.
7124 strayin
= dn
->get_linkage()->get_inode();
7125 hadrealm
= strayin
->snaprealm
? true : false;
7126 strayin
->early_pop_projected_snaprealm();
7129 // unlink main dentry
7130 dn
->get_dir()->unlink_inode(dn
);
7131 dn
->pop_projected_linkage();
7133 // relink as stray? (i.e. was primary link?)
7135 dout(20) << " straydn is " << *straydn
<< dendl
;
7136 straydn
->pop_projected_linkage();
7138 strayin
->pop_and_dirty_projected_inode(mdr
->ls
);
7140 mdcache
->touch_dentry_bottom(straydn
);
7143 dn
->mark_dirty(dnpv
, mdr
->ls
);
7146 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
7149 // update subtree map?
7150 if (strayin
->is_dir())
7151 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
7153 if (strayin
->snaprealm
&& !hadrealm
)
7154 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
7158 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7161 respond_to_request(mdr
, 0);
7163 // removing a new dn?
7164 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7167 // respond_to_request() drops locks. So stray reintegration can race with us.
7168 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7169 // Tip off the MDCache that this dentry is a stray that
7170 // might be elegible for purge.
7171 mdcache
->notify_stray(straydn
);
7175 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
7177 if (mds
->is_cluster_degraded() &&
7178 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7179 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
7180 if (mdr
->more()->waiting_on_slave
.empty())
7181 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7185 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
7186 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RMDIRPREP
);
7187 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
7188 for (auto dn
: trace
)
7189 req
->srcdnpath
.push_dentry(dn
->get_name());
7190 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
7191 if (mdr
->more()->desti_srnode
)
7192 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7194 req
->op_stamp
= mdr
->get_op_stamp();
7195 mds
->send_message_mds(req
, who
);
7197 ceph_assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
7198 mdr
->more()->waiting_on_slave
.insert(who
);
7202 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
7203 CDentry
*dn
, *straydn
;
7204 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
7205 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
7206 void finish(int r
) override
{
7207 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
7211 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
7214 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
7215 : ServerContext(s
), mdr(r
), straydn(sd
) { }
7216 void finish(int r
) override
{
7217 server
->_commit_slave_rmdir(mdr
, r
, straydn
);
7221 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
7223 dout(10) << "handle_slave_rmdir_prep " << *mdr
7224 << " " << mdr
->slave_request
->srcdnpath
7225 << " to " << mdr
->slave_request
->destdnpath
7228 vector
<CDentry
*> trace
;
7229 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
7230 dout(10) << " src " << srcpath
<< dendl
;
7232 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, false);
7233 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
7234 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
7238 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7239 mdr
->slave_to_mds
, true);
7242 ceph_assert(r
== 0);
7243 CDentry
*dn
= trace
.back();
7244 dout(10) << " dn " << *dn
<< dendl
;
7247 ceph_assert(mdr
->straydn
);
7248 CDentry
*straydn
= mdr
->straydn
;
7249 dout(10) << " straydn " << *straydn
<< dendl
;
7251 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7253 rmdir_rollback rollback
;
7254 rollback
.reqid
= mdr
->reqid
;
7255 rollback
.src_dir
= dn
->get_dir()->dirfrag();
7256 rollback
.src_dname
= dn
->get_name();
7257 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
7258 rollback
.dest_dname
= straydn
->get_name();
7259 if (mdr
->slave_request
->desti_snapbl
.length()) {
7260 if (in
->snaprealm
) {
7261 encode(true, rollback
.snapbl
);
7262 in
->encode_snap_blob(rollback
.snapbl
);
7264 encode(false, rollback
.snapbl
);
7267 encode(rollback
, mdr
->more()->rollback_bl
);
7268 // FIXME: rollback snaprealm
7269 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7271 // set up commit waiter
7272 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
, straydn
);
7274 straydn
->push_projected_linkage(in
);
7275 dn
->push_projected_linkage();
7277 ceph_assert(straydn
->first
>= in
->first
);
7278 in
->first
= straydn
->first
;
7280 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
7281 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
7282 _logged_slave_rmdir(mdr
, dn
, straydn
);
7286 mdr
->ls
= mdlog
->get_current_segment();
7287 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
7288 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
7289 mdlog
->start_entry(le
);
7290 le
->rollback
= mdr
->more()->rollback_bl
;
7292 le
->commit
.add_dir_context(straydn
->get_dir());
7293 le
->commit
.add_primary_dentry(straydn
, in
, true);
7294 // slave: no need to journal original dentry
7296 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7297 le
->commit
.renamed_dirino
= in
->ino();
7299 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7300 mdcache
->add_uncommitted_slave(mdr
->reqid
, mdr
->ls
, mdr
->slave_to_mds
);
7302 mdr
->more()->slave_update_journaled
= true;
7303 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
7308 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7310 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
7311 CInode
*in
= dn
->get_linkage()->get_inode();
7314 if (mdr
->slave_request
->desti_snapbl
.length()) {
7315 new_realm
= !in
->snaprealm
;
7316 in
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
7317 ceph_assert(in
->snaprealm
);
7318 ceph_assert(in
->snaprealm
->have_past_parents_open());
7323 // update our cache now, so we are consistent with what is in the journal
7324 // when we journal a subtree map
7325 dn
->get_dir()->unlink_inode(dn
);
7326 straydn
->pop_projected_linkage();
7327 dn
->pop_projected_linkage();
7329 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->slave_update_journaled
);
7332 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
7335 mdr
->reset_slave_request();
7338 if (!mdr
->aborted
) {
7339 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RMDIRPREPACK
);
7340 if (!mdr
->more()->slave_update_journaled
)
7341 reply
->mark_not_journaled();
7342 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7344 dout(10) << " abort flag set, finishing" << dendl
;
7345 mdcache
->request_finish(mdr
);
7349 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
7351 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7352 << " " << *ack
<< dendl
;
7354 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
7356 mdr
->more()->slaves
.insert(from
);
7357 mdr
->more()->witnessed
.insert(from
);
7358 if (!ack
->is_not_journaled())
7359 mdr
->more()->has_journaled_slaves
= true;
7361 // remove from waiting list
7362 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
7363 mdr
->more()->waiting_on_slave
.erase(from
);
7365 if (mdr
->more()->waiting_on_slave
.empty())
7366 dispatch_client_request(mdr
); // go again!
7368 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
7371 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
7373 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
7376 if (mdr
->more()->slave_update_journaled
) {
7377 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7378 if (strayin
&& !strayin
->snaprealm
)
7379 mdcache
->clear_dirty_bits_for_stray(strayin
);
7384 if (mdr
->more()->slave_update_journaled
) {
7385 // write a commit to the journal
7386 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
7387 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
7388 ESlaveUpdate::RMDIR
);
7389 mdlog
->start_entry(le
);
7390 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
7393 _committed_slave(mdr
);
7397 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
7401 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
7405 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
7406 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
7407 void finish(int r
) override
{
7408 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
7412 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
7414 // unlink the other rollback methods, the rmdir rollback is only
7415 // needed to record the subtree changes in the journal for inode
7416 // replicas who are auth for empty dirfrags. no actual changes to
7417 // the file system are taking place here, so there is no Mutation.
7419 rmdir_rollback rollback
;
7420 auto p
= rbl
.cbegin();
7421 decode(rollback
, p
);
7423 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
7424 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
7425 ceph_assert(mdr
|| mds
->is_resolve());
7427 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
7429 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
7431 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
7433 dout(10) << " dn " << *dn
<< dendl
;
7434 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
7435 ceph_assert(straydir
);
7436 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
7437 ceph_assert(straydn
);
7438 dout(10) << " straydn " << *straydn
<< dendl
;
7439 CInode
*in
= straydn
->get_linkage()->get_inode();
7441 dn
->push_projected_linkage(in
);
7442 straydn
->push_projected_linkage();
7444 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7446 auto p
= rollback
.snapbl
.cbegin();
7447 decode(hadrealm
, p
);
7449 decode(in
->snaprealm
->srnode
, p
);
7451 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
7455 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
7456 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
7458 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
7463 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
7464 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
7465 mdlog
->start_entry(le
);
7467 le
->commit
.add_dir_context(dn
->get_dir());
7468 le
->commit
.add_primary_dentry(dn
, in
, true);
7469 // slave: no need to journal straydn
7471 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7472 le
->commit
.renamed_dirino
= in
->ino();
7474 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
7476 submit_mdlog_entry(le
,
7477 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
7483 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
7485 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
7487 straydn
->get_dir()->unlink_inode(straydn
);
7488 dn
->pop_projected_linkage();
7489 straydn
->pop_projected_linkage();
7491 CInode
*in
= dn
->get_linkage()->get_inode();
7492 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
7493 !mdr
|| mdr
->more()->slave_update_journaled
);
7495 if (mds
->is_resolve()) {
7496 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
7497 mdcache
->try_trim_non_auth_subtree(root
);
7501 mdcache
->request_finish(mdr
);
7503 mdcache
->finish_rollback(reqid
, mdr
);
7507 /** _dir_is_nonempty[_unlocked]
7509 * check if a directory is non-empty (i.e. we can rmdir it).
7511 * the unlocked varient this is a fastpath check. we can't really be
7512 * sure until we rdlock the filelock.
7514 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
7516 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
7517 ceph_assert(in
->is_auth());
7519 if (in
->filelock
.is_cached())
7520 return false; // there can be pending async create/unlink. don't know.
7521 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
7522 return true; // in a snapshot!
7524 auto&& ls
= in
->get_dirfrags();
7525 for (const auto& dir
: ls
) {
7526 // is the frag obviously non-empty?
7527 if (dir
->is_auth()) {
7528 if (dir
->get_projected_fnode()->fragstat
.size()) {
7529 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7530 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
7539 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
7541 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
7542 ceph_assert(in
->is_auth());
7543 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
7545 frag_info_t dirstat
;
7546 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
7548 auto&& ls
= in
->get_dirfrags();
7549 for (const auto& dir
: ls
) {
7550 const fnode_t
*pf
= dir
->get_projected_fnode();
7551 if (pf
->fragstat
.size()) {
7552 dout(10) << "dir_is_nonempty dirstat has "
7553 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
7557 if (pf
->accounted_fragstat
.version
== dirstat_version
)
7558 dirstat
.add(pf
->accounted_fragstat
);
7560 dirstat
.add(pf
->fragstat
);
7563 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
7567 // ======================================================
7570 class C_MDS_rename_finish
: public ServerLogContext
{
7575 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
7576 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
7577 ServerLogContext(s
, r
),
7578 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
7579 void finish(int r
) override
{
7580 ceph_assert(r
== 0);
7581 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
7586 /** handle_client_rename
7588 * rename master is the destdn auth. this is because cached inodes
7589 * must remain connected. thus, any replica of srci, must also
7590 * replicate destdn, and possibly straydn, so that srci (and
7591 * destdn->inode) remain connected during the rename.
7593 * to do this, we freeze srci, then master (destdn auth) verifies that
7594 * all other nodes have also replciated destdn and straydn. note that
7595 * destdn replicas need not also replicate srci. this only works when
7598 * This function takes responsibility for the passed mdr.
7600 void Server::handle_client_rename(MDRequestRef
& mdr
)
7602 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7603 dout(7) << "handle_client_rename " << *req
<< dendl
;
7605 filepath destpath
= req
->get_filepath();
7606 filepath srcpath
= req
->get_filepath2();
7607 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
7608 respond_to_request(mdr
, -EBUSY
);
7612 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
7616 dout(10) << " destdn " << *destdn
<< dendl
;
7617 CDir
*destdir
= destdn
->get_dir();
7618 ceph_assert(destdir
->is_auth());
7619 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7621 dout(10) << " srcdn " << *srcdn
<< dendl
;
7622 CDir
*srcdir
= srcdn
->get_dir();
7623 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7624 CInode
*srci
= srcdnl
->get_inode();
7625 dout(10) << " srci " << *srci
<< dendl
;
7627 // -- some sanity checks --
7628 if (destdn
== srcdn
) {
7629 dout(7) << "rename src=dest, noop" << dendl
;
7630 respond_to_request(mdr
, 0);
7634 // dest a child of src?
7635 // e.g. mv /usr /usr/foo
7636 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
7637 dout(7) << "cannot rename item to be a child of itself" << dendl
;
7638 respond_to_request(mdr
, -EINVAL
);
7642 // is this a stray migration, reintegration or merge? (sanity checks!)
7643 if (mdr
->reqid
.name
.is_mds() &&
7644 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
7645 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
7646 !(destdnl
->is_remote() &&
7647 destdnl
->get_remote_ino() == srci
->ino())) {
7648 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
7653 if (!destdnl
->is_null()) {
7654 //dout(10) << "dest dn exists " << *destdn << dendl;
7655 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
7657 dout(10) << " oldin " << *oldin
<< dendl
;
7659 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7660 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
7661 respond_to_request(mdr
, -ENOTEMPTY
);
7665 // mv /some/thing /to/some/existing_other_thing
7666 if (oldin
->is_dir() && !srci
->is_dir()) {
7667 respond_to_request(mdr
, -EISDIR
);
7670 if (!oldin
->is_dir() && srci
->is_dir()) {
7671 respond_to_request(mdr
, -ENOTDIR
);
7674 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
7675 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
7680 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
7681 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
7683 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7684 if (destpath
.get_ino() != srcpath
.get_ino() &&
7685 !(req
->get_source().is_mds() &&
7686 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7687 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
7688 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
7689 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7690 while (srcbase
!= destbase
&&
7691 !srcbase
->is_projected_ancestor_of(destbase
)) {
7692 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
7693 srctrace
.insert(srctrace
.begin(), pdn
);
7694 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
7695 srcbase
= pdn
->get_dir()->get_inode();
7698 // then, extend destpath until it shares the same parent inode as srcpath.
7699 while (destbase
!= srcbase
) {
7700 CDentry
*pdn
= destbase
->get_projected_parent_dn();
7701 desttrace
.insert(desttrace
.begin(), pdn
);
7702 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
7703 destbase
= pdn
->get_dir()->get_inode();
7705 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
7709 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
7711 dout(10) << " this is a link merge" << dendl
;
7713 // -- create stray dentry? --
7714 CDentry
*straydn
= NULL
;
7715 if (destdnl
->is_primary() && !linkmerge
) {
7716 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
7719 dout(10) << " straydn is " << *straydn
<< dendl
;
7720 } else if (mdr
->straydn
) {
7721 mdr
->unpin(mdr
->straydn
);
7722 mdr
->straydn
= NULL
;
7727 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7728 MutationImpl::LockOpVec lov
;
7730 // we need to update srci's ctime. xlock its least contended lock to do that...
7731 lov
.add_xlock(&srci
->linklock
);
7732 lov
.add_xlock(&srci
->snaplock
);
7735 // xlock oldin (for nlink--)
7736 lov
.add_xlock(&oldin
->linklock
);
7737 lov
.add_xlock(&oldin
->snaplock
);
7738 if (oldin
->is_dir()) {
7739 ceph_assert(srci
->is_dir());
7740 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
7742 // adjust locking order?
7743 int cmp
= mdr
->compare_paths();
7744 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
7745 std::reverse(lov
.begin(), lov
.end());
7747 ceph_assert(!srci
->is_dir());
7748 // adjust locking order;
7749 if (srci
->ino() > oldin
->ino())
7750 std::reverse(lov
.begin(), lov
.end());
7756 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7757 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7758 lov
.add_xlock(&straydn
->lock
);
7761 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
7762 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
7765 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7769 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
7771 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7772 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
7775 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
7778 if (!check_fragment_space(mdr
, destdn
->get_dir()))
7781 if (!check_access(mdr
, srci
, MAY_WRITE
))
7785 // with read lock, really verify oldin is empty
7788 _dir_is_nonempty(mdr
, oldin
)) {
7789 respond_to_request(mdr
, -ENOTEMPTY
);
7793 /* project_snaprealm_past_parent() will do this job
7795 // moving between snaprealms?
7796 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7797 SnapRealm *srcrealm = srci->find_snaprealm();
7798 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7799 if (srcrealm != destrealm &&
7800 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7801 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7802 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7803 mdcache->snaprealm_create(mdr, srci);
7809 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
7811 // -- open all srcdn inode frags, if any --
7812 // we need these open so that auth can properly delegate from inode to dirfrags
7813 // after the inode is _ours_.
7814 if (srcdnl
->is_primary() &&
7815 !srcdn
->is_auth() &&
7817 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
7818 mdr
->set_stickydirs(srci
);
7821 srci
->dirfragtree
.get_leaves(leaves
);
7822 for (const auto& leaf
: leaves
) {
7823 CDir
*dir
= srci
->get_dirfrag(leaf
);
7825 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
7826 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
7832 // -- prepare snaprealm ---
7835 if (!mdr
->more()->srci_srnode
&&
7836 srci
->get_projected_inode()->nlink
== 1 &&
7837 srci
->is_projected_snaprealm_global()) {
7838 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
7839 srci
->record_snaprealm_parent_dentry(new_srnode
, NULL
, destdn
, false);
7841 srci
->clear_snaprealm_global(new_srnode
);
7842 mdr
->more()->srci_srnode
= new_srnode
;
7845 if (oldin
&& !mdr
->more()->desti_srnode
) {
7846 if (oldin
->is_projected_snaprealm_global()) {
7847 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
7848 oldin
->record_snaprealm_parent_dentry(new_srnode
, NULL
, destdn
, destdnl
->is_primary());
7849 // dropping the last linkage or dropping the last remote linkage,
7850 // detch the inode from global snaprealm
7851 auto nlink
= oldin
->get_projected_inode()->nlink
;
7853 (nlink
== 2 && !destdnl
->is_primary() &&
7854 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
7855 oldin
->clear_snaprealm_global(new_srnode
);
7856 mdr
->more()->desti_srnode
= new_srnode
;
7857 } else if (destdnl
->is_primary()) {
7858 SnapRealm
*dest_realm
= destdir
->inode
->find_snaprealm();
7859 snapid_t follows
= dest_realm
->get_newest_seq();
7860 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
7861 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
7862 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7863 mdr
->more()->desti_srnode
= new_srnode
;
7867 if (!mdr
->more()->srci_srnode
) {
7868 SnapRealm
*dest_realm
= destdir
->inode
->find_snaprealm();
7869 if (srci
->is_projected_snaprealm_global()) {
7870 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
7871 srci
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, srcdn
, srcdnl
->is_primary());
7872 mdr
->more()->srci_srnode
= new_srnode
;
7873 } else if (srcdnl
->is_primary()) {
7874 SnapRealm
*src_realm
= srcdir
->inode
->find_snaprealm();
7875 snapid_t follows
= src_realm
->get_newest_seq();
7876 if (src_realm
!= dest_realm
&&
7877 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
7878 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
7879 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
7880 mdr
->more()->srci_srnode
= new_srnode
;
7886 // -- prepare witnesses --
7889 * NOTE: we use _all_ replicas as witnesses.
7890 * this probably isn't totally necessary (esp for file renames),
7891 * but if/when we change that, we have to make sure rejoin is
7892 * sufficiently robust to handle strong rejoins from survivors
7893 * with totally wrong dentry->inode linkage.
7894 * (currently, it can ignore rename effects, because the resolve
7895 * stage will sort them out.)
7897 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
7898 if (srcdn
->is_auth())
7899 srcdn
->list_replicas(witnesses
);
7901 witnesses
.insert(srcdn
->authority().first
);
7902 if (srcdnl
->is_remote() && !srci
->is_auth())
7903 witnesses
.insert(srci
->authority().first
);
7904 destdn
->list_replicas(witnesses
);
7905 if (destdnl
->is_remote() && !oldin
->is_auth())
7906 witnesses
.insert(oldin
->authority().first
);
7907 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7909 if (!witnesses
.empty()) {
7910 // Replicas can't see projected dentry linkages and will get confused.
7911 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
7912 // can't project these inodes' linkages.
7913 bool need_flush
= false;
7914 for (auto& dn
: srctrace
) {
7915 if (dn
->is_projected()) {
7921 CDentry
*dn
= destdn
;
7923 if (dn
->is_projected()) {
7927 CInode
*diri
= dn
->get_dir()->get_inode();
7928 dn
= diri
->get_projected_parent_dn();
7932 mdlog
->wait_for_safe(
7933 new MDSInternalContextWrapper(mds
,
7934 new C_MDS_RetryRequest(mdcache
, mdr
)));
7940 // do srcdn auth last
7941 mds_rank_t last
= MDS_RANK_NONE
;
7942 if (!srcdn
->is_auth()) {
7943 last
= srcdn
->authority().first
;
7944 mdr
->more()->srcdn_auth_mds
= last
;
7945 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
7946 // are involved in the rename operation.
7947 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
7948 dout(10) << " preparing ambiguous auth for srci" << dendl
;
7949 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
7950 ceph_assert(mdr
->more()->rename_inode
== srci
);
7951 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
7956 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7957 p
!= witnesses
.end();
7959 if (*p
== last
) continue; // do it last!
7960 if (mdr
->more()->witnessed
.count(*p
)) {
7961 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7962 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
7963 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7965 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
7969 if (!mdr
->more()->waiting_on_slave
.empty())
7970 return; // we're waiting for a witness.
7972 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
7973 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
7974 ceph_assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
7975 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
7979 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
7980 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
7981 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
7982 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
7983 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
7985 // -- declare now --
7986 mdr
->set_mds_stamp(ceph_clock_now());
7988 // -- prepare journal entry --
7989 mdr
->ls
= mdlog
->get_current_segment();
7990 EUpdate
*le
= new EUpdate(mdlog
, "rename");
7991 mdlog
->start_entry(le
);
7992 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7993 if (!mdr
->more()->witnessed
.empty()) {
7994 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
7996 le
->reqid
= mdr
->reqid
;
7997 le
->had_slaves
= true;
7999 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8000 // no need to send frozen auth pin to recovring auth MDS of srci
8001 mdr
->more()->is_remote_frozen_authpin
= false;
8004 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
8005 if (le
->client_map
.length())
8006 le
->cmapv
= mds
->sessionmap
.get_projected();
8008 // -- commit locally --
8009 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
8011 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
8012 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
8016 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8018 dout(10) << "_rename_finish " << *mdr
<< dendl
;
8020 if (!mdr
->more()->witnessed
.empty())
8021 mdcache
->logged_master_update(mdr
->reqid
);
8024 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8026 mdcache
->send_dentry_link(destdn
, mdr
);
8028 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8029 CInode
*in
= destdnl
->get_inode();
8030 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
8032 // test hack: test slave commit
8033 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
8034 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
8035 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
8036 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
8039 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8040 if (destdnl
->is_remote() && in
->is_auth())
8041 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
8043 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8045 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
8048 respond_to_request(mdr
, 0);
8051 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
8054 // respond_to_request() drops locks. So stray reintegration can race with us.
8055 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8056 mdcache
->notify_stray(straydn
);
8064 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
8065 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
8067 if (mds
->is_cluster_degraded() &&
8068 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8069 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
8070 if (mdr
->more()->waiting_on_slave
.empty())
8071 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8075 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
8076 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREP
);
8078 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
8079 for (auto dn
: srctrace
)
8080 req
->srcdnpath
.push_dentry(dn
->get_name());
8081 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
8082 for (auto dn
: dsttrace
)
8083 req
->destdnpath
.push_dentry(dn
->get_name());
8085 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8087 if (mdr
->more()->srci_srnode
)
8088 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
8089 if (mdr
->more()->desti_srnode
)
8090 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8092 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
8094 // srcdn auth will verify our current witness list is sufficient
8095 req
->witnesses
= witnesse
;
8097 req
->op_stamp
= mdr
->get_op_stamp();
8098 mds
->send_message_mds(req
, who
);
8100 ceph_assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
8101 mdr
->more()->waiting_on_slave
.insert(who
);
8105 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
8107 version_t oldpv
= mdr
->more()->inode_import_v
;
8109 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8112 auto blp
= mdr
->more()->inode_import
.cbegin();
8115 map
<client_t
,entity_inst_t
> client_map
;
8116 map
<client_t
, client_metadata_t
> client_metadata_map
;
8117 decode(client_map
, blp
);
8118 decode(client_metadata_map
, blp
);
8119 prepare_force_open_sessions(client_map
, client_metadata_map
,
8120 mdr
->more()->imported_session_map
);
8121 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
8122 encode(client_metadata_map
, *client_map_bl
);
8124 list
<ScatterLock
*> updated_scatterlocks
;
8125 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
8126 mdr
->more()->cap_imports
, updated_scatterlocks
);
8128 // hack: force back to !auth and clean, temporarily
8129 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
8130 srcdnl
->get_inode()->mark_clean();
8135 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
8137 auto&& dirs
= diri
->get_dirfrags();
8139 bool force_journal
= false;
8141 for (const auto& dir
: dirs
) {
8142 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
8143 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
8144 force_journal
= true;
8147 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
8150 // see if any children of our frags are auth subtrees.
8151 std::vector
<CDir
*> subtrees
;
8152 mdcache
->get_subtrees(subtrees
);
8153 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
8154 for (const auto& dir
: dirs
) {
8155 for (const auto& subtree
: subtrees
) {
8156 if (dir
->contains(subtree
)) {
8157 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
8158 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
8159 << *subtree
<< dendl
;
8160 force_journal
= true;
8163 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
8165 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
8171 return force_journal
;
8174 void Server::_rename_prepare(MDRequestRef
& mdr
,
8175 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
8176 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8178 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8180 dout(10) << " straydn " << *straydn
<< dendl
;
8182 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8183 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8184 CInode
*srci
= srcdnl
->get_inode();
8185 CInode
*oldin
= destdnl
->get_inode();
8187 // primary+remote link merge?
8188 bool linkmerge
= (srci
== oldin
);
8190 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8191 bool silent
= srcdn
->get_dir()->inode
->is_stray();
8193 bool force_journal_dest
= false;
8194 if (srci
->is_dir() && !destdn
->is_auth()) {
8195 if (srci
->is_auth()) {
8196 // if we are auth for srci and exporting it, force journal because journal replay needs
8197 // the source inode to create auth subtrees.
8198 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
8199 force_journal_dest
= true;
8201 force_journal_dest
= _need_force_journal(srci
, false);
8204 bool force_journal_stray
= false;
8205 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
8206 force_journal_stray
= _need_force_journal(oldin
, true);
8209 dout(10) << " merging remote and primary links to the same inode" << dendl
;
8211 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
8212 if (force_journal_dest
)
8213 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
8214 if (force_journal_stray
)
8215 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
8217 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
8218 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
8219 metablob
->renamed_dirino
= srci
->ino();
8220 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
8221 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
8222 metablob
->renamed_dirino
= oldin
->ino();
8226 CInode::mempool_inode
*spi
= 0; // renamed inode
8227 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
8231 if (destdnl
->is_primary()) {
8232 ceph_assert(straydn
); // moving to straydn.
8233 // link--, and move.
8234 if (destdn
->is_auth()) {
8235 auto &pi
= oldin
->project_inode(); //project_snaprealm
8236 pi
.inode
.version
= straydn
->pre_dirty(pi
.inode
.version
);
8237 pi
.inode
.update_backtrace();
8240 straydn
->push_projected_linkage(oldin
);
8241 } else if (destdnl
->is_remote()) {
8243 if (oldin
->is_auth()) {
8244 auto &pi
= oldin
->project_inode();
8245 pi
.inode
.version
= oldin
->pre_dirty();
8252 if (srcdnl
->is_remote()) {
8255 if (destdn
->is_auth())
8256 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
8257 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8259 if (srci
->is_auth()) {
8260 auto &pi
= srci
->project_inode();
8261 pi
.inode
.version
= srci
->pre_dirty();
8265 dout(10) << " will merge remote onto primary link" << dendl
;
8266 if (destdn
->is_auth()) {
8267 auto &pi
= oldin
->project_inode();
8268 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
8273 if (destdn
->is_auth()) {
8275 if (srcdn
->is_auth())
8276 oldpv
= srci
->get_projected_version();
8278 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
8280 // note which dirfrags have child subtrees in the journal
8281 // event, so that we can open those (as bounds) during replay.
8282 if (srci
->is_dir()) {
8283 auto&& ls
= srci
->get_dirfrags();
8284 for (const auto& dir
: ls
) {
8285 if (!dir
->is_auth())
8286 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
8288 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
8291 auto &pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
8292 // & srcdnl->snaprealm
8293 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
8294 pi
.inode
.update_backtrace();
8297 destdn
->push_projected_linkage(srci
);
8301 if (srcdn
->is_auth())
8302 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
8303 srcdn
->push_projected_linkage(); // push null linkage
8307 spi
->ctime
= mdr
->get_op_stamp();
8308 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
8309 spi
->rstat
.rctime
= mdr
->get_op_stamp();
8315 tpi
->ctime
= mdr
->get_op_stamp();
8316 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
8317 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
8321 destdn
->make_path_string(t
, true);
8322 tpi
->stray_prior_path
= std::move(t
);
8325 if (tpi
->nlink
== 0)
8326 oldin
->state_set(CInode::STATE_ORPHAN
);
8330 // prepare nesting, mtime updates
8331 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
8333 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8334 // then link the source inode to destdn
8335 if (destdnl
->is_primary()) {
8336 ceph_assert(straydn
);
8337 if (straydn
->is_auth()) {
8338 metablob
->add_dir_context(straydn
->get_dir());
8339 metablob
->add_dir(straydn
->get_dir(), true);
8344 if (destdn
->is_auth() && !destdnl
->is_null()) {
8345 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
8346 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
8347 if (destdnl
->is_primary()) {
8348 ceph_assert(straydn
);
8349 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
8350 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8355 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
8356 int flags
= predirty_dir
| predirty_primary
;
8357 if (srcdn
->is_auth())
8358 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
8359 if (destdn
->is_auth())
8360 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
8362 // add it all to the metablob
8365 if (destdnl
->is_primary()) {
8366 ceph_assert(straydn
);
8367 if (destdn
->is_auth()) {
8368 // project snaprealm, too
8369 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8370 oldin
->project_snaprealm(desti_srnode
);
8371 if (tpi
->nlink
== 0)
8372 ceph_assert(!desti_srnode
->is_parent_global());
8373 desti_srnode
= NULL
;
8375 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8376 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
8377 } else if (force_journal_stray
) {
8378 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
8379 metablob
->add_dir_context(straydn
->get_dir());
8380 metablob
->add_primary_dentry(straydn
, oldin
, true);
8382 } else if (destdnl
->is_remote()) {
8383 if (oldin
->is_auth()) {
8384 sr_t
*new_srnode
= NULL
;
8385 if (mdr
->slave_request
) {
8386 if (mdr
->slave_request
->desti_snapbl
.length() > 0) {
8387 new_srnode
= new sr_t();
8388 auto p
= mdr
->slave_request
->desti_snapbl
.cbegin();
8389 decode(*new_srnode
, p
);
8391 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8392 new_srnode
= desti_srnode
;
8393 desti_srnode
= NULL
;
8396 oldin
->project_snaprealm(new_srnode
);
8397 if (tpi
->nlink
== 0)
8398 ceph_assert(!new_srnode
->is_parent_global());
8401 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
8402 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
8403 CEPH_NOSNAP
, 0, destdnl
);
8404 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
8410 if (srcdnl
->is_remote()) {
8411 ceph_assert(!linkmerge
);
8412 if (destdn
->is_auth() && !destdnl
->is_null())
8413 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8415 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8417 if (destdn
->is_auth())
8418 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8420 if (srci
->is_auth() ) { // it's remote
8421 if (mdr
->slave_request
) {
8422 if (mdr
->slave_request
->srci_snapbl
.length() > 0) {
8423 sr_t
*new_srnode
= new sr_t();
8424 auto p
= mdr
->slave_request
->srci_snapbl
.cbegin();
8425 decode(*new_srnode
, p
);
8426 srci
->project_snaprealm(new_srnode
);
8428 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8429 srci
->project_snaprealm(srci_srnode
);
8433 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
8434 metablob
->add_dir_context(srci_pdn
->get_dir());
8435 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
, CEPH_NOSNAP
, 0, srcdnl
);
8436 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
8438 } else if (srcdnl
->is_primary()) {
8439 // project snap parent update?
8440 if (destdn
->is_auth()) {
8441 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8442 srci
->project_snaprealm(srci_srnode
);
8447 if (destdn
->is_auth() && !destdnl
->is_null())
8448 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8450 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8452 if (destdn
->is_auth())
8453 metablob
->add_primary_dentry(destdn
, srci
, true, true);
8454 else if (force_journal_dest
) {
8455 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
8456 metablob
->add_dir_context(destdn
->get_dir());
8457 metablob
->add_primary_dentry(destdn
, srci
, true);
8458 if (srcdn
->is_auth() && srci
->is_dir()) {
8459 // journal new subtrees root dirfrags
8460 auto&& ls
= srci
->get_dirfrags();
8461 for (const auto& dir
: ls
) {
8463 metablob
->add_dir(dir
, true);
8470 if (srcdn
->is_auth()) {
8471 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
8472 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
8473 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8474 // both primary and NULL dentries. Because during journal replay, null dentry is
8475 // processed after primary dentry.
8476 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
8477 metablob
->add_primary_dentry(srcdn
, srci
, true);
8478 metablob
->add_null_dentry(srcdn
, true);
8480 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
8482 // make renamed inode first track the dn
8483 if (srcdnl
->is_primary() && destdn
->is_auth()) {
8484 ceph_assert(srci
->first
<= destdn
->first
);
8485 srci
->first
= destdn
->first
;
8487 // make stray inode first track the straydn
8488 if (straydn
&& straydn
->is_auth()) {
8489 ceph_assert(oldin
->first
<= straydn
->first
);
8490 oldin
->first
= straydn
->first
;
8493 if (oldin
&& oldin
->is_dir()) {
8494 ceph_assert(straydn
);
8495 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
8498 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
8503 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8505 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8506 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
8508 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8509 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8511 CInode
*oldin
= destdnl
->get_inode();
8513 // primary+remote link merge?
8514 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
8516 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
8518 bool new_in_snaprealm
= false;
8519 bool new_oldin_snaprealm
= false;
8523 if (destdnl
->is_primary()) {
8524 ceph_assert(straydn
);
8525 dout(10) << "straydn is " << *straydn
<< dendl
;
8527 // if there is newly created snaprealm, need to split old snaprealm's
8528 // inodes_with_caps. So pop snaprealm before linkage changes.
8529 if (destdn
->is_auth()) {
8530 bool hadrealm
= (oldin
->snaprealm
? true : false);
8531 oldin
->early_pop_projected_snaprealm();
8532 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
8534 ceph_assert(mdr
->slave_request
);
8535 if (mdr
->slave_request
->desti_snapbl
.length()) {
8536 new_oldin_snaprealm
= !oldin
->snaprealm
;
8537 oldin
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
8538 ceph_assert(oldin
->snaprealm
);
8539 ceph_assert(oldin
->snaprealm
->have_past_parents_open());
8543 destdn
->get_dir()->unlink_inode(destdn
, false);
8545 straydn
->pop_projected_linkage();
8546 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8547 ceph_assert(!straydn
->is_projected()); // no other projected
8550 if (destdn
->is_auth())
8551 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8553 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
8554 } else if (destdnl
->is_remote()) {
8555 destdn
->get_dir()->unlink_inode(destdn
, false);
8556 if (oldin
->is_auth()) {
8557 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8558 } else if (mdr
->slave_request
) {
8559 if (mdr
->slave_request
->desti_snapbl
.length() > 0) {
8560 ceph_assert(oldin
->snaprealm
);
8561 oldin
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
8563 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8564 delete desti_srnode
;
8565 desti_srnode
= NULL
;
8570 // unlink src before we relink it at dest
8571 CInode
*in
= srcdnl
->get_inode();
8574 bool srcdn_was_remote
= srcdnl
->is_remote();
8575 if (!srcdn_was_remote
) {
8576 // if there is newly created snaprealm, need to split old snaprealm's
8577 // inodes_with_caps. So pop snaprealm before linkage changes.
8578 if (destdn
->is_auth()) {
8579 bool hadrealm
= (in
->snaprealm
? true : false);
8580 in
->early_pop_projected_snaprealm();
8581 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
8583 ceph_assert(mdr
->slave_request
);
8584 if (mdr
->slave_request
->srci_snapbl
.length()) {
8585 new_in_snaprealm
= !in
->snaprealm
;
8586 in
->decode_snap_blob(mdr
->slave_request
->srci_snapbl
);
8587 ceph_assert(in
->snaprealm
);
8588 ceph_assert(in
->snaprealm
->have_past_parents_open());
8593 srcdn
->get_dir()->unlink_inode(srcdn
);
8596 if (srcdn_was_remote
) {
8599 destdnl
= destdn
->pop_projected_linkage();
8600 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8601 ceph_assert(!destdn
->is_projected()); // no other projected
8603 destdn
->link_remote(destdnl
, in
);
8604 if (destdn
->is_auth())
8605 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
8607 if (in
->is_auth()) {
8608 in
->pop_and_dirty_projected_inode(mdr
->ls
);
8609 } else if (mdr
->slave_request
) {
8610 if (mdr
->slave_request
->srci_snapbl
.length() > 0) {
8611 ceph_assert(in
->snaprealm
);
8612 in
->decode_snap_blob(mdr
->slave_request
->srci_snapbl
);
8614 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8619 dout(10) << "merging remote onto primary link" << dendl
;
8620 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8624 dout(10) << "merging primary onto remote link" << dendl
;
8625 destdn
->get_dir()->unlink_inode(destdn
, false);
8627 destdnl
= destdn
->pop_projected_linkage();
8628 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8629 ceph_assert(!destdn
->is_projected()); // no other projected
8631 // srcdn inode import?
8632 if (!srcdn
->is_auth() && destdn
->is_auth()) {
8633 ceph_assert(mdr
->more()->inode_import
.length() > 0);
8635 map
<client_t
,Capability::Import
> imported_caps
;
8637 // finish cap imports
8638 finish_force_open_sessions(mdr
->more()->imported_session_map
);
8639 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
8640 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
8641 mdr
->more()->srcdn_auth_mds
, true,
8642 mdr
->more()->imported_session_map
,
8643 mdr
->more()->cap_imports
[destdnl
->get_inode()],
8647 mdr
->more()->inode_import
.clear();
8648 encode(imported_caps
, mdr
->more()->inode_import
);
8650 /* hack: add an auth pin for each xlock we hold. These were
8651 * remote xlocks previously but now they're local and
8652 * we're going to try and unpin when we xlock_finish. */
8654 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
8655 i
!= mdr
->locks
.end();
8657 SimpleLock
*lock
= i
->lock
;
8658 if (lock
->get_parent() != destdnl
->get_inode())
8660 if (i
->is_xlock() && !lock
->is_locallock())
8661 mds
->locker
->xlock_import(lock
);
8664 // hack: fix auth bit
8665 in
->state_set(CInode::STATE_AUTH
);
8667 mdr
->clear_ambiguous_auth();
8670 if (destdn
->is_auth())
8671 in
->pop_and_dirty_projected_inode(mdr
->ls
);
8675 if (srcdn
->is_auth())
8676 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
8677 srcdn
->pop_projected_linkage();
8678 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8679 ceph_assert(!srcdn
->is_projected()); // no other projected
8681 // apply remaining projected inodes (nested)
8684 // update subtree map?
8685 if (destdnl
->is_primary() && in
->is_dir())
8686 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
8688 if (straydn
&& oldin
->is_dir())
8689 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
8691 if (new_oldin_snaprealm
)
8692 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
8693 if (new_in_snaprealm
)
8694 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
8696 // removing a new dn?
8697 if (srcdn
->is_auth())
8698 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
8706 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
8707 CDentry
*srcdn
, *destdn
, *straydn
;
8709 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
8710 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
8711 void finish(int r
) override
{
8712 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
8716 class C_MDS_SlaveRenameCommit
: public ServerContext
{
8718 CDentry
*srcdn
, *destdn
, *straydn
;
8720 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
8721 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
8722 void finish(int r
) override
{
8723 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
8727 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
8730 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
8731 ServerContext(s
), mdr(r
) {}
8732 void finish(int r
) override
{
8733 server
->_slave_rename_sessions_flushed(mdr
);
8737 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
8739 dout(10) << "handle_slave_rename_prep " << *mdr
8740 << " " << mdr
->slave_request
->srcdnpath
8741 << " to " << mdr
->slave_request
->destdnpath
8744 if (mdr
->slave_request
->is_interrupted()) {
8745 dout(10) << " slave request interrupted, sending noop reply" << dendl
;
8746 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8747 reply
->mark_interrupted();
8748 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8749 mdr
->reset_slave_request();
8754 filepath
destpath(mdr
->slave_request
->destdnpath
);
8755 dout(10) << " dest " << destpath
<< dendl
;
8756 vector
<CDentry
*> trace
;
8757 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, false);
8758 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
8759 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
8763 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
8764 mdr
->slave_to_mds
, true);
8767 ceph_assert(r
== 0); // we shouldn't get an error here!
8769 CDentry
*destdn
= trace
.back();
8770 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8771 dout(10) << " destdn " << *destdn
<< dendl
;
8775 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
8776 dout(10) << " src " << srcpath
<< dendl
;
8777 CInode
*srci
= nullptr;
8778 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
8779 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
8782 ceph_assert(r
== 0);
8784 CDentry
*srcdn
= trace
.back();
8785 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8786 dout(10) << " srcdn " << *srcdn
<< dendl
;
8791 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8793 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8794 CDentry
*straydn
= mdr
->straydn
;
8795 if (destdnl
->is_primary() && !linkmerge
)
8796 ceph_assert(straydn
);
8798 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
8799 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
8801 // set up commit waiter (early, to clean up any freezing etc we do)
8802 if (!mdr
->more()->slave_commit
)
8803 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
8806 if (srcdn
->is_auth()) {
8807 set
<mds_rank_t
> srcdnrep
;
8808 srcdn
->list_replicas(srcdnrep
);
8810 bool reply_witness
= false;
8811 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
8814 // - avoid conflicting lock state changes
8815 // - avoid concurrent updates to the inode
8816 // (this could also be accomplished with the versionlock)
8817 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8818 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
8819 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
8821 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8822 if (srcdnl
->get_inode()->is_frozen_auth_pin())
8823 mdr
->unfreeze_auth_pin();
8825 if (!frozen_inode
) {
8826 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
8831 * set ambiguous auth for srci
8832 * NOTE: we don't worry about ambiguous cache expire as we do
8833 * with subtree migrations because all slaves will pin
8834 * srcdn->get_inode() for duration of this rename.
8836 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
8838 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8839 // the master will send another OP_RENAMEPREP slave request later.
8840 if (mdr
->slave_request
->witnesses
.size() > 1) {
8841 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
8842 reply_witness
= true;
8845 // make sure bystanders have received all lock related messages
8846 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
8847 if (*p
== mdr
->slave_to_mds
||
8848 (mds
->is_cluster_degraded() &&
8849 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
8851 auto notify
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMENOTIFY
);
8852 mds
->send_message_mds(notify
, *p
);
8853 mdr
->more()->waiting_on_slave
.insert(*p
);
8856 // make sure clients have received all cap related messages
8857 set
<client_t
> export_client_set
;
8858 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
8860 MDSGatherBuilder
gather(g_ceph_context
);
8861 flush_client_sessions(export_client_set
, gather
);
8862 if (gather
.has_subs()) {
8863 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
8864 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
8869 // is witness list sufficient?
8870 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
8871 if (*p
== mdr
->slave_to_mds
||
8872 mdr
->slave_request
->witnesses
.count(*p
)) continue;
8873 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
8874 reply_witness
= true;
8878 if (reply_witness
) {
8879 ceph_assert(!srcdnrep
.empty());
8880 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8881 reply
->witnesses
.swap(srcdnrep
);
8882 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8883 mdr
->reset_slave_request();
8886 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
8887 if (!mdr
->more()->waiting_on_slave
.empty()) {
8888 dout(10) << " still waiting for rename notify acks from "
8889 << mdr
->more()->waiting_on_slave
<< dendl
;
8892 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
8893 // set ambiguous auth for srci on witnesses
8894 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
8897 // encode everything we'd need to roll this back... basically, just the original state.
8898 rename_rollback rollback
;
8900 rollback
.reqid
= mdr
->reqid
;
8902 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
8903 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8904 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8905 rollback
.orig_src
.dname
= srcdn
->get_name();
8906 if (srcdnl
->is_primary())
8907 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
8909 ceph_assert(srcdnl
->is_remote());
8910 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
8911 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
8914 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
8915 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8916 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8917 rollback
.orig_dest
.dname
= destdn
->get_name();
8918 if (destdnl
->is_primary())
8919 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
8920 else if (destdnl
->is_remote()) {
8921 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
8922 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
8926 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
8927 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8928 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8929 rollback
.stray
.dname
= straydn
->get_name();
8931 if (mdr
->slave_request
->desti_snapbl
.length()) {
8932 CInode
*oldin
= destdnl
->get_inode();
8933 if (oldin
->snaprealm
) {
8934 encode(true, rollback
.desti_snapbl
);
8935 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
8937 encode(false, rollback
.desti_snapbl
);
8940 if (mdr
->slave_request
->srci_snapbl
.length()) {
8941 if (srci
->snaprealm
) {
8942 encode(true, rollback
.srci_snapbl
);
8943 srci
->encode_snap_blob(rollback
.srci_snapbl
);
8945 encode(false, rollback
.srci_snapbl
);
8948 encode(rollback
, mdr
->more()->rollback_bl
);
8949 // FIXME: rollback snaprealm
8950 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
8953 mdr
->ls
= mdlog
->get_current_segment();
8954 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
8955 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
8956 mdlog
->start_entry(le
);
8957 le
->rollback
= mdr
->more()->rollback_bl
;
8959 bufferlist blah
; // inode import data... obviously not used if we're the slave
8960 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
8962 if (le
->commit
.empty()) {
8963 dout(10) << " empty metablob, skipping journal" << dendl
;
8964 mdlog
->cancel_entry(le
);
8966 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
8968 mdcache
->add_uncommitted_slave(mdr
->reqid
, mdr
->ls
, mdr
->slave_to_mds
);
8969 mdr
->more()->slave_update_journaled
= true;
8970 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
8976 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
8977 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8979 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
8982 ref_t
<MMDSSlaveRequest
> reply
;
8983 if (!mdr
->aborted
) {
8984 reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8985 if (!mdr
->more()->slave_update_journaled
)
8986 reply
->mark_not_journaled();
8989 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8990 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
8993 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
8994 // set export bounds for CInode::encode_export()
8996 std::vector
<CDir
*> bounds
;
8997 if (srcdnl
->get_inode()->is_dir()) {
8998 srcdnl
->get_inode()->get_dirfrags(bounds
);
8999 for (const auto& bound
: bounds
) {
9000 bound
->state_set(CDir::STATE_EXPORTBOUND
);
9004 map
<client_t
,entity_inst_t
> exported_client_map
;
9005 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
9007 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
9008 exported_client_map
,
9009 exported_client_metadata_map
);
9011 for (const auto& bound
: bounds
) {
9012 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
9015 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
9016 encode(exported_client_metadata_map
, reply
->inode_export
);
9017 reply
->inode_export
.claim_append(inodebl
);
9018 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
9021 // remove mdr auth pin
9022 mdr
->auth_unpin(srcdnl
->get_inode());
9023 mdr
->more()->is_inode_exporter
= true;
9025 if (srcdnl
->get_inode()->is_dirty())
9026 srcdnl
->get_inode()->mark_clean();
9028 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
9032 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9034 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9037 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9038 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
9039 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
9042 mdr
->reset_slave_request();
9046 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
9048 ceph_assert(mdr
->aborted
);
9049 dout(10) << " abort flag set, finishing" << dendl
;
9050 mdcache
->request_finish(mdr
);
9054 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
9055 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9057 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
9059 CInode
*in
= destdn
->get_linkage()->get_inode();
9061 inodeno_t migrated_stray
;
9062 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
9063 migrated_stray
= in
->ino();
9065 MDSContext::vec finished
;
9067 // unfreeze+singleauth inode
9068 // hmm, do i really need to delay this?
9069 if (mdr
->more()->is_inode_exporter
) {
9071 // we exported, clear out any xlocks that we moved to another MDS
9073 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
9074 i
!= mdr
->locks
.end(); ) {
9075 SimpleLock
*lock
= i
->lock
;
9076 if (lock
->get_parent() != in
)
9078 // we only care about xlocks on the exported inode
9079 if (i
->is_xlock() && !lock
->is_locallock())
9080 mds
->locker
->xlock_export(i
++, mdr
.get());
9085 map
<client_t
,Capability::Import
> peer_imported
;
9086 auto bp
= mdr
->more()->inode_import
.cbegin();
9087 decode(peer_imported
, bp
);
9089 dout(10) << " finishing inode export on " << *in
<< dendl
;
9090 mdcache
->migrator
->finish_export_inode(in
, mdr
->slave_to_mds
, peer_imported
, finished
);
9091 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
9094 ceph_assert(in
->is_frozen_inode());
9095 in
->unfreeze_inode(finished
);
9099 if (mdr
->more()->is_ambiguous_auth
) {
9100 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9101 mdr
->more()->is_ambiguous_auth
= false;
9104 if (straydn
&& mdr
->more()->slave_update_journaled
) {
9105 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
9106 if (strayin
&& !strayin
->snaprealm
)
9107 mdcache
->clear_dirty_bits_for_stray(strayin
);
9110 mds
->queue_waiters(finished
);
9113 if (mdr
->more()->slave_update_journaled
) {
9114 // write a commit to the journal
9115 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
9116 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
9117 ESlaveUpdate::RENAME
);
9118 mdlog
->start_entry(le
);
9119 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
9122 _committed_slave(mdr
);
9127 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9128 // witness list from the master, and they failed before we tried prep again.
9129 if (mdr
->more()->rollback_bl
.length()) {
9130 if (mdr
->more()->is_inode_exporter
) {
9131 dout(10) << " reversing inode export of " << *in
<< dendl
;
9134 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
9135 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
9136 // rollback but preserve the slave request
9137 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
9138 mdr
->more()->rollback_bl
.clear();
9140 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
9142 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
9144 if (mdr
->more()->is_ambiguous_auth
) {
9145 if (srcdn
->is_auth())
9146 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9148 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9149 mdr
->more()->is_ambiguous_auth
= false;
9151 mds
->queue_waiters(finished
);
9152 mdcache
->request_finish(mdr
);
9156 if (migrated_stray
&& mds
->is_stopping())
9157 mdcache
->shutdown_export_stray_finish(migrated_stray
);
9160 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
9161 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
9164 pf
= dir
->project_fnode();
9165 mut
->add_projected_fnode(dir
);
9166 pf
->version
= dir
->pre_dirty();
9169 pf
->fragstat
.nsubdirs
+= linkunlink
;
9171 pf
->fragstat
.nfiles
+= linkunlink
;
9174 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
9175 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
9176 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
9177 pf
->rstat
.rsnaps
+= linkunlink
* rstat
.rsnaps
;
9179 if (pf
->fragstat
.mtime
== ctime
) {
9180 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
9181 if (pf
->rstat
.rctime
== ctime
)
9182 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
9184 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
9185 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
9188 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
9194 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9196 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
9197 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
9198 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
9199 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
9200 straydn(st
), finish_mdr(f
) {
9201 splits
[0].swap(_splits
[0]);
9202 splits
[1].swap(_splits
[1]);
9204 void finish(int r
) override
{
9205 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
9206 destdn
, straydn
, splits
, finish_mdr
);
9210 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
9213 rename_rollback rollback
;
9214 auto p
= rbl
.cbegin();
9215 decode(rollback
, p
);
9217 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
9218 // need to finish this update before sending resolve to claim the subtree
9219 mdcache
->add_rollback(rollback
.reqid
, master
);
9221 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
9222 mut
->ls
= mds
->mdlog
->get_current_segment();
9224 CDentry
*srcdn
= NULL
;
9225 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
9227 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
9229 dout(10) << " srcdir " << *srcdir
<< dendl
;
9230 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
9232 dout(10) << " srcdn " << *srcdn
<< dendl
;
9233 ceph_assert(srcdn
->get_linkage()->is_null());
9235 dout(10) << " srcdn not found" << dendl
;
9237 dout(10) << " srcdir not found" << dendl
;
9239 CDentry
*destdn
= NULL
;
9240 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
9242 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
9244 dout(10) << " destdir " << *destdir
<< dendl
;
9245 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
9247 dout(10) << " destdn " << *destdn
<< dendl
;
9249 dout(10) << " destdn not found" << dendl
;
9251 dout(10) << " destdir not found" << dendl
;
9254 if (rollback
.orig_src
.ino
) {
9255 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
9256 if (in
&& in
->is_dir())
9257 ceph_assert(srcdn
&& destdn
);
9259 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
9261 CDir
*straydir
= NULL
;
9262 CDentry
*straydn
= NULL
;
9263 if (rollback
.stray
.dirfrag
.ino
) {
9264 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
9266 dout(10) << "straydir " << *straydir
<< dendl
;
9267 straydn
= straydir
->lookup(rollback
.stray
.dname
);
9269 dout(10) << " straydn " << *straydn
<< dendl
;
9270 ceph_assert(straydn
->get_linkage()->is_primary());
9272 dout(10) << " straydn not found" << dendl
;
9274 dout(10) << "straydir not found" << dendl
;
9277 CInode
*target
= NULL
;
9278 if (rollback
.orig_dest
.ino
) {
9279 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
9281 ceph_assert(destdn
&& straydn
);
9282 } else if (rollback
.orig_dest
.remote_ino
)
9283 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
9285 // can't use is_auth() in the resolve stage
9286 mds_rank_t whoami
= mds
->get_nodeid();
9288 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
9289 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
9291 bool force_journal_src
= false;
9292 bool force_journal_dest
= false;
9293 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
9294 force_journal_src
= _need_force_journal(in
, false);
9295 if (in
&& target
&& target
->is_dir())
9296 force_journal_dest
= _need_force_journal(in
, true);
9298 version_t srcdnpv
= 0;
9301 if (srcdn
->authority().first
== whoami
)
9302 srcdnpv
= srcdn
->pre_dirty();
9303 if (rollback
.orig_src
.ino
) {
9305 srcdn
->push_projected_linkage(in
);
9307 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
9308 rollback
.orig_src
.remote_d_type
);
9311 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9313 CInode::mempool_inode
*pip
= nullptr;
9316 if (in
->get_projected_parent_dn()->authority().first
== whoami
) {
9317 auto &pi
= in
->project_inode();
9319 mut
->add_projected_inode(in
);
9320 pip
->version
= in
->pre_dirty();
9323 pip
= in
->get_projected_inode();
9326 if (pip
->ctime
== rollback
.ctime
)
9327 pip
->ctime
= rollback
.orig_src
.old_ctime
;
9329 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
9331 auto p
= rollback
.srci_snapbl
.cbegin();
9332 decode(hadrealm
, p
);
9334 if (projected
&& !mds
->is_resolve()) {
9335 sr_t
*new_srnode
= new sr_t();
9336 decode(*new_srnode
, p
);
9337 in
->project_snaprealm(new_srnode
);
9339 decode(in
->snaprealm
->srnode
, p
);
9342 if (rollback
.orig_src
.ino
) {
9343 ceph_assert(srcdir
);
9344 realm
= srcdir
->get_inode()->find_snaprealm();
9346 realm
= in
->snaprealm
->parent
;
9348 if (!mds
->is_resolve())
9349 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
9351 in
->project_snaprealm(NULL
);
9353 in
->snaprealm
->merge_to(realm
);
9358 if (srcdn
&& srcdn
->authority().first
== whoami
) {
9360 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
9361 in
? in
->is_dir() : false, 1, pip
? pip
->accounted_rstat
: blah
);
9366 if (rollback
.orig_dest
.ino
&& target
) {
9367 destdn
->push_projected_linkage(target
);
9368 } else if (rollback
.orig_dest
.remote_ino
) {
9369 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
9370 rollback
.orig_dest
.remote_d_type
);
9372 // the dentry will be trimmed soon, it's ok to have wrong linkage
9373 if (rollback
.orig_dest
.ino
)
9374 ceph_assert(mds
->is_resolve());
9375 destdn
->push_projected_linkage();
9380 straydn
->push_projected_linkage();
9384 CInode::mempool_inode
*ti
= nullptr;
9385 if (target
->get_projected_parent_dn()->authority().first
== whoami
) {
9386 auto &pi
= target
->project_inode();
9388 mut
->add_projected_inode(target
);
9389 ti
->version
= target
->pre_dirty();
9392 ti
= target
->get_projected_inode();
9395 if (ti
->ctime
== rollback
.ctime
)
9396 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
9397 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
9398 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
9399 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
9401 ceph_assert(rollback
.orig_dest
.remote_ino
&&
9402 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
9406 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
9408 auto p
= rollback
.desti_snapbl
.cbegin();
9409 decode(hadrealm
, p
);
9411 if (projected
&& !mds
->is_resolve()) {
9412 sr_t
*new_srnode
= new sr_t();
9413 decode(*new_srnode
, p
);
9414 target
->project_snaprealm(new_srnode
);
9416 decode(target
->snaprealm
->srnode
, p
);
9419 if (rollback
.orig_dest
.ino
) {
9420 ceph_assert(destdir
);
9421 realm
= destdir
->get_inode()->find_snaprealm();
9423 realm
= target
->snaprealm
->parent
;
9425 if (!mds
->is_resolve())
9426 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
9428 target
->project_snaprealm(NULL
);
9430 target
->snaprealm
->merge_to(realm
);
9436 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
9438 dout(0) << " srci back to " << *in
<< dendl
;
9440 dout(0) << " destdn back to " << *destdn
<< dendl
;
9442 dout(0) << " desti back to " << *target
<< dendl
;
9445 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
9446 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
9447 mdlog
->start_entry(le
);
9449 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9450 le
->commit
.add_dir_context(srcdir
);
9451 if (rollback
.orig_src
.ino
)
9452 le
->commit
.add_primary_dentry(srcdn
, 0, true);
9454 le
->commit
.add_remote_dentry(srcdn
, true);
9457 if (!rollback
.orig_src
.ino
&& // remote linkage
9458 in
&& in
->authority().first
== whoami
) {
9459 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
9460 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9463 if (force_journal_dest
) {
9464 ceph_assert(rollback
.orig_dest
.ino
);
9465 le
->commit
.add_dir_context(destdir
);
9466 le
->commit
.add_primary_dentry(destdn
, 0, true);
9469 // slave: no need to journal straydn
9471 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
9472 ceph_assert(rollback
.orig_dest
.remote_ino
);
9473 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
9474 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
9477 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9478 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
9479 le
->commit
.renamed_dirino
= in
->ino();
9480 if (srcdn
->authority().first
== whoami
) {
9481 auto&& ls
= in
->get_dirfrags();
9482 for (const auto& dir
: ls
) {
9483 if (!dir
->is_auth())
9484 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
9486 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
9488 } else if (force_journal_dest
) {
9489 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
9490 le
->commit
.renamed_dirino
= target
->ino();
9493 if (target
&& target
->is_dir()) {
9494 ceph_assert(destdn
);
9495 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
9498 if (in
&& in
->is_dir()) {
9500 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
9503 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
9504 ceph_assert(le
->commit
.empty());
9505 mdlog
->cancel_entry(le
);
9507 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
9509 ceph_assert(!le
->commit
.empty());
9511 mdr
->more()->slave_update_journaled
= false;
9512 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
9513 srcdn
, srcdnpv
, destdn
, straydn
,
9514 splits
, finish_mdr
);
9515 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
9520 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
9521 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
9522 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
9524 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
9527 straydn
->get_dir()->unlink_inode(straydn
);
9528 straydn
->pop_projected_linkage();
9531 destdn
->get_dir()->unlink_inode(destdn
);
9532 destdn
->pop_projected_linkage();
9535 srcdn
->pop_projected_linkage();
9536 if (srcdn
->authority().first
== mds
->get_nodeid()) {
9537 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
9538 if (srcdn
->get_linkage()->is_primary())
9539 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
9545 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
9546 CInode
*in
= srcdn
->get_linkage()->get_inode();
9547 if (in
&& in
->is_dir()) {
9548 ceph_assert(destdn
);
9549 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
9554 CInode
*oldin
= destdn
->get_linkage()->get_inode();
9555 // update subtree map?
9556 if (oldin
&& oldin
->is_dir()) {
9557 ceph_assert(straydn
);
9558 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
9562 if (mds
->is_resolve()) {
9565 root
= mdcache
->get_subtree_root(straydn
->get_dir());
9567 root
= mdcache
->get_subtree_root(destdn
->get_dir());
9569 mdcache
->try_trim_non_auth_subtree(root
);
9571 mdcache
->send_snaps(splits
[1]);
9572 mdcache
->send_snaps(splits
[0]);
9576 MDSContext::vec finished
;
9577 if (mdr
->more()->is_ambiguous_auth
) {
9578 if (srcdn
->is_auth())
9579 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9581 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9582 mdr
->more()->is_ambiguous_auth
= false;
9584 mds
->queue_waiters(finished
);
9585 if (finish_mdr
|| mdr
->aborted
)
9586 mdcache
->request_finish(mdr
);
9588 mdr
->more()->slave_rolling_back
= false;
9591 mdcache
->finish_rollback(mut
->reqid
, mdr
);
9596 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
9598 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9599 << " witnessed by " << ack
->get_source()
9600 << " " << *ack
<< dendl
;
9601 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
9604 mdr
->more()->slaves
.insert(from
);
9605 if (mdr
->more()->srcdn_auth_mds
== from
&&
9606 mdr
->more()->is_remote_frozen_authpin
&&
9607 !mdr
->more()->is_ambiguous_auth
) {
9608 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
9611 // witnessed? or add extra witnesses?
9612 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
9613 if (ack
->is_interrupted()) {
9614 dout(10) << " slave request interrupted, noop" << dendl
;
9615 } else if (ack
->witnesses
.empty()) {
9616 mdr
->more()->witnessed
.insert(from
);
9617 if (!ack
->is_not_journaled())
9618 mdr
->more()->has_journaled_slaves
= true;
9620 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
9621 mdr
->more()->extra_witnesses
= ack
->witnesses
;
9622 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
9626 if (ack
->inode_export
.length()) {
9627 dout(10) << " got srci import" << dendl
;
9628 mdr
->more()->inode_import
.share(ack
->inode_export
);
9629 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
9632 // remove from waiting list
9633 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
9634 mdr
->more()->waiting_on_slave
.erase(from
);
9636 if (mdr
->more()->waiting_on_slave
.empty())
9637 dispatch_client_request(mdr
); // go again!
9639 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
9642 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
9644 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
9645 << ack
->get_source() << dendl
;
9646 ceph_assert(mdr
->is_slave());
9647 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
9649 if (mdr
->more()->waiting_on_slave
.count(from
)) {
9650 mdr
->more()->waiting_on_slave
.erase(from
);
9652 if (mdr
->more()->waiting_on_slave
.empty()) {
9653 if (mdr
->slave_request
)
9654 dispatch_slave_request(mdr
);
9656 dout(10) << " still waiting for rename notify acks from "
9657 << mdr
->more()->waiting_on_slave
<< dendl
;
9661 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
9663 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
9665 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
9666 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
9668 if (mdr
->more()->waiting_on_slave
.empty()) {
9669 if (mdr
->slave_request
)
9670 dispatch_slave_request(mdr
);
9672 dout(10) << " still waiting for rename notify acks from "
9673 << mdr
->more()->waiting_on_slave
<< dendl
;
9678 /* This function takes responsibility for the passed mdr*/
9679 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
9681 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
9684 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
9688 if (!diri
->is_dir()) {
9689 respond_to_request(mdr
, -ENOTDIR
);
9692 dout(10) << "lssnap on " << *diri
<< dendl
;
9695 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
9698 if (!check_access(mdr
, diri
, MAY_READ
))
9701 SnapRealm
*realm
= diri
->find_snaprealm();
9702 map
<snapid_t
,const SnapInfo
*> infomap
;
9703 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
9705 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
9707 max_entries
= infomap
.size();
9708 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
9710 // make sure at least one item can be encoded
9711 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
9713 __u64 last_snapid
= 0;
9714 string offset_str
= req
->get_path2();
9715 if (!offset_str
.empty())
9716 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
9720 static DirStat empty
;
9721 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
9723 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
9727 auto p
= infomap
.upper_bound(last_snapid
);
9728 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
9729 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
9733 if (p
->second
->ino
== diri
->ino())
9734 snap_name
= p
->second
->name
;
9736 snap_name
= p
->second
->get_long_name();
9738 unsigned start_len
= dnbl
.length();
9739 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
9742 encode(snap_name
, dnbl
);
9744 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
9745 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
9746 dout(20) << "encode_infinite_lease" << dendl
;
9748 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
9751 keep
.substr_of(dnbl
, 0, start_len
);
9760 if (p
== infomap
.end()) {
9761 flags
= CEPH_READDIR_FRAG_END
;
9762 if (last_snapid
== 0)
9763 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
9765 encode(flags
, dirbl
);
9766 dirbl
.claim_append(dnbl
);
9768 mdr
->reply_extra_bl
= dirbl
;
9770 respond_to_request(mdr
, 0);
9776 struct C_MDS_mksnap_finish
: public ServerLogContext
{
9779 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
9780 ServerLogContext(s
, r
), diri(di
), info(i
) {}
9781 void finish(int r
) override
{
9782 server
->_mksnap_finish(mdr
, diri
, info
);
9786 /* This function takes responsibility for the passed mdr*/
9787 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
9789 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
9790 // make sure we have as new a map as the client
9791 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
9792 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
9795 if (!mds
->mdsmap
->allows_snaps()) {
9796 // you can't make snapshots until you set an option right now
9797 respond_to_request(mdr
, -EPERM
);
9801 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
9806 if (!diri
->is_dir()) {
9807 respond_to_request(mdr
, -ENOTDIR
);
9810 if (diri
->is_system() && !diri
->is_root()) {
9811 // no snaps in system dirs (root is ok)
9812 respond_to_request(mdr
, -EPERM
);
9816 std::string_view snapname
= req
->get_filepath().last_dentry();
9818 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
9819 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
9820 respond_to_request(mdr
, -EPERM
);
9824 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
9827 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
9828 MutationImpl::LockOpVec lov
;
9829 lov
.add_xlock(&diri
->snaplock
);
9830 if (!mds
->locker
->acquire_locks(mdr
, lov
))
9833 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
9834 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
9837 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
9840 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
9843 // check if we can create any more snapshots
9844 // we don't allow any more if we are already at or beyond the limit
9845 if (diri
->snaprealm
&&
9846 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
9847 respond_to_request(mdr
, -EMLINK
);
9851 // make sure name is unique
9852 if (diri
->snaprealm
&&
9853 diri
->snaprealm
->exists(snapname
)) {
9854 respond_to_request(mdr
, -EEXIST
);
9857 if (snapname
.length() == 0 ||
9858 snapname
[0] == '_') {
9859 respond_to_request(mdr
, -EINVAL
);
9863 // allocate a snapid
9864 if (!mdr
->more()->stid
) {
9866 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
9867 mdr
->get_mds_stamp(),
9868 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
9869 new C_MDS_RetryRequest(mdcache
, mdr
));
9873 version_t stid
= mdr
->more()->stid
;
9875 auto p
= mdr
->more()->snapidbl
.cbegin();
9877 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
9879 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
9883 info
.ino
= diri
->ino();
9884 info
.snapid
= snapid
;
9885 info
.name
= snapname
;
9886 info
.stamp
= mdr
->get_op_stamp();
9888 auto &pi
= diri
->project_inode(false, true);
9889 pi
.inode
.ctime
= info
.stamp
;
9890 if (info
.stamp
> pi
.inode
.rstat
.rctime
)
9891 pi
.inode
.rstat
.rctime
= info
.stamp
;
9892 pi
.inode
.rstat
.rsnaps
++;
9893 pi
.inode
.version
= diri
->pre_dirty();
9895 // project the snaprealm
9896 auto &newsnap
= *pi
.snapnode
;
9897 newsnap
.created
= snapid
;
9898 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
9900 em
.first
->second
= info
;
9901 newsnap
.seq
= snapid
;
9902 newsnap
.last_created
= snapid
;
9904 // journal the inode changes
9905 mdr
->ls
= mdlog
->get_current_segment();
9906 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
9907 mdlog
->start_entry(le
);
9909 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
9910 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
9911 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
9912 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
9914 // journal the snaprealm changes
9915 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
9920 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
9922 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
9924 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
9926 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
9929 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
9932 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
9935 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
9937 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
9941 mdr
->snapid
= info
.snapid
;
9943 respond_to_request(mdr
, 0);
9949 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
9952 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
9953 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
9954 void finish(int r
) override
{
9955 server
->_rmsnap_finish(mdr
, diri
, snapid
);
9959 /* This function takes responsibility for the passed mdr*/
9960 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
9962 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
9964 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
9968 if (!diri
->is_dir()) {
9969 respond_to_request(mdr
, -ENOTDIR
);
9973 std::string_view snapname
= req
->get_filepath().last_dentry();
9975 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
9976 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
9977 respond_to_request(mdr
, -EPERM
);
9981 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
9984 if (snapname
.length() == 0 || snapname
[0] == '_') {
9985 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
9988 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
9989 respond_to_request(mdr
, -ENOENT
);
9992 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
9993 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
9995 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
9996 MutationImpl::LockOpVec lov
;
9997 lov
.add_xlock(&diri
->snaplock
);
9998 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10000 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10001 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10004 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10007 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10011 if (!mdr
->more()->stid
) {
10012 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
10013 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10014 new C_MDS_RetryRequest(mdcache
, mdr
));
10017 version_t stid
= mdr
->more()->stid
;
10018 auto p
= mdr
->more()->snapidbl
.cbegin();
10021 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
10023 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10026 auto &pi
= diri
->project_inode(false, true);
10027 pi
.inode
.version
= diri
->pre_dirty();
10028 pi
.inode
.ctime
= mdr
->get_op_stamp();
10029 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
10030 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
10031 pi
.inode
.rstat
.rsnaps
--;
10033 mdr
->ls
= mdlog
->get_current_segment();
10034 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
10035 mdlog
->start_entry(le
);
10037 // project the snaprealm
10038 auto &newnode
= *pi
.snapnode
;
10039 newnode
.snaps
.erase(snapid
);
10041 newnode
.last_destroyed
= seq
;
10043 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10044 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10045 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10046 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10048 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
10053 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10055 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10056 snapid_t stid
= mdr
->more()->stid
;
10057 auto p
= mdr
->more()->snapidbl
.cbegin();
10061 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
10064 mds
->snapclient
->commit(stid
, mdr
->ls
);
10066 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10068 // notify other mds
10069 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
10071 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
10075 respond_to_request(mdr
, 0);
10077 // purge snapshot data
10078 if (diri
->snaprealm
->have_past_parents_open())
10079 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
10082 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
10085 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10086 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10087 void finish(int r
) override
{
10088 server
->_renamesnap_finish(mdr
, diri
, snapid
);
10092 /* This function takes responsibility for the passed mdr*/
10093 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
10095 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10096 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
10097 respond_to_request(mdr
, -EINVAL
);
10101 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10105 if (!diri
->is_dir()) { // dir only
10106 respond_to_request(mdr
, -ENOTDIR
);
10110 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
10111 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10112 respond_to_request(mdr
, -EPERM
);
10116 std::string_view dstname
= req
->get_filepath().last_dentry();
10117 std::string_view srcname
= req
->get_filepath2().last_dentry();
10118 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
10120 if (srcname
.length() == 0 || srcname
[0] == '_') {
10121 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
10124 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
10125 respond_to_request(mdr
, -ENOENT
);
10128 if (dstname
.length() == 0 || dstname
[0] == '_') {
10129 respond_to_request(mdr
, -EINVAL
);
10132 if (diri
->snaprealm
->exists(dstname
)) {
10133 respond_to_request(mdr
, -EEXIST
);
10137 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
10138 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
10141 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10142 MutationImpl::LockOpVec lov
;
10143 lov
.add_xlock(&diri
->snaplock
);
10144 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10146 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10147 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10150 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10153 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10157 if (!mdr
->more()->stid
) {
10158 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
10159 &mdr
->more()->stid
,
10160 new C_MDS_RetryRequest(mdcache
, mdr
));
10164 version_t stid
= mdr
->more()->stid
;
10165 dout(10) << " stid is " << stid
<< dendl
;
10167 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10170 auto &pi
= diri
->project_inode(false, true);
10171 pi
.inode
.ctime
= mdr
->get_op_stamp();
10172 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
10173 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
10174 pi
.inode
.version
= diri
->pre_dirty();
10176 // project the snaprealm
10177 auto &newsnap
= *pi
.snapnode
;
10178 auto it
= newsnap
.snaps
.find(snapid
);
10179 ceph_assert(it
!= newsnap
.snaps
.end());
10180 it
->second
.name
= dstname
;
10182 // journal the inode changes
10183 mdr
->ls
= mdlog
->get_current_segment();
10184 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
10185 mdlog
->start_entry(le
);
10187 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10188 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10189 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10190 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10192 // journal the snaprealm changes
10193 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
10198 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10200 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10202 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
10205 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10207 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10209 // notify other mds
10210 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
10212 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
10216 mdr
->tracei
= diri
;
10217 mdr
->snapid
= snapid
;
10218 respond_to_request(mdr
, 0);
10222 * Return true if server is in state RECONNECT and this
10223 * client has not yet reconnected.
10225 bool Server::waiting_for_reconnect(client_t c
) const
10227 return client_reconnect_gather
.count(c
) > 0;
10230 void Server::dump_reconnect_status(Formatter
*f
) const
10232 f
->open_object_section("reconnect_status");
10233 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
10234 f
->close_section();