1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "cephfs_features.h"
34 #include "msg/Messenger.h"
36 #include "osdc/Objecter.h"
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43 #include "events/EPurged.h"
45 #include "include/stringify.h"
46 #include "include/filepath.h"
47 #include "common/errno.h"
48 #include "common/Timer.h"
49 #include "common/perf_counters.h"
50 #include "include/compat.h"
51 #include "osd/OSDMap.h"
58 #include <string_view>
60 #include "common/config.h"
62 #define dout_context g_ceph_context
63 #define dout_subsys ceph_subsys_mds
65 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67 class ServerContext
: public MDSContext
{
70 MDSRank
*get_mds() override
76 explicit ServerContext(Server
*s
) : server(s
) {
77 ceph_assert(server
!= NULL
);
81 class Batch_Getattr_Lookup
: public BatchOp
{
84 ceph::ref_t
<MDRequestImpl
> mdr
;
88 Batch_Getattr_Lookup(Server
* s
, ceph::ref_t
<MDRequestImpl
> r
, MDCache
* mdc
) : server(s
), mdr(std::move(r
)), mdcache(mdc
) {}
89 void add_request(const ceph::ref_t
<MDRequestImpl
>& m
) override
{
90 mdr
->batch_reqs
.push_back(m
);
92 void set_request(const ceph::ref_t
<MDRequestImpl
>& m
) override
{
95 void _forward(mds_rank_t t
) override
{
96 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
97 mdr
->set_mds_stamp(ceph_clock_now());
98 for (auto& m
: mdr
->batch_reqs
) {
100 mdcache
->request_forward(m
, t
);
102 mdr
->batch_reqs
.clear();
104 void _respond(int r
) override
{
105 mdr
->set_mds_stamp(ceph_clock_now());
106 for (auto& m
: mdr
->batch_reqs
) {
108 m
->tracei
= mdr
->tracei
;
109 m
->tracedn
= mdr
->tracedn
;
110 server
->respond_to_request(m
, r
);
113 mdr
->batch_reqs
.clear();
114 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
116 void print(std::ostream
& o
) {
117 o
<< "[batch front=" << *mdr
<< "]";
121 class ServerLogContext
: public MDSLogContextBase
{
124 MDSRank
*get_mds() override
130 void pre_finish(int r
) override
{
132 mdr
->mark_event("journal_committed: ");
135 explicit ServerLogContext(Server
*s
) : server(s
) {
136 ceph_assert(server
!= NULL
);
138 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
139 ceph_assert(server
!= NULL
);
143 void Server::create_logger()
145 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
147 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
148 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
149 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
150 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
151 plb
.add_u64_counter(l_mdss_handle_client_session
,
152 "handle_client_session", "Client session messages", "hcs",
153 PerfCountersBuilder::PRIO_INTERESTING
);
154 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
155 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
157 // fop latencies are useful
158 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
159 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
160 "Request type lookup hash of inode latency");
161 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
162 "Request type lookup inode latency");
163 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
164 "Request type lookup parent latency");
165 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
166 "Request type lookup name latency");
167 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
168 "Request type lookup latency");
169 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
170 "Request type lookup snapshot latency");
171 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
172 "Request type get attribute latency");
173 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
174 "Request type set attribute latency");
175 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
176 "Request type set file layout latency");
177 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
178 "Request type set directory layout latency");
179 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
180 "Request type set extended attribute latency");
181 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
182 "Request type remove extended attribute latency");
183 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
184 "Request type read directory latency");
185 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
186 "Request type set file lock latency");
187 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
188 "Request type get file lock latency");
189 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
190 "Request type create latency");
191 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
192 "Request type open latency");
193 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
194 "Request type make node latency");
195 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
196 "Request type link latency");
197 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
198 "Request type unlink latency");
199 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
200 "Request type remove directory latency");
201 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
202 "Request type rename latency");
203 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
204 "Request type make directory latency");
205 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
206 "Request type symbolic link latency");
207 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
208 "Request type list snapshot latency");
209 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
210 "Request type make snapshot latency");
211 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
212 "Request type remove snapshot latency");
213 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
214 "Request type rename snapshot latency");
216 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
217 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
218 "Client requests dispatched");
219 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request",
220 "Server requests dispatched");
222 logger
= plb
.create_perf_counters();
223 g_ceph_context
->get_perfcounters_collection()->add(logger
);
226 Server::Server(MDSRank
*m
) :
228 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
229 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate"))
231 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
232 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
233 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
234 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
235 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
238 void Server::dispatch(const cref_t
<Message
> &m
)
240 switch (m
->get_type()) {
241 case CEPH_MSG_CLIENT_RECONNECT
:
242 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
247 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
249 1. In reconnect phase, client sent unsafe requests to mds.
250 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
251 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
252 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
255 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
257 // handle_slave_request()/handle_client_session() will wait if necessary
258 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
259 const auto &req
= ref_cast
<MClientRequest
>(m
);
260 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
261 Session
*session
= mds
->get_session(req
);
262 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
263 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
266 bool queue_replay
= false;
267 if (req
->is_replay() || req
->is_async()) {
268 dout(3) << "queuing replayed op" << dendl
;
271 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
272 mdcache
->add_replay_ino_alloc(inodeno_t(req
->head
.ino
));
274 } else if (req
->get_retry_attempt()) {
275 // process completed request in clientreplay stage. The completed request
276 // might have created new file/directorie. This guarantees MDS sends a reply
277 // to client before other request modifies the new file/directorie.
278 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
279 dout(3) << "queuing completed op" << dendl
;
282 // this request was created before the cap reconnect message, drop any embedded
284 req
->releases
.clear();
287 req
->mark_queued_for_replay();
288 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
293 bool wait_for_active
= true;
294 if (mds
->is_stopping()) {
295 wait_for_active
= false;
296 } else if (mds
->is_clientreplay()) {
297 if (req
->is_queued_for_replay()) {
298 wait_for_active
= false;
301 if (wait_for_active
) {
302 dout(3) << "not active yet, waiting" << dendl
;
303 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
308 switch (m
->get_type()) {
309 case CEPH_MSG_CLIENT_SESSION
:
310 handle_client_session(ref_cast
<MClientSession
>(m
));
312 case CEPH_MSG_CLIENT_REQUEST
:
313 handle_client_request(ref_cast
<MClientRequest
>(m
));
315 case CEPH_MSG_CLIENT_RECLAIM
:
316 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
318 case MSG_MDS_SLAVE_REQUEST
:
319 handle_slave_request(ref_cast
<MMDSSlaveRequest
>(m
));
322 derr
<< "server unknown message " << m
->get_type() << dendl
;
323 ceph_abort_msg("server unknown message");
329 // ----------------------------------------------------------
330 // SESSION management
332 class C_MDS_session_finish
: public ServerLogContext
{
337 interval_set
<inodeno_t
> inos
;
339 interval_set
<inodeno_t
> purge_inos
;
340 LogSegment
*ls
= nullptr;
343 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
344 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
345 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
> i
, version_t iv
, Context
*fin_
= NULL
) :
346 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(std::move(i
)), inotablev(iv
), fin(fin_
) { }
347 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
> i
, version_t iv
,
348 interval_set
<inodeno_t
> _purge_inos
, LogSegment
*_ls
, Context
*fin_
= NULL
) :
349 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(std::move(i
)), inotablev(iv
), purge_inos(std::move(_purge_inos
)), ls(_ls
), fin(fin_
){}
350 void finish(int r
) override
{
352 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
, purge_inos
, ls
);
359 Session
* Server::find_session_by_uuid(std::string_view uuid
)
361 Session
* session
= nullptr;
362 for (auto& it
: mds
->sessionmap
.get_sessions()) {
363 auto& metadata
= it
.second
->info
.client_metadata
;
365 auto p
= metadata
.find("uuid");
366 if (p
== metadata
.end() || p
->second
!= uuid
)
371 } else if (!session
->reclaiming_from
) {
372 assert(it
.second
->reclaiming_from
== session
);
375 assert(session
->reclaiming_from
== it
.second
);
381 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
383 if (!session
->is_open() && !session
->is_stale()) {
384 dout(10) << "session not open, dropping this req" << dendl
;
388 auto reply
= make_message
<MClientReclaimReply
>(0);
389 if (m
->get_uuid().empty()) {
390 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
391 reply
->set_result(-EINVAL
);
392 mds
->send_message_client(reply
, session
);
396 unsigned flags
= m
->get_flags();
397 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
398 dout(10) << __func__
<< " unsupported flags" << dendl
;
399 reply
->set_result(-EOPNOTSUPP
);
400 mds
->send_message_client(reply
, session
);
404 Session
* target
= find_session_by_uuid(m
->get_uuid());
406 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
407 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
408 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
409 reply
->set_result(-EPERM
);
410 mds
->send_message_client(reply
, session
);
413 assert(!target
->reclaiming_from
);
414 assert(!session
->reclaiming_from
);
415 session
->reclaiming_from
= target
;
416 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
419 if (flags
& CEPH_RECLAIM_RESET
) {
420 finish_reclaim_session(session
, reply
);
427 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
429 Session
*target
= session
->reclaiming_from
;
431 session
->reclaiming_from
= nullptr;
435 int64_t session_id
= session
->get_client().v
;
436 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
437 assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
438 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
442 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
443 reply
->set_epoch(epoch
);
444 mds
->send_message_client(reply
, session
);
447 send_reply
= nullptr;
450 bool blacklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
451 return map
.is_blacklisted(target
->info
.inst
.addr
);
454 if (blacklisted
|| !g_conf()->mds_session_blacklist_on_evict
) {
455 kill_session(target
, send_reply
);
457 std::stringstream ss
;
458 mds
->evict_client(target
->get_client().v
, false, true, ss
, send_reply
);
461 mds
->send_message_client(reply
, session
);
465 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
467 Session
*session
= mds
->get_session(m
);
468 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
469 assert(m
->get_source().is_client()); // should _not_ come from an mds!
472 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
476 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
477 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
481 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
482 finish_reclaim_session(session
);
484 reclaim_session(session
, m
);
488 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
491 Session
*session
= mds
->get_session(m
);
493 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
494 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
497 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
498 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
499 reply
->metadata
["error_string"] = "sessionless";
500 mds
->send_message(reply
, m
->get_connection());
504 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
505 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
506 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
507 // close requests need to be handled when mds is active
508 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
509 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
513 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
514 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
520 logger
->inc(l_mdss_handle_client_session
);
523 switch (m
->get_op()) {
524 case CEPH_SESSION_REQUEST_OPEN
:
525 if (session
->is_opening() ||
526 session
->is_open() ||
527 session
->is_stale() ||
528 session
->is_killing() ||
529 terminating_sessions
) {
530 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
533 ceph_assert(session
->is_closed() || session
->is_closing());
535 if (mds
->is_stopping()) {
536 dout(10) << "mds is stopping, dropping open req" << dendl
;
541 auto& addr
= session
->info
.inst
.addr
;
542 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
543 auto& client_metadata
= session
->info
.client_metadata
;
545 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
546 auto now
= ceph_clock_now();
547 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
548 auto elapsed
= now
- m
->get_recv_stamp();
549 CachedStackStringStream css
;
550 *css
<< "New client session:"
551 << " addr=\"" << session
->info
.inst
.addr
<< "\""
552 << ",elapsed=" << elapsed
553 << ",throttled=" << throttle_elapsed
554 << ",status=\"" << status
<< "\"";
556 *css
<< ",error=\"" << err
<< "\"";
558 const auto& metadata
= session
->info
.client_metadata
;
559 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
560 *css
<< ",root=\"" << it
->second
<< "\"";
562 dout(2) << css
->strv() << dendl
;
565 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
) {
566 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
567 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
568 m
->metadata
["error_string"] = err_str
;
569 mds
->send_message_client(m
, session
);
570 log_session_status("REJECTED", err_str
);
573 bool blacklisted
= mds
->objecter
->with_osdmap(
574 [&addr
](const OSDMap
&osd_map
) -> bool {
575 return osd_map
.is_blacklisted(addr
);
579 dout(10) << "rejecting blacklisted client " << addr
<< dendl
;
580 send_reject_message("blacklisted");
585 if (client_metadata
.features
.empty())
586 infer_supported_features(session
, client_metadata
);
588 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
589 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
590 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
591 for (const auto& p
: client_metadata
) {
592 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
595 feature_bitset_t missing_features
= required_client_features
;
596 missing_features
-= client_metadata
.features
;
597 if (!missing_features
.empty()) {
599 ss
<< "missing required features '" << missing_features
<< "'";
600 send_reject_message(ss
.str());
601 mds
->clog
->warn() << "client session (" << session
->info
.inst
602 << ") lacks required features " << missing_features
603 << "; client supports " << client_metadata
.features
;
608 // Special case for the 'root' metadata path; validate that the claimed
609 // root is actually within the caps of the session
610 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
611 auto claimed_root
= it
->second
;
614 // claimed_root has a leading "/" which we strip before passing
616 if (claimed_root
.empty() || claimed_root
[0] != '/') {
618 ss
<< "invalue root '" << claimed_root
<< "'";
619 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
621 ss
<< "non-allowable root '" << claimed_root
<< "'";
625 // Tell the client we're rejecting their open
626 send_reject_message(ss
.str());
627 mds
->clog
->warn() << "client session with " << ss
.str()
628 << " denied (" << session
->info
.inst
<< ")";
634 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
635 if (find_session_by_uuid(it
->second
)) {
636 send_reject_message("duplicated session uuid");
637 mds
->clog
->warn() << "client session with duplicated session uuid '"
638 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
644 if (session
->is_closed())
645 mds
->sessionmap
.add_session(session
);
647 pv
= mds
->sessionmap
.mark_projected(session
);
648 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
649 mds
->sessionmap
.touch_session(session
);
650 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
652 log_session_status("ACCEPTED", "");
654 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
655 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
660 case CEPH_SESSION_REQUEST_RENEWCAPS
:
661 if (session
->is_open() || session
->is_stale()) {
662 mds
->sessionmap
.touch_session(session
);
663 if (session
->is_stale()) {
664 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
665 mds
->locker
->resume_stale_caps(session
);
666 mds
->sessionmap
.touch_session(session
);
668 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
669 mds
->send_message_client(reply
, session
);
671 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
675 case CEPH_SESSION_REQUEST_CLOSE
:
677 if (session
->is_closed() ||
678 session
->is_closing() ||
679 session
->is_killing()) {
680 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
683 if (session
->is_importing()) {
684 dout(10) << "ignoring close req on importing session" << dendl
;
687 ceph_assert(session
->is_open() ||
688 session
->is_stale() ||
689 session
->is_opening());
690 if (m
->get_seq() < session
->get_push_seq()) {
691 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
692 << ", dropping" << dendl
;
695 // We are getting a seq that is higher than expected.
696 // Handle the same as any other seqn error.
698 if (m
->get_seq() != session
->get_push_seq()) {
699 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
700 << ", BUGGY!" << dendl
;
701 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
702 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
705 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
709 case CEPH_SESSION_FLUSHMSG_ACK
:
710 finish_flush_session(session
, m
->get_seq());
713 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
714 if (mds
->is_active())
724 void Server::flush_session(Session
*session
, MDSGatherBuilder
*gather
) {
725 if (!session
->is_open() ||
726 !session
->get_connection() ||
727 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
731 version_t seq
= session
->wait_for_flush(gather
->new_sub());
732 mds
->send_message_client(
733 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
736 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
738 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
739 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
740 ceph_assert(session
);
741 flush_session(session
, &gather
);
745 void Server::finish_flush_session(Session
*session
, version_t seq
)
747 MDSContext::vec finished
;
748 session
->finish_flush(seq
, finished
);
749 mds
->queue_waiters(finished
);
752 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
753 const interval_set
<inodeno_t
>& inos
, version_t piv
,
754 const interval_set
<inodeno_t
>& purge_inos
, LogSegment
*ls
)
756 dout(10) << "_session_logged " << session
->info
.inst
757 << " state_seq " << state_seq
758 << " " << (open
? "open":"close")
760 << " purge_inos : " << purge_inos
<< dendl
;
763 dout(10) << "_session_logged seq : " << ls
->seq
<< dendl
;
764 if (purge_inos
.size()){
765 ls
->purge_inodes
.insert(purge_inos
);
766 mdcache
->purge_inodes(purge_inos
, ls
);
771 ceph_assert(session
->is_closing() || session
->is_killing() ||
772 session
->is_opening()); // re-open closing session
773 session
->info
.prealloc_inos
.subtract(inos
);
774 session
->delegated_inos
.clear();
775 mds
->inotable
->apply_release_ids(inos
);
776 ceph_assert(mds
->inotable
->get_version() == piv
);
779 mds
->sessionmap
.mark_dirty(session
);
782 if (session
->get_state_seq() != state_seq
) {
783 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
784 << ", noop" << dendl
;
785 // close must have been canceled (by an import?), or any number of other things..
787 ceph_assert(session
->is_opening());
788 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
789 mds
->sessionmap
.touch_session(session
);
790 ceph_assert(session
->get_connection());
791 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
792 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
793 reply
->supported_features
= supported_features
;
794 mds
->send_message_client(reply
, session
);
795 if (mdcache
->is_readonly()) {
796 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
797 mds
->send_message_client(m
, session
);
799 } else if (session
->is_closing() ||
800 session
->is_killing()) {
801 // kill any lingering capabilities, leases, requests
802 while (!session
->caps
.empty()) {
803 Capability
*cap
= session
->caps
.front();
804 CInode
*in
= cap
->get_inode();
805 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
806 mds
->locker
->remove_client_cap(in
, cap
, true);
808 while (!session
->leases
.empty()) {
809 ClientLease
*r
= session
->leases
.front();
810 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
811 dout(20) << " killing client lease of " << *dn
<< dendl
;
812 dn
->remove_client_lease(r
, mds
->locker
);
814 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
815 dout(20) << " removing client from reconnect set" << dendl
;
816 if (client_reconnect_gather
.empty()) {
817 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
818 reconnect_gather_finish();
821 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
822 dout(20) << " removing client from reclaim set" << dendl
;
823 if (client_reclaim_gather
.empty()) {
824 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
825 mds
->maybe_clientreplay_done();
829 if (session
->is_closing()) {
830 // mark con disposable. if there is a fault, we will get a
831 // reset and clean it up. if the client hasn't received the
832 // CLOSE message yet, they will reconnect and get an
833 // ms_handle_remote_reset() and realize they had in fact closed.
834 // do this *before* sending the message to avoid a possible
836 if (session
->get_connection()) {
837 // Conditional because terminate_sessions will indiscrimately
838 // put sessions in CLOSING whether they ever had a conn or not.
839 session
->get_connection()->mark_disposable();
843 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
844 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
846 mds
->sessionmap
.remove_session(session
);
847 } else if (session
->is_killing()) {
848 // destroy session, close connection
849 if (session
->get_connection()) {
850 session
->get_connection()->mark_down();
851 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
852 session
->set_connection(nullptr);
854 mds
->sessionmap
.remove_session(session
);
864 * Inject sessions from some source other than actual connections.
867 * - sessions inferred from journal replay
868 * - sessions learned from other MDSs during rejoin
869 * - sessions learned from other MDSs during dir/caps migration
870 * - sessions learned from other MDSs during a cross-MDS rename
872 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
873 map
<client_t
,client_metadata_t
>& cmm
,
874 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
876 version_t pv
= mds
->sessionmap
.get_projected();
878 dout(10) << "prepare_force_open_sessions " << pv
879 << " on " << cm
.size() << " clients"
882 mds
->objecter
->with_osdmap(
883 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
884 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
885 if (osd_map
.is_blacklisted(p
->second
.addr
)) {
886 dout(10) << " ignoring blacklisted client." << p
->first
887 << " (" << p
->second
.addr
<< ")" << dendl
;
896 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
897 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
898 pv
= mds
->sessionmap
.mark_projected(session
);
900 if (session
->is_closed() ||
901 session
->is_closing() ||
902 session
->is_killing()) {
903 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
904 auto q
= cmm
.find(p
->first
);
906 session
->info
.client_metadata
.merge(q
->second
);
908 ceph_assert(session
->is_open() ||
909 session
->is_opening() ||
910 session
->is_stale());
913 smap
[p
->first
] = make_pair(session
, sseq
);
914 session
->inc_importing();
919 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
923 * FIXME: need to carefully consider the race conditions between a
924 * client trying to close a session and an MDS doing an import
925 * trying to force open a session...
927 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
928 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
930 for (auto &it
: smap
) {
931 Session
*session
= it
.second
.first
;
932 uint64_t sseq
= it
.second
.second
;
934 if (session
->get_state_seq() != sseq
) {
935 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
937 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
938 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
939 mds
->sessionmap
.touch_session(session
);
941 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
942 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
943 reply
->supported_features
= supported_features
;
944 mds
->send_message_client(reply
, session
);
946 if (mdcache
->is_readonly())
947 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
950 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
951 ceph_assert(session
->is_open() || session
->is_stale());
955 session
->dec_importing();
958 mds
->sessionmap
.mark_dirty(session
);
961 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
964 class C_MDS_TerminatedSessions
: public ServerContext
{
965 void finish(int r
) override
{
966 server
->terminating_sessions
= false;
969 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
972 void Server::terminate_sessions()
974 dout(5) << "terminating all sessions..." << dendl
;
976 terminating_sessions
= true;
978 // kill them off. clients will retry etc.
979 set
<Session
*> sessions
;
980 mds
->sessionmap
.get_client_session_set(sessions
);
981 for (set
<Session
*>::const_iterator p
= sessions
.begin();
984 Session
*session
= *p
;
985 if (session
->is_closing() ||
986 session
->is_killing() ||
987 session
->is_closed())
989 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
992 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
996 void Server::find_idle_sessions()
998 auto now
= clock::now();
999 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1001 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1004 // (caps go stale, lease die)
1005 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1006 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1008 // don't kick clients if we've been laggy
1009 if (last_cleared_laggy
< cutoff
) {
1010 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1011 << "), not marking any client stale" << dendl
;
1015 std::vector
<Session
*> to_evict
;
1017 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1018 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1019 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1020 std::vector
<Session
*> new_stale
;
1022 for (auto session
: *(sessions_p1
->second
)) {
1023 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1024 if (last_cap_renew_span
< cutoff
) {
1025 dout(20) << "laggiest active session is " << session
->info
.inst
1026 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1030 if (session
->last_seen
> session
->last_cap_renew
) {
1031 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1032 if (last_cap_renew_span
< cutoff
) {
1033 dout(20) << "laggiest active session is " << session
->info
.inst
1034 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1039 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1040 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1041 "has arrived" << dendl
;
1042 // evict session without marking it stale
1043 to_evict
.push_back(session
);
1047 if (defer_session_stale
&&
1048 !session
->is_any_flush_waiter() &&
1049 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1050 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1051 "since it holds no caps" << dendl
;
1055 auto it
= session
->info
.client_metadata
.find("timeout");
1056 if (it
!= session
->info
.client_metadata
.end()) {
1057 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1059 dout(10) << "skipping session " << session
->info
.inst
1060 << ", infinite timeout specified" << dendl
;
1063 double cutoff
= queue_max_age
+ timeout
;
1064 if (last_cap_renew_span
< cutoff
) {
1065 dout(10) << "skipping session " << session
->info
.inst
1066 << ", timeout (" << timeout
<< ") specified"
1067 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1071 // do not go through stale, evict it directly.
1072 to_evict
.push_back(session
);
1074 dout(10) << "new stale session " << session
->info
.inst
1075 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1076 new_stale
.push_back(session
);
1080 for (auto session
: new_stale
) {
1081 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1082 if (mds
->locker
->revoke_stale_caps(session
)) {
1083 mds
->locker
->remove_stale_leases(session
);
1084 finish_flush_session(session
, session
->get_push_seq());
1085 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
, session
->get_push_seq());
1086 mds
->send_message_client(m
, session
);
1088 to_evict
.push_back(session
);
1094 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1096 // Collect a list of sessions exceeding the autoclose threshold
1097 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1098 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1099 for (auto session
: *(sessions_p2
->second
)) {
1100 assert(session
->is_stale());
1101 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1102 if (last_cap_renew_span
< cutoff
) {
1103 dout(20) << "oldest stale session is " << session
->info
.inst
1104 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1107 to_evict
.push_back(session
);
1111 for (auto session
: to_evict
) {
1112 if (session
->is_importing()) {
1113 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1117 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1118 mds
->clog
->warn() << "evicting unresponsive client " << *session
1119 << ", after " << last_cap_renew_span
<< " seconds";
1120 dout(10) << "autoclosing stale session " << session
->info
.inst
1121 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1123 if (g_conf()->mds_session_blacklist_on_timeout
) {
1124 std::stringstream ss
;
1125 mds
->evict_client(session
->get_client().v
, false, true, ss
, nullptr);
1127 kill_session(session
, NULL
);
1132 void Server::evict_cap_revoke_non_responders() {
1133 if (!cap_revoke_eviction_timeout
) {
1137 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1139 for (auto const &client
: to_evict
) {
1140 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1141 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1142 << " seconds, evicting";
1143 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1146 std::stringstream ss
;
1147 bool evicted
= mds
->evict_client(client
.v
, false,
1148 g_conf()->mds_session_blacklist_on_evict
,
1150 if (evicted
&& logger
) {
1151 logger
->inc(l_mdss_cap_revoke_eviction
);
1156 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1157 if (changed
.count("mds_replay_unsafe_with_closed_session")) {
1158 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
1160 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1161 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1162 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1163 << cap_revoke_eviction_timeout
<< dendl
;
1165 if (changed
.count("mds_recall_max_decay_rate")) {
1166 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1168 if (changed
.count("mds_max_snaps_per_dir")) {
1169 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1170 dout(20) << __func__
<< " max snapshots per directory changed to "
1171 << max_snaps_per_dir
<< dendl
;
1173 if (changed
.count("mds_client_delegate_inos_pct")) {
1174 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1179 * XXX bump in the interface here, not using an MDSContext here
1180 * because all the callers right now happen to use a SaferCond
1182 void Server::kill_session(Session
*session
, Context
*on_safe
, bool need_purge_inos
)
1184 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1186 if ((session
->is_opening() ||
1187 session
->is_open() ||
1188 session
->is_stale()) &&
1189 !session
->is_importing()) {
1190 dout(10) << "kill_session " << session
<< dendl
;
1191 journal_close_session(session
, Session::STATE_KILLING
, on_safe
, need_purge_inos
);
1193 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1194 if (session
->is_closing() ||
1195 session
->is_killing()) {
1197 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1199 ceph_assert(session
->is_closed() ||
1200 session
->is_importing());
1202 on_safe
->complete(0);
1207 size_t Server::apply_blacklist(const std::set
<entity_addr_t
> &blacklist
)
1209 bool prenautilus
= mds
->objecter
->with_osdmap(
1210 [&](const OSDMap
& o
) {
1211 return o
.require_osd_release
< ceph_release_t::nautilus
;
1214 std::vector
<Session
*> victims
;
1215 const auto& sessions
= mds
->sessionmap
.get_sessions();
1216 for (const auto& p
: sessions
) {
1217 if (!p
.first
.is_client()) {
1218 // Do not apply OSDMap blacklist to MDS daemons, we find out
1219 // about their death via MDSMap.
1223 Session
*s
= p
.second
;
1224 auto inst_addr
= s
->info
.inst
.addr
;
1225 // blacklist entries are always TYPE_ANY for nautilus+
1226 inst_addr
.set_type(entity_addr_t::TYPE_ANY
);
1227 if (blacklist
.count(inst_addr
)) {
1228 victims
.push_back(s
);
1232 // ...except pre-nautilus, they were TYPE_LEGACY
1233 inst_addr
.set_type(entity_addr_t::TYPE_LEGACY
);
1234 if (blacklist
.count(inst_addr
)) {
1235 victims
.push_back(s
);
1240 for (const auto& s
: victims
) {
1241 kill_session(s
, nullptr);
1244 dout(10) << "apply_blacklist: killed " << victims
.size() << dendl
;
1246 return victims
.size();
1249 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
, bool need_purge_inos
)
1251 dout(10) << __func__
<< " : "
1252 << "("<< need_purge_inos
<< ")"
1253 << session
->info
.inst
1254 << "(" << session
->info
.prealloc_inos
.size() << "|" << session
->pending_prealloc_inos
.size() << ")" << dendl
;
1256 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1257 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1260 // release alloc and pending-alloc inos for this session
1261 // and wipe out session state, in case the session close aborts for some reason
1262 interval_set
<inodeno_t
> both
;
1263 both
.insert(session
->pending_prealloc_inos
);
1264 if (!need_purge_inos
)
1265 both
.insert(session
->info
.prealloc_inos
);
1267 mds
->inotable
->project_release_ids(both
);
1268 piv
= mds
->inotable
->get_projected_version();
1272 if(need_purge_inos
&& session
->info
.prealloc_inos
.size()) {
1273 dout(10) << "start purge indoes " << session
->info
.prealloc_inos
<< dendl
;
1274 LogSegment
* ls
= mdlog
->get_current_segment();
1275 LogEvent
* e
= new ESession(session
->info
.inst
, false, pv
, both
, piv
, session
->info
.prealloc_inos
);
1276 MDSLogContextBase
* c
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
,
1277 session
->info
.prealloc_inos
, ls
, on_safe
);
1278 mdlog
->start_submit_entry(e
, c
);
1280 interval_set
<inodeno_t
> empty
;
1281 LogEvent
* e
= new ESession(session
->info
.inst
, false, pv
, both
, piv
, empty
);
1282 MDSLogContextBase
* c
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
);
1283 mdlog
->start_submit_entry(e
, c
);
1287 // clean up requests, too
1288 for (auto p
= session
->requests
.begin(); !p
.end(); ) {
1289 MDRequestRef
mdr(*p
);
1291 mdcache
->request_kill(mdr
);
1294 finish_flush_session(session
, session
->get_push_seq());
1297 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1299 reconnect_done
= reconnect_done_
;
1301 auto now
= clock::now();
1302 set
<Session
*> sessions
;
1303 mds
->sessionmap
.get_client_session_set(sessions
);
1304 for (auto session
: sessions
) {
1305 if (session
->is_open()) {
1306 client_reconnect_gather
.insert(session
->get_client());
1307 session
->set_reconnecting(true);
1308 session
->last_cap_renew
= now
;
1312 if (client_reconnect_gather
.empty()) {
1313 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1314 reconnect_gather_finish();
1318 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1320 reconnect_start
= now
;
1321 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1322 mds
->sessionmap
.dump();
1325 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1327 dout(7) << "handle_client_reconnect " << m
->get_source()
1328 << (m
->has_more() ? " (more)" : "") << dendl
;
1329 client_t from
= m
->get_source().num();
1330 Session
*session
= mds
->get_session(m
);
1332 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1333 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1334 reply
->metadata
["error_string"] = "sessionless";
1335 mds
->send_message(reply
, m
->get_connection());
1339 if (!session
->is_open()) {
1340 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1341 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1342 mds
->send_message(reply
, m
->get_connection());
1346 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1347 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1348 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1352 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1353 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1356 if (!mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1357 // XXX maybe in the future we can do better than this?
1358 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1359 mds
->clog
->info() << "denied reconnect attempt (mds is "
1360 << ceph_mds_state_name(mds
->get_state())
1361 << ") from " << m
->get_source_inst()
1362 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1365 std::string error_str
;
1366 if (!session
->is_open()) {
1367 error_str
= "session is closed";
1368 } else if (mdcache
->is_readonly()) {
1369 error_str
= "mds is readonly";
1371 if (session
->info
.client_metadata
.features
.empty())
1372 infer_supported_features(session
, session
->info
.client_metadata
);
1374 feature_bitset_t missing_features
= required_client_features
;
1375 missing_features
-= session
->info
.client_metadata
.features
;
1376 if (!missing_features
.empty()) {
1378 ss
<< "missing required features '" << missing_features
<< "'";
1379 error_str
= ss
.str();
1383 if (!error_str
.empty()) {
1385 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1386 mds
->clog
->info() << "denied reconnect attempt from "
1387 << m
->get_source_inst() << " (" << error_str
<< ")";
1392 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1393 mds
->send_message_client(r
, session
);
1394 if (session
->is_open())
1395 kill_session(session
, nullptr);
1399 if (!m
->has_more()) {
1400 // notify client of success with an OPEN
1401 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1402 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1403 reply
->supported_features
= supported_features
;
1404 mds
->send_message_client(reply
, session
);
1405 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1408 session
->last_cap_renew
= clock::now();
1411 for (const auto &r
: m
->realms
) {
1412 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1413 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1416 if (in
->snaprealm
) {
1417 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1419 // this can happen if we are non-auth or we rollback snaprealm
1420 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1422 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1424 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1425 << " seq " << r
.realm
.seq
<< dendl
;
1426 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1431 for (const auto &p
: m
->caps
) {
1432 // make sure our last_cap_id is MAX over all issued caps
1433 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1434 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1436 CInode
*in
= mdcache
->get_inode(p
.first
);
1437 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1439 if (in
&& in
->is_auth()) {
1440 // we recovered it, and it's ours. take note.
1441 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1442 << " on " << *in
<< dendl
;
1443 in
->reconnect_cap(from
, p
.second
, session
);
1444 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1445 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1449 if (in
&& !in
->is_auth()) {
1451 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1452 // add to cap export list.
1453 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1454 in
->authority().first
, true);
1456 // don't know if the inode is mine
1457 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1458 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1462 reconnect_last_seen
= clock::now();
1464 if (!m
->has_more()) {
1465 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1467 // remove from gather set
1468 client_reconnect_gather
.erase(from
);
1469 session
->set_reconnecting(false);
1470 if (client_reconnect_gather
.empty())
1471 reconnect_gather_finish();
1475 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1478 auto it
= client_metadata
.find("ceph_version");
1479 if (it
!= client_metadata
.end()) {
1480 // user space client
1481 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1482 supported
= CEPHFS_FEATURE_LUMINOUS
;
1483 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1484 supported
= CEPHFS_FEATURE_KRAKEN
;
1486 it
= client_metadata
.find("kernel_version");
1487 if (it
!= client_metadata
.end()) {
1489 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1490 supported
= CEPHFS_FEATURE_LUMINOUS
;
1493 if (supported
== -1 &&
1494 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1495 supported
= CEPHFS_FEATURE_JEWEL
;
1497 if (supported
>= 0) {
1498 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1499 client_metadata
.features
= feature_bitset_t(value
);
1500 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1504 void Server::update_required_client_features()
1506 vector
<size_t> bits
= CEPHFS_FEATURES_MDS_REQUIRED
;
1508 /* If this blows up on you, you added a release without adding a new release bit to cephfs_features.h */
1509 static_assert(CEPHFS_CURRENT_RELEASE
== CEPH_RELEASE_MAX
-1);
1511 ceph_release_t min_compat
= mds
->mdsmap
->get_min_compat_client();
1512 if (min_compat
>= ceph_release_t::octopus
)
1513 bits
.push_back(CEPHFS_FEATURE_OCTOPUS
);
1514 else if (min_compat
>= ceph_release_t::nautilus
)
1515 bits
.push_back(CEPHFS_FEATURE_NAUTILUS
);
1516 else if (min_compat
>= ceph_release_t::mimic
)
1517 bits
.push_back(CEPHFS_FEATURE_MIMIC
);
1518 else if (min_compat
>= ceph_release_t::luminous
)
1519 bits
.push_back(CEPHFS_FEATURE_LUMINOUS
);
1520 else if (min_compat
>= ceph_release_t::kraken
)
1521 bits
.push_back(CEPHFS_FEATURE_KRAKEN
);
1522 else if (min_compat
>= ceph_release_t::jewel
)
1523 bits
.push_back(CEPHFS_FEATURE_JEWEL
);
1525 std::sort(bits
.begin(), bits
.end());
1526 required_client_features
= feature_bitset_t(bits
);
1527 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1529 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1530 set
<Session
*> sessions
;
1531 mds
->sessionmap
.get_client_session_set(sessions
);
1532 for (auto session
: sessions
) {
1533 feature_bitset_t missing_features
= required_client_features
;
1534 missing_features
-= session
->info
.client_metadata
.features
;
1535 if (!missing_features
.empty()) {
1536 bool blacklisted
= mds
->objecter
->with_osdmap(
1537 [session
](const OSDMap
&osd_map
) -> bool {
1538 return osd_map
.is_blacklisted(session
->info
.inst
.addr
);
1543 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1544 << missing_features
<< "'";
1545 std::stringstream ss
;
1546 mds
->evict_client(session
->get_client().v
, false,
1547 g_conf()->mds_session_blacklist_on_evict
, ss
);
1553 void Server::reconnect_gather_finish()
1555 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1556 ceph_assert(reconnect_done
);
1558 if (!mds
->snapclient
->is_synced()) {
1559 // make sure snaptable cache is populated. snaprealms will be
1560 // extensively used in rejoin stage.
1561 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1562 mds
->snapclient
->wait_for_sync(reconnect_done
);
1564 reconnect_done
->complete(0);
1566 reconnect_done
= NULL
;
1569 void Server::reconnect_tick()
1571 if (reconnect_evicting
) {
1572 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1576 if (client_reconnect_gather
.empty())
1579 auto now
= clock::now();
1580 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1581 if (elapse1
< g_conf()->mds_reconnect_timeout
)
1584 vector
<Session
*> remaining_sessions
;
1585 remaining_sessions
.reserve(client_reconnect_gather
.size());
1586 for (auto c
: client_reconnect_gather
) {
1587 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1588 ceph_assert(session
);
1589 remaining_sessions
.push_back(session
);
1590 // client re-sends cap flush messages before the reconnect message
1591 if (session
->last_seen
> reconnect_last_seen
)
1592 reconnect_last_seen
= session
->last_seen
;
1595 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1596 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2) {
1597 dout(7) << "reconnect_tick: last seen " << elapse2
1598 << " seconds ago, extending reconnect interval" << dendl
;
1602 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1603 << " clients have not reconnected in time" << dendl
;
1605 // If we're doing blacklist evictions, use this to wait for them before
1606 // proceeding to reconnect_gather_finish
1607 MDSGatherBuilder
gather(g_ceph_context
);
1609 for (auto session
: remaining_sessions
) {
1610 // Keep sessions that have specified timeout. These sessions will prevent
1611 // mds from going to active. MDS goes to active after they all have been
1612 // killed or reclaimed.
1613 if (session
->info
.client_metadata
.find("timeout") !=
1614 session
->info
.client_metadata
.end()) {
1615 dout(1) << "reconnect keeps " << session
->info
.inst
1616 << ", need to be reclaimed" << dendl
;
1617 client_reclaim_gather
.insert(session
->get_client());
1621 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1623 mds
->clog
->warn() << "evicting unresponsive client " << *session
1624 << ", after waiting " << elapse1
1625 << " seconds during MDS startup";
1627 if (g_conf()->mds_session_blacklist_on_timeout
) {
1628 std::stringstream ss
;
1629 mds
->evict_client(session
->get_client().v
, false, true, ss
,
1632 kill_session(session
, NULL
, true);
1635 failed_reconnects
++;
1637 client_reconnect_gather
.clear();
1639 if (gather
.has_subs()) {
1640 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1641 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1642 [this](int r
){reconnect_gather_finish();})));
1644 reconnect_evicting
= true;
1646 reconnect_gather_finish();
1650 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1652 if (!locks
.length()) return;
1655 auto p
= locks
.cbegin();
1656 decode(numlocks
, p
);
1657 for (int i
= 0; i
< numlocks
; ++i
) {
1659 lock
.client
= client
;
1660 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1661 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1663 decode(numlocks
, p
);
1664 for (int i
= 0; i
< numlocks
; ++i
) {
1666 lock
.client
= client
;
1667 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1668 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1673 * Call this when the MDCache is oversized, to send requests to the clients
1674 * to trim some caps, and consequently unpin some inodes in the MDCache so
1675 * that it can trim too.
1677 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1679 const auto now
= clock::now();
1680 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1681 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1682 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1683 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1685 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1686 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1687 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1688 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1689 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1690 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1692 dout(7) << __func__
<< ":"
1693 << " min=" << min_caps_per_client
1694 << " max=" << max_caps_per_client
1695 << " total=" << Capability::count()
1696 << " flags=" << flags
1699 /* trim caps of sessions with the most caps first */
1700 std::multimap
<uint64_t, Session
*> caps_session
;
1701 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1702 auto num_caps
= s
->caps
.size();
1703 auto cache_liveness
= s
->get_session_cache_liveness();
1704 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1705 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1708 mds
->sessionmap
.get_client_sessions(std::move(f
));
1710 std::pair
<bool, uint64_t> result
= {false, 0};
1711 auto& [throttled
, caps_recalled
] = result
;
1712 last_recall_state
= now
;
1713 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1714 if (!session
->is_open() ||
1715 !session
->get_connection() ||
1716 !session
->info
.inst
.name
.is_client())
1719 dout(10) << __func__
<< ":"
1720 << " session " << session
->info
.inst
1721 << " caps " << num_caps
1722 << ", leases " << session
->leases
.size()
1726 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1727 newlim
= min_caps_per_client
;
1729 newlim
= num_caps
-recall_max_caps
;
1731 if (num_caps
> newlim
) {
1732 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1733 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1734 newlim
= num_caps
-recall
;
1735 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1736 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1737 const uint64_t global_recall_throttle
= recall_throttle
.get();
1738 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1739 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1742 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1743 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1746 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1747 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1752 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1754 const auto session_recall
= session
->get_recall_caps();
1755 const auto session_release
= session
->get_release_caps();
1756 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1757 /* The session has been unable to keep up with the number of caps
1758 * recalled (by half); additionally, to prevent marking sessions
1759 * we've just begun to recall from, the session_recall counter
1760 * (decayed count of caps recently recalled) is **greater** than the
1761 * session threshold for the session's cap recall throttle.
1763 dout(15) << " 2*session_release < session_recall"
1764 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1765 " 2*session_recall < recall_max_decay_threshold"
1766 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1767 " Skipping because we are unlikely to get more released." << dendl
;
1769 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1770 /* The number of caps recalled is less than the number we *could*
1771 * recall (so there isn't much left to recall?) and the number of
1772 * caps is less than the current recall_caps counter (decayed count
1773 * of caps recently recalled).
1775 dout(15) << " 2*recall < session_recall "
1776 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1777 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1778 " Skipping because we are unlikely to get more released." << dendl
;
1783 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1785 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1786 m
->head
.max_caps
= newlim
;
1787 mds
->send_message_client(m
, session
);
1789 flush_session(session
, gather
);
1791 caps_recalled
+= session
->notify_recall_sent(newlim
);
1792 recall_throttle
.hit(recall
);
1796 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1801 void Server::force_clients_readonly()
1803 dout(10) << "force_clients_readonly" << dendl
;
1804 set
<Session
*> sessions
;
1805 mds
->sessionmap
.get_client_session_set(sessions
);
1806 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1807 p
!= sessions
.end();
1809 Session
*session
= *p
;
1810 if (!session
->info
.inst
.name
.is_client() ||
1811 !(session
->is_open() || session
->is_stale()))
1813 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1818 * some generic stuff for finishing off requests
1820 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1822 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1823 ceph_assert(!mdr
->has_completed
);
1825 // note trace items for eventual reply.
1834 early_reply(mdr
, in
, dn
);
1836 mdr
->committing
= true;
1837 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1839 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1840 if (mds
->queue_one_replay()) {
1841 dout(10) << " queued next replay op" << dendl
;
1843 dout(10) << " journaled last replay op" << dendl
;
1845 } else if (mdr
->did_early_reply
)
1846 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1851 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1852 std::string_view event
)
1855 string
event_str("submit entry: ");
1857 mdr
->mark_event(event_str
);
1859 mdlog
->submit_entry(le
, fin
);
1863 * send response built from mdr contents and error code; clean up mdr
1865 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1867 if (mdr
->client_request
) {
1868 if (mdr
->is_batch_op() && mdr
->is_batch_head
) {
1869 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
1871 std::unique_ptr
<BatchOp
> bop
;
1872 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_GETATTR
) {
1873 dout(20) << __func__
<< ": respond other getattr ops. " << *mdr
<< dendl
;
1874 auto it
= mdr
->in
[0]->batch_ops
.find(mask
);
1875 bop
= std::move(it
->second
);
1876 mdr
->in
[0]->batch_ops
.erase(it
);
1878 dout(20) << __func__
<< ": respond other lookup ops. " << *mdr
<< dendl
;
1879 auto it
= mdr
->dn
[0].back()->batch_ops
.find(mask
);
1880 bop
= std::move(it
->second
);
1881 mdr
->dn
[0].back()->batch_ops
.erase(it
);
1886 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
1888 } else if (mdr
->internal_op
> -1) {
1889 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1890 if (!mdr
->internal_op_finish
)
1891 ceph_abort_msg("trying to respond to internal op without finisher");
1892 mdr
->internal_op_finish
->complete(r
);
1893 mdcache
->request_finish(mdr
);
1897 // statistics mds req op number and latency
1898 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
1900 int code
= l_mdss_first
;
1901 switch(req
->get_op()) {
1902 case CEPH_MDS_OP_LOOKUPHASH
:
1903 code
= l_mdss_req_lookuphash_latency
;
1905 case CEPH_MDS_OP_LOOKUPINO
:
1906 code
= l_mdss_req_lookupino_latency
;
1908 case CEPH_MDS_OP_LOOKUPPARENT
:
1909 code
= l_mdss_req_lookupparent_latency
;
1911 case CEPH_MDS_OP_LOOKUPNAME
:
1912 code
= l_mdss_req_lookupname_latency
;
1914 case CEPH_MDS_OP_LOOKUP
:
1915 code
= l_mdss_req_lookup_latency
;
1917 case CEPH_MDS_OP_LOOKUPSNAP
:
1918 code
= l_mdss_req_lookupsnap_latency
;
1920 case CEPH_MDS_OP_GETATTR
:
1921 code
= l_mdss_req_getattr_latency
;
1923 case CEPH_MDS_OP_SETATTR
:
1924 code
= l_mdss_req_setattr_latency
;
1926 case CEPH_MDS_OP_SETLAYOUT
:
1927 code
= l_mdss_req_setlayout_latency
;
1929 case CEPH_MDS_OP_SETDIRLAYOUT
:
1930 code
= l_mdss_req_setdirlayout_latency
;
1932 case CEPH_MDS_OP_SETXATTR
:
1933 code
= l_mdss_req_setxattr_latency
;
1935 case CEPH_MDS_OP_RMXATTR
:
1936 code
= l_mdss_req_rmxattr_latency
;
1938 case CEPH_MDS_OP_READDIR
:
1939 code
= l_mdss_req_readdir_latency
;
1941 case CEPH_MDS_OP_SETFILELOCK
:
1942 code
= l_mdss_req_setfilelock_latency
;
1944 case CEPH_MDS_OP_GETFILELOCK
:
1945 code
= l_mdss_req_getfilelock_latency
;
1947 case CEPH_MDS_OP_CREATE
:
1948 code
= l_mdss_req_create_latency
;
1950 case CEPH_MDS_OP_OPEN
:
1951 code
= l_mdss_req_open_latency
;
1953 case CEPH_MDS_OP_MKNOD
:
1954 code
= l_mdss_req_mknod_latency
;
1956 case CEPH_MDS_OP_LINK
:
1957 code
= l_mdss_req_link_latency
;
1959 case CEPH_MDS_OP_UNLINK
:
1960 code
= l_mdss_req_unlink_latency
;
1962 case CEPH_MDS_OP_RMDIR
:
1963 code
= l_mdss_req_rmdir_latency
;
1965 case CEPH_MDS_OP_RENAME
:
1966 code
= l_mdss_req_rename_latency
;
1968 case CEPH_MDS_OP_MKDIR
:
1969 code
= l_mdss_req_mkdir_latency
;
1971 case CEPH_MDS_OP_SYMLINK
:
1972 code
= l_mdss_req_symlink_latency
;
1974 case CEPH_MDS_OP_LSSNAP
:
1975 code
= l_mdss_req_lssnap_latency
;
1977 case CEPH_MDS_OP_MKSNAP
:
1978 code
= l_mdss_req_mksnap_latency
;
1980 case CEPH_MDS_OP_RMSNAP
:
1981 code
= l_mdss_req_rmsnap_latency
;
1983 case CEPH_MDS_OP_RENAMESNAP
:
1984 code
= l_mdss_req_renamesnap_latency
;
1986 default: ceph_abort();
1988 logger
->tinc(code
, lat
);
1991 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1993 if (!g_conf()->mds_early_reply
)
1996 if (mdr
->no_early_reply
) {
1997 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2001 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
2002 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
2006 if (mdr
->alloc_ino
) {
2007 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2011 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2012 entity_inst_t client_inst
= req
->get_source_inst();
2013 if (client_inst
.name
.is_mds())
2016 if (req
->is_replay()) {
2017 dout(10) << " no early reply on replay op" << dendl
;
2022 auto reply
= make_message
<MClientReply
>(*req
, 0);
2023 reply
->set_unsafe();
2025 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2027 //_rename_finish() does not send dentry link/unlink message to replicas.
2028 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2029 // that have projected linkages from getting new replica.
2030 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2032 dout(10) << "early_reply " << reply
->get_result()
2033 << " (" << cpp_strerror(reply
->get_result())
2034 << ") " << *req
<< dendl
;
2036 if (tracei
|| tracedn
) {
2038 mdr
->cap_releases
.erase(tracei
->vino());
2040 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2042 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2045 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2046 mds
->send_message_client(reply
, mdr
->session
);
2048 mdr
->did_early_reply
= true;
2050 mds
->logger
->inc(l_mds_reply
);
2051 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2052 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2053 if (client_inst
.name
.is_client()) {
2054 mds
->sessionmap
.hit_session(mdr
->session
);
2056 perf_gather_op_latency(req
, lat
);
2057 dout(20) << "lat " << lat
<< dendl
;
2059 mdr
->mark_event("early_replied");
2064 * include a trace to tracei
2067 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2069 ceph_assert(mdr
.get());
2070 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2072 dout(7) << "reply_client_request " << reply
->get_result()
2073 << " (" << cpp_strerror(reply
->get_result())
2074 << ") " << *req
<< dendl
;
2076 mdr
->mark_event("replying");
2078 Session
*session
= mdr
->session
;
2080 // note successful request in session map?
2082 // setfilelock requests are special, they only modify states in MDS memory.
2083 // The states get lost when MDS fails. If Client re-send a completed
2084 // setfilelock request, it means that client did not receive corresponding
2085 // setfilelock reply. So MDS should re-execute the setfilelock request.
2086 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2087 reply
->get_result() == 0 && session
) {
2088 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2089 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2091 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2095 // give any preallocated inos to the session
2096 apply_allocated_inos(mdr
, session
);
2098 // get tracei/tracedn from mdr?
2099 CInode
*tracei
= mdr
->tracei
;
2100 CDentry
*tracedn
= mdr
->tracedn
;
2102 bool is_replay
= mdr
->client_request
->is_replay();
2103 bool did_early_reply
= mdr
->did_early_reply
;
2104 entity_inst_t client_inst
= req
->get_source_inst();
2106 if (!did_early_reply
&& !is_replay
) {
2108 mds
->logger
->inc(l_mds_reply
);
2109 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2110 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2111 if (session
&& client_inst
.name
.is_client()) {
2112 mds
->sessionmap
.hit_session(session
);
2114 perf_gather_op_latency(req
, lat
);
2115 dout(20) << "lat " << lat
<< dendl
;
2118 mdr
->cap_releases
.erase(tracei
->vino());
2120 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2123 // drop non-rdlocks before replying, so that we can issue leases
2124 mdcache
->request_drop_non_rdlocks(mdr
);
2127 if (session
&& !client_inst
.name
.is_mds()) {
2129 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2130 (tracei
|| tracedn
)) {
2133 mdcache
->try_reconnect_cap(tracei
, session
);
2135 // include metadata in reply
2136 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2140 // We can set the extra bl unconditionally: if it's already been sent in the
2141 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2142 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2144 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2145 mds
->send_message_client(reply
, session
);
2148 if (req
->is_queued_for_replay() &&
2149 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2150 if (reply
->get_result() < 0) {
2151 int r
= reply
->get_result();
2152 derr
<< "reply_client_request: failed to replay " << *req
2153 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2154 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2156 mds
->queue_one_replay();
2160 mdcache
->request_finish(mdr
);
2162 // take a closer look at tracei, if it happens to be a remote link
2165 tracedn
->get_projected_linkage()->is_remote()) {
2166 mdcache
->eval_remote(tracedn
);
2171 * pass inode OR dentry (not both, or we may get confused)
2173 * trace is in reverse order (i.e. root inode comes last)
2175 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2176 CInode
*in
, CDentry
*dn
,
2179 // skip doing this for debugging purposes?
2180 if (g_conf()->mds_inject_traceless_reply_probability
&&
2181 mdr
->ls
&& !mdr
->o_trunc
&&
2182 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2183 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2187 // inode, dentry, dir, ..., inode
2189 mds_rank_t whoami
= mds
->get_nodeid();
2190 Session
*session
= mdr
->session
;
2191 snapid_t snapid
= mdr
->snapid
;
2192 utime_t now
= ceph_clock_now();
2194 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2197 if (snapid
== CEPH_NOSNAP
) {
2200 realm
= in
->find_snaprealm();
2202 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2203 reply
->snapbl
= realm
->get_snap_trace();
2204 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2209 reply
->head
.is_dentry
= 1;
2210 CDir
*dir
= dn
->get_dir();
2211 CInode
*diri
= dir
->get_inode();
2213 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2214 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2216 #ifdef MDS_VERIFY_FRAGSTAT
2217 if (dir
->is_complete())
2218 dir
->verify_fragstat();
2221 ds
.frag
= dir
->get_frag();
2222 ds
.auth
= dir
->get_dir_auth().first
;
2223 if (dir
->is_auth() && !mdcache
->forward_all_reqs_to_auth())
2224 dir
->get_dist_spec(ds
.dist
, whoami
);
2226 dir
->encode_dirstat(bl
, session
->info
, ds
);
2227 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2229 encode(dn
->get_name(), bl
);
2232 CDentry::linkage_t
*dnl
= dn
->get_linkage(mdr
->get_client(), mdr
);
2233 if (dnl
->is_primary()) {
2234 ceph_assert(dnl
->get_inode() == in
);
2235 lease_mask
= CEPH_LEASE_PRIMARY_LINK
;
2237 if (dnl
->is_remote())
2238 ceph_assert(dnl
->get_remote_ino() == in
->ino());
2242 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, bl
);
2243 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
2245 reply
->head
.is_dentry
= 0;
2249 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2250 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2251 reply
->head
.is_target
= 1;
2253 reply
->head
.is_target
= 0;
2255 reply
->set_trace(bl
);
2258 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2260 dout(4) << "handle_client_request " << *req
<< dendl
;
2263 mds
->logger
->inc(l_mds_request
);
2265 logger
->inc(l_mdss_handle_client_request
);
2267 if (!mdcache
->is_open()) {
2268 dout(5) << "waiting for root" << dendl
;
2269 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2273 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2275 Session
*session
= 0;
2276 if (req
->get_source().is_client()) {
2277 session
= mds
->get_session(req
);
2279 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2280 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2281 session
->is_closing() ||
2282 session
->is_killing()) {
2283 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2287 if (req
->is_queued_for_replay())
2288 mds
->queue_one_replay();
2294 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2295 // send it? hrm, this isn't ideal; they may get a lot of copies if
2296 // they have a high request rate.
2299 // completed request?
2300 bool has_completed
= false;
2301 if (req
->is_replay() || req
->get_retry_attempt()) {
2302 ceph_assert(session
);
2304 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2305 has_completed
= true;
2306 if (!session
->is_open())
2308 // Don't send traceless reply if the completed request has created
2309 // new inode. Treat the request as lookup request instead.
2310 if (req
->is_replay() ||
2311 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2312 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2313 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2314 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2315 auto reply
= make_message
<MClientReply
>(*req
, 0);
2316 if (created
!= inodeno_t()) {
2318 encode(created
, extra
);
2319 reply
->set_extra_bl(extra
);
2321 mds
->send_message_client(reply
, session
);
2323 if (req
->is_queued_for_replay())
2324 mds
->queue_one_replay();
2328 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2329 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2330 dout(10) << " completed request which created new inode " << created
2331 << ", convert it to lookup request" << dendl
;
2332 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2333 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2338 // trim completed_request list
2339 if (req
->get_oldest_client_tid() > 0) {
2340 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2341 ceph_assert(session
);
2342 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2343 // Sessions 'completed_requests' was dirtied, mark it to be
2344 // potentially flushed at segment expiry.
2345 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2347 if (session
->get_num_trim_requests_warnings() > 0 &&
2348 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2349 session
->reset_num_trim_requests_warnings();
2351 if (session
->get_num_completed_requests() >=
2352 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2353 session
->inc_num_trim_requests_warnings();
2355 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2356 << req
->get_oldest_client_tid() << "), "
2357 << session
->get_num_completed_requests()
2358 << " completed requests recorded in session\n";
2359 mds
->clog
->warn() << ss
.str();
2360 dout(20) << __func__
<< " " << ss
.str() << dendl
;
2365 // register + dispatch
2366 MDRequestRef mdr
= mdcache
->request_start(req
);
2371 mdr
->session
= session
;
2372 session
->requests
.push_back(&mdr
->item_session_request
);
2376 mdr
->has_completed
= true;
2378 // process embedded cap releases?
2379 // (only if NOT replay!)
2380 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2381 client_t client
= req
->get_source().num();
2382 for (const auto &r
: req
->releases
) {
2383 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2385 req
->releases
.clear();
2388 dispatch_client_request(mdr
);
2392 void Server::handle_osd_map()
2394 /* Note that we check the OSDMAP_FULL flag directly rather than
2395 * using osdmap_full_flag(), because we want to know "is the flag set"
2396 * rather than "does the flag apply to us?" */
2397 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2398 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
2399 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2400 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2401 << o
.get_epoch() << dendl
;
2405 void Server::clear_batch_ops(const MDRequestRef
& mdr
)
2407 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2408 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_GETATTR
&& mdr
->in
[0]) {
2409 mdr
->in
[0]->batch_ops
.erase(mask
);
2410 } else if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
&& mdr
->dn
[0].size()) {
2411 mdr
->dn
[0].back()->batch_ops
.erase(mask
);
2415 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2417 // we shouldn't be waiting on anyone.
2418 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
2421 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2422 //if the mdr is a "batch_op" and it has followers, pick a follower as
2423 //the new "head of the batch ops" and go on processing the new one.
2424 if (mdr
->is_batch_op() && mdr
->is_batch_head
) {
2425 if (!mdr
->batch_reqs
.empty()) {
2426 MDRequestRef new_batch_head
;
2427 for (auto itr
= mdr
->batch_reqs
.cbegin(); itr
!= mdr
->batch_reqs
.cend();) {
2429 itr
= mdr
->batch_reqs
.erase(itr
);
2431 new_batch_head
= req
;
2436 if (!new_batch_head
) {
2437 clear_batch_ops(mdr
);
2441 new_batch_head
->batch_reqs
= std::move(mdr
->batch_reqs
);
2443 mdr
= new_batch_head
;
2444 mdr
->is_batch_head
= true;
2445 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2446 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_GETATTR
) {
2447 auto& fin
= mdr
->in
[0]->batch_ops
[mask
];
2448 fin
->set_request(new_batch_head
);
2449 } else if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
) {
2450 auto& fin
= mdr
->dn
[0].back()->batch_ops
[mask
];
2451 fin
->set_request(new_batch_head
);
2454 clear_batch_ops(mdr
);
2460 } else if (mdr
->aborted
) {
2461 mdr
->aborted
= false;
2462 mdcache
->request_kill(mdr
);
2466 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2468 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2470 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2472 if (req
->may_write() && mdcache
->is_readonly()) {
2473 dout(10) << " read-only FS" << dendl
;
2474 respond_to_request(mdr
, -EROFS
);
2477 if (mdr
->has_more() && mdr
->more()->slave_error
) {
2478 dout(10) << " got error from slaves" << dendl
;
2479 respond_to_request(mdr
, mdr
->more()->slave_error
);
2484 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2485 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2486 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2487 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2488 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2489 req
->get_op() == CEPH_MDS_OP_CREATE
||
2490 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2491 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2492 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2493 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2494 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
2497 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2498 respond_to_request(mdr
, -ENOSPC
);
2501 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2505 switch (req
->get_op()) {
2506 case CEPH_MDS_OP_LOOKUPHASH
:
2507 case CEPH_MDS_OP_LOOKUPINO
:
2508 handle_client_lookup_ino(mdr
, false, false);
2510 case CEPH_MDS_OP_LOOKUPPARENT
:
2511 handle_client_lookup_ino(mdr
, true, false);
2513 case CEPH_MDS_OP_LOOKUPNAME
:
2514 handle_client_lookup_ino(mdr
, false, true);
2518 case CEPH_MDS_OP_LOOKUP
:
2519 handle_client_getattr(mdr
, true);
2522 case CEPH_MDS_OP_LOOKUPSNAP
:
2523 // lookupsnap does not reference a CDentry; treat it as a getattr
2524 case CEPH_MDS_OP_GETATTR
:
2525 handle_client_getattr(mdr
, false);
2528 case CEPH_MDS_OP_SETATTR
:
2529 handle_client_setattr(mdr
);
2531 case CEPH_MDS_OP_SETLAYOUT
:
2532 handle_client_setlayout(mdr
);
2534 case CEPH_MDS_OP_SETDIRLAYOUT
:
2535 handle_client_setdirlayout(mdr
);
2537 case CEPH_MDS_OP_SETXATTR
:
2538 handle_client_setxattr(mdr
);
2540 case CEPH_MDS_OP_RMXATTR
:
2541 handle_client_removexattr(mdr
);
2544 case CEPH_MDS_OP_READDIR
:
2545 handle_client_readdir(mdr
);
2548 case CEPH_MDS_OP_SETFILELOCK
:
2549 handle_client_file_setlock(mdr
);
2552 case CEPH_MDS_OP_GETFILELOCK
:
2553 handle_client_file_readlock(mdr
);
2557 case CEPH_MDS_OP_CREATE
:
2558 if (mdr
->has_completed
)
2559 handle_client_open(mdr
); // already created.. just open
2561 handle_client_openc(mdr
);
2564 case CEPH_MDS_OP_OPEN
:
2565 handle_client_open(mdr
);
2570 case CEPH_MDS_OP_MKNOD
:
2571 handle_client_mknod(mdr
);
2573 case CEPH_MDS_OP_LINK
:
2574 handle_client_link(mdr
);
2576 case CEPH_MDS_OP_UNLINK
:
2577 case CEPH_MDS_OP_RMDIR
:
2578 handle_client_unlink(mdr
);
2580 case CEPH_MDS_OP_RENAME
:
2581 handle_client_rename(mdr
);
2583 case CEPH_MDS_OP_MKDIR
:
2584 handle_client_mkdir(mdr
);
2586 case CEPH_MDS_OP_SYMLINK
:
2587 handle_client_symlink(mdr
);
2592 case CEPH_MDS_OP_LSSNAP
:
2593 handle_client_lssnap(mdr
);
2595 case CEPH_MDS_OP_MKSNAP
:
2596 handle_client_mksnap(mdr
);
2598 case CEPH_MDS_OP_RMSNAP
:
2599 handle_client_rmsnap(mdr
);
2601 case CEPH_MDS_OP_RENAMESNAP
:
2602 handle_client_renamesnap(mdr
);
2606 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2607 respond_to_request(mdr
, -EOPNOTSUPP
);
2612 // ---------------------------------------
2615 void Server::handle_slave_request(const cref_t
<MMDSSlaveRequest
> &m
)
2617 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2618 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2620 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
2624 return handle_slave_request_reply(m
);
2626 // the purpose of rename notify is enforcing causal message ordering. making sure
2627 // bystanders have received all messages from rename srcdn's auth MDS.
2628 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
2629 auto reply
= make_message
<MMDSSlaveRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
2630 mds
->send_message(reply
, m
->get_connection());
2634 CDentry
*straydn
= NULL
;
2635 if (m
->straybl
.length() > 0) {
2636 mdcache
->decode_replica_stray(straydn
, m
->straybl
, from
);
2637 ceph_assert(straydn
);
2641 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2642 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2643 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2647 // am i a new slave?
2649 if (mdcache
->have_request(m
->get_reqid())) {
2651 mdr
= mdcache
->request_get(m
->get_reqid());
2653 // is my request newer?
2654 if (mdr
->attempt
> m
->get_attempt()) {
2655 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2656 << ", dropping " << *m
<< dendl
;
2660 if (mdr
->attempt
< m
->get_attempt()) {
2661 // mine is old, close it out
2662 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2663 << ", closing out" << dendl
;
2664 mdcache
->request_finish(mdr
);
2666 } else if (mdr
->slave_to_mds
!= from
) {
2667 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
2671 // may get these while mdr->slave_request is non-null
2672 if (m
->get_op() == MMDSSlaveRequest::OP_DROPLOCKS
) {
2673 mds
->locker
->drop_locks(mdr
.get());
2676 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
2677 if (m
->is_abort()) {
2678 mdr
->aborted
= true;
2679 if (mdr
->slave_request
) {
2680 // only abort on-going xlock, wrlock and auth pin
2681 ceph_assert(!mdr
->slave_did_prepare());
2683 mdcache
->request_finish(mdr
);
2686 if (m
->inode_export
.length() > 0)
2687 mdr
->more()->inode_import
= m
->inode_export
;
2688 // finish off request.
2689 mdcache
->request_finish(mdr
);
2696 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
2697 dout(10) << "missing slave request for " << m
->get_reqid()
2698 << " OP_FINISH, must have lost race with a forward" << dendl
;
2701 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
2702 mdr
->set_op_stamp(m
->op_stamp
);
2704 ceph_assert(mdr
->slave_request
== 0); // only one at a time, please!
2708 mdr
->straydn
= straydn
;
2711 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2712 mdr
->locks
.empty()) {
2713 dout(3) << "not active yet, waiting" << dendl
;
2714 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2718 mdr
->reset_slave_request(m
);
2720 dispatch_slave_request(mdr
);
2723 void Server::handle_slave_request_reply(const cref_t
<MMDSSlaveRequest
> &m
)
2725 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2727 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2728 metareqid_t r
= m
->get_reqid();
2729 if (!mdcache
->have_uncommitted_master(r
, from
)) {
2730 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2731 << from
<< " reqid " << r
<< dendl
;
2734 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2735 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2739 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
2740 metareqid_t r
= m
->get_reqid();
2741 mdcache
->committed_master_slave(r
, from
);
2745 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2746 if (m
->get_attempt() != mdr
->attempt
) {
2747 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
2748 << m
->get_attempt() << dendl
;
2752 switch (m
->get_op()) {
2753 case MMDSSlaveRequest::OP_XLOCKACK
:
2755 // identify lock, master request
2756 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2757 m
->get_object_info());
2758 mdr
->more()->slaves
.insert(from
);
2759 lock
->decode_locked_state(m
->get_lock_data());
2760 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2761 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2762 mdr
->finish_locking(lock
);
2763 lock
->get_xlock(mdr
, mdr
->get_client());
2765 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2766 mdr
->more()->waiting_on_slave
.erase(from
);
2767 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
2768 mdcache
->dispatch_request(mdr
);
2772 case MMDSSlaveRequest::OP_WRLOCKACK
:
2774 // identify lock, master request
2775 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2776 m
->get_object_info());
2777 mdr
->more()->slaves
.insert(from
);
2778 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2779 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2780 ceph_assert(it
->is_remote_wrlock());
2781 ceph_assert(it
->wrlock_target
== from
);
2783 mdr
->finish_locking(lock
);
2785 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
2786 mdr
->more()->waiting_on_slave
.erase(from
);
2787 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
2788 mdcache
->dispatch_request(mdr
);
2792 case MMDSSlaveRequest::OP_AUTHPINACK
:
2793 handle_slave_auth_pin_ack(mdr
, m
);
2796 case MMDSSlaveRequest::OP_LINKPREPACK
:
2797 handle_slave_link_prep_ack(mdr
, m
);
2800 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
2801 handle_slave_rmdir_prep_ack(mdr
, m
);
2804 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
2805 handle_slave_rename_prep_ack(mdr
, m
);
2808 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
2809 handle_slave_rename_notify_ack(mdr
, m
);
2817 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
2819 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
2822 dout(7) << " abort flag set, finishing" << dendl
;
2823 mdcache
->request_finish(mdr
);
2827 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
2829 int op
= mdr
->slave_request
->get_op();
2831 case MMDSSlaveRequest::OP_XLOCK
:
2832 case MMDSSlaveRequest::OP_WRLOCK
:
2835 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2836 mdr
->slave_request
->get_object_info());
2839 dout(10) << "don't have object, dropping" << dendl
;
2840 ceph_abort(); // can this happen, if we auth pinned properly.
2842 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2843 dout(10) << "not auth for remote xlock attempt, dropping on "
2844 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2846 // use acquire_locks so that we get auth_pinning.
2847 MutationImpl::LockOpVec lov
;
2848 for (const auto& p
: mdr
->locks
) {
2850 lov
.add_xlock(p
.lock
);
2851 else if (p
.is_wrlock())
2852 lov
.add_wrlock(p
.lock
);
2857 case MMDSSlaveRequest::OP_XLOCK
:
2858 lov
.add_xlock(lock
);
2859 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2861 case MMDSSlaveRequest::OP_WRLOCK
:
2862 lov
.add_wrlock(lock
);
2863 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2867 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2871 auto r
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2872 r
->set_lock_type(lock
->get_type());
2873 lock
->get_parent()->set_object_info(r
->get_object_info());
2874 if (replycode
== MMDSSlaveRequest::OP_XLOCKACK
)
2875 lock
->encode_locked_state(r
->get_lock_data());
2876 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2880 mdr
->reset_slave_request();
2884 case MMDSSlaveRequest::OP_UNXLOCK
:
2885 case MMDSSlaveRequest::OP_UNWRLOCK
:
2887 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2888 mdr
->slave_request
->get_object_info());
2890 auto it
= mdr
->locks
.find(lock
);
2891 ceph_assert(it
!= mdr
->locks
.end());
2892 bool need_issue
= false;
2894 case MMDSSlaveRequest::OP_UNXLOCK
:
2895 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2897 case MMDSSlaveRequest::OP_UNWRLOCK
:
2898 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2902 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2904 // done. no ack necessary.
2905 mdr
->reset_slave_request();
2909 case MMDSSlaveRequest::OP_AUTHPIN
:
2910 handle_slave_auth_pin(mdr
);
2913 case MMDSSlaveRequest::OP_LINKPREP
:
2914 case MMDSSlaveRequest::OP_UNLINKPREP
:
2915 handle_slave_link_prep(mdr
);
2918 case MMDSSlaveRequest::OP_RMDIRPREP
:
2919 handle_slave_rmdir_prep(mdr
);
2922 case MMDSSlaveRequest::OP_RENAMEPREP
:
2923 handle_slave_rename_prep(mdr
);
2931 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2933 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2935 // build list of objects
2936 list
<MDSCacheObject
*> objects
;
2937 CInode
*auth_pin_freeze
= NULL
;
2938 bool nonblocking
= mdr
->slave_request
->is_nonblocking();
2939 bool fail
= false, wouldblock
= false, readonly
= false;
2940 ref_t
<MMDSSlaveRequest
> reply
;
2942 if (mdcache
->is_readonly()) {
2943 dout(10) << " read-only FS" << dendl
;
2949 for (const auto &oi
: mdr
->slave_request
->get_authpins()) {
2950 MDSCacheObject
*object
= mdcache
->get_object(oi
);
2952 dout(10) << " don't have " << oi
<< dendl
;
2957 objects
.push_back(object
);
2958 if (oi
== mdr
->slave_request
->get_authpin_freeze())
2959 auth_pin_freeze
= static_cast<CInode
*>(object
);
2963 // can we auth pin them?
2965 for (const auto& obj
: objects
) {
2966 if (!obj
->is_auth()) {
2967 dout(10) << " not auth for " << *obj
<< dendl
;
2971 if (mdr
->is_auth_pinned(obj
))
2973 if (!mdr
->can_auth_pin(obj
)) {
2975 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
2981 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
2982 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2983 mdr
->drop_local_auth_pins();
2985 mds
->locker
->notify_freeze_waiter(obj
);
2992 /* freeze authpin wrong inode */
2993 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2994 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2995 mdr
->unfreeze_auth_pin(true);
2997 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2998 * on the source inode to complete. This happens after all locks for the rename
2999 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3000 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3001 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
3002 * The solution is freeze the inode and prevent other MDRequests from getting new
3005 if (auth_pin_freeze
) {
3006 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3007 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3008 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3009 mds
->mdlog
->flush();
3015 reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
3018 mdr
->drop_local_auth_pins(); // just in case
3020 reply
->mark_error_rofs();
3022 reply
->mark_error_wouldblock();
3025 for (const auto& obj
: objects
) {
3026 dout(10) << "auth_pinning " << *obj
<< dendl
;
3029 // return list of my auth_pins (if any)
3030 for (const auto &p
: mdr
->object_states
) {
3031 if (!p
.second
.auth_pinned
)
3033 MDSCacheObjectInfo info
;
3034 p
.first
->set_object_info(info
);
3035 reply
->get_authpins().push_back(info
);
3036 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3037 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3041 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
3043 // clean up this request
3044 mdr
->reset_slave_request();
3048 if (mdr
->slave_request
->should_notify_blocking()) {
3049 reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
3050 reply
->mark_req_blocked();
3051 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
3052 mdr
->slave_request
->clear_notify_blocking();
3057 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
3059 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3060 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3062 if (ack
->is_req_blocked()) {
3063 mdr
->disable_lock_cache();
3064 // slave auth pin is blocked, drop locks to avoid deadlock
3065 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3070 set
<MDSCacheObject
*> pinned
;
3071 for (const auto &oi
: ack
->get_authpins()) {
3072 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3073 ceph_assert(object
); // we pinned it
3074 dout(10) << " remote has pinned " << *object
<< dendl
;
3075 mdr
->set_remote_auth_pinned(object
, from
);
3076 if (oi
== ack
->get_authpin_freeze())
3077 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3078 pinned
.insert(object
);
3081 // removed frozen auth pin ?
3082 if (mdr
->more()->is_remote_frozen_authpin
&&
3083 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3084 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3085 ceph_assert(stat_p
);
3086 if (stat_p
->remote_auth_pinned
== from
) {
3087 mdr
->more()->is_remote_frozen_authpin
= false;
3091 // removed auth pins?
3092 for (auto& p
: mdr
->object_states
) {
3093 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3095 MDSCacheObject
* object
= p
.first
;
3096 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3097 dout(10) << " remote has unpinned " << *object
<< dendl
;
3098 mdr
->_clear_remote_auth_pinned(p
.second
);
3103 mdr
->more()->slaves
.insert(from
);
3105 // clear from waiting list
3106 auto ret
= mdr
->more()->waiting_on_slave
.erase(from
);
3109 if (ack
->is_error_rofs()) {
3110 mdr
->more()->slave_error
= -EROFS
;
3111 } else if (ack
->is_error_wouldblock()) {
3112 mdr
->more()->slave_error
= -EWOULDBLOCK
;
3116 if (mdr
->more()->waiting_on_slave
.empty())
3117 mdcache
->dispatch_request(mdr
);
3119 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
3123 // ---------------------------------------
3128 * check whether we are permitted to complete a request
3130 * Check whether we have permission to perform the operation specified
3131 * by mask on the given inode, based on the capability in the mdr's
3134 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3137 int r
= mdr
->session
->check_access(
3139 mdr
->client_request
->get_caller_uid(),
3140 mdr
->client_request
->get_caller_gid(),
3141 &mdr
->client_request
->get_caller_gid_list(),
3142 mdr
->client_request
->head
.args
.setattr
.uid
,
3143 mdr
->client_request
->head
.args
.setattr
.gid
);
3145 respond_to_request(mdr
, r
);
3153 * check whether fragment has reached maximum size
3156 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
3158 const auto size
= in
->get_frag_size();
3159 if (size
>= g_conf()->mds_bal_fragment_size_max
) {
3160 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf()->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
3161 respond_to_request(mdr
, -ENOSPC
);
3168 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3170 CDentry
*straydn
= mdr
->straydn
;
3173 in
->name_stray_dentry(straydname
);
3174 ceph_assert(straydn
->get_name() == straydname
);
3178 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3180 if (!mdr
->client_request
->is_replay() &&
3181 !check_fragment_space(mdr
, straydir
))
3184 straydn
= mdcache
->get_or_create_stray_dentry(in
);
3185 mdr
->straydn
= straydn
;
3190 /** prepare_new_inode
3192 * create a new inode. set c/m/atime. hit dir pop.
3194 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3195 file_layout_t
*layout
)
3197 CInode
*in
= new CInode(mdcache
);
3199 // Server::prepare_force_open_sessions() can re-open session in closing
3200 // state. In that corner case, session's prealloc_inos are being freed.
3201 // To simplify the code, we disallow using/refilling session's prealloc_ino
3202 // while session is opening.
3203 bool allow_prealloc_inos
= mdr
->session
->is_open();
3206 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= in
->inode
.ino
= mdr
->session
->take_ino(useino
))) {
3207 mds
->sessionmap
.mark_projected(mdr
->session
);
3208 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3209 << " (" << mdr
->session
->info
.prealloc_inos
3210 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3214 in
->inode
.ino
= mds
->inotable
->project_alloc_id(useino
);
3215 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3218 if (useino
&& useino
!= in
->inode
.ino
) {
3219 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
3220 mds
->clog
->error() << mdr
->client_request
->get_source()
3221 << " specified ino " << useino
3222 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
3223 //ceph_abort(); // just for now.
3226 if (allow_prealloc_inos
&&
3227 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3228 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3229 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3230 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3231 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3232 mds
->sessionmap
.mark_projected(mdr
->session
);
3233 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3236 in
->inode
.version
= 1;
3237 in
->inode
.xattr_version
= 1;
3238 in
->inode
.nlink
= 1; // FIXME
3240 in
->inode
.mode
= mode
;
3242 // FIPS zeroization audit 20191117: this memset is not security related.
3243 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
3244 if (in
->inode
.is_dir()) {
3245 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3246 } else if (layout
) {
3247 in
->inode
.layout
= *layout
;
3249 in
->inode
.layout
= mdcache
->default_file_layout
;
3252 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
3253 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3255 CInode
*diri
= dir
->get_inode();
3257 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3259 if (diri
->inode
.mode
& S_ISGID
) {
3260 dout(10) << " dir is sticky" << dendl
;
3261 in
->inode
.gid
= diri
->inode
.gid
;
3262 if (S_ISDIR(mode
)) {
3263 dout(10) << " new dir also sticky" << dendl
;
3264 in
->inode
.mode
|= S_ISGID
;
3267 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
3269 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
3271 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
3272 mdr
->get_op_stamp();
3274 in
->inode
.change_attr
= 0;
3276 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3277 if (req
->get_data().length()) {
3278 auto p
= req
->get_data().cbegin();
3280 // xattrs on new inode?
3281 CInode::mempool_xattr_map xattrs
;
3282 decode_noshare(xattrs
, p
);
3283 for (const auto &p
: xattrs
) {
3284 dout(10) << "prepare_new_inode setting xattr " << p
.first
<< dendl
;
3285 auto em
= in
->xattrs
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple(p
.second
));
3287 em
.first
->second
= p
.second
;
3291 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3292 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3293 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
3295 mdcache
->add_inode(in
); // add
3296 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3300 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3302 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3303 << " inotablev " << mds
->inotable
->get_projected_version()
3305 blob
->set_ino_alloc(mdr
->alloc_ino
,
3306 mdr
->used_prealloc_ino
,
3308 mdr
->client_request
->get_source(),
3309 mds
->sessionmap
.get_projected(),
3310 mds
->inotable
->get_projected_version());
3313 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3315 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3316 << " / " << mdr
->prealloc_inos
3317 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3319 if (mdr
->alloc_ino
) {
3320 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3322 if (mdr
->prealloc_inos
.size()) {
3323 ceph_assert(session
);
3324 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3325 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3326 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3327 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3329 if (mdr
->used_prealloc_ino
) {
3330 ceph_assert(session
);
3331 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
3332 mds
->sessionmap
.mark_dirty(session
);
3336 class C_MDS_TryFindInode
: public ServerContext
{
3339 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3340 void finish(int r
) override
{
3341 if (r
== -ESTALE
) // :( find_ino_peers failed
3342 server
->respond_to_request(mdr
, r
);
3344 server
->dispatch_client_request(mdr
);
3348 class CF_MDS_MDRContextFactory
: public MDSContextFactory
{
3350 CF_MDS_MDRContextFactory(MDCache
*cache
, MDRequestRef
&mdr
, bool dl
) :
3351 mdcache(cache
), mdr(mdr
), drop_locks(dl
) {}
3352 MDSContext
*build() {
3354 mdcache
->mds
->locker
->drop_locks(mdr
.get(), nullptr);
3355 mdr
->drop_local_auth_pins();
3357 return new C_MDS_RetryRequest(mdcache
, mdr
);
3365 /* If this returns null, the request has been handled
3366 * as appropriate: forwarded on, or the client's been replied to */
3367 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3371 const filepath
& refpath
= mdr
->get_filepath();
3372 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3374 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3378 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, true);
3380 if (refpath
.is_last_snap()) {
3384 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3387 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3388 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3390 return nullptr; // delayed
3391 if (r
< 0) { // error
3392 if (r
== -ENOENT
&& !mdr
->dn
[0].empty()) {
3393 if (mdr
->client_request
&&
3394 mdr
->client_request
->get_dentry_wanted())
3395 mdr
->tracedn
= mdr
->dn
[0].back();
3396 respond_to_request(mdr
, r
);
3397 } else if (r
== -ESTALE
) {
3398 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3399 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
3400 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
3402 dout(10) << "FAIL on error " << r
<< dendl
;
3403 respond_to_request(mdr
, r
);
3407 CInode
*ref
= mdr
->in
[0];
3408 dout(10) << "ref is " << *ref
<< dendl
;
3412 // do NOT proceed if freezing, as cap release may defer in that case, and
3413 // we could deadlock when we try to lock @ref.
3414 // if we're already auth_pinned, continue; the release has already been processed.
3415 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3416 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3417 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3418 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3419 if (mdr
->is_any_remote_auth_pin())
3420 mds
->locker
->notify_freeze_waiter(ref
);
3432 /** rdlock_path_xlock_dentry
3433 * traverse path to the directory that could/would contain dentry.
3434 * make sure i am auth for that dentry, forward as necessary.
3435 * create null dentry in place (or use existing if okexist).
3436 * get rdlocks on traversed dentries, xlock on new dentry.
3438 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3439 bool create
, bool okexist
, bool want_layout
)
3441 const filepath
& refpath
= mdr
->get_filepath();
3442 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3444 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3445 return mdr
->dn
[0].back();
3447 // figure parent dir vs dname
3448 if (refpath
.depth() == 0) {
3449 dout(7) << "invalid path (zero length)" << dendl
;
3450 respond_to_request(mdr
, -EINVAL
);
3454 if (refpath
.is_last_snap()) {
3455 respond_to_request(mdr
, -EROFS
);
3459 if (refpath
.is_last_dot_or_dotdot()) {
3460 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3462 respond_to_request(mdr
, -EEXIST
);
3464 respond_to_request(mdr
, -ENOTEMPTY
);
3468 // traverse to parent dir
3469 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, true);
3470 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3471 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3472 MDS_TRAVERSE_WANT_AUTH
;
3473 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3474 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3476 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3478 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3479 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3481 return nullptr; // delayed
3484 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3485 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3488 respond_to_request(mdr
, r
);
3492 CDentry
*dn
= mdr
->dn
[0].back();
3493 CDir
*dir
= dn
->get_dir();
3494 CInode
*diri
= dir
->get_inode();
3496 if (!mdr
->reqid
.name
.is_mds()) {
3497 if (diri
->is_system() && !diri
->is_root()) {
3498 respond_to_request(mdr
, -EROFS
);
3503 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3504 respond_to_request(mdr
, -ENOENT
);
3508 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3509 if (dnl
->is_null()) {
3510 if (!create
&& okexist
) {
3511 respond_to_request(mdr
, -ENOENT
);
3515 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3516 dn
->first
= std::max(dn
->first
, next_snap
);
3519 respond_to_request(mdr
, -EEXIST
);
3522 mdr
->in
[0] = dnl
->get_inode();
3528 /** rdlock_two_paths_xlock_destdn
3529 * traverse two paths and lock the two paths in proper order.
3530 * The order of taking locks is:
3531 * 1. Lock directory inodes or dentries according to which trees they
3532 * are under. Lock objects under fs root before objects under mdsdir.
3533 * 2. Lock directory inodes or dentries according to their depth, in
3535 * 3. Lock directory inodes or dentries according to inode numbers or
3536 * dentries' parent inode numbers, in ascending order.
3537 * 4. Lock dentries in the same directory in order of their keys.
3538 * 5. Lock non-directory inodes according to inode numbers, in ascending
3541 std::pair
<CDentry
*, CDentry
*>
3542 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3545 const filepath
& refpath
= mdr
->get_filepath();
3546 const filepath
& refpath2
= mdr
->get_filepath2();
3548 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3550 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3551 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3553 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3554 respond_to_request(mdr
, -EINVAL
);
3555 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3558 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3559 respond_to_request(mdr
, -EROFS
);
3560 return std::make_pair(nullptr, nullptr);
3563 // traverse to parent dir
3564 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, true);
3565 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3566 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3569 dout(10) << "ESTALE on path, attempting recovery" << dendl
;
3570 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3572 respond_to_request(mdr
, r
);
3574 return std::make_pair(nullptr, nullptr);
3577 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3578 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3581 dout(10) << "ESTALE on path2, attempting recovery" << dendl
;
3582 mdcache
->find_ino_peers(refpath2
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3584 respond_to_request(mdr
, r
);
3586 return std::make_pair(nullptr, nullptr);
3589 CDentry
*srcdn
= mdr
->dn
[1].back();
3590 CDir
*srcdir
= srcdn
->get_dir();
3591 CDentry
*destdn
= mdr
->dn
[0].back();
3592 CDir
*destdir
= destdn
->get_dir();
3594 if (!mdr
->reqid
.name
.is_mds()) {
3595 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3596 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3597 respond_to_request(mdr
, -EROFS
);
3598 return std::make_pair(nullptr, nullptr);
3602 if (!destdir
->get_inode()->is_base() &&
3603 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3604 respond_to_request(mdr
, -ENOENT
);
3605 return std::make_pair(nullptr, nullptr);
3608 MutationImpl::LockOpVec lov
;
3609 if (srcdir
->get_inode() == destdir
->get_inode()) {
3610 lov
.add_wrlock(&destdir
->inode
->filelock
);
3611 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3612 if (xlock_srcdn
&& srcdir
!= destdir
) {
3613 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3614 if (srcdir_auth
!= mds
->get_nodeid()) {
3615 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3616 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3620 if (srcdn
->get_name() > destdn
->get_name())
3621 lov
.add_xlock(&destdn
->lock
);
3624 lov
.add_xlock(&srcdn
->lock
);
3626 lov
.add_rdlock(&srcdn
->lock
);
3628 if (srcdn
->get_name() < destdn
->get_name())
3629 lov
.add_xlock(&destdn
->lock
);
3631 int cmp
= mdr
->compare_paths();
3632 bool lock_destdir_first
=
3633 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3635 if (lock_destdir_first
) {
3636 lov
.add_wrlock(&destdir
->inode
->filelock
);
3637 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3638 lov
.add_xlock(&destdn
->lock
);
3642 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3643 if (srcdir_auth
== mds
->get_nodeid()) {
3644 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3645 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3647 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3648 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3650 lov
.add_xlock(&srcdn
->lock
);
3652 lov
.add_rdlock(&srcdn
->lock
);
3655 if (!lock_destdir_first
) {
3656 lov
.add_wrlock(&destdir
->inode
->filelock
);
3657 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3658 lov
.add_xlock(&destdn
->lock
);
3662 CInode
*auth_pin_freeze
= nullptr;
3663 // XXX any better way to do this?
3664 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3665 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3666 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3668 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3669 return std::make_pair(nullptr, nullptr);
3671 if (srcdn
->get_projected_linkage()->is_null()) {
3672 respond_to_request(mdr
, -ENOENT
);
3673 return std::make_pair(nullptr, nullptr);
3676 if (destdn
->get_projected_linkage()->is_null()) {
3677 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3678 destdn
->first
= std::max(destdn
->first
, next_snap
);
3681 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3683 return std::make_pair(destdn
, srcdn
);
3687 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3689 * @param diri base inode
3690 * @param fg the exact frag we want
3691 * @param mdr request
3692 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3694 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3696 CDir
*dir
= diri
->get_dirfrag(fg
);
3699 // am i auth for the dirfrag?
3700 if (!dir
->is_auth()) {
3701 mds_rank_t auth
= dir
->authority().first
;
3702 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3703 << ", fw to mds." << auth
<< dendl
;
3704 mdcache
->request_forward(mdr
, auth
);
3708 // not open and inode not mine?
3709 if (!diri
->is_auth()) {
3710 mds_rank_t inauth
= diri
->authority().first
;
3711 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3712 mdcache
->request_forward(mdr
, inauth
);
3716 // not open and inode frozen?
3717 if (diri
->is_frozen()) {
3718 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3719 ceph_assert(diri
->get_parent_dir());
3720 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3725 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3732 // ===============================================================================
3735 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3737 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3739 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3740 // refpath can't be empty for lookup but it can for
3741 // getattr (we do getattr with empty refpath for mount of '/')
3742 respond_to_request(mdr
, -EINVAL
);
3746 bool want_auth
= false;
3747 int mask
= req
->head
.args
.getattr
.mask
;
3748 if (mask
& CEPH_STAT_RSTAT
)
3749 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3751 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3755 mdr
->getattr_caps
= mask
;
3757 if (mdr
->snapid
== CEPH_NOSNAP
&& !mdr
->is_batch_head
&& mdr
->is_batch_op()) {
3759 auto em
= ref
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3761 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
, mdcache
);
3763 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3764 em
.first
->second
->add_request(mdr
);
3768 CDentry
* dn
= mdr
->dn
[0].back();
3769 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3771 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
, mdcache
);
3774 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3775 em
.first
->second
->add_request(mdr
);
3779 mdr
->is_batch_head
= true;
3783 * if client currently holds the EXCL cap on a field, do not rdlock
3784 * it; client's stat() will result in valid info if _either_ EXCL
3785 * cap is held or MDS rdlocks and reads the value here.
3787 * handling this case here is easier than weakening rdlock
3788 * semantics... that would cause problems elsewhere.
3790 client_t client
= mdr
->get_client();
3792 Capability
*cap
= ref
->get_client_cap(client
);
3793 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3794 mdr
->snapid
<= cap
->client_follows
))
3795 issued
= cap
->issued();
3798 MutationImpl::LockOpVec lov
;
3799 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3800 lov
.add_rdlock(&ref
->linklock
);
3801 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3802 lov
.add_rdlock(&ref
->authlock
);
3803 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3804 lov
.add_rdlock(&ref
->xattrlock
);
3805 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3806 // Don't wait on unstable filelock if client is allowed to read file size.
3807 // This can reduce the response time of getattr in the case that multiple
3808 // clients do stat(2) and there are writers.
3809 // The downside of this optimization is that mds may not issue Fs caps along
3810 // with getattr reply. Client may need to send more getattr requests.
3811 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3812 lov
.add_rdlock(&ref
->filelock
);
3813 } else if (ref
->filelock
.is_stable() ||
3814 ref
->filelock
.get_num_wrlocks() > 0 ||
3815 !ref
->filelock
.can_read(mdr
->get_client())) {
3816 lov
.add_rdlock(&ref
->filelock
);
3817 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
3821 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3824 if (!check_access(mdr
, ref
, MAY_READ
))
3827 utime_t now
= ceph_clock_now();
3828 mdr
->set_mds_stamp(now
);
3830 // note which caps are requested, so we return at least a snapshot
3831 // value for them. (currently this matters for xattrs and inline data)
3832 mdr
->getattr_caps
= mask
;
3834 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3837 dout(10) << "reply to stat on " << *req
<< dendl
;
3840 mdr
->tracedn
= mdr
->dn
[0].back();
3841 respond_to_request(mdr
, 0);
3844 struct C_MDS_LookupIno2
: public ServerContext
{
3846 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3847 void finish(int r
) override
{
3848 server
->_lookup_ino_2(mdr
, r
);
3855 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3856 bool want_parent
, bool want_dentry
)
3858 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3860 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
3861 return _lookup_snap_ino(mdr
);
3863 inodeno_t ino
= req
->get_filepath().get_ino();
3864 CInode
*in
= mdcache
->get_inode(ino
);
3865 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3866 respond_to_request(mdr
, -ESTALE
);
3870 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3874 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->have_past_parents_open() &&
3875 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
3879 // check for nothing (not read or write); this still applies the
3881 if (!check_access(mdr
, in
, 0))
3884 CDentry
*dn
= in
->get_projected_parent_dn();
3885 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3887 MutationImpl::LockOpVec lov
;
3888 if (dn
&& (want_parent
|| want_dentry
)) {
3890 lov
.add_rdlock(&dn
->lock
);
3893 unsigned mask
= req
->head
.args
.lookupino
.mask
;
3895 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3897 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3898 issued
= cap
->issued();
3900 // permission bits, ACL/security xattrs
3901 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3902 lov
.add_rdlock(&in
->authlock
);
3903 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3904 lov
.add_rdlock(&in
->xattrlock
);
3906 mdr
->getattr_caps
= mask
;
3910 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3914 // need read access to directory inode
3915 if (!check_access(mdr
, diri
, MAY_READ
))
3921 if (in
->is_base()) {
3922 respond_to_request(mdr
, -EINVAL
);
3925 if (!diri
|| diri
->is_stray()) {
3926 respond_to_request(mdr
, -ESTALE
);
3929 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3931 respond_to_request(mdr
, 0);
3934 inodeno_t dirino
= req
->get_filepath2().get_ino();
3935 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3936 respond_to_request(mdr
, -ENOENT
);
3939 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3941 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3946 respond_to_request(mdr
, 0);
3950 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
3952 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3955 vino
.ino
= req
->get_filepath().get_ino();
3956 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
3957 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
3958 __u32 hash
= req
->head
.args
.lookupino
.hash
;
3960 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
3962 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
3964 in
= mdcache
->get_inode(vino
.ino
);
3966 if (in
->state_test(CInode::STATE_PURGING
) ||
3967 !in
->has_snap_data(vino
.snapid
)) {
3968 if (in
->is_dir() || !parent_ino
) {
3969 respond_to_request(mdr
, -ESTALE
);
3978 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
3979 mdr
->snapid
= vino
.snapid
;
3981 respond_to_request(mdr
, 0);
3985 CInode
*diri
= NULL
;
3987 diri
= mdcache
->get_inode(parent_ino
);
3989 mdcache
->open_ino(parent_ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
3993 if (!diri
->is_dir()) {
3994 respond_to_request(mdr
, -EINVAL
);
3998 MutationImpl::LockOpVec lov
;
3999 lov
.add_rdlock(&diri
->dirfragtreelock
);
4000 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4003 frag_t frag
= diri
->dirfragtree
[hash
];
4004 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4008 if (!dir
->is_complete()) {
4009 if (dir
->is_frozen()) {
4010 mds
->locker
->drop_locks(mdr
.get());
4011 mdr
->drop_local_auth_pins();
4012 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4015 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4019 respond_to_request(mdr
, -ESTALE
);
4021 mdcache
->open_ino(vino
.ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4025 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4027 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4028 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4030 // `r` is a rank if >=0, else an error code
4032 mds_rank_t
dest_rank(r
);
4033 if (dest_rank
== mds
->get_nodeid())
4034 dispatch_client_request(mdr
);
4036 mdcache
->request_forward(mdr
, dest_rank
);
4041 if (r
== -ENOENT
|| r
== -ENODATA
)
4043 respond_to_request(mdr
, r
);
4047 /* This function takes responsibility for the passed mdr*/
4048 void Server::handle_client_open(MDRequestRef
& mdr
)
4050 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4051 dout(7) << "open on " << req
->get_filepath() << dendl
;
4053 int flags
= req
->head
.args
.open
.flags
;
4054 int cmode
= ceph_flags_to_mode(flags
);
4056 respond_to_request(mdr
, -EINVAL
);
4060 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4061 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4063 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4064 dout(7) << "read-only FS" << dendl
;
4065 respond_to_request(mdr
, -EROFS
);
4069 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4073 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4074 ceph_assert(!need_auth
);
4075 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4076 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4081 if (!cur
->inode
.is_file()) {
4082 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4083 cmode
= CEPH_FILE_MODE_PIN
;
4084 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4085 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4086 flags
&= ~CEPH_O_TRUNC
;
4089 dout(10) << "open flags = " << flags
4090 << ", filemode = " << cmode
4091 << ", need_auth = " << need_auth
4095 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4096 dout(7) << "not a file or dir " << *cur << dendl;
4097 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
4100 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
4101 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4102 respond_to_request(mdr
, -EINVAL
);
4106 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
4107 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4108 // we should return -EISDIR for directory, return -EINVAL for other non-regular
4109 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
4113 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
4114 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4115 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4116 respond_to_request(mdr
, -EPERM
);
4120 // snapped data is read only
4121 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4122 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4123 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4124 respond_to_request(mdr
, -EROFS
);
4128 MutationImpl::LockOpVec lov
;
4130 unsigned mask
= req
->head
.args
.open
.mask
;
4132 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4134 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4135 issued
= cap
->issued();
4136 // permission bits, ACL/security xattrs
4137 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4138 lov
.add_rdlock(&cur
->authlock
);
4139 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4140 lov
.add_rdlock(&cur
->xattrlock
);
4142 mdr
->getattr_caps
= mask
;
4146 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4147 ceph_assert(cur
->is_auth());
4149 lov
.add_xlock(&cur
->filelock
);
4150 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4153 if (!check_access(mdr
, cur
, MAY_WRITE
))
4156 // wait for pending truncate?
4157 const auto pi
= cur
->get_projected_inode();
4158 if (pi
->is_truncating()) {
4159 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4160 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4161 mds
->locker
->drop_locks(mdr
.get());
4162 mdr
->drop_local_auth_pins();
4163 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4167 do_open_truncate(mdr
, cmode
);
4171 // sync filelock if snapped.
4172 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4173 // and that data itself is flushed so that we can read the snapped data off disk.
4174 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4175 lov
.add_rdlock(&cur
->filelock
);
4178 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4182 if (cmode
& CEPH_FILE_MODE_WR
)
4184 if (!check_access(mdr
, cur
, mask
))
4187 utime_t now
= ceph_clock_now();
4188 mdr
->set_mds_stamp(now
);
4190 if (cur
->is_file() || cur
->is_dir()) {
4191 if (mdr
->snapid
== CEPH_NOSNAP
) {
4193 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4195 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4196 << " for " << req
->get_source()
4197 << " on " << *cur
<< dendl
;
4199 int caps
= ceph_caps_for_mode(cmode
);
4200 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4201 << " for " << req
->get_source()
4202 << " snapid " << mdr
->snapid
4203 << " on " << *cur
<< dendl
;
4204 mdr
->snap_caps
= caps
;
4208 // increase max_size?
4209 if (cmode
& CEPH_FILE_MODE_WR
)
4210 mds
->locker
->check_inode_max_size(cur
);
4212 // make sure this inode gets into the journal
4213 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4214 mdcache
->open_file_table
.should_log_open(cur
)) {
4215 EOpen
*le
= new EOpen(mds
->mdlog
);
4216 mdlog
->start_entry(le
);
4217 le
->add_clean_inode(cur
);
4218 mdlog
->submit_entry(le
);
4222 if (cmode
& CEPH_FILE_MODE_WR
)
4223 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4225 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
4226 mdr
->client_request
->get_source().num());
4229 if (req
->get_dentry_wanted()) {
4230 ceph_assert(mdr
->dn
[0].size());
4231 dn
= mdr
->dn
[0].back();
4236 respond_to_request(mdr
, 0);
4239 class C_MDS_openc_finish
: public ServerLogContext
{
4243 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4244 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4245 void finish(int r
) override
{
4246 ceph_assert(r
== 0);
4248 dn
->pop_projected_linkage();
4250 // dirty inode, dn, dir
4251 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
4252 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
4253 newi
->mark_dirty_parent(mdr
->ls
, true);
4257 get_mds()->locker
->share_inode_max_size(newi
);
4259 MDRequestRef null_ref
;
4260 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4262 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4264 server
->respond_to_request(mdr
, 0);
4266 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4270 /* This function takes responsibility for the passed mdr*/
4271 void Server::handle_client_openc(MDRequestRef
& mdr
)
4273 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4274 client_t client
= mdr
->get_client();
4276 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4278 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4280 respond_to_request(mdr
, -EINVAL
);
4284 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4285 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true);
4289 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4290 if (!excl
&& !dnl
->is_null()) {
4292 mds
->locker
->xlock_downgrade(&dn
->lock
, mdr
.get());
4294 MutationImpl::LockOpVec lov
;
4295 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4296 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4299 handle_client_open(mdr
);
4303 ceph_assert(dnl
->is_null());
4306 file_layout_t layout
;
4307 if (mdr
->dir_layout
!= file_layout_t())
4308 layout
= mdr
->dir_layout
;
4310 layout
= mdcache
->default_file_layout
;
4312 // What kind of client caps are required to complete this operation
4313 uint64_t access
= MAY_WRITE
;
4315 const auto default_layout
= layout
;
4317 // fill in any special params from client
4318 if (req
->head
.args
.open
.stripe_unit
)
4319 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4320 if (req
->head
.args
.open
.stripe_count
)
4321 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4322 if (req
->head
.args
.open
.object_size
)
4323 layout
.object_size
= req
->head
.args
.open
.object_size
;
4324 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4325 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4326 layout
.pool_id
= req
->head
.args
.open
.pool
;
4328 // make sure we have as new a map as the client
4329 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4330 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4335 // If client doesn't have capability to modify layout pools, then
4336 // only permit this request if the requested pool matches what the
4337 // file would have inherited anyway from its parent.
4338 if (default_layout
!= layout
) {
4339 access
|= MAY_SET_VXATTR
;
4342 if (!layout
.is_valid()) {
4343 dout(10) << " invalid initial file layout" << dendl
;
4344 respond_to_request(mdr
, -EINVAL
);
4347 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4348 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4349 respond_to_request(mdr
, -EINVAL
);
4354 CDir
*dir
= dn
->get_dir();
4355 CInode
*diri
= dir
->get_inode();
4356 if (!check_access(mdr
, diri
, access
))
4358 if (!check_fragment_space(mdr
, dir
))
4361 if (mdr
->dn
[0].size() == 1)
4362 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4365 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4366 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4370 dn
->push_projected_linkage(in
);
4372 in
->inode
.version
= dn
->pre_dirty();
4373 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4374 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
4375 in
->inode
.update_backtrace();
4376 in
->inode
.rstat
.rfiles
= 1;
4378 SnapRealm
*realm
= diri
->find_snaprealm();
4379 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4380 ceph_assert(follows
>= realm
->get_newest_seq());
4382 ceph_assert(dn
->first
== follows
+1);
4383 in
->first
= dn
->first
;
4386 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
4387 in
->authlock
.set_state(LOCK_EXCL
);
4388 in
->xattrlock
.set_state(LOCK_EXCL
);
4390 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4391 in
->inode
.client_ranges
[client
].range
.first
= 0;
4392 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.layout
.stripe_unit
;
4393 in
->inode
.client_ranges
[client
].follows
= follows
;
4394 cap
->mark_clientwriteable();
4398 mdr
->ls
= mdlog
->get_current_segment();
4399 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4400 mdlog
->start_entry(le
);
4401 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4402 journal_allocated_inos(mdr
, &le
->metablob
);
4403 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4404 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
4406 // make sure this inode gets into the journal
4407 le
->metablob
.add_opened_ino(in
->ino());
4409 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
);
4411 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4412 openc_response_t ocresp
;
4414 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4415 ocresp
.created_ino
= in
->inode
.ino
;
4417 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4418 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4419 unsigned frac
= 100 / delegate_inos_pct
;
4420 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4421 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4424 encode(ocresp
, mdr
->reply_extra_bl
);
4425 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4426 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4427 // add the file created flag onto the reply if create_flags features is supported
4428 encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
4431 journal_and_reply(mdr
, in
, dn
, le
, fin
);
4433 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4434 // have overshot the split size (multiple opencs in flight), so here is
4435 // an early chance to split the dir if this openc makes it oversized.
4436 mds
->balancer
->maybe_fragment(dir
, false);
4441 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4443 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4444 client_t client
= req
->get_source().num();
4445 MutationImpl::LockOpVec lov
;
4446 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4449 // it's a directory, right?
4450 if (!diri
->is_dir()) {
4452 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
4453 respond_to_request(mdr
, -ENOTDIR
);
4457 lov
.add_rdlock(&diri
->filelock
);
4458 lov
.add_rdlock(&diri
->dirfragtreelock
);
4460 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4463 if (!check_access(mdr
, diri
, MAY_READ
))
4467 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4468 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4469 string offset_str
= req
->get_path2();
4471 __u32 offset_hash
= 0;
4472 if (!offset_str
.empty())
4473 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4475 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4477 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4478 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4480 // does the frag exist?
4481 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4483 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4484 if (fg
.contains((unsigned)offset_hash
)) {
4485 newfg
= diri
->dirfragtree
[offset_hash
];
4487 // client actually wants next frag
4488 newfg
= diri
->dirfragtree
[fg
.value()];
4492 newfg
= diri
->dirfragtree
[fg
.value()];
4494 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4498 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4502 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4503 ceph_assert(dir
->is_auth());
4505 if (!dir
->is_complete()) {
4506 if (dir
->is_frozen()) {
4507 dout(7) << "dir is frozen " << *dir
<< dendl
;
4508 mds
->locker
->drop_locks(mdr
.get());
4509 mdr
->drop_local_auth_pins();
4510 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4514 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4515 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4519 #ifdef MDS_VERIFY_FRAGSTAT
4520 dir
->verify_fragstat();
4523 utime_t now
= ceph_clock_now();
4524 mdr
->set_mds_stamp(now
);
4526 snapid_t snapid
= mdr
->snapid
;
4527 dout(10) << "snapid " << snapid
<< dendl
;
4529 SnapRealm
*realm
= diri
->find_snaprealm();
4531 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4533 max
= dir
->get_num_any(); // whatever, something big.
4534 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4536 // make sure at least one item can be encoded
4537 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4542 ds
.frag
= dir
->get_frag();
4543 ds
.auth
= dir
->get_dir_auth().first
;
4544 if (dir
->is_auth() && !mdcache
->forward_all_reqs_to_auth())
4545 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4547 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4549 // count bytes available.
4550 // this isn't perfect, but we should capture the main variable/unbounded size items!
4551 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4552 int bytes_left
= max_bytes
- front_bytes
;
4553 bytes_left
-= realm
->get_snap_trace().length();
4555 // build dir contents
4558 bool start
= !offset_hash
&& offset_str
.empty();
4559 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4560 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4561 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4562 bool end
= (it
== dir
->end());
4563 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4564 CDentry
*dn
= it
->second
;
4567 if (dn
->state_test(CDentry::STATE_PURGING
))
4570 bool dnp
= dn
->use_projected(client
, mdr
);
4571 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4576 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4577 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4582 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4583 if (!(offset_key
< dn
->key()))
4587 CInode
*in
= dnl
->get_inode();
4589 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4593 // better for the MDS to do the work, if we think the client will stat any of these files.
4594 if (dnl
->is_remote() && !in
) {
4595 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4597 dn
->link_remote(dnl
, in
);
4598 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4599 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4602 // touch everything i _do_ have
4603 for (auto &p
: *dir
) {
4604 if (!p
.second
->get_linkage()->is_null())
4605 mdcache
->lru
.lru_touch(p
.second
);
4608 // already issued caps and leases, reply immediately.
4609 if (dnbl
.length() > 0) {
4610 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4611 dout(10) << " open remote dentry after caps were issued, stopping at "
4612 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4616 mds
->locker
->drop_locks(mdr
.get());
4617 mdr
->drop_local_auth_pins();
4618 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4624 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4625 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4629 unsigned start_len
= dnbl
.length();
4632 dout(12) << "including dn " << *dn
<< dendl
;
4633 encode(dn
->get_name(), dnbl
);
4634 int lease_mask
= dnl
->is_primary() ? CEPH_LEASE_PRIMARY_LINK
: 0;
4635 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, dnbl
);
4638 dout(12) << "including inode " << *in
<< dendl
;
4639 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4641 // chop off dn->name, lease
4642 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4644 keep
.substr_of(dnbl
, 0, start_len
);
4648 ceph_assert(r
>= 0);
4652 mdcache
->lru
.lru_touch(dn
);
4657 flags
= CEPH_READDIR_FRAG_END
;
4659 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4661 // client only understand END and COMPLETE flags ?
4662 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4663 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4666 // finish final blob
4667 encode(numfiles
, dirbl
);
4668 encode(flags
, dirbl
);
4669 dirbl
.claim_append(dnbl
);
4672 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4673 << " bytes=" << dirbl
.length()
4674 << " start=" << (int)start
4675 << " end=" << (int)end
4677 mdr
->reply_extra_bl
= dirbl
;
4679 // bump popularity. NOTE: this doesn't quite capture it.
4680 mds
->balancer
->hit_dir(dir
, META_POP_IRD
, -1, numfiles
);
4684 respond_to_request(mdr
, 0);
4689 // ===============================================================================
4694 * finisher for basic inode updates
4696 class C_MDS_inode_update_finish
: public ServerLogContext
{
4698 bool truncating_smaller
, changed_ranges
, new_realm
;
4700 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4701 bool sm
=false, bool cr
=false, bool nr
=false) :
4702 ServerLogContext(s
, r
), in(i
),
4703 truncating_smaller(sm
), changed_ranges(cr
), new_realm(nr
) { }
4704 void finish(int r
) override
{
4705 ceph_assert(r
== 0);
4708 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4711 MDSRank
*mds
= get_mds();
4713 // notify any clients
4714 if (truncating_smaller
&& in
->inode
.is_truncating()) {
4715 mds
->locker
->issue_truncate(in
);
4716 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4720 int op
= CEPH_SNAP_OP_SPLIT
;
4721 mds
->mdcache
->send_snap_update(in
, 0, op
);
4722 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, op
);
4725 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4727 server
->respond_to_request(mdr
, 0);
4730 get_mds()->locker
->share_inode_max_size(in
);
4734 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4736 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4737 MutationImpl::LockOpVec lov
;
4739 // get the inode to operate on, and set up any locks needed for that
4740 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4744 lov
.add_xlock(&cur
->flocklock
);
4745 /* acquire_locks will return true if it gets the locks. If it fails,
4746 it will redeliver this request at a later date, so drop the request.
4748 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4749 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4753 // copy the lock change into a ceph_filelock so we can store/apply it
4754 ceph_filelock set_lock
;
4755 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4756 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4757 set_lock
.client
= req
->get_orig_source().num();
4758 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4759 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4760 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4761 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4763 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4765 ceph_lock_state_t
*lock_state
= NULL
;
4766 bool interrupt
= false;
4768 // get the appropriate lock state
4769 switch (req
->head
.args
.filelock_change
.rule
) {
4770 case CEPH_LOCK_FLOCK_INTR
:
4773 case CEPH_LOCK_FLOCK
:
4774 lock_state
= cur
->get_flock_lock_state();
4777 case CEPH_LOCK_FCNTL_INTR
:
4780 case CEPH_LOCK_FCNTL
:
4781 lock_state
= cur
->get_fcntl_lock_state();
4785 dout(10) << "got unknown lock type " << set_lock
.type
4786 << ", dropping request!" << dendl
;
4787 respond_to_request(mdr
, -EOPNOTSUPP
);
4791 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4792 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4793 list
<ceph_filelock
> activated_locks
;
4794 MDSContext::vec waiters
;
4795 if (lock_state
->is_waiting(set_lock
)) {
4796 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4797 lock_state
->remove_waiting(set_lock
);
4798 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4799 } else if (!interrupt
) {
4800 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4801 lock_state
->remove_lock(set_lock
, activated_locks
);
4802 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4804 mds
->queue_waiters(waiters
);
4806 respond_to_request(mdr
, 0);
4808 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4809 bool deadlock
= false;
4810 if (mdr
->more()->flock_was_waiting
&&
4811 !lock_state
->is_waiting(set_lock
)) {
4812 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4813 respond_to_request(mdr
, -EINTR
);
4814 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4815 dout(10) << " it failed on this attempt" << dendl
;
4816 // couldn't set lock right now
4818 respond_to_request(mdr
, -EDEADLK
);
4819 } else if (!will_wait
) {
4820 respond_to_request(mdr
, -EWOULDBLOCK
);
4822 dout(10) << " added to waiting list" << dendl
;
4823 ceph_assert(lock_state
->is_waiting(set_lock
));
4824 mdr
->more()->flock_was_waiting
= true;
4825 mds
->locker
->drop_locks(mdr
.get());
4826 mdr
->drop_local_auth_pins();
4827 mdr
->mark_event("failed to add lock, waiting");
4829 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4832 respond_to_request(mdr
, 0);
4834 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4837 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4839 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4840 MutationImpl::LockOpVec lov
;
4842 // get the inode to operate on, and set up any locks needed for that
4843 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4847 /* acquire_locks will return true if it gets the locks. If it fails,
4848 it will redeliver this request at a later date, so drop the request.
4850 lov
.add_rdlock(&cur
->flocklock
);
4851 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4852 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4856 // copy the lock change into a ceph_filelock so we can store/apply it
4857 ceph_filelock checking_lock
;
4858 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4859 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4860 checking_lock
.client
= req
->get_orig_source().num();
4861 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4862 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4863 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4865 // get the appropriate lock state
4866 ceph_lock_state_t
*lock_state
= NULL
;
4867 switch (req
->head
.args
.filelock_change
.rule
) {
4868 case CEPH_LOCK_FLOCK
:
4869 lock_state
= cur
->get_flock_lock_state();
4872 case CEPH_LOCK_FCNTL
:
4873 lock_state
= cur
->get_fcntl_lock_state();
4877 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4878 respond_to_request(mdr
, -EINVAL
);
4881 lock_state
->look_for_lock(checking_lock
);
4884 encode(checking_lock
, lock_bl
);
4886 mdr
->reply_extra_bl
= lock_bl
;
4887 respond_to_request(mdr
, 0);
4890 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4892 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4893 MutationImpl::LockOpVec lov
;
4894 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4897 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4898 respond_to_request(mdr
, -EROFS
);
4901 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4902 respond_to_request(mdr
, -EPERM
);
4906 __u32 mask
= req
->head
.args
.setattr
.mask
;
4907 __u32 access_mask
= MAY_WRITE
;
4910 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4911 lov
.add_xlock(&cur
->authlock
);
4912 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4913 lov
.add_xlock(&cur
->filelock
);
4914 if (mask
& CEPH_SETATTR_CTIME
)
4915 lov
.add_wrlock(&cur
->versionlock
);
4917 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4920 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
4921 access_mask
|= MAY_CHOWN
;
4923 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
4924 access_mask
|= MAY_CHGRP
;
4926 if (!check_access(mdr
, cur
, access_mask
))
4929 // trunc from bigger -> smaller?
4930 auto pip
= cur
->get_projected_inode();
4932 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
4934 // ENOSPC on growing file while full, but allow shrinks
4935 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4936 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
4937 respond_to_request(mdr
, -ENOSPC
);
4941 bool truncating_smaller
= false;
4942 if (mask
& CEPH_SETATTR_SIZE
) {
4943 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
4944 if (truncating_smaller
&& pip
->is_truncating()) {
4945 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
4946 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4947 mds
->locker
->drop_locks(mdr
.get());
4948 mdr
->drop_local_auth_pins();
4949 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4954 bool changed_ranges
= false;
4957 mdr
->ls
= mdlog
->get_current_segment();
4958 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
4959 mdlog
->start_entry(le
);
4961 auto &pi
= cur
->project_inode();
4963 if (mask
& CEPH_SETATTR_UID
)
4964 pi
.inode
.uid
= req
->head
.args
.setattr
.uid
;
4965 if (mask
& CEPH_SETATTR_GID
)
4966 pi
.inode
.gid
= req
->head
.args
.setattr
.gid
;
4968 if (mask
& CEPH_SETATTR_MODE
)
4969 pi
.inode
.mode
= (pi
.inode
.mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
4970 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4971 S_ISREG(pi
.inode
.mode
) &&
4972 (pi
.inode
.mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
4973 pi
.inode
.mode
&= ~(S_ISUID
|S_ISGID
);
4976 if (mask
& CEPH_SETATTR_MTIME
)
4977 pi
.inode
.mtime
= req
->head
.args
.setattr
.mtime
;
4978 if (mask
& CEPH_SETATTR_ATIME
)
4979 pi
.inode
.atime
= req
->head
.args
.setattr
.atime
;
4980 if (mask
& CEPH_SETATTR_BTIME
)
4981 pi
.inode
.btime
= req
->head
.args
.setattr
.btime
;
4982 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4983 pi
.inode
.time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4984 if (mask
& CEPH_SETATTR_SIZE
) {
4985 if (truncating_smaller
) {
4986 pi
.inode
.truncate(old_size
, req
->head
.args
.setattr
.size
);
4987 le
->metablob
.add_truncate_start(cur
->ino());
4989 pi
.inode
.size
= req
->head
.args
.setattr
.size
;
4990 pi
.inode
.rstat
.rbytes
= pi
.inode
.size
;
4992 pi
.inode
.mtime
= mdr
->get_op_stamp();
4994 // adjust client's max_size?
4995 CInode::mempool_inode::client_range_map new_ranges
;
4996 bool max_increased
= false;
4997 mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
.size
, true, &new_ranges
, &max_increased
);
4998 if (pi
.inode
.client_ranges
!= new_ranges
) {
4999 dout(10) << " client_ranges " << pi
.inode
.client_ranges
<< " -> " << new_ranges
<< dendl
;
5000 pi
.inode
.client_ranges
= new_ranges
;
5001 changed_ranges
= true;
5005 pi
.inode
.version
= cur
->pre_dirty();
5006 pi
.inode
.ctime
= mdr
->get_op_stamp();
5007 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5008 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5009 pi
.inode
.change_attr
++;
5012 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5013 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5014 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5016 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5017 truncating_smaller
, changed_ranges
));
5019 // flush immediately if there are readers/writers waiting
5020 if (mdr
->is_xlocked(&cur
->filelock
) &&
5021 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5022 mds
->mdlog
->flush();
5025 /* Takes responsibility for mdr */
5026 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5028 CInode
*in
= mdr
->in
[0];
5029 client_t client
= mdr
->get_client();
5032 dout(10) << "do_open_truncate " << *in
<< dendl
;
5034 SnapRealm
*realm
= in
->find_snaprealm();
5035 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5037 mdr
->ls
= mdlog
->get_current_segment();
5038 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5039 mdlog
->start_entry(le
);
5042 auto &pi
= in
->project_inode();
5043 pi
.inode
.version
= in
->pre_dirty();
5044 pi
.inode
.mtime
= pi
.inode
.ctime
= mdr
->get_op_stamp();
5045 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5046 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5047 pi
.inode
.change_attr
++;
5049 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
.size
, mdr
->client_request
->head
.args
.open
.old_size
);
5051 pi
.inode
.truncate(old_size
, 0);
5052 le
->metablob
.add_truncate_start(in
->ino());
5055 bool changed_ranges
= false;
5056 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5057 pi
.inode
.client_ranges
[client
].range
.first
= 0;
5058 pi
.inode
.client_ranges
[client
].range
.last
= pi
.inode
.get_layout_size_increment();
5059 pi
.inode
.client_ranges
[client
].follows
= realm
->get_newest_seq();
5060 changed_ranges
= true;
5061 cap
->mark_clientwriteable();
5064 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5066 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5067 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5069 // make sure ino gets into the journal
5070 le
->metablob
.add_opened_ino(in
->ino());
5072 mdr
->o_trunc
= true;
5075 if (mdr
->client_request
->get_dentry_wanted()) {
5076 ceph_assert(mdr
->dn
[0].size());
5077 dn
= mdr
->dn
[0].back();
5080 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5082 // Although the `open` part can give an early reply, the truncation won't
5083 // happen until our EUpdate is persistent, to give the client a prompt
5084 // response we must also flush that event.
5089 /* This function cleans up the passed mdr */
5090 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5092 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5093 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5096 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5097 respond_to_request(mdr
, -EROFS
);
5100 if (!cur
->is_file()) {
5101 respond_to_request(mdr
, -EINVAL
);
5104 if (cur
->get_projected_inode()->size
||
5105 cur
->get_projected_inode()->truncate_seq
> 1) {
5106 respond_to_request(mdr
, -ENOTEMPTY
);
5111 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5112 // save existing layout for later
5113 const auto old_layout
= layout
;
5115 int access
= MAY_WRITE
;
5117 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5118 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5119 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5120 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5121 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5122 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5123 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5124 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5126 // make sure we have as new a map as the client
5127 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5128 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5133 // Don't permit layout modifications without 'p' caps
5134 if (layout
!= old_layout
) {
5135 access
|= MAY_SET_VXATTR
;
5138 if (!layout
.is_valid()) {
5139 dout(10) << "bad layout" << dendl
;
5140 respond_to_request(mdr
, -EINVAL
);
5143 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5144 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5145 respond_to_request(mdr
, -EINVAL
);
5149 MutationImpl::LockOpVec lov
;
5150 lov
.add_xlock(&cur
->filelock
);
5151 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5154 if (!check_access(mdr
, cur
, access
))
5158 auto &pi
= cur
->project_inode();
5159 pi
.inode
.layout
= layout
;
5160 // add the old pool to the inode
5161 pi
.inode
.add_old_pool(old_layout
.pool_id
);
5162 pi
.inode
.version
= cur
->pre_dirty();
5163 pi
.inode
.ctime
= mdr
->get_op_stamp();
5164 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5165 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5166 pi
.inode
.change_attr
++;
5169 mdr
->ls
= mdlog
->get_current_segment();
5170 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5171 mdlog
->start_entry(le
);
5172 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5173 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5174 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5176 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5179 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5181 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5184 MutationImpl::LockOpVec lov
;
5185 lov
.add_xlock(&in
->policylock
);
5187 lov
.add_xlock(&in
->snaplock
);
5189 lov
.add_rdlock(&in
->snaplock
);
5190 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5193 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5194 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5195 want_layout
= false;
5197 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5198 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5202 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5206 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5208 CInode
*in
= mdcache
->get_inode(ino
);
5209 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5210 respond_to_request(mdr
, -ESTALE
);
5213 if (!in
->is_auth()) {
5214 mdcache
->request_forward(mdr
, in
->authority().first
);
5221 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5223 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5225 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5226 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5230 if (!cur
->is_dir()) {
5231 respond_to_request(mdr
, -ENOTDIR
);
5235 if (!xlock_policylock(mdr
, cur
, true))
5239 const auto old_pi
= cur
->get_projected_inode();
5240 file_layout_t layout
;
5241 if (old_pi
->has_layout())
5242 layout
= old_pi
->layout
;
5243 else if (mdr
->dir_layout
!= file_layout_t())
5244 layout
= mdr
->dir_layout
;
5246 layout
= mdcache
->default_file_layout
;
5248 // Level of access required to complete
5249 int access
= MAY_WRITE
;
5251 const auto old_layout
= layout
;
5253 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5254 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5255 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5256 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5257 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5258 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5259 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5260 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5261 // make sure we have as new a map as the client
5262 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5263 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5268 if (layout
!= old_layout
) {
5269 access
|= MAY_SET_VXATTR
;
5272 if (!layout
.is_valid()) {
5273 dout(10) << "bad layout" << dendl
;
5274 respond_to_request(mdr
, -EINVAL
);
5277 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5278 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5279 respond_to_request(mdr
, -EINVAL
);
5283 if (!check_access(mdr
, cur
, access
))
5286 auto &pi
= cur
->project_inode();
5287 pi
.inode
.layout
= layout
;
5288 pi
.inode
.version
= cur
->pre_dirty();
5291 mdr
->ls
= mdlog
->get_current_segment();
5292 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5293 mdlog
->start_entry(le
);
5294 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5295 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5296 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5298 mdr
->no_early_reply
= true;
5299 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5304 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5305 file_layout_t
*layout
, bool validate
)
5307 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5309 if (name
== "layout") {
5310 string::iterator begin
= value
.begin();
5311 string::iterator end
= value
.end();
5312 keys_and_values
<string::iterator
> p
; // create instance of parser
5313 std::map
<string
, string
> m
; // map to receive results
5314 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5317 string
left(begin
, end
);
5318 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5321 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5322 // Skip validation on each attr, we do it once at the end (avoid
5323 // rejecting intermediate states if the overall result is ok)
5324 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
5325 osdmap
, layout
, false);
5329 } else if (name
== "layout.object_size") {
5330 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5331 } else if (name
== "layout.stripe_unit") {
5332 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5333 } else if (name
== "layout.stripe_count") {
5334 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5335 } else if (name
== "layout.pool") {
5337 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5338 } catch (boost::bad_lexical_cast
const&) {
5339 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5341 dout(10) << " unknown pool " << value
<< dendl
;
5344 layout
->pool_id
= pool
;
5346 } else if (name
== "layout.pool_namespace") {
5347 layout
->pool_ns
= value
;
5349 dout(10) << " unknown layout vxattr " << name
<< dendl
;
5352 } catch (boost::bad_lexical_cast
const&) {
5353 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5357 if (validate
&& !layout
->is_valid()) {
5358 dout(10) << "bad layout" << dendl
;
5361 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5362 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
5368 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5370 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5372 if (name
== "quota") {
5373 string::iterator begin
= value
.begin();
5374 string::iterator end
= value
.end();
5376 // keep quota unchanged. (for create_quota_realm())
5379 keys_and_values
<string::iterator
> p
; // create instance of parser
5380 std::map
<string
, string
> m
; // map to receive results
5381 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5384 string
left(begin
, end
);
5385 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5388 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5389 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5393 } else if (name
== "quota.max_bytes") {
5394 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5397 quota
->max_bytes
= q
;
5398 } else if (name
== "quota.max_files") {
5399 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5402 quota
->max_files
= q
;
5404 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5407 } catch (boost::bad_lexical_cast
const&) {
5408 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5412 if (!quota
->is_valid()) {
5413 dout(10) << "bad quota" << dendl
;
5419 void Server::create_quota_realm(CInode
*in
)
5421 dout(10) << __func__
<< " " << *in
<< dendl
;
5423 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5424 req
->set_filepath(filepath(in
->ino()));
5425 req
->set_string2("ceph.quota");
5426 // empty vxattr value
5427 req
->set_tid(mds
->issue_tid());
5429 mds
->send_message_mds(req
, in
->authority().first
);
5433 * Verify that the file layout attribute carried by client
5434 * is well-formatted.
5435 * Return 0 on success, otherwise this function takes
5436 * responsibility for the passed mdr.
5438 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5441 file_layout_t
*layout
)
5443 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5447 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5448 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5449 epoch
= osdmap
.get_epoch();
5454 // we don't have the specified pool, make sure our map
5455 // is newer than or as new as the client.
5456 epoch_t req_epoch
= req
->get_osdmap_epoch();
5458 if (req_epoch
> epoch
) {
5460 // well, our map is older. consult mds.
5461 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5463 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
5464 return r
; // wait, fin will retry this request later
5468 // now we have at least as new a map as the client, try again.
5469 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5470 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5471 epoch
= osdmap
.get_epoch();
5474 ceph_assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
5476 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5478 // For compatibility with client w/ old code, we still need get the
5479 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5480 // we can remove those code.
5481 mdr
->waited_for_osdmap
= true;
5482 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
5483 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
5493 respond_to_request(mdr
, r
);
5501 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5503 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5504 string
name(req
->get_path2());
5505 bufferlist bl
= req
->get_data();
5506 string
value (bl
.c_str(), bl
.length());
5507 dout(10) << "handle_set_vxattr " << name
5508 << " val " << value
.length()
5509 << " bytes on " << *cur
5512 CInode::mempool_inode
*pip
= nullptr;
5515 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5519 bool new_realm
= false;
5520 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5521 if (!cur
->is_dir()) {
5522 respond_to_request(mdr
, -EINVAL
);
5526 if (!xlock_policylock(mdr
, cur
, true))
5529 file_layout_t layout
;
5530 if (cur
->get_projected_inode()->has_layout())
5531 layout
= cur
->get_projected_inode()->layout
;
5532 else if (mdr
->dir_layout
!= file_layout_t())
5533 layout
= mdr
->dir_layout
;
5535 layout
= mdcache
->default_file_layout
;
5537 rest
= name
.substr(name
.find("layout"));
5538 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5541 auto &pi
= cur
->project_inode();
5542 pi
.inode
.layout
= layout
;
5543 mdr
->no_early_reply
= true;
5545 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5546 if (!cur
->is_file()) {
5547 respond_to_request(mdr
, -EINVAL
);
5550 if (cur
->get_projected_inode()->size
||
5551 cur
->get_projected_inode()->truncate_seq
> 1) {
5552 respond_to_request(mdr
, -ENOTEMPTY
);
5555 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5556 rest
= name
.substr(name
.find("layout"));
5557 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5560 MutationImpl::LockOpVec lov
;
5561 lov
.add_xlock(&cur
->filelock
);
5562 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5565 auto &pi
= cur
->project_inode();
5566 int64_t old_pool
= pi
.inode
.layout
.pool_id
;
5567 pi
.inode
.add_old_pool(old_pool
);
5568 pi
.inode
.layout
= layout
;
5570 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5571 if (!cur
->is_dir() || cur
->is_root()) {
5572 respond_to_request(mdr
, -EINVAL
);
5576 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5578 rest
= name
.substr(name
.find("quota"));
5579 int r
= parse_quota_vxattr(rest
, value
, "a
);
5581 respond_to_request(mdr
, r
);
5585 if (quota
.is_enable() && !cur
->get_projected_srnode())
5588 if (!xlock_policylock(mdr
, cur
, false, new_realm
))
5591 auto &pi
= cur
->project_inode(false, new_realm
);
5592 pi
.inode
.quota
= quota
;
5595 SnapRealm
*realm
= cur
->find_snaprealm();
5596 auto seq
= realm
->get_newest_seq();
5597 auto &newsnap
= *pi
.snapnode
;
5598 newsnap
.created
= seq
;
5601 mdr
->no_early_reply
= true;
5604 client_t exclude_ct
= mdr
->get_client();
5605 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5606 } else if (name
== "ceph.dir.pin"sv
) {
5607 if (!cur
->is_dir() || cur
->is_root()) {
5608 respond_to_request(mdr
, -EINVAL
);
5614 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5615 if (rank
< 0) rank
= MDS_RANK_NONE
;
5616 } catch (boost::bad_lexical_cast
const&) {
5617 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5618 respond_to_request(mdr
, -EINVAL
);
5622 if (!xlock_policylock(mdr
, cur
))
5625 auto &pi
= cur
->project_inode();
5626 cur
->set_export_pin(rank
);
5628 } else if (name
== "ceph.dir.pin.random"sv
) {
5629 if (!cur
->is_dir() || cur
->is_root()) {
5630 respond_to_request(mdr
, -EINVAL
);
5636 val
= boost::lexical_cast
<double>(value
);
5637 } catch (boost::bad_lexical_cast
const&) {
5638 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
5639 respond_to_request(mdr
, -EINVAL
);
5643 if (val
< 0.0 || 1.0 < val
) {
5644 respond_to_request(mdr
, -EDOM
);
5646 } else if (mdcache
->export_ephemeral_random_max
< val
) {
5647 respond_to_request(mdr
, -EINVAL
);
5651 if (!xlock_policylock(mdr
, cur
))
5654 auto &pi
= cur
->project_inode();
5655 cur
->setxattr_ephemeral_rand(val
);
5657 } else if (name
== "ceph.dir.pin.distributed"sv
) {
5658 if (!cur
->is_dir() || cur
->is_root()) {
5659 respond_to_request(mdr
, -EINVAL
);
5665 val
= boost::lexical_cast
<bool>(value
);
5666 } catch (boost::bad_lexical_cast
const&) {
5667 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5668 respond_to_request(mdr
, -EINVAL
);
5672 if (!xlock_policylock(mdr
, cur
))
5675 auto &pi
= cur
->project_inode();
5676 cur
->setxattr_ephemeral_dist(val
);
5679 dout(10) << " unknown vxattr " << name
<< dendl
;
5680 respond_to_request(mdr
, -EINVAL
);
5685 pip
->ctime
= mdr
->get_op_stamp();
5686 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
5687 pip
->rstat
.rctime
= mdr
->get_op_stamp();
5688 pip
->version
= cur
->pre_dirty();
5690 pip
->update_backtrace();
5693 mdr
->ls
= mdlog
->get_current_segment();
5694 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
5695 mdlog
->start_entry(le
);
5696 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5697 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5698 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5700 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5701 false, false, new_realm
));
5705 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5707 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5708 string
name(req
->get_path2());
5710 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
5712 if (name
== "ceph.dir.layout") {
5713 if (!cur
->is_dir()) {
5714 respond_to_request(mdr
, -ENODATA
);
5717 if (cur
->is_root()) {
5718 dout(10) << "can't remove layout policy on the root directory" << dendl
;
5719 respond_to_request(mdr
, -EINVAL
);
5723 if (!cur
->get_projected_inode()->has_layout()) {
5724 respond_to_request(mdr
, -ENODATA
);
5728 MutationImpl::LockOpVec lov
;
5729 lov
.add_xlock(&cur
->policylock
);
5730 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5733 auto &pi
= cur
->project_inode();
5734 pi
.inode
.clear_layout();
5735 pi
.inode
.version
= cur
->pre_dirty();
5738 mdr
->ls
= mdlog
->get_current_segment();
5739 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
5740 mdlog
->start_entry(le
);
5741 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5742 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5743 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5745 mdr
->no_early_reply
= true;
5746 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5748 } else if (name
== "ceph.dir.layout.pool_namespace"
5749 || name
== "ceph.file.layout.pool_namespace") {
5750 // Namespace is the only layout field that has a meaningful
5751 // null/none value (empty string, means default layout). Is equivalent
5752 // to a setxattr with empty string: pass through the empty payload of
5753 // the rmxattr request to do this.
5754 handle_set_vxattr(mdr
, cur
);
5758 respond_to_request(mdr
, -ENODATA
);
5761 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
5765 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
5766 ServerLogContext(s
, r
), in(i
) { }
5767 void finish(int r
) override
{
5768 ceph_assert(r
== 0);
5771 in
->pop_and_dirty_projected_inode(mdr
->ls
);
5775 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
5777 server
->respond_to_request(mdr
, 0);
5781 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
5783 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5784 string
name(req
->get_path2());
5786 // magic ceph.* namespace?
5787 if (name
.compare(0, 5, "ceph.") == 0) {
5788 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5789 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5793 handle_set_vxattr(mdr
, cur
);
5797 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5801 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5802 respond_to_request(mdr
, -EROFS
);
5806 int flags
= req
->head
.args
.setxattr
.flags
;
5808 MutationImpl::LockOpVec lov
;
5809 lov
.add_xlock(&cur
->xattrlock
);
5810 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5813 if (!check_access(mdr
, cur
, MAY_WRITE
))
5816 auto pxattrs
= cur
->get_projected_xattrs();
5817 size_t len
= req
->get_data().length();
5818 size_t inc
= len
+ name
.length();
5820 // check xattrs kv pairs size
5821 size_t cur_xattrs_size
= 0;
5822 for (const auto& p
: *pxattrs
) {
5823 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(p
.first
) == 0)) {
5826 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
5829 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
5830 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5831 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
5832 respond_to_request(mdr
, -ENOSPC
);
5836 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(mempool::mds_co::string(name
))) {
5837 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
5838 respond_to_request(mdr
, -EEXIST
);
5841 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(mempool::mds_co::string(name
))) {
5842 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
5843 respond_to_request(mdr
, -ENODATA
);
5847 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
5850 auto &pi
= cur
->project_inode(true);
5851 pi
.inode
.version
= cur
->pre_dirty();
5852 pi
.inode
.ctime
= mdr
->get_op_stamp();
5853 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5854 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5855 pi
.inode
.change_attr
++;
5856 pi
.inode
.xattr_version
++;
5857 auto &px
= *pi
.xattrs
;
5858 if ((flags
& CEPH_XATTR_REMOVE
)) {
5859 px
.erase(mempool::mds_co::string(name
));
5861 bufferptr b
= buffer::create(len
);
5863 req
->get_data().begin().copy(len
, b
.c_str());
5864 auto em
= px
.emplace(std::piecewise_construct
, std::forward_as_tuple(mempool::mds_co::string(name
)), std::forward_as_tuple(b
));
5866 em
.first
->second
= b
;
5870 mdr
->ls
= mdlog
->get_current_segment();
5871 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
5872 mdlog
->start_entry(le
);
5873 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5874 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5875 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5877 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5880 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
5882 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5883 std::string
name(req
->get_path2());
5885 if (name
.compare(0, 5, "ceph.") == 0) {
5886 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5887 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5891 handle_remove_vxattr(mdr
, cur
);
5895 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
5899 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5900 respond_to_request(mdr
, -EROFS
);
5904 MutationImpl::LockOpVec lov
;
5905 lov
.add_xlock(&cur
->xattrlock
);
5906 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5909 auto pxattrs
= cur
->get_projected_xattrs();
5910 if (pxattrs
->count(mempool::mds_co::string(name
)) == 0) {
5911 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
5912 respond_to_request(mdr
, -ENODATA
);
5916 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
5919 auto &pi
= cur
->project_inode(true);
5920 auto &px
= *pi
.xattrs
;
5921 pi
.inode
.version
= cur
->pre_dirty();
5922 pi
.inode
.ctime
= mdr
->get_op_stamp();
5923 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5924 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5925 pi
.inode
.change_attr
++;
5926 pi
.inode
.xattr_version
++;
5927 px
.erase(mempool::mds_co::string(name
));
5930 mdr
->ls
= mdlog
->get_current_segment();
5931 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
5932 mdlog
->start_entry(le
);
5933 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5934 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5935 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5937 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5941 // =================================================================
5942 // DIRECTORY and NAMESPACE OPS
5945 // ------------------------------------------------
5949 class C_MDS_mknod_finish
: public ServerLogContext
{
5953 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
5954 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
5955 void finish(int r
) override
{
5956 ceph_assert(r
== 0);
5959 dn
->pop_projected_linkage();
5961 // be a bit hacky with the inode version, here.. we decrement it
5962 // just to keep mark_dirty() happen. (we didn't bother projecting
5963 // a new version of hte inode since it's just been created)
5964 newi
->inode
.version
--;
5965 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
5966 newi
->mark_dirty_parent(mdr
->ls
, true);
5969 if (newi
->inode
.is_dir()) {
5970 CDir
*dir
= newi
->get_dirfrag(frag_t());
5972 dir
->fnode
.version
--;
5973 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
5974 dir
->mark_new(mdr
->ls
);
5979 MDRequestRef null_ref
;
5980 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
5982 if (newi
->inode
.is_file()) {
5983 get_mds()->locker
->share_inode_max_size(newi
);
5984 } else if (newi
->inode
.is_dir()) {
5985 // We do this now so that the linkages on the new directory are stable.
5986 newi
->maybe_ephemeral_dist();
5987 newi
->maybe_ephemeral_rand(true);
5991 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
5994 server
->respond_to_request(mdr
, 0);
5999 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6001 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6002 client_t client
= mdr
->get_client();
6004 unsigned mode
= req
->head
.args
.mknod
.mode
;
6005 if ((mode
& S_IFMT
) == 0)
6008 mdr
->disable_lock_cache();
6009 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, S_ISREG(mode
));
6013 CDir
*dir
= dn
->get_dir();
6014 CInode
*diri
= dir
->get_inode();
6015 if (!check_access(mdr
, diri
, MAY_WRITE
))
6017 if (!check_fragment_space(mdr
, dn
->get_dir()))
6021 file_layout_t layout
;
6022 if (mdr
->dir_layout
!= file_layout_t())
6023 layout
= mdr
->dir_layout
;
6025 layout
= mdcache
->default_file_layout
;
6027 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6030 dn
->push_projected_linkage(newi
);
6032 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
6033 newi
->inode
.version
= dn
->pre_dirty();
6034 newi
->inode
.rstat
.rfiles
= 1;
6035 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6036 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
6037 newi
->inode
.update_backtrace();
6039 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6040 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6041 ceph_assert(follows
>= realm
->get_newest_seq());
6043 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6044 // want to write to it (e.g., if they are reexporting NFS)
6045 if (S_ISREG(newi
->inode
.mode
)) {
6046 // issue a cap on the file
6047 int cmode
= CEPH_FILE_MODE_RDWR
;
6048 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6052 // put locks in excl mode
6053 newi
->filelock
.set_state(LOCK_EXCL
);
6054 newi
->authlock
.set_state(LOCK_EXCL
);
6055 newi
->xattrlock
.set_state(LOCK_EXCL
);
6057 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6058 newi
->inode
.client_ranges
[client
].range
.first
= 0;
6059 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.layout
.stripe_unit
;
6060 newi
->inode
.client_ranges
[client
].follows
= follows
;
6061 cap
->mark_clientwriteable();
6065 ceph_assert(dn
->first
== follows
+ 1);
6066 newi
->first
= dn
->first
;
6068 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
6071 mdr
->ls
= mdlog
->get_current_segment();
6072 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6073 mdlog
->start_entry(le
);
6074 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6075 journal_allocated_inos(mdr
, &le
->metablob
);
6077 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6078 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6079 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6081 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6082 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6088 /* This function takes responsibility for the passed mdr*/
6089 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6091 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6093 mdr
->disable_lock_cache();
6094 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6098 CDir
*dir
= dn
->get_dir();
6099 CInode
*diri
= dir
->get_inode();
6101 // mkdir check access
6102 if (!check_access(mdr
, diri
, MAY_WRITE
))
6105 if (!check_fragment_space(mdr
, dir
))
6109 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6112 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6115 // it's a directory.
6116 dn
->push_projected_linkage(newi
);
6118 newi
->inode
.version
= dn
->pre_dirty();
6119 newi
->inode
.rstat
.rsubdirs
= 1;
6120 newi
->inode
.update_backtrace();
6122 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6123 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6124 ceph_assert(follows
>= realm
->get_newest_seq());
6126 dout(12) << " follows " << follows
<< dendl
;
6127 ceph_assert(dn
->first
== follows
+ 1);
6128 newi
->first
= dn
->first
;
6130 // ...and that new dir is empty.
6131 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
6132 newdir
->state_set(CDir::STATE_CREATING
);
6133 newdir
->mark_complete();
6134 newdir
->fnode
.version
= newdir
->pre_dirty();
6137 mdr
->ls
= mdlog
->get_current_segment();
6138 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
6139 mdlog
->start_entry(le
);
6140 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6141 journal_allocated_inos(mdr
, &le
->metablob
);
6142 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6143 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6144 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
6146 // issue a cap on the directory
6147 int cmode
= CEPH_FILE_MODE_RDWR
;
6148 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6152 // put locks in excl mode
6153 newi
->filelock
.set_state(LOCK_EXCL
);
6154 newi
->authlock
.set_state(LOCK_EXCL
);
6155 newi
->xattrlock
.set_state(LOCK_EXCL
);
6158 // make sure this inode gets into the journal
6159 le
->metablob
.add_opened_ino(newi
->ino());
6161 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6163 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6164 // have overshot the split size (multiple mkdir in flight), so here is
6165 // an early chance to split the dir if this mkdir makes it oversized.
6166 mds
->balancer
->maybe_fragment(dir
, false);
6172 void Server::handle_client_symlink(MDRequestRef
& mdr
)
6174 mdr
->disable_lock_cache();
6175 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6179 CDir
*dir
= dn
->get_dir();
6180 CInode
*diri
= dir
->get_inode();
6182 if (!check_access(mdr
, diri
, MAY_WRITE
))
6184 if (!check_fragment_space(mdr
, dir
))
6187 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6189 unsigned mode
= S_IFLNK
| 0777;
6190 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6194 dn
->push_projected_linkage(newi
);
6196 newi
->symlink
= req
->get_path2();
6197 newi
->inode
.size
= newi
->symlink
.length();
6198 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
6199 newi
->inode
.rstat
.rfiles
= 1;
6200 newi
->inode
.version
= dn
->pre_dirty();
6201 newi
->inode
.update_backtrace();
6203 newi
->first
= dn
->first
;
6206 mdr
->ls
= mdlog
->get_current_segment();
6207 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
6208 mdlog
->start_entry(le
);
6209 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6210 journal_allocated_inos(mdr
, &le
->metablob
);
6211 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6212 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6214 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6215 mds
->balancer
->maybe_fragment(dir
, false);
6224 void Server::handle_client_link(MDRequestRef
& mdr
)
6226 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6228 dout(7) << "handle_client_link " << req
->get_filepath()
6229 << " to " << req
->get_filepath2()
6232 mdr
->disable_lock_cache();
6237 if (req
->get_filepath2().depth() == 0) {
6238 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
6240 dout(10) << "ESTALE on path2, attempting recovery" << dendl
;
6241 mdcache
->find_ino_peers(req
->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr
));
6246 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
6247 CDentry
*pdn
= targeti
->get_projected_parent_dn();
6249 dout(7) << "target has no parent dn, failing..." << dendl
;
6250 respond_to_request(mdr
, -EINVAL
);
6253 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
6255 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
6258 destdn
= rdlock_path_xlock_dentry(mdr
, false);
6263 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
6268 if (!destdn
->get_projected_linkage()->is_null()) {
6269 respond_to_request(mdr
, -EEXIST
);
6273 targeti
= ret
.second
->get_projected_linkage()->get_inode();
6276 if (targeti
->is_dir()) {
6277 dout(7) << "target is a dir, failing..." << dendl
;
6278 respond_to_request(mdr
, -EINVAL
);
6282 CDir
*dir
= destdn
->get_dir();
6283 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
6284 dout(7) << "target is " << *targeti
<< dendl
;
6286 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6287 MutationImpl::LockOpVec lov
;
6288 lov
.add_xlock(&targeti
->snaplock
);
6289 lov
.add_xlock(&targeti
->linklock
);
6291 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6294 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6297 if (targeti
->get_projected_inode()->nlink
== 0) {
6298 dout(7) << "target has no link, failing..." << dendl
;
6299 respond_to_request(mdr
, -ENOENT
);
6302 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6303 if (!check_access(mdr
, targeti
, MAY_WRITE
))
6306 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
6309 if (!check_fragment_space(mdr
, dir
))
6314 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
6317 if (targeti
->is_auth())
6318 _link_local(mdr
, destdn
, targeti
);
6320 _link_remote(mdr
, true, destdn
, targeti
);
6321 mds
->balancer
->maybe_fragment(dir
, false);
6325 class C_MDS_link_local_finish
: public ServerLogContext
{
6332 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
6333 version_t dnpv_
, version_t tipv_
, bool ar
) :
6334 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
6335 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
6336 void finish(int r
) override
{
6337 ceph_assert(r
== 0);
6338 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
6343 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
6345 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
6347 mdr
->ls
= mdlog
->get_current_segment();
6349 // predirty NEW dentry
6350 version_t dnpv
= dn
->pre_dirty();
6351 version_t tipv
= targeti
->pre_dirty();
6353 // project inode update
6354 auto &pi
= targeti
->project_inode();
6356 pi
.inode
.ctime
= mdr
->get_op_stamp();
6357 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
6358 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
6359 pi
.inode
.change_attr
++;
6360 pi
.inode
.version
= tipv
;
6362 bool adjust_realm
= false;
6363 if (!targeti
->is_projected_snaprealm_global()) {
6364 sr_t
*newsnap
= targeti
->project_snaprealm();
6365 targeti
->mark_snaprealm_global(newsnap
);
6366 targeti
->record_snaprealm_parent_dentry(newsnap
, NULL
, targeti
->get_projected_parent_dn(), true);
6367 adjust_realm
= true;
6371 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
6372 mdlog
->start_entry(le
);
6373 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6374 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
6375 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
6376 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6377 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
6379 // do this after predirty_*, to avoid funky extra dnl arg
6380 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6382 journal_and_reply(mdr
, targeti
, dn
, le
,
6383 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
6386 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
6387 version_t dnpv
, version_t tipv
, bool adjust_realm
)
6389 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
6391 // link and unlock the NEW dentry
6392 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6393 if (!dnl
->get_inode())
6394 dn
->link_remote(dnl
, targeti
);
6395 dn
->mark_dirty(dnpv
, mdr
->ls
);
6398 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
6402 MDRequestRef null_ref
;
6403 mdcache
->send_dentry_link(dn
, null_ref
);
6406 int op
= CEPH_SNAP_OP_SPLIT
;
6407 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6408 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6411 // bump target popularity
6412 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6413 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6416 respond_to_request(mdr
, 0);
6420 // link / unlink remote
6422 class C_MDS_link_remote_finish
: public ServerLogContext
{
6428 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
6429 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
6430 dpv(d
->get_projected_version()) {}
6431 void finish(int r
) override
{
6432 ceph_assert(r
== 0);
6433 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
6437 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
6439 dout(10) << "_link_remote "
6440 << (inc
? "link ":"unlink ")
6441 << *dn
<< " to " << *targeti
<< dendl
;
6443 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6444 mds_rank_t linkauth
= targeti
->authority().first
;
6445 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
6446 if (mds
->is_cluster_degraded() &&
6447 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
6448 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
6449 if (mdr
->more()->waiting_on_slave
.empty())
6450 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
6454 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
6457 op
= MMDSSlaveRequest::OP_LINKPREP
;
6459 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
6460 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
6461 targeti
->set_object_info(req
->get_object_info());
6462 req
->op_stamp
= mdr
->get_op_stamp();
6463 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
6464 encode(*desti_srnode
, req
->desti_snapbl
);
6465 mds
->send_message_mds(req
, linkauth
);
6467 ceph_assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
6468 mdr
->more()->waiting_on_slave
.insert(linkauth
);
6471 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
6473 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
6475 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
6476 delete desti_srnode
;
6477 desti_srnode
= NULL
;
6480 mdr
->set_mds_stamp(ceph_clock_now());
6483 mdr
->ls
= mdlog
->get_current_segment();
6484 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
6485 mdlog
->start_entry(le
);
6486 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6487 if (!mdr
->more()->witnessed
.empty()) {
6488 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6489 le
->reqid
= mdr
->reqid
;
6490 le
->had_slaves
= true;
6491 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6496 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
6497 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6498 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6501 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6502 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6503 le
->metablob
.add_null_dentry(dn
, true);
6504 dn
->push_projected_linkage();
6507 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
6508 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
6511 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
6512 CDentry
*dn
, CInode
*targeti
,
6515 dout(10) << "_link_remote_finish "
6516 << (inc
? "link ":"unlink ")
6517 << *dn
<< " to " << *targeti
<< dendl
;
6519 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
6521 if (!mdr
->more()->witnessed
.empty())
6522 mdcache
->logged_master_update(mdr
->reqid
);
6525 // link the new dentry
6526 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6527 if (!dnl
->get_inode())
6528 dn
->link_remote(dnl
, targeti
);
6529 dn
->mark_dirty(dpv
, mdr
->ls
);
6531 // unlink main dentry
6532 dn
->get_dir()->unlink_inode(dn
);
6533 dn
->pop_projected_linkage();
6534 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
6539 MDRequestRef null_ref
;
6541 mdcache
->send_dentry_link(dn
, null_ref
);
6543 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
6545 // bump target popularity
6546 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6547 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6550 respond_to_request(mdr
, 0);
6553 // removing a new dn?
6554 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6558 // remote linking/unlinking
6560 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
6564 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
6565 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
6566 void finish(int r
) override
{
6567 ceph_assert(r
== 0);
6568 server
->_logged_slave_link(mdr
, targeti
, adjust_realm
);
6572 class C_MDS_SlaveLinkCommit
: public ServerContext
{
6576 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
6577 ServerContext(s
), mdr(r
), targeti(t
) { }
6578 void finish(int r
) override
{
6579 server
->_commit_slave_link(mdr
, r
, targeti
);
6583 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
6585 dout(10) << "handle_slave_link_prep " << *mdr
6586 << " on " << mdr
->slave_request
->get_object_info()
6589 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
6591 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
6592 ceph_assert(targeti
);
6593 dout(10) << "targeti " << *targeti
<< dendl
;
6594 CDentry
*dn
= targeti
->get_parent_dn();
6595 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6596 ceph_assert(dnl
->is_primary());
6598 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6600 mdr
->auth_pin(targeti
);
6602 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6603 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
6606 mdr
->ls
= mdlog
->get_current_segment();
6607 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
6608 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
6609 mdlog
->start_entry(le
);
6611 auto &pi
= dnl
->get_inode()->project_inode();
6613 // update journaled target inode
6615 bool adjust_realm
= false;
6616 bool realm_projected
= false;
6617 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
6620 if (!targeti
->is_projected_snaprealm_global()) {
6621 sr_t
*newsnap
= targeti
->project_snaprealm();
6622 targeti
->mark_snaprealm_global(newsnap
);
6623 targeti
->record_snaprealm_parent_dentry(newsnap
, NULL
, targeti
->get_projected_parent_dn(), true);
6624 adjust_realm
= true;
6625 realm_projected
= true;
6630 if (targeti
->is_projected_snaprealm_global()) {
6631 ceph_assert(mdr
->slave_request
->desti_snapbl
.length());
6632 auto p
= mdr
->slave_request
->desti_snapbl
.cbegin();
6634 sr_t
*newsnap
= targeti
->project_snaprealm();
6635 decode(*newsnap
, p
);
6637 if (pi
.inode
.nlink
== 0)
6638 ceph_assert(!newsnap
->is_parent_global());
6640 realm_projected
= true;
6642 ceph_assert(mdr
->slave_request
->desti_snapbl
.length() == 0);
6646 link_rollback rollback
;
6647 rollback
.reqid
= mdr
->reqid
;
6648 rollback
.ino
= targeti
->ino();
6649 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
6650 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
6651 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
6652 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
6653 rollback
.was_inc
= inc
;
6654 if (realm_projected
) {
6655 if (targeti
->snaprealm
) {
6656 encode(true, rollback
.snapbl
);
6657 targeti
->encode_snap_blob(rollback
.snapbl
);
6659 encode(false, rollback
.snapbl
);
6662 encode(rollback
, le
->rollback
);
6663 mdr
->more()->rollback_bl
= le
->rollback
;
6665 pi
.inode
.ctime
= mdr
->get_op_stamp();
6666 pi
.inode
.version
= targeti
->pre_dirty();
6668 dout(10) << " projected inode " << pi
.inode
.ino
<< " v " << pi
.inode
.version
<< dendl
;
6671 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
6672 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
6673 mdcache
->add_uncommitted_slave(mdr
->reqid
, mdr
->ls
, mdr
->slave_to_mds
);
6675 // set up commit waiter
6676 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
6678 mdr
->more()->slave_update_journaled
= true;
6679 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
, adjust_realm
),
6684 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
6686 dout(10) << "_logged_slave_link " << *mdr
6687 << " " << *targeti
<< dendl
;
6689 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
6691 // update the target
6692 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
6696 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6699 mdr
->reset_slave_request();
6702 int op
= CEPH_SNAP_OP_SPLIT
;
6703 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6704 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6708 if (!mdr
->aborted
) {
6709 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_LINKPREPACK
);
6710 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6712 dout(10) << " abort flag set, finishing" << dendl
;
6713 mdcache
->request_finish(mdr
);
6718 struct C_MDS_CommittedSlave
: public ServerLogContext
{
6719 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
6720 void finish(int r
) override
{
6721 server
->_committed_slave(mdr
);
6725 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
6727 dout(10) << "_commit_slave_link " << *mdr
6729 << " " << *targeti
<< dendl
;
6731 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
6734 // drop our pins, etc.
6737 // write a commit to the journal
6738 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
6739 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
6740 mdlog
->start_entry(le
);
6741 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6744 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6748 void Server::_committed_slave(MDRequestRef
& mdr
)
6750 dout(10) << "_committed_slave " << *mdr
<< dendl
;
6752 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
6754 bool assert_exist
= mdr
->more()->slave_update_journaled
;
6755 mdcache
->finish_uncommitted_slave(mdr
->reqid
, assert_exist
);
6756 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_COMMITTED
);
6757 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
6758 mdcache
->request_finish(mdr
);
6761 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
6763 map
<client_t
,ref_t
<MClientSnap
>> splits
;
6764 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
6765 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
6766 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
6768 void finish(int r
) override
{
6769 server
->_link_rollback_finish(mut
, mdr
, splits
);
6773 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6775 link_rollback rollback
;
6776 auto p
= rbl
.cbegin();
6777 decode(rollback
, p
);
6779 dout(10) << "do_link_rollback on " << rollback
.reqid
6780 << (rollback
.was_inc
? " inc":" dec")
6781 << " ino " << rollback
.ino
6784 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
6786 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6787 ceph_assert(mdr
|| mds
->is_resolve());
6789 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
6790 mut
->ls
= mds
->mdlog
->get_current_segment();
6792 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
6794 dout(10) << " target is " << *in
<< dendl
;
6795 ceph_assert(!in
->is_projected()); // live slave request hold versionlock xlock.
6797 auto &pi
= in
->project_inode();
6798 pi
.inode
.version
= in
->pre_dirty();
6799 mut
->add_projected_inode(in
);
6801 // parent dir rctime
6802 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
6803 fnode_t
*pf
= parent
->project_fnode();
6804 mut
->add_projected_fnode(parent
);
6805 pf
->version
= parent
->pre_dirty();
6806 if (pf
->fragstat
.mtime
== pi
.inode
.ctime
) {
6807 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
6808 if (pf
->rstat
.rctime
== pi
.inode
.ctime
)
6809 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
6810 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
6811 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
6815 pi
.inode
.ctime
= rollback
.old_ctime
;
6816 if (rollback
.was_inc
)
6821 map
<client_t
,ref_t
<MClientSnap
>> splits
;
6822 if (rollback
.snapbl
.length() && in
->snaprealm
) {
6824 auto p
= rollback
.snapbl
.cbegin();
6825 decode(hadrealm
, p
);
6827 if (!mds
->is_resolve()) {
6828 sr_t
*new_srnode
= new sr_t();
6829 decode(*new_srnode
, p
);
6830 in
->project_snaprealm(new_srnode
);
6832 decode(in
->snaprealm
->srnode
, p
);
6835 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
6836 if (!mds
->is_resolve())
6837 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
6838 in
->project_snaprealm(NULL
);
6843 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
6844 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
6845 mdlog
->start_entry(le
);
6846 le
->commit
.add_dir_context(parent
);
6847 le
->commit
.add_dir(parent
, true);
6848 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
6850 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
6855 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
6856 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
6858 dout(10) << "_link_rollback_finish" << dendl
;
6860 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
6864 if (!mds
->is_resolve())
6865 mdcache
->send_snaps(splits
);
6868 mdcache
->request_finish(mdr
);
6870 mdcache
->finish_rollback(mut
->reqid
, mdr
);
6876 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &m
)
6878 dout(10) << "handle_slave_link_prep_ack " << *mdr
6879 << " " << *m
<< dendl
;
6880 mds_rank_t from
= mds_rank_t(m
->get_source().num());
6882 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
6885 mdr
->more()->slaves
.insert(from
);
6888 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
6889 mdr
->more()->witnessed
.insert(from
);
6890 ceph_assert(!m
->is_not_journaled());
6891 mdr
->more()->has_journaled_slaves
= true;
6893 // remove from waiting list
6894 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
6895 mdr
->more()->waiting_on_slave
.erase(from
);
6897 ceph_assert(mdr
->more()->waiting_on_slave
.empty());
6899 dispatch_client_request(mdr
); // go again!
6908 void Server::handle_client_unlink(MDRequestRef
& mdr
)
6910 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6911 client_t client
= mdr
->get_client();
6914 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
6917 mdr
->disable_lock_cache();
6918 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
6922 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
6923 ceph_assert(!dnl
->is_null());
6924 CInode
*in
= dnl
->get_inode();
6927 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
6929 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
6931 dout(7) << "dn links to " << *in
<< dendl
;
6936 // do empty directory checks
6937 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
6938 respond_to_request(mdr
, -ENOTEMPTY
);
6942 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
6943 respond_to_request(mdr
, -EISDIR
);
6949 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
6950 respond_to_request(mdr
, -ENOTDIR
);
6955 CInode
*diri
= dn
->get_dir()->get_inode();
6956 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6957 if (!check_access(mdr
, diri
, MAY_WRITE
))
6961 // -- create stray dentry? --
6962 CDentry
*straydn
= NULL
;
6963 if (dnl
->is_primary()) {
6964 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
6967 dout(10) << " straydn is " << *straydn
<< dendl
;
6968 } else if (mdr
->straydn
) {
6969 mdr
->unpin(mdr
->straydn
);
6970 mdr
->straydn
= NULL
;
6974 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6975 MutationImpl::LockOpVec lov
;
6977 lov
.add_xlock(&in
->linklock
);
6978 lov
.add_xlock(&in
->snaplock
);
6980 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
6983 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
6984 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
6985 lov
.add_xlock(&straydn
->lock
);
6988 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6991 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6995 _dir_is_nonempty(mdr
, in
)) {
6996 respond_to_request(mdr
, -ENOTEMPTY
);
7001 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7003 if (!mdr
->more()->desti_srnode
) {
7004 if (in
->is_projected_snaprealm_global()) {
7005 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7006 in
->record_snaprealm_parent_dentry(new_srnode
, NULL
, dn
, dnl
->is_primary());
7007 // dropping the last linkage or dropping the last remote linkage,
7008 // detch the inode from global snaprealm
7009 auto nlink
= in
->get_projected_inode()->nlink
;
7011 (nlink
== 2 && !dnl
->is_primary() &&
7012 !in
->get_projected_parent_dir()->inode
->is_stray()))
7013 in
->clear_snaprealm_global(new_srnode
);
7014 mdr
->more()->desti_srnode
= new_srnode
;
7015 } else if (dnl
->is_primary()) {
7016 // prepare snaprealm blob for slave request
7017 SnapRealm
*realm
= in
->find_snaprealm();
7018 snapid_t follows
= realm
->get_newest_seq();
7019 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7020 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7021 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7022 mdr
->more()->desti_srnode
= new_srnode
;
7028 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7029 // subtree root auths need to be witnesses
7030 set
<mds_rank_t
> witnesses
;
7031 in
->list_replicas(witnesses
);
7032 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7034 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7035 p
!= witnesses
.end();
7037 if (mdr
->more()->witnessed
.count(*p
)) {
7038 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7039 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
7040 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7042 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7046 if (!mdr
->more()->waiting_on_slave
.empty())
7047 return; // we're waiting for a witness.
7050 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
7051 mds
->locker
->create_lock_cache(mdr
, diri
);
7054 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7055 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7057 _unlink_local(mdr
, dn
, straydn
);
7060 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7063 version_t dnpv
; // deleted dentry
7065 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7066 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
7067 dnpv(d
->get_projected_version()) {}
7068 void finish(int r
) override
{
7069 ceph_assert(r
== 0);
7070 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
7074 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7076 dout(10) << "_unlink_local " << *dn
<< dendl
;
7078 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7079 CInode
*in
= dnl
->get_inode();
7083 mdr
->ls
= mdlog
->get_current_segment();
7085 // prepare log entry
7086 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
7087 mdlog
->start_entry(le
);
7088 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7089 if (!mdr
->more()->witnessed
.empty()) {
7090 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
7091 le
->reqid
= mdr
->reqid
;
7092 le
->had_slaves
= true;
7093 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7097 ceph_assert(dnl
->is_primary());
7098 straydn
->push_projected_linkage(in
);
7101 // the unlinked dentry
7104 auto &pi
= in
->project_inode();
7107 dn
->make_path_string(t
, true);
7108 pi
.inode
.stray_prior_path
= std::move(t
);
7110 pi
.inode
.version
= in
->pre_dirty();
7111 pi
.inode
.ctime
= mdr
->get_op_stamp();
7112 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
7113 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
7114 pi
.inode
.change_attr
++;
7116 if (pi
.inode
.nlink
== 0)
7117 in
->state_set(CInode::STATE_ORPHAN
);
7119 if (mdr
->more()->desti_srnode
) {
7120 auto& desti_srnode
= mdr
->more()->desti_srnode
;
7121 in
->project_snaprealm(desti_srnode
);
7122 desti_srnode
= NULL
;
7126 // will manually pop projected inode
7128 // primary link. add stray dentry.
7129 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
7130 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7132 pi
.inode
.update_backtrace();
7133 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
7135 mdr
->add_projected_inode(in
);
7136 // remote link. update remote inode.
7137 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7138 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
7139 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
7142 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7143 le
->metablob
.add_null_dentry(dn
, true);
7146 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7147 le
->metablob
.renamed_dirino
= in
->ino();
7150 dn
->push_projected_linkage();
7153 ceph_assert(in
->first
<= straydn
->first
);
7154 in
->first
= straydn
->first
;
7158 ceph_assert(straydn
);
7159 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7162 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
7165 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
7166 CDentry
*dn
, CDentry
*straydn
,
7169 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
7171 if (!mdr
->more()->witnessed
.empty())
7172 mdcache
->logged_master_update(mdr
->reqid
);
7174 CInode
*strayin
= NULL
;
7175 bool hadrealm
= false;
7177 // if there is newly created snaprealm, need to split old snaprealm's
7178 // inodes_with_caps. So pop snaprealm before linkage changes.
7179 strayin
= dn
->get_linkage()->get_inode();
7180 hadrealm
= strayin
->snaprealm
? true : false;
7181 strayin
->early_pop_projected_snaprealm();
7184 // unlink main dentry
7185 dn
->get_dir()->unlink_inode(dn
);
7186 dn
->pop_projected_linkage();
7188 // relink as stray? (i.e. was primary link?)
7190 dout(20) << " straydn is " << *straydn
<< dendl
;
7191 straydn
->pop_projected_linkage();
7193 strayin
->pop_and_dirty_projected_inode(mdr
->ls
);
7195 mdcache
->touch_dentry_bottom(straydn
);
7198 dn
->mark_dirty(dnpv
, mdr
->ls
);
7201 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
7204 // update subtree map?
7205 if (strayin
->is_dir())
7206 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
7208 if (strayin
->snaprealm
&& !hadrealm
)
7209 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
7213 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7216 respond_to_request(mdr
, 0);
7218 // removing a new dn?
7219 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7222 // respond_to_request() drops locks. So stray reintegration can race with us.
7223 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7224 // Tip off the MDCache that this dentry is a stray that
7225 // might be elegible for purge.
7226 mdcache
->notify_stray(straydn
);
7230 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
7232 if (mds
->is_cluster_degraded() &&
7233 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7234 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
7235 if (mdr
->more()->waiting_on_slave
.empty())
7236 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7240 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
7241 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RMDIRPREP
);
7242 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
7243 for (auto dn
: trace
)
7244 req
->srcdnpath
.push_dentry(dn
->get_name());
7245 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
7246 if (mdr
->more()->desti_srnode
)
7247 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7249 req
->op_stamp
= mdr
->get_op_stamp();
7250 mds
->send_message_mds(req
, who
);
7252 ceph_assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
7253 mdr
->more()->waiting_on_slave
.insert(who
);
7257 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
7258 CDentry
*dn
, *straydn
;
7259 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
7260 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
7261 void finish(int r
) override
{
7262 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
7266 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
7269 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
7270 : ServerContext(s
), mdr(r
), straydn(sd
) { }
7271 void finish(int r
) override
{
7272 server
->_commit_slave_rmdir(mdr
, r
, straydn
);
7276 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
7278 dout(10) << "handle_slave_rmdir_prep " << *mdr
7279 << " " << mdr
->slave_request
->srcdnpath
7280 << " to " << mdr
->slave_request
->destdnpath
7283 vector
<CDentry
*> trace
;
7284 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
7285 dout(10) << " src " << srcpath
<< dendl
;
7287 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, false);
7288 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
7289 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
7293 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7294 mdr
->slave_to_mds
, true);
7297 ceph_assert(r
== 0);
7298 CDentry
*dn
= trace
.back();
7299 dout(10) << " dn " << *dn
<< dendl
;
7302 ceph_assert(mdr
->straydn
);
7303 CDentry
*straydn
= mdr
->straydn
;
7304 dout(10) << " straydn " << *straydn
<< dendl
;
7306 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7308 rmdir_rollback rollback
;
7309 rollback
.reqid
= mdr
->reqid
;
7310 rollback
.src_dir
= dn
->get_dir()->dirfrag();
7311 rollback
.src_dname
= dn
->get_name();
7312 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
7313 rollback
.dest_dname
= straydn
->get_name();
7314 if (mdr
->slave_request
->desti_snapbl
.length()) {
7315 if (in
->snaprealm
) {
7316 encode(true, rollback
.snapbl
);
7317 in
->encode_snap_blob(rollback
.snapbl
);
7319 encode(false, rollback
.snapbl
);
7322 encode(rollback
, mdr
->more()->rollback_bl
);
7323 // FIXME: rollback snaprealm
7324 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7326 // set up commit waiter
7327 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
, straydn
);
7329 straydn
->push_projected_linkage(in
);
7330 dn
->push_projected_linkage();
7332 ceph_assert(straydn
->first
>= in
->first
);
7333 in
->first
= straydn
->first
;
7335 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
7336 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
7337 _logged_slave_rmdir(mdr
, dn
, straydn
);
7341 mdr
->ls
= mdlog
->get_current_segment();
7342 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
7343 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
7344 mdlog
->start_entry(le
);
7345 le
->rollback
= mdr
->more()->rollback_bl
;
7347 le
->commit
.add_dir_context(straydn
->get_dir());
7348 le
->commit
.add_primary_dentry(straydn
, in
, true);
7349 // slave: no need to journal original dentry
7351 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7352 le
->commit
.renamed_dirino
= in
->ino();
7354 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7355 mdcache
->add_uncommitted_slave(mdr
->reqid
, mdr
->ls
, mdr
->slave_to_mds
);
7357 mdr
->more()->slave_update_journaled
= true;
7358 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
7363 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7365 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
7366 CInode
*in
= dn
->get_linkage()->get_inode();
7369 if (mdr
->slave_request
->desti_snapbl
.length()) {
7370 new_realm
= !in
->snaprealm
;
7371 in
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
7372 ceph_assert(in
->snaprealm
);
7373 ceph_assert(in
->snaprealm
->have_past_parents_open());
7378 // update our cache now, so we are consistent with what is in the journal
7379 // when we journal a subtree map
7380 dn
->get_dir()->unlink_inode(dn
);
7381 straydn
->pop_projected_linkage();
7382 dn
->pop_projected_linkage();
7384 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->slave_update_journaled
);
7387 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
7390 mdr
->reset_slave_request();
7393 if (!mdr
->aborted
) {
7394 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RMDIRPREPACK
);
7395 if (!mdr
->more()->slave_update_journaled
)
7396 reply
->mark_not_journaled();
7397 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7399 dout(10) << " abort flag set, finishing" << dendl
;
7400 mdcache
->request_finish(mdr
);
7404 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
7406 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7407 << " " << *ack
<< dendl
;
7409 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
7411 mdr
->more()->slaves
.insert(from
);
7412 mdr
->more()->witnessed
.insert(from
);
7413 if (!ack
->is_not_journaled())
7414 mdr
->more()->has_journaled_slaves
= true;
7416 // remove from waiting list
7417 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
7418 mdr
->more()->waiting_on_slave
.erase(from
);
7420 if (mdr
->more()->waiting_on_slave
.empty())
7421 dispatch_client_request(mdr
); // go again!
7423 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
7426 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
7428 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
7431 if (mdr
->more()->slave_update_journaled
) {
7432 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7433 if (strayin
&& !strayin
->snaprealm
)
7434 mdcache
->clear_dirty_bits_for_stray(strayin
);
7439 if (mdr
->more()->slave_update_journaled
) {
7440 // write a commit to the journal
7441 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
7442 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
7443 ESlaveUpdate::RMDIR
);
7444 mdlog
->start_entry(le
);
7445 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
7448 _committed_slave(mdr
);
7452 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
7456 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
7460 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
7461 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
7462 void finish(int r
) override
{
7463 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
7467 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
7469 // unlink the other rollback methods, the rmdir rollback is only
7470 // needed to record the subtree changes in the journal for inode
7471 // replicas who are auth for empty dirfrags. no actual changes to
7472 // the file system are taking place here, so there is no Mutation.
7474 rmdir_rollback rollback
;
7475 auto p
= rbl
.cbegin();
7476 decode(rollback
, p
);
7478 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
7479 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
7480 ceph_assert(mdr
|| mds
->is_resolve());
7482 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
7484 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
7486 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
7488 dout(10) << " dn " << *dn
<< dendl
;
7489 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
7490 ceph_assert(straydir
);
7491 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
7492 ceph_assert(straydn
);
7493 dout(10) << " straydn " << *straydn
<< dendl
;
7494 CInode
*in
= straydn
->get_linkage()->get_inode();
7496 dn
->push_projected_linkage(in
);
7497 straydn
->push_projected_linkage();
7499 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7501 auto p
= rollback
.snapbl
.cbegin();
7502 decode(hadrealm
, p
);
7504 decode(in
->snaprealm
->srnode
, p
);
7506 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
7510 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
7511 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
7513 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
7518 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
7519 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
7520 mdlog
->start_entry(le
);
7522 le
->commit
.add_dir_context(dn
->get_dir());
7523 le
->commit
.add_primary_dentry(dn
, in
, true);
7524 // slave: no need to journal straydn
7526 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7527 le
->commit
.renamed_dirino
= in
->ino();
7529 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
7531 submit_mdlog_entry(le
,
7532 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
7538 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
7540 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
7542 straydn
->get_dir()->unlink_inode(straydn
);
7543 dn
->pop_projected_linkage();
7544 straydn
->pop_projected_linkage();
7546 CInode
*in
= dn
->get_linkage()->get_inode();
7547 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
7548 !mdr
|| mdr
->more()->slave_update_journaled
);
7550 if (mds
->is_resolve()) {
7551 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
7552 mdcache
->try_trim_non_auth_subtree(root
);
7556 mdcache
->request_finish(mdr
);
7558 mdcache
->finish_rollback(reqid
, mdr
);
7562 /** _dir_is_nonempty[_unlocked]
7564 * check if a directory is non-empty (i.e. we can rmdir it).
7566 * the unlocked varient this is a fastpath check. we can't really be
7567 * sure until we rdlock the filelock.
7569 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
7571 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
7572 ceph_assert(in
->is_auth());
7574 if (in
->filelock
.is_cached())
7575 return false; // there can be pending async create/unlink. don't know.
7576 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
7577 return true; // in a snapshot!
7579 auto&& ls
= in
->get_dirfrags();
7580 for (const auto& dir
: ls
) {
7581 // is the frag obviously non-empty?
7582 if (dir
->is_auth()) {
7583 if (dir
->get_projected_fnode()->fragstat
.size()) {
7584 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7585 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
7594 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
7596 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
7597 ceph_assert(in
->is_auth());
7598 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
7600 frag_info_t dirstat
;
7601 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
7603 auto&& ls
= in
->get_dirfrags();
7604 for (const auto& dir
: ls
) {
7605 const fnode_t
*pf
= dir
->get_projected_fnode();
7606 if (pf
->fragstat
.size()) {
7607 dout(10) << "dir_is_nonempty dirstat has "
7608 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
7612 if (pf
->accounted_fragstat
.version
== dirstat_version
)
7613 dirstat
.add(pf
->accounted_fragstat
);
7615 dirstat
.add(pf
->fragstat
);
7618 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
7622 // ======================================================
7625 class C_MDS_rename_finish
: public ServerLogContext
{
7630 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
7631 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
7632 ServerLogContext(s
, r
),
7633 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
7634 void finish(int r
) override
{
7635 ceph_assert(r
== 0);
7636 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
7641 /** handle_client_rename
7643 * rename master is the destdn auth. this is because cached inodes
7644 * must remain connected. thus, any replica of srci, must also
7645 * replicate destdn, and possibly straydn, so that srci (and
7646 * destdn->inode) remain connected during the rename.
7648 * to do this, we freeze srci, then master (destdn auth) verifies that
7649 * all other nodes have also replciated destdn and straydn. note that
7650 * destdn replicas need not also replicate srci. this only works when
7653 * This function takes responsibility for the passed mdr.
7655 void Server::handle_client_rename(MDRequestRef
& mdr
)
7657 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7658 dout(7) << "handle_client_rename " << *req
<< dendl
;
7660 filepath destpath
= req
->get_filepath();
7661 filepath srcpath
= req
->get_filepath2();
7662 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
7663 respond_to_request(mdr
, -EBUSY
);
7667 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
7671 dout(10) << " destdn " << *destdn
<< dendl
;
7672 CDir
*destdir
= destdn
->get_dir();
7673 ceph_assert(destdir
->is_auth());
7674 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7676 dout(10) << " srcdn " << *srcdn
<< dendl
;
7677 CDir
*srcdir
= srcdn
->get_dir();
7678 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7679 CInode
*srci
= srcdnl
->get_inode();
7680 dout(10) << " srci " << *srci
<< dendl
;
7682 // -- some sanity checks --
7683 if (destdn
== srcdn
) {
7684 dout(7) << "rename src=dest, noop" << dendl
;
7685 respond_to_request(mdr
, 0);
7689 // dest a child of src?
7690 // e.g. mv /usr /usr/foo
7691 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
7692 dout(7) << "cannot rename item to be a child of itself" << dendl
;
7693 respond_to_request(mdr
, -EINVAL
);
7697 // is this a stray migration, reintegration or merge? (sanity checks!)
7698 if (mdr
->reqid
.name
.is_mds() &&
7699 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
7700 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
7701 !(destdnl
->is_remote() &&
7702 destdnl
->get_remote_ino() == srci
->ino())) {
7703 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
7708 if (!destdnl
->is_null()) {
7709 //dout(10) << "dest dn exists " << *destdn << dendl;
7710 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
7712 dout(10) << " oldin " << *oldin
<< dendl
;
7714 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7715 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
7716 respond_to_request(mdr
, -ENOTEMPTY
);
7720 // mv /some/thing /to/some/existing_other_thing
7721 if (oldin
->is_dir() && !srci
->is_dir()) {
7722 respond_to_request(mdr
, -EISDIR
);
7725 if (!oldin
->is_dir() && srci
->is_dir()) {
7726 respond_to_request(mdr
, -ENOTDIR
);
7729 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
7730 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
7735 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
7736 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
7738 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7739 if (destpath
.get_ino() != srcpath
.get_ino() &&
7740 !(req
->get_source().is_mds() &&
7741 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7742 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
7743 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
7744 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7745 while (srcbase
!= destbase
&&
7746 !srcbase
->is_projected_ancestor_of(destbase
)) {
7747 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
7748 srctrace
.insert(srctrace
.begin(), pdn
);
7749 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
7750 srcbase
= pdn
->get_dir()->get_inode();
7753 // then, extend destpath until it shares the same parent inode as srcpath.
7754 while (destbase
!= srcbase
) {
7755 CDentry
*pdn
= destbase
->get_projected_parent_dn();
7756 desttrace
.insert(desttrace
.begin(), pdn
);
7757 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
7758 destbase
= pdn
->get_dir()->get_inode();
7760 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
7764 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
7766 dout(10) << " this is a link merge" << dendl
;
7768 // -- create stray dentry? --
7769 CDentry
*straydn
= NULL
;
7770 if (destdnl
->is_primary() && !linkmerge
) {
7771 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
7774 dout(10) << " straydn is " << *straydn
<< dendl
;
7775 } else if (mdr
->straydn
) {
7776 mdr
->unpin(mdr
->straydn
);
7777 mdr
->straydn
= NULL
;
7782 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7783 MutationImpl::LockOpVec lov
;
7785 // we need to update srci's ctime. xlock its least contended lock to do that...
7786 lov
.add_xlock(&srci
->linklock
);
7787 lov
.add_xlock(&srci
->snaplock
);
7790 // xlock oldin (for nlink--)
7791 lov
.add_xlock(&oldin
->linklock
);
7792 lov
.add_xlock(&oldin
->snaplock
);
7793 if (oldin
->is_dir()) {
7794 ceph_assert(srci
->is_dir());
7795 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
7797 // adjust locking order?
7798 int cmp
= mdr
->compare_paths();
7799 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
7800 std::reverse(lov
.begin(), lov
.end());
7802 ceph_assert(!srci
->is_dir());
7803 // adjust locking order;
7804 if (srci
->ino() > oldin
->ino())
7805 std::reverse(lov
.begin(), lov
.end());
7811 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7812 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7813 lov
.add_xlock(&straydn
->lock
);
7816 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
7817 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
7820 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7824 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
7826 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7827 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
7830 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
7833 if (!check_fragment_space(mdr
, destdn
->get_dir()))
7836 if (!check_access(mdr
, srci
, MAY_WRITE
))
7840 // with read lock, really verify oldin is empty
7843 _dir_is_nonempty(mdr
, oldin
)) {
7844 respond_to_request(mdr
, -ENOTEMPTY
);
7848 /* project_snaprealm_past_parent() will do this job
7850 // moving between snaprealms?
7851 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7852 SnapRealm *srcrealm = srci->find_snaprealm();
7853 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7854 if (srcrealm != destrealm &&
7855 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7856 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7857 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7858 mdcache->snaprealm_create(mdr, srci);
7864 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
7866 // -- open all srcdn inode frags, if any --
7867 // we need these open so that auth can properly delegate from inode to dirfrags
7868 // after the inode is _ours_.
7869 if (srcdnl
->is_primary() &&
7870 !srcdn
->is_auth() &&
7872 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
7873 mdr
->set_stickydirs(srci
);
7876 srci
->dirfragtree
.get_leaves(leaves
);
7877 for (const auto& leaf
: leaves
) {
7878 CDir
*dir
= srci
->get_dirfrag(leaf
);
7880 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
7881 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
7887 // -- prepare snaprealm ---
7890 if (!mdr
->more()->srci_srnode
&&
7891 srci
->get_projected_inode()->nlink
== 1 &&
7892 srci
->is_projected_snaprealm_global()) {
7893 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
7894 srci
->record_snaprealm_parent_dentry(new_srnode
, NULL
, destdn
, false);
7896 srci
->clear_snaprealm_global(new_srnode
);
7897 mdr
->more()->srci_srnode
= new_srnode
;
7900 if (oldin
&& !mdr
->more()->desti_srnode
) {
7901 if (oldin
->is_projected_snaprealm_global()) {
7902 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
7903 oldin
->record_snaprealm_parent_dentry(new_srnode
, NULL
, destdn
, destdnl
->is_primary());
7904 // dropping the last linkage or dropping the last remote linkage,
7905 // detch the inode from global snaprealm
7906 auto nlink
= oldin
->get_projected_inode()->nlink
;
7908 (nlink
== 2 && !destdnl
->is_primary() &&
7909 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
7910 oldin
->clear_snaprealm_global(new_srnode
);
7911 mdr
->more()->desti_srnode
= new_srnode
;
7912 } else if (destdnl
->is_primary()) {
7913 SnapRealm
*dest_realm
= destdir
->inode
->find_snaprealm();
7914 snapid_t follows
= dest_realm
->get_newest_seq();
7915 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
7916 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
7917 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7918 mdr
->more()->desti_srnode
= new_srnode
;
7922 if (!mdr
->more()->srci_srnode
) {
7923 SnapRealm
*dest_realm
= destdir
->inode
->find_snaprealm();
7924 if (srci
->is_projected_snaprealm_global()) {
7925 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
7926 srci
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, srcdn
, srcdnl
->is_primary());
7927 mdr
->more()->srci_srnode
= new_srnode
;
7928 } else if (srcdnl
->is_primary()) {
7929 SnapRealm
*src_realm
= srcdir
->inode
->find_snaprealm();
7930 snapid_t follows
= src_realm
->get_newest_seq();
7931 if (src_realm
!= dest_realm
&&
7932 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
7933 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
7934 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
7935 mdr
->more()->srci_srnode
= new_srnode
;
7941 // -- prepare witnesses --
7944 * NOTE: we use _all_ replicas as witnesses.
7945 * this probably isn't totally necessary (esp for file renames),
7946 * but if/when we change that, we have to make sure rejoin is
7947 * sufficiently robust to handle strong rejoins from survivors
7948 * with totally wrong dentry->inode linkage.
7949 * (currently, it can ignore rename effects, because the resolve
7950 * stage will sort them out.)
7952 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
7953 if (srcdn
->is_auth())
7954 srcdn
->list_replicas(witnesses
);
7956 witnesses
.insert(srcdn
->authority().first
);
7957 if (srcdnl
->is_remote() && !srci
->is_auth())
7958 witnesses
.insert(srci
->authority().first
);
7959 destdn
->list_replicas(witnesses
);
7960 if (destdnl
->is_remote() && !oldin
->is_auth())
7961 witnesses
.insert(oldin
->authority().first
);
7962 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7964 if (!witnesses
.empty()) {
7965 // Replicas can't see projected dentry linkages and will get confused.
7966 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
7967 // can't project these inodes' linkages.
7968 bool need_flush
= false;
7969 for (auto& dn
: srctrace
) {
7970 if (dn
->is_projected()) {
7976 CDentry
*dn
= destdn
;
7978 if (dn
->is_projected()) {
7982 CInode
*diri
= dn
->get_dir()->get_inode();
7983 dn
= diri
->get_projected_parent_dn();
7987 mdlog
->wait_for_safe(
7988 new MDSInternalContextWrapper(mds
,
7989 new C_MDS_RetryRequest(mdcache
, mdr
)));
7995 // do srcdn auth last
7996 mds_rank_t last
= MDS_RANK_NONE
;
7997 if (!srcdn
->is_auth()) {
7998 last
= srcdn
->authority().first
;
7999 mdr
->more()->srcdn_auth_mds
= last
;
8000 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8001 // are involved in the rename operation.
8002 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8003 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8004 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8005 ceph_assert(mdr
->more()->rename_inode
== srci
);
8006 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8011 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
8012 p
!= witnesses
.end();
8014 if (*p
== last
) continue; // do it last!
8015 if (mdr
->more()->witnessed
.count(*p
)) {
8016 dout(10) << " already witnessed by mds." << *p
<< dendl
;
8017 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
8018 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
8020 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
8024 if (!mdr
->more()->waiting_on_slave
.empty())
8025 return; // we're waiting for a witness.
8027 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
8028 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
8029 ceph_assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
8030 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8034 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
8035 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
8036 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
8037 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
8038 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
8040 // -- declare now --
8041 mdr
->set_mds_stamp(ceph_clock_now());
8043 // -- prepare journal entry --
8044 mdr
->ls
= mdlog
->get_current_segment();
8045 EUpdate
*le
= new EUpdate(mdlog
, "rename");
8046 mdlog
->start_entry(le
);
8047 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
8048 if (!mdr
->more()->witnessed
.empty()) {
8049 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
8051 le
->reqid
= mdr
->reqid
;
8052 le
->had_slaves
= true;
8054 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8055 // no need to send frozen auth pin to recovring auth MDS of srci
8056 mdr
->more()->is_remote_frozen_authpin
= false;
8059 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
8060 if (le
->client_map
.length())
8061 le
->cmapv
= mds
->sessionmap
.get_projected();
8063 // -- commit locally --
8064 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
8066 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
8067 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
8071 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8073 dout(10) << "_rename_finish " << *mdr
<< dendl
;
8075 if (!mdr
->more()->witnessed
.empty())
8076 mdcache
->logged_master_update(mdr
->reqid
);
8079 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8081 mdcache
->send_dentry_link(destdn
, mdr
);
8083 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8084 CInode
*in
= destdnl
->get_inode();
8085 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
8087 // test hack: test slave commit
8088 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
8089 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
8090 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
8091 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
8094 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8095 if (destdnl
->is_remote() && in
->is_auth())
8096 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
8098 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8100 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
8103 respond_to_request(mdr
, 0);
8106 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
8109 // respond_to_request() drops locks. So stray reintegration can race with us.
8110 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8111 mdcache
->notify_stray(straydn
);
8119 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
8120 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
8122 if (mds
->is_cluster_degraded() &&
8123 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8124 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
8125 if (mdr
->more()->waiting_on_slave
.empty())
8126 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8130 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
8131 auto req
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREP
);
8133 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
8134 for (auto dn
: srctrace
)
8135 req
->srcdnpath
.push_dentry(dn
->get_name());
8136 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
8137 for (auto dn
: dsttrace
)
8138 req
->destdnpath
.push_dentry(dn
->get_name());
8140 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8142 if (mdr
->more()->srci_srnode
)
8143 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
8144 if (mdr
->more()->desti_srnode
)
8145 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8147 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
8149 // srcdn auth will verify our current witness list is sufficient
8150 req
->witnesses
= witnesse
;
8152 req
->op_stamp
= mdr
->get_op_stamp();
8153 mds
->send_message_mds(req
, who
);
8155 ceph_assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
8156 mdr
->more()->waiting_on_slave
.insert(who
);
8160 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
8162 version_t oldpv
= mdr
->more()->inode_import_v
;
8164 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8167 auto blp
= mdr
->more()->inode_import
.cbegin();
8170 map
<client_t
,entity_inst_t
> client_map
;
8171 map
<client_t
, client_metadata_t
> client_metadata_map
;
8172 decode(client_map
, blp
);
8173 decode(client_metadata_map
, blp
);
8174 prepare_force_open_sessions(client_map
, client_metadata_map
,
8175 mdr
->more()->imported_session_map
);
8176 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
8177 encode(client_metadata_map
, *client_map_bl
);
8179 list
<ScatterLock
*> updated_scatterlocks
;
8180 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
8181 mdr
->more()->cap_imports
, updated_scatterlocks
);
8183 // hack: force back to !auth and clean, temporarily
8184 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
8185 srcdnl
->get_inode()->mark_clean();
8190 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
8192 auto&& dirs
= diri
->get_dirfrags();
8194 bool force_journal
= false;
8196 for (const auto& dir
: dirs
) {
8197 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
8198 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
8199 force_journal
= true;
8202 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
8205 // see if any children of our frags are auth subtrees.
8206 std::vector
<CDir
*> subtrees
;
8207 mdcache
->get_subtrees(subtrees
);
8208 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
8209 for (const auto& dir
: dirs
) {
8210 for (const auto& subtree
: subtrees
) {
8211 if (dir
->contains(subtree
)) {
8212 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
8213 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
8214 << *subtree
<< dendl
;
8215 force_journal
= true;
8218 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
8220 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
8226 return force_journal
;
8229 void Server::_rename_prepare(MDRequestRef
& mdr
,
8230 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
8231 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8233 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8235 dout(10) << " straydn " << *straydn
<< dendl
;
8237 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8238 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8239 CInode
*srci
= srcdnl
->get_inode();
8240 CInode
*oldin
= destdnl
->get_inode();
8242 // primary+remote link merge?
8243 bool linkmerge
= (srci
== oldin
);
8245 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8246 bool silent
= srcdn
->get_dir()->inode
->is_stray();
8248 bool force_journal_dest
= false;
8249 if (srci
->is_dir() && !destdn
->is_auth()) {
8250 if (srci
->is_auth()) {
8251 // if we are auth for srci and exporting it, force journal because journal replay needs
8252 // the source inode to create auth subtrees.
8253 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
8254 force_journal_dest
= true;
8256 force_journal_dest
= _need_force_journal(srci
, false);
8259 bool force_journal_stray
= false;
8260 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
8261 force_journal_stray
= _need_force_journal(oldin
, true);
8264 dout(10) << " merging remote and primary links to the same inode" << dendl
;
8266 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
8267 if (force_journal_dest
)
8268 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
8269 if (force_journal_stray
)
8270 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
8272 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
8273 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
8274 metablob
->renamed_dirino
= srci
->ino();
8275 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
8276 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
8277 metablob
->renamed_dirino
= oldin
->ino();
8281 CInode::mempool_inode
*spi
= 0; // renamed inode
8282 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
8286 if (destdnl
->is_primary()) {
8287 ceph_assert(straydn
); // moving to straydn.
8288 // link--, and move.
8289 if (destdn
->is_auth()) {
8290 auto &pi
= oldin
->project_inode(); //project_snaprealm
8291 pi
.inode
.version
= straydn
->pre_dirty(pi
.inode
.version
);
8292 pi
.inode
.update_backtrace();
8295 straydn
->push_projected_linkage(oldin
);
8296 } else if (destdnl
->is_remote()) {
8298 if (oldin
->is_auth()) {
8299 auto &pi
= oldin
->project_inode();
8300 pi
.inode
.version
= oldin
->pre_dirty();
8307 if (srcdnl
->is_remote()) {
8310 if (destdn
->is_auth())
8311 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
8312 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8314 if (srci
->is_auth()) {
8315 auto &pi
= srci
->project_inode();
8316 pi
.inode
.version
= srci
->pre_dirty();
8320 dout(10) << " will merge remote onto primary link" << dendl
;
8321 if (destdn
->is_auth()) {
8322 auto &pi
= oldin
->project_inode();
8323 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
8328 if (destdn
->is_auth()) {
8330 if (srcdn
->is_auth())
8331 oldpv
= srci
->get_projected_version();
8333 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
8335 // note which dirfrags have child subtrees in the journal
8336 // event, so that we can open those (as bounds) during replay.
8337 if (srci
->is_dir()) {
8338 auto&& ls
= srci
->get_dirfrags();
8339 for (const auto& dir
: ls
) {
8340 if (!dir
->is_auth())
8341 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
8343 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
8346 auto &pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
8347 // & srcdnl->snaprealm
8348 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
8349 pi
.inode
.update_backtrace();
8352 destdn
->push_projected_linkage(srci
);
8356 if (srcdn
->is_auth())
8357 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
8358 srcdn
->push_projected_linkage(); // push null linkage
8362 spi
->ctime
= mdr
->get_op_stamp();
8363 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
8364 spi
->rstat
.rctime
= mdr
->get_op_stamp();
8370 tpi
->ctime
= mdr
->get_op_stamp();
8371 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
8372 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
8376 destdn
->make_path_string(t
, true);
8377 tpi
->stray_prior_path
= std::move(t
);
8380 if (tpi
->nlink
== 0)
8381 oldin
->state_set(CInode::STATE_ORPHAN
);
8385 // prepare nesting, mtime updates
8386 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
8388 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8389 // then link the source inode to destdn
8390 if (destdnl
->is_primary()) {
8391 ceph_assert(straydn
);
8392 if (straydn
->is_auth()) {
8393 metablob
->add_dir_context(straydn
->get_dir());
8394 metablob
->add_dir(straydn
->get_dir(), true);
8399 if (destdn
->is_auth() && !destdnl
->is_null()) {
8400 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
8401 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
8402 if (destdnl
->is_primary()) {
8403 ceph_assert(straydn
);
8404 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
8405 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8410 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
8411 int flags
= predirty_dir
| predirty_primary
;
8412 if (srcdn
->is_auth())
8413 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
8414 if (destdn
->is_auth())
8415 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
8417 // add it all to the metablob
8420 if (destdnl
->is_primary()) {
8421 ceph_assert(straydn
);
8422 if (destdn
->is_auth()) {
8423 // project snaprealm, too
8424 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8425 oldin
->project_snaprealm(desti_srnode
);
8426 if (tpi
->nlink
== 0)
8427 ceph_assert(!desti_srnode
->is_parent_global());
8428 desti_srnode
= NULL
;
8430 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8431 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
8432 } else if (force_journal_stray
) {
8433 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
8434 metablob
->add_dir_context(straydn
->get_dir());
8435 metablob
->add_primary_dentry(straydn
, oldin
, true);
8437 } else if (destdnl
->is_remote()) {
8438 if (oldin
->is_auth()) {
8439 sr_t
*new_srnode
= NULL
;
8440 if (mdr
->slave_request
) {
8441 if (mdr
->slave_request
->desti_snapbl
.length() > 0) {
8442 new_srnode
= new sr_t();
8443 auto p
= mdr
->slave_request
->desti_snapbl
.cbegin();
8444 decode(*new_srnode
, p
);
8446 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8447 new_srnode
= desti_srnode
;
8448 desti_srnode
= NULL
;
8451 oldin
->project_snaprealm(new_srnode
);
8452 if (tpi
->nlink
== 0)
8453 ceph_assert(!new_srnode
->is_parent_global());
8456 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
8457 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
8458 CEPH_NOSNAP
, 0, destdnl
);
8459 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
8465 if (srcdnl
->is_remote()) {
8466 ceph_assert(!linkmerge
);
8467 if (destdn
->is_auth() && !destdnl
->is_null())
8468 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8470 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8472 if (destdn
->is_auth())
8473 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8475 if (srci
->is_auth() ) { // it's remote
8476 if (mdr
->slave_request
) {
8477 if (mdr
->slave_request
->srci_snapbl
.length() > 0) {
8478 sr_t
*new_srnode
= new sr_t();
8479 auto p
= mdr
->slave_request
->srci_snapbl
.cbegin();
8480 decode(*new_srnode
, p
);
8481 srci
->project_snaprealm(new_srnode
);
8483 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8484 srci
->project_snaprealm(srci_srnode
);
8488 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
8489 metablob
->add_dir_context(srci_pdn
->get_dir());
8490 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
, CEPH_NOSNAP
, 0, srcdnl
);
8491 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
8493 } else if (srcdnl
->is_primary()) {
8494 // project snap parent update?
8495 if (destdn
->is_auth()) {
8496 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8497 srci
->project_snaprealm(srci_srnode
);
8502 if (destdn
->is_auth() && !destdnl
->is_null())
8503 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8505 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8507 if (destdn
->is_auth())
8508 metablob
->add_primary_dentry(destdn
, srci
, true, true);
8509 else if (force_journal_dest
) {
8510 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
8511 metablob
->add_dir_context(destdn
->get_dir());
8512 metablob
->add_primary_dentry(destdn
, srci
, true);
8513 if (srcdn
->is_auth() && srci
->is_dir()) {
8514 // journal new subtrees root dirfrags
8515 auto&& ls
= srci
->get_dirfrags();
8516 for (const auto& dir
: ls
) {
8518 metablob
->add_dir(dir
, true);
8525 if (srcdn
->is_auth()) {
8526 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
8527 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
8528 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8529 // both primary and NULL dentries. Because during journal replay, null dentry is
8530 // processed after primary dentry.
8531 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
8532 metablob
->add_primary_dentry(srcdn
, srci
, true);
8533 metablob
->add_null_dentry(srcdn
, true);
8535 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
8537 // make renamed inode first track the dn
8538 if (srcdnl
->is_primary() && destdn
->is_auth()) {
8539 ceph_assert(srci
->first
<= destdn
->first
);
8540 srci
->first
= destdn
->first
;
8542 // make stray inode first track the straydn
8543 if (straydn
&& straydn
->is_auth()) {
8544 ceph_assert(oldin
->first
<= straydn
->first
);
8545 oldin
->first
= straydn
->first
;
8548 if (oldin
&& oldin
->is_dir()) {
8549 ceph_assert(straydn
);
8550 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
8553 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
8558 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8560 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8561 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
8563 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8564 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8566 CInode
*oldin
= destdnl
->get_inode();
8568 // primary+remote link merge?
8569 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
8571 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
8573 bool new_in_snaprealm
= false;
8574 bool new_oldin_snaprealm
= false;
8578 if (destdnl
->is_primary()) {
8579 ceph_assert(straydn
);
8580 dout(10) << "straydn is " << *straydn
<< dendl
;
8582 // if there is newly created snaprealm, need to split old snaprealm's
8583 // inodes_with_caps. So pop snaprealm before linkage changes.
8584 if (destdn
->is_auth()) {
8585 bool hadrealm
= (oldin
->snaprealm
? true : false);
8586 oldin
->early_pop_projected_snaprealm();
8587 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
8589 ceph_assert(mdr
->slave_request
);
8590 if (mdr
->slave_request
->desti_snapbl
.length()) {
8591 new_oldin_snaprealm
= !oldin
->snaprealm
;
8592 oldin
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
8593 ceph_assert(oldin
->snaprealm
);
8594 ceph_assert(oldin
->snaprealm
->have_past_parents_open());
8598 destdn
->get_dir()->unlink_inode(destdn
, false);
8600 straydn
->pop_projected_linkage();
8601 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8602 ceph_assert(!straydn
->is_projected()); // no other projected
8605 if (destdn
->is_auth())
8606 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8608 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
8609 } else if (destdnl
->is_remote()) {
8610 destdn
->get_dir()->unlink_inode(destdn
, false);
8611 if (oldin
->is_auth()) {
8612 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8613 } else if (mdr
->slave_request
) {
8614 if (mdr
->slave_request
->desti_snapbl
.length() > 0) {
8615 ceph_assert(oldin
->snaprealm
);
8616 oldin
->decode_snap_blob(mdr
->slave_request
->desti_snapbl
);
8618 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8619 delete desti_srnode
;
8620 desti_srnode
= NULL
;
8625 // unlink src before we relink it at dest
8626 CInode
*in
= srcdnl
->get_inode();
8629 bool srcdn_was_remote
= srcdnl
->is_remote();
8630 if (!srcdn_was_remote
) {
8631 // if there is newly created snaprealm, need to split old snaprealm's
8632 // inodes_with_caps. So pop snaprealm before linkage changes.
8633 if (destdn
->is_auth()) {
8634 bool hadrealm
= (in
->snaprealm
? true : false);
8635 in
->early_pop_projected_snaprealm();
8636 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
8638 ceph_assert(mdr
->slave_request
);
8639 if (mdr
->slave_request
->srci_snapbl
.length()) {
8640 new_in_snaprealm
= !in
->snaprealm
;
8641 in
->decode_snap_blob(mdr
->slave_request
->srci_snapbl
);
8642 ceph_assert(in
->snaprealm
);
8643 ceph_assert(in
->snaprealm
->have_past_parents_open());
8648 srcdn
->get_dir()->unlink_inode(srcdn
);
8651 if (srcdn_was_remote
) {
8654 destdnl
= destdn
->pop_projected_linkage();
8655 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8656 ceph_assert(!destdn
->is_projected()); // no other projected
8658 destdn
->link_remote(destdnl
, in
);
8659 if (destdn
->is_auth())
8660 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
8662 if (in
->is_auth()) {
8663 in
->pop_and_dirty_projected_inode(mdr
->ls
);
8664 } else if (mdr
->slave_request
) {
8665 if (mdr
->slave_request
->srci_snapbl
.length() > 0) {
8666 ceph_assert(in
->snaprealm
);
8667 in
->decode_snap_blob(mdr
->slave_request
->srci_snapbl
);
8669 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8674 dout(10) << "merging remote onto primary link" << dendl
;
8675 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
8679 dout(10) << "merging primary onto remote link" << dendl
;
8680 destdn
->get_dir()->unlink_inode(destdn
, false);
8682 destdnl
= destdn
->pop_projected_linkage();
8683 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8684 ceph_assert(!destdn
->is_projected()); // no other projected
8686 // srcdn inode import?
8687 if (!srcdn
->is_auth() && destdn
->is_auth()) {
8688 ceph_assert(mdr
->more()->inode_import
.length() > 0);
8690 map
<client_t
,Capability::Import
> imported_caps
;
8692 // finish cap imports
8693 finish_force_open_sessions(mdr
->more()->imported_session_map
);
8694 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
8695 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
8696 mdr
->more()->srcdn_auth_mds
, true,
8697 mdr
->more()->imported_session_map
,
8698 mdr
->more()->cap_imports
[destdnl
->get_inode()],
8702 mdr
->more()->inode_import
.clear();
8703 encode(imported_caps
, mdr
->more()->inode_import
);
8705 /* hack: add an auth pin for each xlock we hold. These were
8706 * remote xlocks previously but now they're local and
8707 * we're going to try and unpin when we xlock_finish. */
8709 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
8710 i
!= mdr
->locks
.end();
8712 SimpleLock
*lock
= i
->lock
;
8713 if (lock
->get_parent() != destdnl
->get_inode())
8715 if (i
->is_xlock() && !lock
->is_locallock())
8716 mds
->locker
->xlock_import(lock
);
8719 // hack: fix auth bit
8720 in
->state_set(CInode::STATE_AUTH
);
8722 mdr
->clear_ambiguous_auth();
8725 if (destdn
->is_auth())
8726 in
->pop_and_dirty_projected_inode(mdr
->ls
);
8730 if (srcdn
->is_auth())
8731 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
8732 srcdn
->pop_projected_linkage();
8733 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
8734 ceph_assert(!srcdn
->is_projected()); // no other projected
8736 // apply remaining projected inodes (nested)
8739 // update subtree map?
8740 if (destdnl
->is_primary() && in
->is_dir())
8741 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
8743 if (straydn
&& oldin
->is_dir())
8744 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
8746 if (new_oldin_snaprealm
)
8747 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
8748 if (new_in_snaprealm
)
8749 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
8751 // removing a new dn?
8752 if (srcdn
->is_auth())
8753 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
8761 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
8762 CDentry
*srcdn
, *destdn
, *straydn
;
8764 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
8765 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
8766 void finish(int r
) override
{
8767 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
8771 class C_MDS_SlaveRenameCommit
: public ServerContext
{
8773 CDentry
*srcdn
, *destdn
, *straydn
;
8775 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
8776 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
8777 void finish(int r
) override
{
8778 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
8782 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
8785 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
8786 ServerContext(s
), mdr(r
) {}
8787 void finish(int r
) override
{
8788 server
->_slave_rename_sessions_flushed(mdr
);
8792 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
8794 dout(10) << "handle_slave_rename_prep " << *mdr
8795 << " " << mdr
->slave_request
->srcdnpath
8796 << " to " << mdr
->slave_request
->destdnpath
8799 if (mdr
->slave_request
->is_interrupted()) {
8800 dout(10) << " slave request interrupted, sending noop reply" << dendl
;
8801 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8802 reply
->mark_interrupted();
8803 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8804 mdr
->reset_slave_request();
8809 filepath
destpath(mdr
->slave_request
->destdnpath
);
8810 dout(10) << " dest " << destpath
<< dendl
;
8811 vector
<CDentry
*> trace
;
8812 CF_MDS_MDRContextFactory
cf(mdcache
, mdr
, false);
8813 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
8814 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
8818 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
8819 mdr
->slave_to_mds
, true);
8822 ceph_assert(r
== 0); // we shouldn't get an error here!
8824 CDentry
*destdn
= trace
.back();
8825 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8826 dout(10) << " destdn " << *destdn
<< dendl
;
8830 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
8831 dout(10) << " src " << srcpath
<< dendl
;
8832 CInode
*srci
= nullptr;
8833 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
8834 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
8837 ceph_assert(r
== 0);
8839 CDentry
*srcdn
= trace
.back();
8840 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8841 dout(10) << " srcdn " << *srcdn
<< dendl
;
8846 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8848 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8849 CDentry
*straydn
= mdr
->straydn
;
8850 if (destdnl
->is_primary() && !linkmerge
)
8851 ceph_assert(straydn
);
8853 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
8854 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
8856 // set up commit waiter (early, to clean up any freezing etc we do)
8857 if (!mdr
->more()->slave_commit
)
8858 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
8861 if (srcdn
->is_auth()) {
8862 set
<mds_rank_t
> srcdnrep
;
8863 srcdn
->list_replicas(srcdnrep
);
8865 bool reply_witness
= false;
8866 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
8869 // - avoid conflicting lock state changes
8870 // - avoid concurrent updates to the inode
8871 // (this could also be accomplished with the versionlock)
8872 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8873 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
8874 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
8876 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8877 if (srcdnl
->get_inode()->is_frozen_auth_pin())
8878 mdr
->unfreeze_auth_pin();
8880 if (!frozen_inode
) {
8881 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
8886 * set ambiguous auth for srci
8887 * NOTE: we don't worry about ambiguous cache expire as we do
8888 * with subtree migrations because all slaves will pin
8889 * srcdn->get_inode() for duration of this rename.
8891 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
8893 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8894 // the master will send another OP_RENAMEPREP slave request later.
8895 if (mdr
->slave_request
->witnesses
.size() > 1) {
8896 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
8897 reply_witness
= true;
8900 // make sure bystanders have received all lock related messages
8901 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
8902 if (*p
== mdr
->slave_to_mds
||
8903 (mds
->is_cluster_degraded() &&
8904 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
8906 auto notify
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMENOTIFY
);
8907 mds
->send_message_mds(notify
, *p
);
8908 mdr
->more()->waiting_on_slave
.insert(*p
);
8911 // make sure clients have received all cap related messages
8912 set
<client_t
> export_client_set
;
8913 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
8915 MDSGatherBuilder
gather(g_ceph_context
);
8916 flush_client_sessions(export_client_set
, gather
);
8917 if (gather
.has_subs()) {
8918 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
8919 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
8924 // is witness list sufficient?
8925 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
8926 if (*p
== mdr
->slave_to_mds
||
8927 mdr
->slave_request
->witnesses
.count(*p
)) continue;
8928 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
8929 reply_witness
= true;
8933 if (reply_witness
) {
8934 ceph_assert(!srcdnrep
.empty());
8935 auto reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
8936 reply
->witnesses
.swap(srcdnrep
);
8937 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
8938 mdr
->reset_slave_request();
8941 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
8942 if (!mdr
->more()->waiting_on_slave
.empty()) {
8943 dout(10) << " still waiting for rename notify acks from "
8944 << mdr
->more()->waiting_on_slave
<< dendl
;
8947 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
8948 // set ambiguous auth for srci on witnesses
8949 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
8952 // encode everything we'd need to roll this back... basically, just the original state.
8953 rename_rollback rollback
;
8955 rollback
.reqid
= mdr
->reqid
;
8957 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
8958 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8959 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8960 rollback
.orig_src
.dname
= srcdn
->get_name();
8961 if (srcdnl
->is_primary())
8962 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
8964 ceph_assert(srcdnl
->is_remote());
8965 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
8966 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
8969 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
8970 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8971 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8972 rollback
.orig_dest
.dname
= destdn
->get_name();
8973 if (destdnl
->is_primary())
8974 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
8975 else if (destdnl
->is_remote()) {
8976 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
8977 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
8981 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
8982 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
8983 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
8984 rollback
.stray
.dname
= straydn
->get_name();
8986 if (mdr
->slave_request
->desti_snapbl
.length()) {
8987 CInode
*oldin
= destdnl
->get_inode();
8988 if (oldin
->snaprealm
) {
8989 encode(true, rollback
.desti_snapbl
);
8990 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
8992 encode(false, rollback
.desti_snapbl
);
8995 if (mdr
->slave_request
->srci_snapbl
.length()) {
8996 if (srci
->snaprealm
) {
8997 encode(true, rollback
.srci_snapbl
);
8998 srci
->encode_snap_blob(rollback
.srci_snapbl
);
9000 encode(false, rollback
.srci_snapbl
);
9003 encode(rollback
, mdr
->more()->rollback_bl
);
9004 // FIXME: rollback snaprealm
9005 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
9008 mdr
->ls
= mdlog
->get_current_segment();
9009 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
9010 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
9011 mdlog
->start_entry(le
);
9012 le
->rollback
= mdr
->more()->rollback_bl
;
9014 bufferlist blah
; // inode import data... obviously not used if we're the slave
9015 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
9017 if (le
->commit
.empty()) {
9018 dout(10) << " empty metablob, skipping journal" << dendl
;
9019 mdlog
->cancel_entry(le
);
9021 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
9023 mdcache
->add_uncommitted_slave(mdr
->reqid
, mdr
->ls
, mdr
->slave_to_mds
);
9024 mdr
->more()->slave_update_journaled
= true;
9025 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
9031 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
9032 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9034 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
9037 ref_t
<MMDSSlaveRequest
> reply
;
9038 if (!mdr
->aborted
) {
9039 reply
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
9040 if (!mdr
->more()->slave_update_journaled
)
9041 reply
->mark_not_journaled();
9044 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9045 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9048 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
9049 // set export bounds for CInode::encode_export()
9051 std::vector
<CDir
*> bounds
;
9052 if (srcdnl
->get_inode()->is_dir()) {
9053 srcdnl
->get_inode()->get_dirfrags(bounds
);
9054 for (const auto& bound
: bounds
) {
9055 bound
->state_set(CDir::STATE_EXPORTBOUND
);
9059 map
<client_t
,entity_inst_t
> exported_client_map
;
9060 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
9062 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
9063 exported_client_map
,
9064 exported_client_metadata_map
);
9066 for (const auto& bound
: bounds
) {
9067 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
9070 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
9071 encode(exported_client_metadata_map
, reply
->inode_export
);
9072 reply
->inode_export
.claim_append(inodebl
);
9073 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
9076 // remove mdr auth pin
9077 mdr
->auth_unpin(srcdnl
->get_inode());
9078 mdr
->more()->is_inode_exporter
= true;
9080 if (srcdnl
->get_inode()->is_dirty())
9081 srcdnl
->get_inode()->mark_clean();
9083 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
9087 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9089 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9092 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9093 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
9094 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
9097 mdr
->reset_slave_request();
9101 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
9103 ceph_assert(mdr
->aborted
);
9104 dout(10) << " abort flag set, finishing" << dendl
;
9105 mdcache
->request_finish(mdr
);
9109 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
9110 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9112 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
9114 CInode
*in
= destdn
->get_linkage()->get_inode();
9116 inodeno_t migrated_stray
;
9117 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
9118 migrated_stray
= in
->ino();
9120 MDSContext::vec finished
;
9122 // unfreeze+singleauth inode
9123 // hmm, do i really need to delay this?
9124 if (mdr
->more()->is_inode_exporter
) {
9126 // we exported, clear out any xlocks that we moved to another MDS
9128 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
9129 i
!= mdr
->locks
.end(); ) {
9130 SimpleLock
*lock
= i
->lock
;
9131 if (lock
->get_parent() != in
)
9133 // we only care about xlocks on the exported inode
9134 if (i
->is_xlock() && !lock
->is_locallock())
9135 mds
->locker
->xlock_export(i
++, mdr
.get());
9140 map
<client_t
,Capability::Import
> peer_imported
;
9141 auto bp
= mdr
->more()->inode_import
.cbegin();
9142 decode(peer_imported
, bp
);
9144 dout(10) << " finishing inode export on " << *in
<< dendl
;
9145 mdcache
->migrator
->finish_export_inode(in
, mdr
->slave_to_mds
, peer_imported
, finished
);
9146 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
9149 ceph_assert(in
->is_frozen_inode());
9150 in
->unfreeze_inode(finished
);
9154 if (mdr
->more()->is_ambiguous_auth
) {
9155 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9156 mdr
->more()->is_ambiguous_auth
= false;
9159 if (straydn
&& mdr
->more()->slave_update_journaled
) {
9160 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
9161 if (strayin
&& !strayin
->snaprealm
)
9162 mdcache
->clear_dirty_bits_for_stray(strayin
);
9165 mds
->queue_waiters(finished
);
9168 if (mdr
->more()->slave_update_journaled
) {
9169 // write a commit to the journal
9170 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
9171 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
9172 ESlaveUpdate::RENAME
);
9173 mdlog
->start_entry(le
);
9174 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
9177 _committed_slave(mdr
);
9182 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9183 // witness list from the master, and they failed before we tried prep again.
9184 if (mdr
->more()->rollback_bl
.length()) {
9185 if (mdr
->more()->is_inode_exporter
) {
9186 dout(10) << " reversing inode export of " << *in
<< dendl
;
9189 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
9190 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
9191 // rollback but preserve the slave request
9192 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
9193 mdr
->more()->rollback_bl
.clear();
9195 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
9197 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
9199 if (mdr
->more()->is_ambiguous_auth
) {
9200 if (srcdn
->is_auth())
9201 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9203 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9204 mdr
->more()->is_ambiguous_auth
= false;
9206 mds
->queue_waiters(finished
);
9207 mdcache
->request_finish(mdr
);
9211 if (migrated_stray
&& mds
->is_stopping())
9212 mdcache
->shutdown_export_stray_finish(migrated_stray
);
9215 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
9216 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
9219 pf
= dir
->project_fnode();
9220 mut
->add_projected_fnode(dir
);
9221 pf
->version
= dir
->pre_dirty();
9224 pf
->fragstat
.nsubdirs
+= linkunlink
;
9226 pf
->fragstat
.nfiles
+= linkunlink
;
9229 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
9230 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
9231 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
9232 pf
->rstat
.rsnaps
+= linkunlink
* rstat
.rsnaps
;
9234 if (pf
->fragstat
.mtime
== ctime
) {
9235 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
9236 if (pf
->rstat
.rctime
== ctime
)
9237 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
9239 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
9240 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
9243 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
9249 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9251 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
9252 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
9253 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
9254 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
9255 straydn(st
), finish_mdr(f
) {
9256 splits
[0].swap(_splits
[0]);
9257 splits
[1].swap(_splits
[1]);
9259 void finish(int r
) override
{
9260 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
9261 destdn
, straydn
, splits
, finish_mdr
);
9265 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
9268 rename_rollback rollback
;
9269 auto p
= rbl
.cbegin();
9270 decode(rollback
, p
);
9272 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
9273 // need to finish this update before sending resolve to claim the subtree
9274 mdcache
->add_rollback(rollback
.reqid
, master
);
9276 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
9277 mut
->ls
= mds
->mdlog
->get_current_segment();
9279 CDentry
*srcdn
= NULL
;
9280 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
9282 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
9284 dout(10) << " srcdir " << *srcdir
<< dendl
;
9285 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
9287 dout(10) << " srcdn " << *srcdn
<< dendl
;
9288 ceph_assert(srcdn
->get_linkage()->is_null());
9290 dout(10) << " srcdn not found" << dendl
;
9292 dout(10) << " srcdir not found" << dendl
;
9294 CDentry
*destdn
= NULL
;
9295 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
9297 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
9299 dout(10) << " destdir " << *destdir
<< dendl
;
9300 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
9302 dout(10) << " destdn " << *destdn
<< dendl
;
9304 dout(10) << " destdn not found" << dendl
;
9306 dout(10) << " destdir not found" << dendl
;
9309 if (rollback
.orig_src
.ino
) {
9310 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
9311 if (in
&& in
->is_dir())
9312 ceph_assert(srcdn
&& destdn
);
9314 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
9316 CDir
*straydir
= NULL
;
9317 CDentry
*straydn
= NULL
;
9318 if (rollback
.stray
.dirfrag
.ino
) {
9319 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
9321 dout(10) << "straydir " << *straydir
<< dendl
;
9322 straydn
= straydir
->lookup(rollback
.stray
.dname
);
9324 dout(10) << " straydn " << *straydn
<< dendl
;
9325 ceph_assert(straydn
->get_linkage()->is_primary());
9327 dout(10) << " straydn not found" << dendl
;
9329 dout(10) << "straydir not found" << dendl
;
9332 CInode
*target
= NULL
;
9333 if (rollback
.orig_dest
.ino
) {
9334 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
9336 ceph_assert(destdn
&& straydn
);
9337 } else if (rollback
.orig_dest
.remote_ino
)
9338 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
9340 // can't use is_auth() in the resolve stage
9341 mds_rank_t whoami
= mds
->get_nodeid();
9343 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
9344 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
9346 bool force_journal_src
= false;
9347 bool force_journal_dest
= false;
9348 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
9349 force_journal_src
= _need_force_journal(in
, false);
9350 if (in
&& target
&& target
->is_dir())
9351 force_journal_dest
= _need_force_journal(in
, true);
9353 version_t srcdnpv
= 0;
9356 if (srcdn
->authority().first
== whoami
)
9357 srcdnpv
= srcdn
->pre_dirty();
9358 if (rollback
.orig_src
.ino
) {
9360 srcdn
->push_projected_linkage(in
);
9362 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
9363 rollback
.orig_src
.remote_d_type
);
9366 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9368 CInode::mempool_inode
*pip
= nullptr;
9371 if (in
->get_projected_parent_dn()->authority().first
== whoami
) {
9372 auto &pi
= in
->project_inode();
9374 mut
->add_projected_inode(in
);
9375 pip
->version
= in
->pre_dirty();
9378 pip
= in
->get_projected_inode();
9381 if (pip
->ctime
== rollback
.ctime
)
9382 pip
->ctime
= rollback
.orig_src
.old_ctime
;
9384 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
9386 auto p
= rollback
.srci_snapbl
.cbegin();
9387 decode(hadrealm
, p
);
9389 if (projected
&& !mds
->is_resolve()) {
9390 sr_t
*new_srnode
= new sr_t();
9391 decode(*new_srnode
, p
);
9392 in
->project_snaprealm(new_srnode
);
9394 decode(in
->snaprealm
->srnode
, p
);
9397 if (rollback
.orig_src
.ino
) {
9398 ceph_assert(srcdir
);
9399 realm
= srcdir
->get_inode()->find_snaprealm();
9401 realm
= in
->snaprealm
->parent
;
9403 if (!mds
->is_resolve())
9404 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
9406 in
->project_snaprealm(NULL
);
9408 in
->snaprealm
->merge_to(realm
);
9413 if (srcdn
&& srcdn
->authority().first
== whoami
) {
9415 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
9416 in
? in
->is_dir() : false, 1, pip
? pip
->accounted_rstat
: blah
);
9421 if (rollback
.orig_dest
.ino
&& target
) {
9422 destdn
->push_projected_linkage(target
);
9423 } else if (rollback
.orig_dest
.remote_ino
) {
9424 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
9425 rollback
.orig_dest
.remote_d_type
);
9427 // the dentry will be trimmed soon, it's ok to have wrong linkage
9428 if (rollback
.orig_dest
.ino
)
9429 ceph_assert(mds
->is_resolve());
9430 destdn
->push_projected_linkage();
9435 straydn
->push_projected_linkage();
9439 CInode::mempool_inode
*ti
= nullptr;
9440 if (target
->get_projected_parent_dn()->authority().first
== whoami
) {
9441 auto &pi
= target
->project_inode();
9443 mut
->add_projected_inode(target
);
9444 ti
->version
= target
->pre_dirty();
9447 ti
= target
->get_projected_inode();
9450 if (ti
->ctime
== rollback
.ctime
)
9451 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
9452 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
9453 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
9454 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
9456 ceph_assert(rollback
.orig_dest
.remote_ino
&&
9457 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
9461 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
9463 auto p
= rollback
.desti_snapbl
.cbegin();
9464 decode(hadrealm
, p
);
9466 if (projected
&& !mds
->is_resolve()) {
9467 sr_t
*new_srnode
= new sr_t();
9468 decode(*new_srnode
, p
);
9469 target
->project_snaprealm(new_srnode
);
9471 decode(target
->snaprealm
->srnode
, p
);
9474 if (rollback
.orig_dest
.ino
) {
9475 ceph_assert(destdir
);
9476 realm
= destdir
->get_inode()->find_snaprealm();
9478 realm
= target
->snaprealm
->parent
;
9480 if (!mds
->is_resolve())
9481 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
9483 target
->project_snaprealm(NULL
);
9485 target
->snaprealm
->merge_to(realm
);
9491 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
9493 dout(0) << " srci back to " << *in
<< dendl
;
9495 dout(0) << " destdn back to " << *destdn
<< dendl
;
9497 dout(0) << " desti back to " << *target
<< dendl
;
9500 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
9501 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
9502 mdlog
->start_entry(le
);
9504 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9505 le
->commit
.add_dir_context(srcdir
);
9506 if (rollback
.orig_src
.ino
)
9507 le
->commit
.add_primary_dentry(srcdn
, 0, true);
9509 le
->commit
.add_remote_dentry(srcdn
, true);
9512 if (!rollback
.orig_src
.ino
&& // remote linkage
9513 in
&& in
->authority().first
== whoami
) {
9514 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
9515 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9518 if (force_journal_dest
) {
9519 ceph_assert(rollback
.orig_dest
.ino
);
9520 le
->commit
.add_dir_context(destdir
);
9521 le
->commit
.add_primary_dentry(destdn
, 0, true);
9524 // slave: no need to journal straydn
9526 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
9527 ceph_assert(rollback
.orig_dest
.remote_ino
);
9528 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
9529 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
9532 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9533 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
9534 le
->commit
.renamed_dirino
= in
->ino();
9535 if (srcdn
->authority().first
== whoami
) {
9536 auto&& ls
= in
->get_dirfrags();
9537 for (const auto& dir
: ls
) {
9538 if (!dir
->is_auth())
9539 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
9541 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
9543 } else if (force_journal_dest
) {
9544 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
9545 le
->commit
.renamed_dirino
= target
->ino();
9548 if (target
&& target
->is_dir()) {
9549 ceph_assert(destdn
);
9550 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
9553 if (in
&& in
->is_dir()) {
9555 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
9558 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
9559 ceph_assert(le
->commit
.empty());
9560 mdlog
->cancel_entry(le
);
9562 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
9564 ceph_assert(!le
->commit
.empty());
9566 mdr
->more()->slave_update_journaled
= false;
9567 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
9568 srcdn
, srcdnpv
, destdn
, straydn
,
9569 splits
, finish_mdr
);
9570 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
9575 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
9576 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
9577 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
9579 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
9582 straydn
->get_dir()->unlink_inode(straydn
);
9583 straydn
->pop_projected_linkage();
9586 destdn
->get_dir()->unlink_inode(destdn
);
9587 destdn
->pop_projected_linkage();
9590 srcdn
->pop_projected_linkage();
9591 if (srcdn
->authority().first
== mds
->get_nodeid()) {
9592 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
9593 if (srcdn
->get_linkage()->is_primary())
9594 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
9600 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
9601 CInode
*in
= srcdn
->get_linkage()->get_inode();
9602 if (in
&& in
->is_dir()) {
9603 ceph_assert(destdn
);
9604 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
9609 CInode
*oldin
= destdn
->get_linkage()->get_inode();
9610 // update subtree map?
9611 if (oldin
&& oldin
->is_dir()) {
9612 ceph_assert(straydn
);
9613 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
9617 if (mds
->is_resolve()) {
9620 root
= mdcache
->get_subtree_root(straydn
->get_dir());
9622 root
= mdcache
->get_subtree_root(destdn
->get_dir());
9624 mdcache
->try_trim_non_auth_subtree(root
);
9626 mdcache
->send_snaps(splits
[1]);
9627 mdcache
->send_snaps(splits
[0]);
9631 MDSContext::vec finished
;
9632 if (mdr
->more()->is_ambiguous_auth
) {
9633 if (srcdn
->is_auth())
9634 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9636 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9637 mdr
->more()->is_ambiguous_auth
= false;
9639 mds
->queue_waiters(finished
);
9640 if (finish_mdr
|| mdr
->aborted
)
9641 mdcache
->request_finish(mdr
);
9643 mdr
->more()->slave_rolling_back
= false;
9646 mdcache
->finish_rollback(mut
->reqid
, mdr
);
9651 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
9653 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9654 << " witnessed by " << ack
->get_source()
9655 << " " << *ack
<< dendl
;
9656 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
9659 mdr
->more()->slaves
.insert(from
);
9660 if (mdr
->more()->srcdn_auth_mds
== from
&&
9661 mdr
->more()->is_remote_frozen_authpin
&&
9662 !mdr
->more()->is_ambiguous_auth
) {
9663 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
9666 // witnessed? or add extra witnesses?
9667 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
9668 if (ack
->is_interrupted()) {
9669 dout(10) << " slave request interrupted, noop" << dendl
;
9670 } else if (ack
->witnesses
.empty()) {
9671 mdr
->more()->witnessed
.insert(from
);
9672 if (!ack
->is_not_journaled())
9673 mdr
->more()->has_journaled_slaves
= true;
9675 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
9676 mdr
->more()->extra_witnesses
= ack
->witnesses
;
9677 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
9681 if (ack
->inode_export
.length()) {
9682 dout(10) << " got srci import" << dendl
;
9683 mdr
->more()->inode_import
.share(ack
->inode_export
);
9684 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
9687 // remove from waiting list
9688 ceph_assert(mdr
->more()->waiting_on_slave
.count(from
));
9689 mdr
->more()->waiting_on_slave
.erase(from
);
9691 if (mdr
->more()->waiting_on_slave
.empty())
9692 dispatch_client_request(mdr
); // go again!
9694 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
9697 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSSlaveRequest
> &ack
)
9699 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
9700 << ack
->get_source() << dendl
;
9701 ceph_assert(mdr
->is_slave());
9702 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
9704 if (mdr
->more()->waiting_on_slave
.count(from
)) {
9705 mdr
->more()->waiting_on_slave
.erase(from
);
9707 if (mdr
->more()->waiting_on_slave
.empty()) {
9708 if (mdr
->slave_request
)
9709 dispatch_slave_request(mdr
);
9711 dout(10) << " still waiting for rename notify acks from "
9712 << mdr
->more()->waiting_on_slave
<< dendl
;
9716 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
9718 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
9720 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
9721 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
9723 if (mdr
->more()->waiting_on_slave
.empty()) {
9724 if (mdr
->slave_request
)
9725 dispatch_slave_request(mdr
);
9727 dout(10) << " still waiting for rename notify acks from "
9728 << mdr
->more()->waiting_on_slave
<< dendl
;
9733 /* This function takes responsibility for the passed mdr*/
9734 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
9736 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
9739 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
9743 if (!diri
->is_dir()) {
9744 respond_to_request(mdr
, -ENOTDIR
);
9747 dout(10) << "lssnap on " << *diri
<< dendl
;
9750 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
9753 if (!check_access(mdr
, diri
, MAY_READ
))
9756 SnapRealm
*realm
= diri
->find_snaprealm();
9757 map
<snapid_t
,const SnapInfo
*> infomap
;
9758 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
9760 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
9762 max_entries
= infomap
.size();
9763 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
9765 // make sure at least one item can be encoded
9766 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
9768 __u64 last_snapid
= 0;
9769 string offset_str
= req
->get_path2();
9770 if (!offset_str
.empty())
9771 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
9775 static DirStat empty
;
9776 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
9778 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
9782 auto p
= infomap
.upper_bound(last_snapid
);
9783 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
9784 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
9788 if (p
->second
->ino
== diri
->ino())
9789 snap_name
= p
->second
->name
;
9791 snap_name
= p
->second
->get_long_name();
9793 unsigned start_len
= dnbl
.length();
9794 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
9797 encode(snap_name
, dnbl
);
9799 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
9800 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
9801 dout(20) << "encode_infinite_lease" << dendl
;
9803 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
9806 keep
.substr_of(dnbl
, 0, start_len
);
9815 if (p
== infomap
.end()) {
9816 flags
= CEPH_READDIR_FRAG_END
;
9817 if (last_snapid
== 0)
9818 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
9820 encode(flags
, dirbl
);
9821 dirbl
.claim_append(dnbl
);
9823 mdr
->reply_extra_bl
= dirbl
;
9825 respond_to_request(mdr
, 0);
9831 struct C_MDS_mksnap_finish
: public ServerLogContext
{
9834 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
9835 ServerLogContext(s
, r
), diri(di
), info(i
) {}
9836 void finish(int r
) override
{
9837 server
->_mksnap_finish(mdr
, diri
, info
);
9841 /* This function takes responsibility for the passed mdr*/
9842 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
9844 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
9845 // make sure we have as new a map as the client
9846 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
9847 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
9850 if (!mds
->mdsmap
->allows_snaps()) {
9851 // you can't make snapshots until you set an option right now
9852 respond_to_request(mdr
, -EPERM
);
9856 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
9861 if (!diri
->is_dir()) {
9862 respond_to_request(mdr
, -ENOTDIR
);
9865 if (diri
->is_system() && !diri
->is_root()) {
9866 // no snaps in system dirs (root is ok)
9867 respond_to_request(mdr
, -EPERM
);
9871 std::string_view snapname
= req
->get_filepath().last_dentry();
9873 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
9874 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
9875 respond_to_request(mdr
, -EPERM
);
9879 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
9882 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
9883 MutationImpl::LockOpVec lov
;
9884 lov
.add_xlock(&diri
->snaplock
);
9885 if (!mds
->locker
->acquire_locks(mdr
, lov
))
9888 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
9889 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
9892 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
9895 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
9898 // check if we can create any more snapshots
9899 // we don't allow any more if we are already at or beyond the limit
9900 if (diri
->snaprealm
&&
9901 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
9902 respond_to_request(mdr
, -EMLINK
);
9906 // make sure name is unique
9907 if (diri
->snaprealm
&&
9908 diri
->snaprealm
->exists(snapname
)) {
9909 respond_to_request(mdr
, -EEXIST
);
9912 if (snapname
.length() == 0 ||
9913 snapname
[0] == '_') {
9914 respond_to_request(mdr
, -EINVAL
);
9918 // allocate a snapid
9919 if (!mdr
->more()->stid
) {
9921 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
9922 mdr
->get_mds_stamp(),
9923 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
9924 new C_MDS_RetryRequest(mdcache
, mdr
));
9928 version_t stid
= mdr
->more()->stid
;
9930 auto p
= mdr
->more()->snapidbl
.cbegin();
9932 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
9934 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
9938 info
.ino
= diri
->ino();
9939 info
.snapid
= snapid
;
9940 info
.name
= snapname
;
9941 info
.stamp
= mdr
->get_op_stamp();
9943 auto &pi
= diri
->project_inode(false, true);
9944 pi
.inode
.ctime
= info
.stamp
;
9945 if (info
.stamp
> pi
.inode
.rstat
.rctime
)
9946 pi
.inode
.rstat
.rctime
= info
.stamp
;
9947 pi
.inode
.rstat
.rsnaps
++;
9948 pi
.inode
.version
= diri
->pre_dirty();
9950 // project the snaprealm
9951 auto &newsnap
= *pi
.snapnode
;
9952 newsnap
.created
= snapid
;
9953 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
9955 em
.first
->second
= info
;
9956 newsnap
.seq
= snapid
;
9957 newsnap
.last_created
= snapid
;
9959 // journal the inode changes
9960 mdr
->ls
= mdlog
->get_current_segment();
9961 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
9962 mdlog
->start_entry(le
);
9964 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
9965 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
9966 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
9967 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
9969 // journal the snaprealm changes
9970 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
9975 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
9977 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
9979 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
9981 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
9984 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
9987 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
9990 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
9992 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
9996 mdr
->snapid
= info
.snapid
;
9998 respond_to_request(mdr
, 0);
10004 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
10007 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10008 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10009 void finish(int r
) override
{
10010 server
->_rmsnap_finish(mdr
, diri
, snapid
);
10014 /* This function takes responsibility for the passed mdr*/
10015 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
10017 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10019 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10023 if (!diri
->is_dir()) {
10024 respond_to_request(mdr
, -ENOTDIR
);
10028 std::string_view snapname
= req
->get_filepath().last_dentry();
10030 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10031 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10032 respond_to_request(mdr
, -EPERM
);
10036 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
10038 // does snap exist?
10039 if (snapname
.length() == 0 || snapname
[0] == '_') {
10040 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
10043 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
10044 respond_to_request(mdr
, -ENOENT
);
10047 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
10048 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
10050 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10051 MutationImpl::LockOpVec lov
;
10052 lov
.add_xlock(&diri
->snaplock
);
10053 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10055 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10056 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10059 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10062 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10066 if (!mdr
->more()->stid
) {
10067 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
10068 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10069 new C_MDS_RetryRequest(mdcache
, mdr
));
10072 version_t stid
= mdr
->more()->stid
;
10073 auto p
= mdr
->more()->snapidbl
.cbegin();
10076 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
10078 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10081 auto &pi
= diri
->project_inode(false, true);
10082 pi
.inode
.version
= diri
->pre_dirty();
10083 pi
.inode
.ctime
= mdr
->get_op_stamp();
10084 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
10085 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
10086 pi
.inode
.rstat
.rsnaps
--;
10088 mdr
->ls
= mdlog
->get_current_segment();
10089 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
10090 mdlog
->start_entry(le
);
10092 // project the snaprealm
10093 auto &newnode
= *pi
.snapnode
;
10094 newnode
.snaps
.erase(snapid
);
10096 newnode
.last_destroyed
= seq
;
10098 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10099 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10100 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10101 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10103 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
10108 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10110 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10111 snapid_t stid
= mdr
->more()->stid
;
10112 auto p
= mdr
->more()->snapidbl
.cbegin();
10116 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
10119 mds
->snapclient
->commit(stid
, mdr
->ls
);
10121 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10123 // notify other mds
10124 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
10126 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
10130 respond_to_request(mdr
, 0);
10132 // purge snapshot data
10133 if (diri
->snaprealm
->have_past_parents_open())
10134 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
10137 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
10140 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10141 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10142 void finish(int r
) override
{
10143 server
->_renamesnap_finish(mdr
, diri
, snapid
);
10147 /* This function takes responsibility for the passed mdr*/
10148 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
10150 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10151 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
10152 respond_to_request(mdr
, -EINVAL
);
10156 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10160 if (!diri
->is_dir()) { // dir only
10161 respond_to_request(mdr
, -ENOTDIR
);
10165 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
10166 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10167 respond_to_request(mdr
, -EPERM
);
10171 std::string_view dstname
= req
->get_filepath().last_dentry();
10172 std::string_view srcname
= req
->get_filepath2().last_dentry();
10173 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
10175 if (srcname
.length() == 0 || srcname
[0] == '_') {
10176 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
10179 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
10180 respond_to_request(mdr
, -ENOENT
);
10183 if (dstname
.length() == 0 || dstname
[0] == '_') {
10184 respond_to_request(mdr
, -EINVAL
);
10187 if (diri
->snaprealm
->exists(dstname
)) {
10188 respond_to_request(mdr
, -EEXIST
);
10192 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
10193 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
10196 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10197 MutationImpl::LockOpVec lov
;
10198 lov
.add_xlock(&diri
->snaplock
);
10199 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10201 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10202 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10205 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10208 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10212 if (!mdr
->more()->stid
) {
10213 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
10214 &mdr
->more()->stid
,
10215 new C_MDS_RetryRequest(mdcache
, mdr
));
10219 version_t stid
= mdr
->more()->stid
;
10220 dout(10) << " stid is " << stid
<< dendl
;
10222 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10225 auto &pi
= diri
->project_inode(false, true);
10226 pi
.inode
.ctime
= mdr
->get_op_stamp();
10227 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
10228 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
10229 pi
.inode
.version
= diri
->pre_dirty();
10231 // project the snaprealm
10232 auto &newsnap
= *pi
.snapnode
;
10233 auto it
= newsnap
.snaps
.find(snapid
);
10234 ceph_assert(it
!= newsnap
.snaps
.end());
10235 it
->second
.name
= dstname
;
10237 // journal the inode changes
10238 mdr
->ls
= mdlog
->get_current_segment();
10239 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
10240 mdlog
->start_entry(le
);
10242 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10243 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10244 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10245 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10247 // journal the snaprealm changes
10248 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
10253 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10255 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10257 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
10260 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10262 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10264 // notify other mds
10265 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
10267 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
10271 mdr
->tracei
= diri
;
10272 mdr
->snapid
= snapid
;
10273 respond_to_request(mdr
, 0);
10277 * Return true if server is in state RECONNECT and this
10278 * client has not yet reconnected.
10280 bool Server::waiting_for_reconnect(client_t c
) const
10282 return client_reconnect_gather
.count(c
) > 0;
10285 void Server::dump_reconnect_status(Formatter
*f
) const
10287 f
->open_object_section("reconnect_status");
10288 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
10289 f
->close_section();