1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
35 #include "msg/Messenger.h"
37 #include "osdc/Objecter.h"
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
58 #include <string_view>
61 #include "common/config.h"
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
70 class ServerContext
: public MDSContext
{
73 MDSRank
*get_mds() override
79 explicit ServerContext(Server
*s
) : server(s
) {
80 ceph_assert(server
!= NULL
);
84 class Batch_Getattr_Lookup
: public BatchOp
{
87 ceph::ref_t
<MDRequestImpl
> mdr
;
88 std::vector
<ceph::ref_t
<MDRequestImpl
>> batch_reqs
;
91 Batch_Getattr_Lookup(Server
* s
, const ceph::ref_t
<MDRequestImpl
>& r
)
93 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
)
94 mdr
->batch_op_map
= &mdr
->dn
[0].back()->batch_ops
;
96 mdr
->batch_op_map
= &mdr
->in
[0]->batch_ops
;
98 void add_request(const ceph::ref_t
<MDRequestImpl
>& r
) override
{
99 batch_reqs
.push_back(r
);
101 ceph::ref_t
<MDRequestImpl
> find_new_head() override
{
102 while (!batch_reqs
.empty()) {
103 auto r
= std::move(batch_reqs
.back());
104 batch_reqs
.pop_back();
108 r
->batch_op_map
= mdr
->batch_op_map
;
109 mdr
->batch_op_map
= nullptr;
115 void _forward(mds_rank_t t
) override
{
116 MDCache
* mdcache
= server
->mdcache
;
117 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
118 mdr
->set_mds_stamp(ceph_clock_now());
119 for (auto& m
: batch_reqs
) {
121 mdcache
->request_forward(m
, t
);
125 void _respond(int r
) override
{
126 mdr
->set_mds_stamp(ceph_clock_now());
127 for (auto& m
: batch_reqs
) {
129 m
->tracei
= mdr
->tracei
;
130 m
->tracedn
= mdr
->tracedn
;
131 server
->respond_to_request(m
, r
);
135 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
137 void print(std::ostream
& o
) {
138 o
<< "[batch front=" << *mdr
<< "]";
142 class ServerLogContext
: public MDSLogContextBase
{
145 MDSRank
*get_mds() override
151 void pre_finish(int r
) override
{
153 mdr
->mark_event("journal_committed: ");
156 explicit ServerLogContext(Server
*s
) : server(s
) {
157 ceph_assert(server
!= NULL
);
159 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
160 ceph_assert(server
!= NULL
);
164 void Server::create_logger()
166 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
168 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
170 plb
.add_u64_counter(l_mdss_handle_peer_request
, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
172 plb
.add_u64_counter(l_mdss_handle_client_session
,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING
);
175 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
177 plb
.add_u64_counter(l_mdss_cap_acquisition_throttle
,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING
);
181 // fop latencies are useful
182 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
183 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
192 "Request type lookup latency");
193 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
203 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
204 "Request type set extended attribute latency");
205 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
206 "Request type remove extended attribute latency");
207 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
208 "Request type read directory latency");
209 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
210 "Request type set file lock latency");
211 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
212 "Request type get file lock latency");
213 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
214 "Request type create latency");
215 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
216 "Request type open latency");
217 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
218 "Request type make node latency");
219 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
220 "Request type link latency");
221 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
222 "Request type unlink latency");
223 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
224 "Request type remove directory latency");
225 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
226 "Request type rename latency");
227 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
228 "Request type make directory latency");
229 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
230 "Request type symbolic link latency");
231 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
232 "Request type list snapshot latency");
233 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
234 "Request type make snapshot latency");
235 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
236 "Request type remove snapshot latency");
237 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
238 "Request type rename snapshot latency");
240 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
241 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
242 "Client requests dispatched");
243 plb
.add_u64_counter(l_mdss_dispatch_peer_request
, "dispatch_server_request",
244 "Server requests dispatched");
246 logger
= plb
.create_perf_counters();
247 g_ceph_context
->get_perfcounters_collection()->add(logger
);
250 Server::Server(MDSRank
*m
, MetricsHandler
*metrics_handler
) :
252 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
253 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate")),
254 metrics_handler(metrics_handler
)
256 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
257 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
258 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
259 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
260 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
261 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
262 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
263 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
264 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
265 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
266 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
267 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
270 void Server::dispatch(const cref_t
<Message
> &m
)
272 switch (m
->get_type()) {
273 case CEPH_MSG_CLIENT_RECONNECT
:
274 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
279 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
281 1. In reconnect phase, client sent unsafe requests to mds.
282 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
283 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
284 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
287 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
289 // handle_peer_request()/handle_client_session() will wait if necessary
290 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
291 const auto &req
= ref_cast
<MClientRequest
>(m
);
292 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
293 Session
*session
= mds
->get_session(req
);
294 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
295 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
298 bool queue_replay
= false;
299 if (req
->is_replay() || req
->is_async()) {
300 dout(3) << "queuing replayed op" << dendl
;
303 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
304 inodeno_t
ino(req
->head
.ino
);
305 mdcache
->add_replay_ino_alloc(ino
);
306 if (replay_unsafe_with_closed_session
&&
307 session
->free_prealloc_inos
.contains(ino
)) {
308 // don't purge inodes that will be created by later replay
309 session
->free_prealloc_inos
.erase(ino
);
310 session
->delegated_inos
.insert(ino
);
313 } else if (req
->get_retry_attempt()) {
314 // process completed request in clientreplay stage. The completed request
315 // might have created new file/directorie. This guarantees MDS sends a reply
316 // to client before other request modifies the new file/directorie.
317 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
318 dout(3) << "queuing completed op" << dendl
;
321 // this request was created before the cap reconnect message, drop any embedded
323 req
->releases
.clear();
326 req
->mark_queued_for_replay();
327 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
332 bool wait_for_active
= true;
333 if (mds
->is_stopping()) {
334 wait_for_active
= false;
335 } else if (mds
->is_clientreplay()) {
336 if (req
->is_queued_for_replay()) {
337 wait_for_active
= false;
340 if (wait_for_active
) {
341 dout(3) << "not active yet, waiting" << dendl
;
342 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
347 switch (m
->get_type()) {
348 case CEPH_MSG_CLIENT_SESSION
:
349 handle_client_session(ref_cast
<MClientSession
>(m
));
351 case CEPH_MSG_CLIENT_REQUEST
:
352 handle_client_request(ref_cast
<MClientRequest
>(m
));
354 case CEPH_MSG_CLIENT_RECLAIM
:
355 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
357 case MSG_MDS_PEER_REQUEST
:
358 handle_peer_request(ref_cast
<MMDSPeerRequest
>(m
));
361 derr
<< "server unknown message " << m
->get_type() << dendl
;
362 ceph_abort_msg("server unknown message");
368 // ----------------------------------------------------------
369 // SESSION management
371 class C_MDS_session_finish
: public ServerLogContext
{
376 interval_set
<inodeno_t
> inos_to_free
;
378 interval_set
<inodeno_t
> inos_to_purge
;
379 LogSegment
*ls
= nullptr;
382 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= nullptr) :
383 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
384 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
,
385 const interval_set
<inodeno_t
>& to_free
, version_t iv
,
386 const interval_set
<inodeno_t
>& to_purge
, LogSegment
*_ls
, Context
*fin_
= nullptr) :
387 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
),
388 inos_to_free(to_free
), inotablev(iv
), inos_to_purge(to_purge
), ls(_ls
), fin(fin_
) {}
389 void finish(int r
) override
{
391 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos_to_free
, inotablev
, inos_to_purge
, ls
);
398 Session
* Server::find_session_by_uuid(std::string_view uuid
)
400 Session
* session
= nullptr;
401 for (auto& it
: mds
->sessionmap
.get_sessions()) {
402 auto& metadata
= it
.second
->info
.client_metadata
;
404 auto p
= metadata
.find("uuid");
405 if (p
== metadata
.end() || p
->second
!= uuid
)
410 } else if (!session
->reclaiming_from
) {
411 ceph_assert(it
.second
->reclaiming_from
== session
);
414 ceph_assert(session
->reclaiming_from
== it
.second
);
420 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
422 if (!session
->is_open() && !session
->is_stale()) {
423 dout(10) << "session not open, dropping this req" << dendl
;
427 auto reply
= make_message
<MClientReclaimReply
>(0);
428 if (m
->get_uuid().empty()) {
429 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
430 reply
->set_result(-CEPHFS_EINVAL
);
431 mds
->send_message_client(reply
, session
);
435 unsigned flags
= m
->get_flags();
436 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
437 dout(10) << __func__
<< " unsupported flags" << dendl
;
438 reply
->set_result(-CEPHFS_EOPNOTSUPP
);
439 mds
->send_message_client(reply
, session
);
443 Session
* target
= find_session_by_uuid(m
->get_uuid());
445 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
446 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
447 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
448 reply
->set_result(-CEPHFS_EPERM
);
449 mds
->send_message_client(reply
, session
);
452 ceph_assert(!target
->reclaiming_from
);
453 ceph_assert(!session
->reclaiming_from
);
454 session
->reclaiming_from
= target
;
455 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
458 if (flags
& CEPH_RECLAIM_RESET
) {
459 finish_reclaim_session(session
, reply
);
466 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
468 Session
*target
= session
->reclaiming_from
;
470 session
->reclaiming_from
= nullptr;
474 int64_t session_id
= session
->get_client().v
;
475 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
476 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
477 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
481 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
482 reply
->set_epoch(epoch
);
483 mds
->send_message_client(reply
, session
);
486 send_reply
= nullptr;
489 bool blocklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
490 return map
.is_blocklisted(target
->info
.inst
.addr
);
493 if (blocklisted
|| !g_conf()->mds_session_blocklist_on_evict
) {
494 kill_session(target
, send_reply
);
496 CachedStackStringStream css
;
497 mds
->evict_client(target
->get_client().v
, false, true, *css
, send_reply
);
500 mds
->send_message_client(reply
, session
);
504 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
506 Session
*session
= mds
->get_session(m
);
507 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
508 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
511 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
515 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
516 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
517 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
521 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
522 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
526 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
527 finish_reclaim_session(session
);
529 reclaim_session(session
, m
);
533 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
536 Session
*session
= mds
->get_session(m
);
538 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
539 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
542 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
543 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
544 reply
->metadata
["error_string"] = "sessionless";
545 mds
->send_message(reply
, m
->get_connection());
549 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
550 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
551 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
552 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
553 reply
->metadata
["error_string"] = "client doesn't have caps for FS \"" +
554 std::string(fs_name
) + "\"";
555 mds
->send_message(std::move(reply
), m
->get_connection());
559 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
560 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
561 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
562 // close requests need to be handled when mds is active
563 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
564 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
568 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
569 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
575 logger
->inc(l_mdss_handle_client_session
);
578 switch (m
->get_op()) {
579 case CEPH_SESSION_REQUEST_OPEN
:
580 if (session
->is_opening() ||
581 session
->is_open() ||
582 session
->is_stale() ||
583 session
->is_killing() ||
584 terminating_sessions
) {
585 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
588 ceph_assert(session
->is_closed() || session
->is_closing());
590 if (mds
->is_stopping()) {
591 dout(10) << "mds is stopping, dropping open req" << dendl
;
596 auto& addr
= session
->info
.inst
.addr
;
597 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
598 auto& client_metadata
= session
->info
.client_metadata
;
600 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
601 auto now
= ceph_clock_now();
602 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
603 auto elapsed
= now
- m
->get_recv_stamp();
604 CachedStackStringStream css
;
605 *css
<< "New client session:"
606 << " addr=\"" << session
->info
.inst
.addr
<< "\""
607 << ",elapsed=" << elapsed
608 << ",throttled=" << throttle_elapsed
609 << ",status=\"" << status
<< "\"";
611 *css
<< ",error=\"" << err
<< "\"";
613 const auto& metadata
= session
->info
.client_metadata
;
614 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
615 *css
<< ",root=\"" << it
->second
<< "\"";
617 dout(2) << css
->strv() << dendl
;
620 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
, unsigned flags
=0) {
621 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
, 0, flags
);
622 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
623 m
->metadata
["error_string"] = err_str
;
624 mds
->send_message_client(m
, session
);
625 log_session_status("REJECTED", err_str
);
628 bool blocklisted
= mds
->objecter
->with_osdmap(
629 [&addr
](const OSDMap
&osd_map
) -> bool {
630 return osd_map
.is_blocklisted(addr
);
634 dout(10) << "rejecting blocklisted client " << addr
<< dendl
;
635 // This goes on the wire and the "blacklisted" substring is
636 // depended upon by the kernel client for detecting whether it
637 // has been blocklisted. If mounted with recover_session=clean
638 // (since 5.4), it tries to automatically recover itself from
641 flags
|= MClientSession::SESSION_BLOCKLISTED
;
642 send_reject_message("blocklisted (blacklisted)", flags
);
647 if (client_metadata
.features
.empty())
648 infer_supported_features(session
, client_metadata
);
650 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
651 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
652 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
653 for (const auto& p
: client_metadata
) {
654 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
657 feature_bitset_t missing_features
= required_client_features
;
658 missing_features
-= client_metadata
.features
;
659 if (!missing_features
.empty()) {
660 CachedStackStringStream css
;
661 *css
<< "missing required features '" << missing_features
<< "'";
662 send_reject_message(css
->strv());
663 mds
->clog
->warn() << "client session (" << session
->info
.inst
664 << ") lacks required features " << missing_features
665 << "; client supports " << client_metadata
.features
;
670 // Special case for the 'root' metadata path; validate that the claimed
671 // root is actually within the caps of the session
672 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
673 auto claimed_root
= it
->second
;
674 CachedStackStringStream css
;
676 // claimed_root has a leading "/" which we strip before passing
678 if (claimed_root
.empty() || claimed_root
[0] != '/') {
680 *css
<< "invalue root '" << claimed_root
<< "'";
681 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
683 *css
<< "non-allowable root '" << claimed_root
<< "'";
687 // Tell the client we're rejecting their open
688 send_reject_message(css
->strv());
689 mds
->clog
->warn() << "client session with " << css
->strv()
690 << " denied (" << session
->info
.inst
<< ")";
696 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
697 if (find_session_by_uuid(it
->second
)) {
698 send_reject_message("duplicated session uuid");
699 mds
->clog
->warn() << "client session with duplicated session uuid '"
700 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
706 if (session
->is_closed()) {
707 mds
->sessionmap
.add_session(session
);
710 pv
= mds
->sessionmap
.mark_projected(session
);
711 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
712 mds
->sessionmap
.touch_session(session
);
713 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
715 log_session_status("ACCEPTED", "");
717 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
718 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
723 case CEPH_SESSION_REQUEST_RENEWCAPS
:
724 if (session
->is_open() || session
->is_stale()) {
725 mds
->sessionmap
.touch_session(session
);
726 if (session
->is_stale()) {
727 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
728 mds
->locker
->resume_stale_caps(session
);
729 mds
->sessionmap
.touch_session(session
);
731 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
732 mds
->send_message_client(reply
, session
);
734 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
738 case CEPH_SESSION_REQUEST_CLOSE
:
740 if (session
->is_closed() ||
741 session
->is_closing() ||
742 session
->is_killing()) {
743 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
746 if (session
->is_importing()) {
747 dout(10) << "ignoring close req on importing session" << dendl
;
750 ceph_assert(session
->is_open() ||
751 session
->is_stale() ||
752 session
->is_opening());
753 if (m
->get_seq() < session
->get_push_seq()) {
754 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
755 << ", dropping" << dendl
;
758 // We are getting a seq that is higher than expected.
759 // Handle the same as any other seqn error.
761 if (m
->get_seq() != session
->get_push_seq()) {
762 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
763 << ", BUGGY!" << dendl
;
764 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
765 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
768 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
772 case CEPH_SESSION_FLUSHMSG_ACK
:
773 finish_flush_session(session
, m
->get_seq());
776 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
777 if (mds
->is_active())
786 void Server::flush_session(Session
*session
, MDSGatherBuilder
& gather
) {
787 if (!session
->is_open() ||
788 !session
->get_connection() ||
789 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
793 version_t seq
= session
->wait_for_flush(gather
.new_sub());
794 mds
->send_message_client(
795 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
798 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
800 for (const auto& client
: client_set
) {
801 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
802 ceph_assert(session
);
803 flush_session(session
, gather
);
807 void Server::finish_flush_session(Session
*session
, version_t seq
)
809 MDSContext::vec finished
;
810 session
->finish_flush(seq
, finished
);
811 mds
->queue_waiters(finished
);
814 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
815 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
816 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
)
818 dout(10) << "_session_logged " << session
->info
.inst
819 << " state_seq " << state_seq
820 << " " << (open
? "open":"close") << " " << pv
821 << " inos_to_free " << inos_to_free
<< " inotablev " << piv
822 << " inos_to_purge " << inos_to_purge
<< dendl
;
825 if (inos_to_purge
.size()){
827 session
->info
.prealloc_inos
.subtract(inos_to_purge
);
828 ls
->purging_inodes
.insert(inos_to_purge
);
829 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())
830 mdcache
->purge_inodes(inos_to_purge
, ls
);
833 if (inos_to_free
.size()) {
835 ceph_assert(session
->is_closing() || session
->is_killing() ||
836 session
->is_opening()); // re-open closing session
837 session
->info
.prealloc_inos
.subtract(inos_to_free
);
838 mds
->inotable
->apply_release_ids(inos_to_free
);
839 ceph_assert(mds
->inotable
->get_version() == piv
);
841 session
->free_prealloc_inos
= session
->info
.prealloc_inos
;
842 session
->delegated_inos
.clear();
845 mds
->sessionmap
.mark_dirty(session
);
848 if (session
->get_state_seq() != state_seq
) {
849 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
850 << ", noop" << dendl
;
851 // close must have been canceled (by an import?), or any number of other things..
853 ceph_assert(session
->is_opening());
854 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
855 mds
->sessionmap
.touch_session(session
);
856 metrics_handler
->add_session(session
);
857 ceph_assert(session
->get_connection());
858 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
859 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
860 reply
->supported_features
= supported_features
;
861 mds
->send_message_client(reply
, session
);
862 if (mdcache
->is_readonly()) {
863 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
864 mds
->send_message_client(m
, session
);
866 } else if (session
->is_closing() ||
867 session
->is_killing()) {
868 // kill any lingering capabilities, leases, requests
869 bool killing
= session
->is_killing();
870 while (!session
->caps
.empty()) {
871 Capability
*cap
= session
->caps
.front();
872 CInode
*in
= cap
->get_inode();
873 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
874 mds
->locker
->remove_client_cap(in
, cap
, killing
);
876 while (!session
->leases
.empty()) {
877 ClientLease
*r
= session
->leases
.front();
878 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
879 dout(20) << " killing client lease of " << *dn
<< dendl
;
880 dn
->remove_client_lease(r
, mds
->locker
);
882 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
883 dout(20) << " removing client from reconnect set" << dendl
;
884 if (client_reconnect_gather
.empty()) {
885 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
886 reconnect_gather_finish();
889 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
890 dout(20) << " removing client from reclaim set" << dendl
;
891 if (client_reclaim_gather
.empty()) {
892 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
893 mds
->maybe_clientreplay_done();
897 if (session
->is_closing()) {
898 // mark con disposable. if there is a fault, we will get a
899 // reset and clean it up. if the client hasn't received the
900 // CLOSE message yet, they will reconnect and get an
901 // ms_handle_remote_reset() and realize they had in fact closed.
902 // do this *before* sending the message to avoid a possible
904 if (session
->get_connection()) {
905 // Conditional because terminate_sessions will indiscrimately
906 // put sessions in CLOSING whether they ever had a conn or not.
907 session
->get_connection()->mark_disposable();
911 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
912 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
914 metrics_handler
->remove_session(session
);
915 mds
->sessionmap
.remove_session(session
);
916 } else if (session
->is_killing()) {
917 // destroy session, close connection
918 if (session
->get_connection()) {
919 session
->get_connection()->mark_down();
920 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
921 session
->set_connection(nullptr);
923 metrics_handler
->remove_session(session
);
924 mds
->sessionmap
.remove_session(session
);
934 * Inject sessions from some source other than actual connections.
937 * - sessions inferred from journal replay
938 * - sessions learned from other MDSs during rejoin
939 * - sessions learned from other MDSs during dir/caps migration
940 * - sessions learned from other MDSs during a cross-MDS rename
942 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
943 map
<client_t
,client_metadata_t
>& cmm
,
944 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
946 version_t pv
= mds
->sessionmap
.get_projected();
948 dout(10) << "prepare_force_open_sessions " << pv
949 << " on " << cm
.size() << " clients"
952 mds
->objecter
->with_osdmap(
953 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
954 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
955 if (osd_map
.is_blocklisted(p
->second
.addr
)) {
956 dout(10) << " ignoring blocklisted client." << p
->first
957 << " (" << p
->second
.addr
<< ")" << dendl
;
966 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
967 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
968 pv
= mds
->sessionmap
.mark_projected(session
);
970 if (session
->is_closed() ||
971 session
->is_closing() ||
972 session
->is_killing()) {
973 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
974 auto q
= cmm
.find(p
->first
);
976 session
->info
.client_metadata
.merge(q
->second
);
978 ceph_assert(session
->is_open() ||
979 session
->is_opening() ||
980 session
->is_stale());
983 smap
[p
->first
] = make_pair(session
, sseq
);
984 session
->inc_importing();
989 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
993 * FIXME: need to carefully consider the race conditions between a
994 * client trying to close a session and an MDS doing an import
995 * trying to force open a session...
997 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
998 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
1000 for (auto &it
: smap
) {
1001 Session
*session
= it
.second
.first
;
1002 uint64_t sseq
= it
.second
.second
;
1004 if (session
->get_state_seq() != sseq
) {
1005 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
1007 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
1008 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1009 mds
->sessionmap
.touch_session(session
);
1010 metrics_handler
->add_session(session
);
1012 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1013 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1014 reply
->supported_features
= supported_features
;
1015 mds
->send_message_client(reply
, session
);
1017 if (mdcache
->is_readonly())
1018 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1021 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
1022 ceph_assert(session
->is_open() || session
->is_stale());
1026 session
->dec_importing();
1029 mds
->sessionmap
.mark_dirty(session
);
1032 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
1035 class C_MDS_TerminatedSessions
: public ServerContext
{
1036 void finish(int r
) override
{
1037 server
->terminating_sessions
= false;
1040 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
1043 void Server::terminate_sessions()
1045 dout(5) << "terminating all sessions..." << dendl
;
1047 terminating_sessions
= true;
1049 // kill them off. clients will retry etc.
1050 set
<Session
*> sessions
;
1051 mds
->sessionmap
.get_client_session_set(sessions
);
1052 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1053 p
!= sessions
.end();
1055 Session
*session
= *p
;
1056 if (session
->is_closing() ||
1057 session
->is_killing() ||
1058 session
->is_closed())
1060 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
1063 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
1067 void Server::find_idle_sessions()
1069 auto now
= clock::now();
1070 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1072 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1075 // (caps go stale, lease die)
1076 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1077 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1079 // don't kick clients if we've been laggy
1080 if (last_cleared_laggy
< cutoff
) {
1081 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1082 << "), not marking any client stale" << dendl
;
1086 std::vector
<Session
*> to_evict
;
1088 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1089 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1090 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1091 std::vector
<Session
*> new_stale
;
1093 for (auto session
: *(sessions_p1
->second
)) {
1094 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1095 if (last_cap_renew_span
< cutoff
) {
1096 dout(20) << "laggiest active session is " << session
->info
.inst
1097 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1101 if (session
->last_seen
> session
->last_cap_renew
) {
1102 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1103 if (last_cap_renew_span
< cutoff
) {
1104 dout(20) << "laggiest active session is " << session
->info
.inst
1105 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1110 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1111 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1112 "has arrived" << dendl
;
1113 // evict session without marking it stale
1114 to_evict
.push_back(session
);
1118 if (defer_session_stale
&&
1119 !session
->is_any_flush_waiter() &&
1120 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1121 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1122 "since it holds no caps" << dendl
;
1126 auto it
= session
->info
.client_metadata
.find("timeout");
1127 if (it
!= session
->info
.client_metadata
.end()) {
1128 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1130 dout(10) << "skipping session " << session
->info
.inst
1131 << ", infinite timeout specified" << dendl
;
1134 double cutoff
= queue_max_age
+ timeout
;
1135 if (last_cap_renew_span
< cutoff
) {
1136 dout(10) << "skipping session " << session
->info
.inst
1137 << ", timeout (" << timeout
<< ") specified"
1138 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1142 // do not go through stale, evict it directly.
1143 to_evict
.push_back(session
);
1145 dout(10) << "new stale session " << session
->info
.inst
1146 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1147 new_stale
.push_back(session
);
1151 for (auto session
: new_stale
) {
1152 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1153 if (mds
->locker
->revoke_stale_caps(session
)) {
1154 mds
->locker
->remove_stale_leases(session
);
1155 finish_flush_session(session
, session
->get_push_seq());
1156 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
, session
->get_push_seq());
1157 mds
->send_message_client(m
, session
);
1159 to_evict
.push_back(session
);
1165 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1167 // Collect a list of sessions exceeding the autoclose threshold
1168 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1169 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1170 for (auto session
: *(sessions_p2
->second
)) {
1171 ceph_assert(session
->is_stale());
1172 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1173 if (last_cap_renew_span
< cutoff
) {
1174 dout(20) << "oldest stale session is " << session
->info
.inst
1175 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1178 to_evict
.push_back(session
);
1182 for (auto session
: to_evict
) {
1183 if (session
->is_importing()) {
1184 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1188 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1189 mds
->clog
->warn() << "evicting unresponsive client " << *session
1190 << ", after " << last_cap_renew_span
<< " seconds";
1191 dout(10) << "autoclosing stale session " << session
->info
.inst
1192 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1194 if (g_conf()->mds_session_blocklist_on_timeout
) {
1195 CachedStackStringStream css
;
1196 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
1198 kill_session(session
, NULL
);
1203 void Server::evict_cap_revoke_non_responders() {
1204 if (!cap_revoke_eviction_timeout
) {
1208 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1210 for (auto const &client
: to_evict
) {
1211 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1212 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1213 << " seconds, evicting";
1214 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1217 CachedStackStringStream css
;
1218 bool evicted
= mds
->evict_client(client
.v
, false,
1219 g_conf()->mds_session_blocklist_on_evict
,
1221 if (evicted
&& logger
) {
1222 logger
->inc(l_mdss_cap_revoke_eviction
);
1227 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1228 if (changed
.count("mds_forward_all_requests_to_auth")){
1229 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
1231 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1232 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1233 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1234 << cap_revoke_eviction_timeout
<< dendl
;
1236 if (changed
.count("mds_recall_max_decay_rate")) {
1237 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1239 if (changed
.count("mds_max_snaps_per_dir")) {
1240 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1241 dout(20) << __func__
<< " max snapshots per directory changed to "
1242 << max_snaps_per_dir
<< dendl
;
1244 if (changed
.count("mds_client_delegate_inos_pct")) {
1245 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1247 if (changed
.count("mds_max_caps_per_client")) {
1248 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1250 if (changed
.count("mds_session_cap_acquisition_throttle")) {
1251 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
1253 if (changed
.count("mds_session_max_caps_throttle_ratio")) {
1254 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
1256 if (changed
.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1257 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1259 if (changed
.count("mds_alternate_name_max")) {
1260 alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
1262 if (changed
.count("mds_dir_max_entries")) {
1263 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
1264 dout(20) << __func__
<< " max entries per directory changed to "
1265 << dir_max_entries
<< dendl
;
1267 if (changed
.count("mds_bal_fragment_size_max")) {
1268 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
1269 dout(20) << __func__
<< " max fragment size changed to "
1270 << bal_fragment_size_max
<< dendl
;
1275 * XXX bump in the interface here, not using an MDSContext here
1276 * because all the callers right now happen to use a SaferCond
1278 void Server::kill_session(Session
*session
, Context
*on_safe
)
1280 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1282 if ((session
->is_opening() ||
1283 session
->is_open() ||
1284 session
->is_stale()) &&
1285 !session
->is_importing()) {
1286 dout(10) << "kill_session " << session
<< dendl
;
1287 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1289 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1290 if (session
->is_closing() ||
1291 session
->is_killing()) {
1293 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1295 ceph_assert(session
->is_closed() ||
1296 session
->is_importing());
1298 on_safe
->complete(0);
1303 size_t Server::apply_blocklist(const std::set
<entity_addr_t
> &blocklist
)
1305 bool prenautilus
= mds
->objecter
->with_osdmap(
1306 [&](const OSDMap
& o
) {
1307 return o
.require_osd_release
< ceph_release_t::nautilus
;
1310 std::vector
<Session
*> victims
;
1311 const auto& sessions
= mds
->sessionmap
.get_sessions();
1312 for (const auto& p
: sessions
) {
1313 if (!p
.first
.is_client()) {
1314 // Do not apply OSDMap blocklist to MDS daemons, we find out
1315 // about their death via MDSMap.
1319 Session
*s
= p
.second
;
1320 auto inst_addr
= s
->info
.inst
.addr
;
1321 // blocklist entries are always TYPE_ANY for nautilus+
1322 inst_addr
.set_type(entity_addr_t::TYPE_ANY
);
1323 if (blocklist
.count(inst_addr
)) {
1324 victims
.push_back(s
);
1328 // ...except pre-nautilus, they were TYPE_LEGACY
1329 inst_addr
.set_type(entity_addr_t::TYPE_LEGACY
);
1330 if (blocklist
.count(inst_addr
)) {
1331 victims
.push_back(s
);
1336 for (const auto& s
: victims
) {
1337 kill_session(s
, nullptr);
1340 dout(10) << "apply_blocklist: killed " << victims
.size() << dendl
;
1342 return victims
.size();
1345 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1347 dout(10) << __func__
<< " : "
1348 << session
->info
.inst
1349 << " pending_prealloc_inos " << session
->pending_prealloc_inos
1350 << " free_prealloc_inos " << session
->free_prealloc_inos
1351 << " delegated_inos " << session
->delegated_inos
<< dendl
;
1353 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1354 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1357 // release alloc and pending-alloc inos for this session
1358 // and wipe out session state, in case the session close aborts for some reason
1359 interval_set
<inodeno_t
> inos_to_free
;
1360 inos_to_free
.insert(session
->pending_prealloc_inos
);
1361 inos_to_free
.insert(session
->free_prealloc_inos
);
1362 if (inos_to_free
.size()) {
1363 mds
->inotable
->project_release_ids(inos_to_free
);
1364 piv
= mds
->inotable
->get_projected_version();
1368 auto le
= new ESession(session
->info
.inst
, false, pv
, inos_to_free
, piv
, session
->delegated_inos
);
1369 auto fin
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, inos_to_free
, piv
,
1370 session
->delegated_inos
, mdlog
->get_current_segment(), on_safe
);
1371 mdlog
->start_submit_entry(le
, fin
);
1374 // clean up requests, too
1375 while(!session
->requests
.empty()) {
1376 auto mdr
= MDRequestRef(*session
->requests
.begin());
1377 mdcache
->request_kill(mdr
);
1380 finish_flush_session(session
, session
->get_push_seq());
1383 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1385 reconnect_done
= reconnect_done_
;
1387 auto now
= clock::now();
1388 set
<Session
*> sessions
;
1389 mds
->sessionmap
.get_client_session_set(sessions
);
1390 for (auto session
: sessions
) {
1391 if (session
->is_open()) {
1392 client_reconnect_gather
.insert(session
->get_client());
1393 session
->set_reconnecting(true);
1394 session
->last_cap_renew
= now
;
1398 if (client_reconnect_gather
.empty()) {
1399 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1400 reconnect_gather_finish();
1404 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1406 reconnect_start
= now
;
1407 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1408 mds
->sessionmap
.dump();
1411 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1413 dout(7) << "handle_client_reconnect " << m
->get_source()
1414 << (m
->has_more() ? " (more)" : "") << dendl
;
1415 client_t from
= m
->get_source().num();
1416 Session
*session
= mds
->get_session(m
);
1418 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1419 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1420 reply
->metadata
["error_string"] = "sessionless";
1421 mds
->send_message(reply
, m
->get_connection());
1425 if (!session
->is_open()) {
1426 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1427 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1428 mds
->send_message(reply
, m
->get_connection());
1432 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1434 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1435 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1436 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1440 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1441 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1444 if (reconnect_all_deny
|| !mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1445 // XXX maybe in the future we can do better than this?
1446 if (reconnect_all_deny
) {
1447 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl
;
1449 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1451 mds
->clog
->info() << "denied reconnect attempt (mds is "
1452 << ceph_mds_state_name(mds
->get_state())
1453 << ") from " << m
->get_source_inst()
1454 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1457 std::string error_str
;
1458 if (!session
->is_open()) {
1459 error_str
= "session is closed";
1460 } else if (mdcache
->is_readonly()) {
1461 error_str
= "mds is readonly";
1463 if (session
->info
.client_metadata
.features
.empty())
1464 infer_supported_features(session
, session
->info
.client_metadata
);
1466 feature_bitset_t missing_features
= required_client_features
;
1467 missing_features
-= session
->info
.client_metadata
.features
;
1468 if (!missing_features
.empty()) {
1469 CachedStackStringStream css
;
1470 *css
<< "missing required features '" << missing_features
<< "'";
1471 error_str
= css
->strv();
1475 if (!error_str
.empty()) {
1477 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1478 mds
->clog
->info() << "denied reconnect attempt from "
1479 << m
->get_source_inst() << " (" << error_str
<< ")";
1484 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1485 mds
->send_message_client(r
, session
);
1486 if (session
->is_open()) {
1487 client_reconnect_denied
.insert(session
->get_client());
1492 if (!m
->has_more()) {
1493 metrics_handler
->add_session(session
);
1494 // notify client of success with an OPEN
1495 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1496 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1497 reply
->supported_features
= supported_features
;
1498 mds
->send_message_client(reply
, session
);
1499 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1502 session
->last_cap_renew
= clock::now();
1505 for (const auto &r
: m
->realms
) {
1506 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1507 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1510 if (in
->snaprealm
) {
1511 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1513 // this can happen if we are non-auth or we rollback snaprealm
1514 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1516 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1518 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1519 << " seq " << r
.realm
.seq
<< dendl
;
1520 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1525 for (const auto &p
: m
->caps
) {
1526 // make sure our last_cap_id is MAX over all issued caps
1527 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1528 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1530 CInode
*in
= mdcache
->get_inode(p
.first
);
1531 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1533 if (in
&& in
->is_auth()) {
1534 // we recovered it, and it's ours. take note.
1535 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1536 << " on " << *in
<< dendl
;
1537 in
->reconnect_cap(from
, p
.second
, session
);
1538 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1539 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1543 if (in
&& !in
->is_auth()) {
1545 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1546 // add to cap export list.
1547 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1548 in
->authority().first
, true);
1550 // don't know if the inode is mine
1551 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1552 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1556 reconnect_last_seen
= clock::now();
1558 if (!m
->has_more()) {
1559 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1561 // remove from gather set
1562 client_reconnect_gather
.erase(from
);
1563 session
->set_reconnecting(false);
1564 if (client_reconnect_gather
.empty())
1565 reconnect_gather_finish();
1569 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1572 auto it
= client_metadata
.find("ceph_version");
1573 if (it
!= client_metadata
.end()) {
1574 // user space client
1575 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1576 supported
= CEPHFS_FEATURE_LUMINOUS
;
1577 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1578 supported
= CEPHFS_FEATURE_KRAKEN
;
1580 it
= client_metadata
.find("kernel_version");
1581 if (it
!= client_metadata
.end()) {
1583 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1584 supported
= CEPHFS_FEATURE_LUMINOUS
;
1587 if (supported
== -1 &&
1588 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1589 supported
= CEPHFS_FEATURE_JEWEL
;
1591 if (supported
>= 0) {
1592 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1593 client_metadata
.features
= feature_bitset_t(value
);
1594 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1598 void Server::update_required_client_features()
1600 required_client_features
= mds
->mdsmap
->get_required_client_features();
1601 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1603 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1604 set
<Session
*> sessions
;
1605 mds
->sessionmap
.get_client_session_set(sessions
);
1606 for (auto session
: sessions
) {
1607 feature_bitset_t missing_features
= required_client_features
;
1608 missing_features
-= session
->info
.client_metadata
.features
;
1609 if (!missing_features
.empty()) {
1610 bool blocklisted
= mds
->objecter
->with_osdmap(
1611 [session
](const OSDMap
&osd_map
) -> bool {
1612 return osd_map
.is_blocklisted(session
->info
.inst
.addr
);
1617 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1618 << missing_features
<< "'";
1619 CachedStackStringStream css
;
1620 mds
->evict_client(session
->get_client().v
, false,
1621 g_conf()->mds_session_blocklist_on_evict
, *css
);
1627 void Server::reconnect_gather_finish()
1629 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1630 ceph_assert(reconnect_done
);
1632 if (!mds
->snapclient
->is_synced()) {
1633 // make sure snaptable cache is populated. snaprealms will be
1634 // extensively used in rejoin stage.
1635 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1636 mds
->snapclient
->wait_for_sync(reconnect_done
);
1638 reconnect_done
->complete(0);
1640 reconnect_done
= NULL
;
1643 void Server::reconnect_tick()
1645 bool reject_all_reconnect
= false;
1646 if (reconnect_evicting
) {
1647 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1652 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1653 * then load less meta information in rejoin phase. This will shorten reboot time.
1654 * Moreover, loading less meta increases the chance standby with less memory can failover.
1656 * Why not shorten reconnect period?
1657 * Clients may send unsafe or retry requests, which haven't been
1658 * completed before old mds stop, to new mds. These requests may
1659 * need to be processed during new mds's clientreplay phase,
1660 * see: #https://github.com/ceph/ceph/pull/29059.
1662 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1663 if (client_reconnect_gather
.empty())
1666 if (reconnect_all_deny
&& (client_reconnect_gather
== client_reconnect_denied
))
1667 reject_all_reconnect
= true;
1669 auto now
= clock::now();
1670 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1671 if (elapse1
< g_conf()->mds_reconnect_timeout
&& !reject_all_reconnect
)
1674 vector
<Session
*> remaining_sessions
;
1675 remaining_sessions
.reserve(client_reconnect_gather
.size());
1676 for (auto c
: client_reconnect_gather
) {
1677 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1678 ceph_assert(session
);
1679 remaining_sessions
.push_back(session
);
1680 // client re-sends cap flush messages before the reconnect message
1681 if (session
->last_seen
> reconnect_last_seen
)
1682 reconnect_last_seen
= session
->last_seen
;
1685 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1686 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2 && !reject_all_reconnect
) {
1687 dout(7) << "reconnect_tick: last seen " << elapse2
1688 << " seconds ago, extending reconnect interval" << dendl
;
1692 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1693 << " clients have not reconnected in time" << dendl
;
1695 // If we're doing blocklist evictions, use this to wait for them before
1696 // proceeding to reconnect_gather_finish
1697 MDSGatherBuilder
gather(g_ceph_context
);
1699 for (auto session
: remaining_sessions
) {
1700 // Keep sessions that have specified timeout. These sessions will prevent
1701 // mds from going to active. MDS goes to active after they all have been
1702 // killed or reclaimed.
1703 if (session
->info
.client_metadata
.find("timeout") !=
1704 session
->info
.client_metadata
.end()) {
1705 dout(1) << "reconnect keeps " << session
->info
.inst
1706 << ", need to be reclaimed" << dendl
;
1707 client_reclaim_gather
.insert(session
->get_client());
1711 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1713 mds
->clog
->warn() << "evicting unresponsive client " << *session
1714 << ", after waiting " << elapse1
1715 << " seconds during MDS startup";
1717 // make _session_logged() purge orphan objects of lost async/unsafe requests
1718 session
->delegated_inos
.swap(session
->free_prealloc_inos
);
1720 if (g_conf()->mds_session_blocklist_on_timeout
) {
1721 CachedStackStringStream css
;
1722 mds
->evict_client(session
->get_client().v
, false, true, *css
,
1725 kill_session(session
, NULL
);
1728 failed_reconnects
++;
1730 client_reconnect_gather
.clear();
1731 client_reconnect_denied
.clear();
1733 if (gather
.has_subs()) {
1734 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1735 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1736 [this](int r
){reconnect_gather_finish();})));
1738 reconnect_evicting
= true;
1740 reconnect_gather_finish();
1744 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1746 if (!locks
.length()) return;
1749 auto p
= locks
.cbegin();
1750 decode(numlocks
, p
);
1751 for (int i
= 0; i
< numlocks
; ++i
) {
1753 lock
.client
= client
;
1754 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1755 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1757 decode(numlocks
, p
);
1758 for (int i
= 0; i
< numlocks
; ++i
) {
1760 lock
.client
= client
;
1761 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1762 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1767 * Call this when the MDCache is oversized, to send requests to the clients
1768 * to trim some caps, and consequently unpin some inodes in the MDCache so
1769 * that it can trim too.
1771 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1773 const auto now
= clock::now();
1774 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1775 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1776 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1777 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1779 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1780 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1781 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1782 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1783 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1784 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1786 dout(7) << __func__
<< ":"
1787 << " min=" << min_caps_per_client
1788 << " max=" << max_caps_per_client
1789 << " total=" << Capability::count()
1790 << " flags=" << flags
1793 /* trim caps of sessions with the most caps first */
1794 std::multimap
<uint64_t, Session
*> caps_session
;
1795 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1796 auto num_caps
= s
->caps
.size();
1797 auto cache_liveness
= s
->get_session_cache_liveness();
1798 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1799 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1802 mds
->sessionmap
.get_client_sessions(std::move(f
));
1804 std::pair
<bool, uint64_t> result
= {false, 0};
1805 auto& [throttled
, caps_recalled
] = result
;
1806 last_recall_state
= now
;
1807 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1808 if (!session
->is_open() ||
1809 !session
->get_connection() ||
1810 !session
->info
.inst
.name
.is_client())
1813 dout(10) << __func__
<< ":"
1814 << " session " << session
->info
.inst
1815 << " caps " << num_caps
1816 << ", leases " << session
->leases
.size()
1820 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1821 newlim
= min_caps_per_client
;
1823 newlim
= num_caps
-recall_max_caps
;
1825 if (num_caps
> newlim
) {
1826 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1827 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1828 newlim
= num_caps
-recall
;
1829 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1830 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1831 const uint64_t global_recall_throttle
= recall_throttle
.get();
1832 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1833 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1836 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1837 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1840 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1841 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1846 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1848 const auto session_recall
= session
->get_recall_caps();
1849 const auto session_release
= session
->get_release_caps();
1850 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1851 /* The session has been unable to keep up with the number of caps
1852 * recalled (by half); additionally, to prevent marking sessions
1853 * we've just begun to recall from, the session_recall counter
1854 * (decayed count of caps recently recalled) is **greater** than the
1855 * session threshold for the session's cap recall throttle.
1857 dout(15) << " 2*session_release < session_recall"
1858 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1859 " 2*session_recall < recall_max_decay_threshold"
1860 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1861 " Skipping because we are unlikely to get more released." << dendl
;
1863 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1864 /* The number of caps recalled is less than the number we *could*
1865 * recall (so there isn't much left to recall?) and the number of
1866 * caps is less than the current recall_caps counter (decayed count
1867 * of caps recently recalled).
1869 dout(15) << " 2*recall < session_recall "
1870 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1871 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1872 " Skipping because we are unlikely to get more released." << dendl
;
1877 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1879 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1880 m
->head
.max_caps
= newlim
;
1881 mds
->send_message_client(m
, session
);
1883 flush_session(session
, *gather
);
1885 caps_recalled
+= session
->notify_recall_sent(newlim
);
1886 recall_throttle
.hit(recall
);
1890 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1895 void Server::force_clients_readonly()
1897 dout(10) << "force_clients_readonly" << dendl
;
1898 set
<Session
*> sessions
;
1899 mds
->sessionmap
.get_client_session_set(sessions
);
1900 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1901 p
!= sessions
.end();
1903 Session
*session
= *p
;
1904 if (!session
->info
.inst
.name
.is_client() ||
1905 !(session
->is_open() || session
->is_stale()))
1907 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1912 * some generic stuff for finishing off requests
1914 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1916 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1917 ceph_assert(!mdr
->has_completed
);
1919 // note trace items for eventual reply.
1928 early_reply(mdr
, in
, dn
);
1930 mdr
->committing
= true;
1931 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1933 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1934 if (mds
->queue_one_replay()) {
1935 dout(10) << " queued next replay op" << dendl
;
1937 dout(10) << " journaled last replay op" << dendl
;
1939 } else if (mdr
->did_early_reply
)
1940 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1945 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1946 std::string_view event
)
1949 string
event_str("submit entry: ");
1951 mdr
->mark_event(event_str
);
1953 mdlog
->submit_entry(le
, fin
);
1957 * send response built from mdr contents and error code; clean up mdr
1959 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1961 if (mdr
->client_request
) {
1962 if (mdr
->is_batch_head()) {
1963 dout(20) << __func__
<< " batch head " << *mdr
<< dendl
;
1964 mdr
->release_batch_op()->respond(r
);
1966 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
1968 } else if (mdr
->internal_op
> -1) {
1969 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1970 if (!mdr
->internal_op_finish
)
1971 ceph_abort_msg("trying to respond to internal op without finisher");
1972 mdr
->internal_op_finish
->complete(r
);
1973 mdcache
->request_finish(mdr
);
1977 // statistics mds req op number and latency
1978 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
1980 int code
= l_mdss_first
;
1981 switch(req
->get_op()) {
1982 case CEPH_MDS_OP_LOOKUPHASH
:
1983 code
= l_mdss_req_lookuphash_latency
;
1985 case CEPH_MDS_OP_LOOKUPINO
:
1986 code
= l_mdss_req_lookupino_latency
;
1988 case CEPH_MDS_OP_LOOKUPPARENT
:
1989 code
= l_mdss_req_lookupparent_latency
;
1991 case CEPH_MDS_OP_LOOKUPNAME
:
1992 code
= l_mdss_req_lookupname_latency
;
1994 case CEPH_MDS_OP_LOOKUP
:
1995 code
= l_mdss_req_lookup_latency
;
1997 case CEPH_MDS_OP_LOOKUPSNAP
:
1998 code
= l_mdss_req_lookupsnap_latency
;
2000 case CEPH_MDS_OP_GETATTR
:
2001 code
= l_mdss_req_getattr_latency
;
2003 case CEPH_MDS_OP_SETATTR
:
2004 code
= l_mdss_req_setattr_latency
;
2006 case CEPH_MDS_OP_SETLAYOUT
:
2007 code
= l_mdss_req_setlayout_latency
;
2009 case CEPH_MDS_OP_SETDIRLAYOUT
:
2010 code
= l_mdss_req_setdirlayout_latency
;
2012 case CEPH_MDS_OP_SETXATTR
:
2013 code
= l_mdss_req_setxattr_latency
;
2015 case CEPH_MDS_OP_RMXATTR
:
2016 code
= l_mdss_req_rmxattr_latency
;
2018 case CEPH_MDS_OP_READDIR
:
2019 code
= l_mdss_req_readdir_latency
;
2021 case CEPH_MDS_OP_SETFILELOCK
:
2022 code
= l_mdss_req_setfilelock_latency
;
2024 case CEPH_MDS_OP_GETFILELOCK
:
2025 code
= l_mdss_req_getfilelock_latency
;
2027 case CEPH_MDS_OP_CREATE
:
2028 code
= l_mdss_req_create_latency
;
2030 case CEPH_MDS_OP_OPEN
:
2031 code
= l_mdss_req_open_latency
;
2033 case CEPH_MDS_OP_MKNOD
:
2034 code
= l_mdss_req_mknod_latency
;
2036 case CEPH_MDS_OP_LINK
:
2037 code
= l_mdss_req_link_latency
;
2039 case CEPH_MDS_OP_UNLINK
:
2040 code
= l_mdss_req_unlink_latency
;
2042 case CEPH_MDS_OP_RMDIR
:
2043 code
= l_mdss_req_rmdir_latency
;
2045 case CEPH_MDS_OP_RENAME
:
2046 code
= l_mdss_req_rename_latency
;
2048 case CEPH_MDS_OP_MKDIR
:
2049 code
= l_mdss_req_mkdir_latency
;
2051 case CEPH_MDS_OP_SYMLINK
:
2052 code
= l_mdss_req_symlink_latency
;
2054 case CEPH_MDS_OP_LSSNAP
:
2055 code
= l_mdss_req_lssnap_latency
;
2057 case CEPH_MDS_OP_MKSNAP
:
2058 code
= l_mdss_req_mksnap_latency
;
2060 case CEPH_MDS_OP_RMSNAP
:
2061 code
= l_mdss_req_rmsnap_latency
;
2063 case CEPH_MDS_OP_RENAMESNAP
:
2064 code
= l_mdss_req_renamesnap_latency
;
2067 dout(1) << ": unknown client op" << dendl
;
2070 logger
->tinc(code
, lat
);
2073 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
2075 if (!g_conf()->mds_early_reply
)
2078 if (mdr
->no_early_reply
) {
2079 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2083 if (mdr
->has_more() && mdr
->more()->has_journaled_peers
) {
2084 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl
;
2088 if (mdr
->alloc_ino
) {
2089 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2093 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2094 entity_inst_t client_inst
= req
->get_source_inst();
2095 if (client_inst
.name
.is_mds())
2098 if (req
->is_replay()) {
2099 dout(10) << " no early reply on replay op" << dendl
;
2104 auto reply
= make_message
<MClientReply
>(*req
, 0);
2105 reply
->set_unsafe();
2107 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2109 //_rename_finish() does not send dentry link/unlink message to replicas.
2110 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2111 // that have projected linkages from getting new replica.
2112 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2114 dout(10) << "early_reply " << reply
->get_result()
2115 << " (" << cpp_strerror(reply
->get_result())
2116 << ") " << *req
<< dendl
;
2118 if (tracei
|| tracedn
) {
2120 mdr
->cap_releases
.erase(tracei
->vino());
2122 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2124 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2127 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2128 mds
->send_message_client(reply
, mdr
->session
);
2130 mdr
->did_early_reply
= true;
2132 mds
->logger
->inc(l_mds_reply
);
2133 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2134 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2135 if (client_inst
.name
.is_client()) {
2136 mds
->sessionmap
.hit_session(mdr
->session
);
2138 perf_gather_op_latency(req
, lat
);
2139 dout(20) << "lat " << lat
<< dendl
;
2141 mdr
->mark_event("early_replied");
2146 * include a trace to tracei
2149 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2151 ceph_assert(mdr
.get());
2152 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2154 dout(7) << "reply_client_request " << reply
->get_result()
2155 << " (" << cpp_strerror(reply
->get_result())
2156 << ") " << *req
<< dendl
;
2158 mdr
->mark_event("replying");
2160 Session
*session
= mdr
->session
;
2162 // note successful request in session map?
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2169 reply
->get_result() == 0 && session
) {
2170 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2171 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2173 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr
, session
);
2180 // get tracei/tracedn from mdr?
2181 CInode
*tracei
= mdr
->tracei
;
2182 CDentry
*tracedn
= mdr
->tracedn
;
2184 bool is_replay
= mdr
->client_request
->is_replay();
2185 bool did_early_reply
= mdr
->did_early_reply
;
2186 entity_inst_t client_inst
= req
->get_source_inst();
2188 if (!did_early_reply
&& !is_replay
) {
2190 mds
->logger
->inc(l_mds_reply
);
2191 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2192 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2193 if (session
&& client_inst
.name
.is_client()) {
2194 mds
->sessionmap
.hit_session(session
);
2196 perf_gather_op_latency(req
, lat
);
2197 dout(20) << "lat " << lat
<< dendl
;
2200 mdr
->cap_releases
.erase(tracei
->vino());
2202 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2205 // drop non-rdlocks before replying, so that we can issue leases
2206 mdcache
->request_drop_non_rdlocks(mdr
);
2209 if (session
&& !client_inst
.name
.is_mds()) {
2211 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2212 (tracei
|| tracedn
)) {
2215 mdcache
->try_reconnect_cap(tracei
, session
);
2217 // include metadata in reply
2218 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2222 // We can set the extra bl unconditionally: if it's already been sent in the
2223 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2224 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2226 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2227 mds
->send_message_client(reply
, session
);
2230 if (req
->is_queued_for_replay() &&
2231 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2232 if (reply
->get_result() < 0) {
2233 int r
= reply
->get_result();
2234 derr
<< "reply_client_request: failed to replay " << *req
2235 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2236 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2238 mds
->queue_one_replay();
2242 mdcache
->request_finish(mdr
);
2244 // take a closer look at tracei, if it happens to be a remote link
2247 tracedn
->get_projected_linkage()->is_remote()) {
2248 mdcache
->eval_remote(tracedn
);
2253 * pass inode OR dentry (not both, or we may get confused)
2255 * trace is in reverse order (i.e. root inode comes last)
2257 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2258 CInode
*in
, CDentry
*dn
,
2261 // skip doing this for debugging purposes?
2262 if (g_conf()->mds_inject_traceless_reply_probability
&&
2263 mdr
->ls
&& !mdr
->o_trunc
&&
2264 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2265 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2269 // inode, dentry, dir, ..., inode
2271 mds_rank_t whoami
= mds
->get_nodeid();
2272 Session
*session
= mdr
->session
;
2273 snapid_t snapid
= mdr
->snapid
;
2274 utime_t now
= ceph_clock_now();
2276 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2279 if (snapid
== CEPH_NOSNAP
) {
2282 realm
= in
->find_snaprealm();
2284 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2285 reply
->snapbl
= realm
->get_snap_trace();
2286 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2291 reply
->head
.is_dentry
= 1;
2292 CDir
*dir
= dn
->get_dir();
2293 CInode
*diri
= dir
->get_inode();
2295 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2296 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2298 #ifdef MDS_VERIFY_FRAGSTAT
2299 if (dir
->is_complete())
2300 dir
->verify_fragstat();
2303 ds
.frag
= dir
->get_frag();
2304 ds
.auth
= dir
->get_dir_auth().first
;
2305 if (dir
->is_auth() && !forward_all_requests_to_auth
)
2306 dir
->get_dist_spec(ds
.dist
, whoami
);
2308 dir
->encode_dirstat(bl
, session
->info
, ds
);
2309 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2311 encode(dn
->get_name(), bl
);
2314 CDentry::linkage_t
*dnl
= dn
->get_linkage(mdr
->get_client(), mdr
);
2315 if (dnl
->is_primary()) {
2316 ceph_assert(dnl
->get_inode() == in
);
2317 lease_mask
= CEPH_LEASE_PRIMARY_LINK
;
2319 if (dnl
->is_remote())
2320 ceph_assert(dnl
->get_remote_ino() == in
->ino());
2324 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, bl
);
2325 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
2327 reply
->head
.is_dentry
= 0;
2331 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2332 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2333 reply
->head
.is_target
= 1;
2335 reply
->head
.is_target
= 0;
2337 reply
->set_trace(bl
);
2340 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2342 dout(4) << "handle_client_request " << *req
<< dendl
;
2345 mds
->logger
->inc(l_mds_request
);
2347 logger
->inc(l_mdss_handle_client_request
);
2349 if (!mdcache
->is_open()) {
2350 dout(5) << "waiting for root" << dendl
;
2351 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2355 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2357 Session
*session
= 0;
2358 if (req
->get_source().is_client()) {
2359 session
= mds
->get_session(req
);
2361 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2362 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2363 session
->is_closing() ||
2364 session
->is_killing()) {
2365 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2369 if (req
->is_queued_for_replay())
2370 mds
->queue_one_replay();
2376 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2377 // send it? hrm, this isn't ideal; they may get a lot of copies if
2378 // they have a high request rate.
2381 // completed request?
2382 bool has_completed
= false;
2383 if (req
->is_replay() || req
->get_retry_attempt()) {
2384 ceph_assert(session
);
2386 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2387 has_completed
= true;
2388 if (!session
->is_open())
2390 // Don't send traceless reply if the completed request has created
2391 // new inode. Treat the request as lookup request instead.
2392 if (req
->is_replay() ||
2393 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2394 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2395 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2396 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2397 auto reply
= make_message
<MClientReply
>(*req
, 0);
2398 if (created
!= inodeno_t()) {
2400 encode(created
, extra
);
2401 reply
->set_extra_bl(extra
);
2403 mds
->send_message_client(reply
, session
);
2405 if (req
->is_queued_for_replay())
2406 mds
->queue_one_replay();
2410 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2411 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2412 dout(10) << " completed request which created new inode " << created
2413 << ", convert it to lookup request" << dendl
;
2414 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2415 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2420 // trim completed_request list
2421 if (req
->get_oldest_client_tid() > 0) {
2422 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2423 ceph_assert(session
);
2424 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2425 // Sessions 'completed_requests' was dirtied, mark it to be
2426 // potentially flushed at segment expiry.
2427 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2429 if (session
->get_num_trim_requests_warnings() > 0 &&
2430 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2431 session
->reset_num_trim_requests_warnings();
2433 if (session
->get_num_completed_requests() >=
2434 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2435 session
->inc_num_trim_requests_warnings();
2436 CachedStackStringStream css
;
2437 *css
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2438 << req
->get_oldest_client_tid() << "), "
2439 << session
->get_num_completed_requests()
2440 << " completed requests recorded in session\n";
2441 mds
->clog
->warn() << css
->strv();
2442 dout(20) << __func__
<< " " << css
->strv() << dendl
;
2447 // register + dispatch
2448 MDRequestRef mdr
= mdcache
->request_start(req
);
2453 mdr
->session
= session
;
2454 session
->requests
.push_back(&mdr
->item_session_request
);
2458 mdr
->has_completed
= true;
2460 // process embedded cap releases?
2461 // (only if NOT replay!)
2462 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2463 client_t client
= req
->get_source().num();
2464 for (const auto &r
: req
->releases
) {
2465 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2467 req
->releases
.clear();
2470 dispatch_client_request(mdr
);
2474 void Server::handle_osd_map()
2476 /* Note that we check the OSDMAP_FULL flag directly rather than
2477 * using osdmap_full_flag(), because we want to know "is the flag set"
2478 * rather than "does the flag apply to us?" */
2479 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2480 auto pi
= o
.get_pg_pool(mds
->get_metadata_pool());
2481 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2482 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2483 << o
.get_epoch() << dendl
;
2487 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2489 // we shouldn't be waiting on anyone.
2490 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_peer
.empty());
2493 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2494 //if the mdr is a "batch_op" and it has followers, pick a follower as
2495 //the new "head of the batch ops" and go on processing the new one.
2496 if (mdr
->is_batch_head()) {
2497 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2498 auto it
= mdr
->batch_op_map
->find(mask
);
2499 auto new_batch_head
= it
->second
->find_new_head();
2500 if (!new_batch_head
) {
2501 mdr
->batch_op_map
->erase(it
);
2504 mdr
= std::move(new_batch_head
);
2508 } else if (mdr
->aborted
) {
2509 mdr
->aborted
= false;
2510 mdcache
->request_kill(mdr
);
2514 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2516 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2518 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2520 if (req
->may_write() && mdcache
->is_readonly()) {
2521 dout(10) << " read-only FS" << dendl
;
2522 respond_to_request(mdr
, -CEPHFS_EROFS
);
2525 if (mdr
->has_more() && mdr
->more()->peer_error
) {
2526 dout(10) << " got error from peers" << dendl
;
2527 respond_to_request(mdr
, mdr
->more()->peer_error
);
2532 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
2534 // the request is already responded to
2537 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2538 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2539 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2540 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2541 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2542 req
->get_op() == CEPH_MDS_OP_CREATE
||
2543 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2544 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2545 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2546 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2547 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started peer request
2550 if (check_access(mdr
, cur
, MAY_FULL
)) {
2551 dout(20) << __func__
<< ": full, has FULL caps, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2553 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2554 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
2558 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2562 switch (req
->get_op()) {
2563 case CEPH_MDS_OP_LOOKUPHASH
:
2564 case CEPH_MDS_OP_LOOKUPINO
:
2565 handle_client_lookup_ino(mdr
, false, false);
2567 case CEPH_MDS_OP_LOOKUPPARENT
:
2568 handle_client_lookup_ino(mdr
, true, false);
2570 case CEPH_MDS_OP_LOOKUPNAME
:
2571 handle_client_lookup_ino(mdr
, false, true);
2575 case CEPH_MDS_OP_LOOKUP
:
2576 handle_client_getattr(mdr
, true);
2579 case CEPH_MDS_OP_LOOKUPSNAP
:
2580 // lookupsnap does not reference a CDentry; treat it as a getattr
2581 case CEPH_MDS_OP_GETATTR
:
2582 handle_client_getattr(mdr
, false);
2585 case CEPH_MDS_OP_SETATTR
:
2586 handle_client_setattr(mdr
);
2588 case CEPH_MDS_OP_SETLAYOUT
:
2589 handle_client_setlayout(mdr
);
2591 case CEPH_MDS_OP_SETDIRLAYOUT
:
2592 handle_client_setdirlayout(mdr
);
2594 case CEPH_MDS_OP_SETXATTR
:
2595 handle_client_setxattr(mdr
);
2597 case CEPH_MDS_OP_RMXATTR
:
2598 handle_client_removexattr(mdr
);
2601 case CEPH_MDS_OP_READDIR
:
2602 handle_client_readdir(mdr
);
2605 case CEPH_MDS_OP_SETFILELOCK
:
2606 handle_client_file_setlock(mdr
);
2609 case CEPH_MDS_OP_GETFILELOCK
:
2610 handle_client_file_readlock(mdr
);
2614 case CEPH_MDS_OP_CREATE
:
2615 if (mdr
->has_completed
)
2616 handle_client_open(mdr
); // already created.. just open
2618 handle_client_openc(mdr
);
2621 case CEPH_MDS_OP_OPEN
:
2622 handle_client_open(mdr
);
2627 case CEPH_MDS_OP_MKNOD
:
2628 handle_client_mknod(mdr
);
2630 case CEPH_MDS_OP_LINK
:
2631 handle_client_link(mdr
);
2633 case CEPH_MDS_OP_UNLINK
:
2634 case CEPH_MDS_OP_RMDIR
:
2635 handle_client_unlink(mdr
);
2637 case CEPH_MDS_OP_RENAME
:
2638 handle_client_rename(mdr
);
2640 case CEPH_MDS_OP_MKDIR
:
2641 handle_client_mkdir(mdr
);
2643 case CEPH_MDS_OP_SYMLINK
:
2644 handle_client_symlink(mdr
);
2649 case CEPH_MDS_OP_LSSNAP
:
2650 handle_client_lssnap(mdr
);
2652 case CEPH_MDS_OP_MKSNAP
:
2653 handle_client_mksnap(mdr
);
2655 case CEPH_MDS_OP_RMSNAP
:
2656 handle_client_rmsnap(mdr
);
2658 case CEPH_MDS_OP_RENAMESNAP
:
2659 handle_client_renamesnap(mdr
);
2663 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2664 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
2669 // ---------------------------------------
2672 void Server::handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
)
2674 dout(4) << "handle_peer_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2675 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2677 if (logger
) logger
->inc(l_mdss_handle_peer_request
);
2681 return handle_peer_request_reply(m
);
2683 // the purpose of rename notify is enforcing causal message ordering. making sure
2684 // bystanders have received all messages from rename srcdn's auth MDS.
2685 if (m
->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY
) {
2686 auto reply
= make_message
<MMDSPeerRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK
);
2687 mds
->send_message(reply
, m
->get_connection());
2691 CDentry
*straydn
= NULL
;
2692 if (m
->straybl
.length() > 0) {
2693 mdcache
->decode_replica_stray(straydn
, m
->straybl
, from
);
2694 ceph_assert(straydn
);
2698 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2699 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2700 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2706 if (mdcache
->have_request(m
->get_reqid())) {
2708 mdr
= mdcache
->request_get(m
->get_reqid());
2710 // is my request newer?
2711 if (mdr
->attempt
> m
->get_attempt()) {
2712 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2713 << ", dropping " << *m
<< dendl
;
2717 if (mdr
->attempt
< m
->get_attempt()) {
2718 // mine is old, close it out
2719 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2720 << ", closing out" << dendl
;
2721 mdcache
->request_finish(mdr
);
2723 } else if (mdr
->peer_to_mds
!= from
) {
2724 dout(10) << "local request " << *mdr
<< " not peer to mds." << from
<< dendl
;
2728 // may get these while mdr->peer_request is non-null
2729 if (m
->get_op() == MMDSPeerRequest::OP_DROPLOCKS
) {
2730 mds
->locker
->drop_locks(mdr
.get());
2733 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2734 if (m
->is_abort()) {
2735 mdr
->aborted
= true;
2736 if (mdr
->peer_request
) {
2737 // only abort on-going xlock, wrlock and auth pin
2738 ceph_assert(!mdr
->peer_did_prepare());
2740 mdcache
->request_finish(mdr
);
2743 if (m
->inode_export
.length() > 0)
2744 mdr
->more()->inode_import
= m
->inode_export
;
2745 // finish off request.
2746 mdcache
->request_finish(mdr
);
2753 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2754 dout(10) << "missing peer request for " << m
->get_reqid()
2755 << " OP_FINISH, must have lost race with a forward" << dendl
;
2758 mdr
= mdcache
->request_start_peer(m
->get_reqid(), m
->get_attempt(), m
);
2759 mdr
->set_op_stamp(m
->op_stamp
);
2761 ceph_assert(mdr
->peer_request
== 0); // only one at a time, please!
2765 mdr
->straydn
= straydn
;
2768 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2769 mdr
->locks
.empty()) {
2770 dout(3) << "not active yet, waiting" << dendl
;
2771 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2775 mdr
->reset_peer_request(m
);
2777 dispatch_peer_request(mdr
);
2780 void Server::handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
)
2782 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2784 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2785 metareqid_t r
= m
->get_reqid();
2786 if (!mdcache
->have_uncommitted_leader(r
, from
)) {
2787 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2788 << from
<< " reqid " << r
<< dendl
;
2791 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2792 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2796 if (m
->get_op() == MMDSPeerRequest::OP_COMMITTED
) {
2797 metareqid_t r
= m
->get_reqid();
2798 mdcache
->committed_leader_peer(r
, from
);
2802 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2803 if (m
->get_attempt() != mdr
->attempt
) {
2804 dout(10) << "handle_peer_request_reply " << *mdr
<< " ignoring reply from other attempt "
2805 << m
->get_attempt() << dendl
;
2809 switch (m
->get_op()) {
2810 case MMDSPeerRequest::OP_XLOCKACK
:
2812 // identify lock, leader request
2813 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2814 m
->get_object_info());
2815 mdr
->more()->peers
.insert(from
);
2816 lock
->decode_locked_state(m
->get_lock_data());
2817 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2818 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2819 mdr
->finish_locking(lock
);
2820 lock
->get_xlock(mdr
, mdr
->get_client());
2822 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2823 mdr
->more()->waiting_on_peer
.erase(from
);
2824 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2825 mdcache
->dispatch_request(mdr
);
2829 case MMDSPeerRequest::OP_WRLOCKACK
:
2831 // identify lock, leader request
2832 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2833 m
->get_object_info());
2834 mdr
->more()->peers
.insert(from
);
2835 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2836 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2837 ceph_assert(it
->is_remote_wrlock());
2838 ceph_assert(it
->wrlock_target
== from
);
2840 mdr
->finish_locking(lock
);
2842 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2843 mdr
->more()->waiting_on_peer
.erase(from
);
2844 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2845 mdcache
->dispatch_request(mdr
);
2849 case MMDSPeerRequest::OP_AUTHPINACK
:
2850 handle_peer_auth_pin_ack(mdr
, m
);
2853 case MMDSPeerRequest::OP_LINKPREPACK
:
2854 handle_peer_link_prep_ack(mdr
, m
);
2857 case MMDSPeerRequest::OP_RMDIRPREPACK
:
2858 handle_peer_rmdir_prep_ack(mdr
, m
);
2861 case MMDSPeerRequest::OP_RENAMEPREPACK
:
2862 handle_peer_rename_prep_ack(mdr
, m
);
2865 case MMDSPeerRequest::OP_RENAMENOTIFYACK
:
2866 handle_peer_rename_notify_ack(mdr
, m
);
2874 void Server::dispatch_peer_request(MDRequestRef
& mdr
)
2876 dout(7) << "dispatch_peer_request " << *mdr
<< " " << *mdr
->peer_request
<< dendl
;
2879 dout(7) << " abort flag set, finishing" << dendl
;
2880 mdcache
->request_finish(mdr
);
2884 if (logger
) logger
->inc(l_mdss_dispatch_peer_request
);
2886 int op
= mdr
->peer_request
->get_op();
2888 case MMDSPeerRequest::OP_XLOCK
:
2889 case MMDSPeerRequest::OP_WRLOCK
:
2892 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2893 mdr
->peer_request
->get_object_info());
2896 dout(10) << "don't have object, dropping" << dendl
;
2897 ceph_abort(); // can this happen, if we auth pinned properly.
2899 if (op
== MMDSPeerRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2900 dout(10) << "not auth for remote xlock attempt, dropping on "
2901 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2903 // use acquire_locks so that we get auth_pinning.
2904 MutationImpl::LockOpVec lov
;
2905 for (const auto& p
: mdr
->locks
) {
2907 lov
.add_xlock(p
.lock
);
2908 else if (p
.is_wrlock())
2909 lov
.add_wrlock(p
.lock
);
2914 case MMDSPeerRequest::OP_XLOCK
:
2915 lov
.add_xlock(lock
);
2916 replycode
= MMDSPeerRequest::OP_XLOCKACK
;
2918 case MMDSPeerRequest::OP_WRLOCK
:
2919 lov
.add_wrlock(lock
);
2920 replycode
= MMDSPeerRequest::OP_WRLOCKACK
;
2924 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2928 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2929 r
->set_lock_type(lock
->get_type());
2930 lock
->get_parent()->set_object_info(r
->get_object_info());
2931 if (replycode
== MMDSPeerRequest::OP_XLOCKACK
)
2932 lock
->encode_locked_state(r
->get_lock_data());
2933 mds
->send_message(r
, mdr
->peer_request
->get_connection());
2937 mdr
->reset_peer_request();
2941 case MMDSPeerRequest::OP_UNXLOCK
:
2942 case MMDSPeerRequest::OP_UNWRLOCK
:
2944 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2945 mdr
->peer_request
->get_object_info());
2947 auto it
= mdr
->locks
.find(lock
);
2948 ceph_assert(it
!= mdr
->locks
.end());
2949 bool need_issue
= false;
2951 case MMDSPeerRequest::OP_UNXLOCK
:
2952 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2954 case MMDSPeerRequest::OP_UNWRLOCK
:
2955 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2959 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2961 // done. no ack necessary.
2962 mdr
->reset_peer_request();
2966 case MMDSPeerRequest::OP_AUTHPIN
:
2967 handle_peer_auth_pin(mdr
);
2970 case MMDSPeerRequest::OP_LINKPREP
:
2971 case MMDSPeerRequest::OP_UNLINKPREP
:
2972 handle_peer_link_prep(mdr
);
2975 case MMDSPeerRequest::OP_RMDIRPREP
:
2976 handle_peer_rmdir_prep(mdr
);
2979 case MMDSPeerRequest::OP_RENAMEPREP
:
2980 handle_peer_rename_prep(mdr
);
2988 void Server::handle_peer_auth_pin(MDRequestRef
& mdr
)
2990 dout(10) << "handle_peer_auth_pin " << *mdr
<< dendl
;
2992 // build list of objects
2993 list
<MDSCacheObject
*> objects
;
2994 CInode
*auth_pin_freeze
= NULL
;
2995 bool nonblocking
= mdr
->peer_request
->is_nonblocking();
2996 bool fail
= false, wouldblock
= false, readonly
= false;
2997 ref_t
<MMDSPeerRequest
> reply
;
2999 if (mdcache
->is_readonly()) {
3000 dout(10) << " read-only FS" << dendl
;
3006 for (const auto &oi
: mdr
->peer_request
->get_authpins()) {
3007 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3009 dout(10) << " don't have " << oi
<< dendl
;
3014 objects
.push_back(object
);
3015 if (oi
== mdr
->peer_request
->get_authpin_freeze())
3016 auth_pin_freeze
= static_cast<CInode
*>(object
);
3020 // can we auth pin them?
3022 for (const auto& obj
: objects
) {
3023 if (!obj
->is_auth()) {
3024 dout(10) << " not auth for " << *obj
<< dendl
;
3028 if (mdr
->is_auth_pinned(obj
))
3030 if (!mdr
->can_auth_pin(obj
)) {
3032 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
3038 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
3039 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3040 mdr
->drop_local_auth_pins();
3042 mds
->locker
->notify_freeze_waiter(obj
);
3049 /* freeze authpin wrong inode */
3050 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
3051 mdr
->more()->rename_inode
!= auth_pin_freeze
)
3052 mdr
->unfreeze_auth_pin(true);
3054 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3055 * on the source inode to complete. This happens after all locks for the rename
3056 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3057 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3058 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3059 * The solution is freeze the inode and prevent other MDRequests from getting new
3062 if (auth_pin_freeze
) {
3063 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3064 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3065 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3066 mds
->mdlog
->flush();
3072 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3075 mdr
->drop_local_auth_pins(); // just in case
3077 reply
->mark_error_rofs();
3079 reply
->mark_error_wouldblock();
3082 for (const auto& obj
: objects
) {
3083 dout(10) << "auth_pinning " << *obj
<< dendl
;
3086 // return list of my auth_pins (if any)
3087 for (const auto &p
: mdr
->object_states
) {
3088 if (!p
.second
.auth_pinned
)
3090 MDSCacheObjectInfo info
;
3091 p
.first
->set_object_info(info
);
3092 reply
->get_authpins().push_back(info
);
3093 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3094 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3098 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3100 // clean up this request
3101 mdr
->reset_peer_request();
3105 if (mdr
->peer_request
->should_notify_blocking()) {
3106 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3107 reply
->mark_req_blocked();
3108 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3109 mdr
->peer_request
->clear_notify_blocking();
3114 void Server::handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
3116 dout(10) << "handle_peer_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3117 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3119 if (ack
->is_req_blocked()) {
3120 mdr
->disable_lock_cache();
3121 // peer auth pin is blocked, drop locks to avoid deadlock
3122 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3127 set
<MDSCacheObject
*> pinned
;
3128 for (const auto &oi
: ack
->get_authpins()) {
3129 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3130 ceph_assert(object
); // we pinned it
3131 dout(10) << " remote has pinned " << *object
<< dendl
;
3132 mdr
->set_remote_auth_pinned(object
, from
);
3133 if (oi
== ack
->get_authpin_freeze())
3134 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3135 pinned
.insert(object
);
3138 // removed frozen auth pin ?
3139 if (mdr
->more()->is_remote_frozen_authpin
&&
3140 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3141 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3142 ceph_assert(stat_p
);
3143 if (stat_p
->remote_auth_pinned
== from
) {
3144 mdr
->more()->is_remote_frozen_authpin
= false;
3148 // removed auth pins?
3149 for (auto& p
: mdr
->object_states
) {
3150 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3152 MDSCacheObject
* object
= p
.first
;
3153 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3154 dout(10) << " remote has unpinned " << *object
<< dendl
;
3155 mdr
->_clear_remote_auth_pinned(p
.second
);
3160 mdr
->more()->peers
.insert(from
);
3162 // clear from waiting list
3163 auto ret
= mdr
->more()->waiting_on_peer
.erase(from
);
3166 if (ack
->is_error_rofs()) {
3167 mdr
->more()->peer_error
= -CEPHFS_EROFS
;
3168 } else if (ack
->is_error_wouldblock()) {
3169 mdr
->more()->peer_error
= -CEPHFS_EWOULDBLOCK
;
3173 if (mdr
->more()->waiting_on_peer
.empty())
3174 mdcache
->dispatch_request(mdr
);
3176 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
3180 // ---------------------------------------
3185 * check whether we are permitted to complete a request
3187 * Check whether we have permission to perform the operation specified
3188 * by mask on the given inode, based on the capability in the mdr's
3191 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3194 int r
= mdr
->session
->check_access(
3196 mdr
->client_request
->get_caller_uid(),
3197 mdr
->client_request
->get_caller_gid(),
3198 &mdr
->client_request
->get_caller_gid_list(),
3199 mdr
->client_request
->head
.args
.setattr
.uid
,
3200 mdr
->client_request
->head
.args
.setattr
.gid
);
3202 respond_to_request(mdr
, r
);
3210 * check whether fragment has reached maximum size
3213 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*dir
)
3215 const auto size
= dir
->get_frag_size();
3216 const auto max
= bal_fragment_size_max
;
3218 dout(10) << "fragment " << *dir
<< " size exceeds " << max
<< " (CEPHFS_ENOSPC)" << dendl
;
3219 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
3222 dout(20) << "fragment " << *dir
<< " size " << size
<< " < " << max
<< dendl
;
3229 * check whether entries in a dir reached maximum size
3232 bool Server::check_dir_max_entries(MDRequestRef
&mdr
, CDir
*in
)
3234 const uint64_t size
= in
->inode
->get_projected_inode()->dirstat
.nfiles
+
3235 in
->inode
->get_projected_inode()->dirstat
.nsubdirs
;
3236 if (dir_max_entries
&& size
>= dir_max_entries
) {
3237 dout(10) << "entries per dir " << *in
<< " size exceeds " << dir_max_entries
<< " (ENOSPC)" << dendl
;
3238 respond_to_request(mdr
, -ENOSPC
);
3245 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3248 in
->name_stray_dentry(straydname
);
3250 CDentry
*straydn
= mdr
->straydn
;
3252 ceph_assert(straydn
->get_name() == straydname
);
3255 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3257 if (!mdr
->client_request
->is_replay() &&
3258 !check_fragment_space(mdr
, straydir
))
3261 straydn
= straydir
->lookup(straydname
);
3263 if (straydir
->is_frozen_dir()) {
3264 dout(10) << __func__
<< ": " << *straydir
<< " is frozen, waiting" << dendl
;
3265 mds
->locker
->drop_locks(mdr
.get());
3266 mdr
->drop_local_auth_pins();
3267 straydir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3270 straydn
= straydir
->add_null_dentry(straydname
);
3271 straydn
->mark_new();
3273 ceph_assert(straydn
->get_projected_linkage()->is_null());
3276 straydn
->state_set(CDentry::STATE_STRAY
);
3277 mdr
->straydn
= straydn
;
3283 /** prepare_new_inode
3285 * create a new inode. set c/m/atime. hit dir pop.
3287 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3288 const file_layout_t
*layout
)
3290 CInode
*in
= new CInode(mdcache
);
3291 auto _inode
= in
->_get_inode();
3293 // Server::prepare_force_open_sessions() can re-open session in closing
3294 // state. In that corner case, session's prealloc_inos are being freed.
3295 // To simplify the code, we disallow using/refilling session's prealloc_ino
3296 // while session is opening.
3297 bool allow_prealloc_inos
= mdr
->session
->is_open();
3300 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= _inode
->ino
= mdr
->session
->take_ino(useino
))) {
3301 mds
->sessionmap
.mark_projected(mdr
->session
);
3302 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3303 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3307 _inode
->ino
= mds
->inotable
->project_alloc_id(useino
);
3308 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3311 if (useino
&& useino
!= _inode
->ino
) {
3312 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << _inode
->ino
<< dendl
;
3313 mds
->clog
->error() << mdr
->client_request
->get_source()
3314 << " specified ino " << useino
3315 << " but mds." << mds
->get_nodeid() << " allocated " << _inode
->ino
;
3316 //ceph_abort(); // just for now.
3319 if (allow_prealloc_inos
&&
3320 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3321 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3322 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3323 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3324 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3325 mds
->sessionmap
.mark_projected(mdr
->session
);
3326 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3329 _inode
->version
= 1;
3330 _inode
->xattr_version
= 1;
3331 _inode
->nlink
= 1; // FIXME
3333 _inode
->mode
= mode
;
3335 // FIPS zeroization audit 20191117: this memset is not security related.
3336 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
3337 if (_inode
->is_dir()) {
3338 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3339 } else if (layout
) {
3340 _inode
->layout
= *layout
;
3342 _inode
->layout
= mdcache
->default_file_layout
;
3345 _inode
->truncate_size
= -1ull; // not truncated, yet!
3346 _inode
->truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3348 CInode
*diri
= dir
->get_inode();
3350 dout(10) << oct
<< " dir mode 0" << diri
->get_inode()->mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3352 if (diri
->get_inode()->mode
& S_ISGID
) {
3353 dout(10) << " dir is sticky" << dendl
;
3354 _inode
->gid
= diri
->get_inode()->gid
;
3355 if (S_ISDIR(mode
)) {
3356 dout(10) << " new dir also sticky" << dendl
;
3357 _inode
->mode
|= S_ISGID
;
3360 _inode
->gid
= mdr
->client_request
->get_caller_gid();
3362 _inode
->uid
= mdr
->client_request
->get_caller_uid();
3364 _inode
->btime
= _inode
->ctime
= _inode
->mtime
= _inode
->atime
=
3365 mdr
->get_op_stamp();
3367 _inode
->change_attr
= 0;
3369 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3370 if (req
->get_data().length()) {
3371 auto p
= req
->get_data().cbegin();
3373 // xattrs on new inode?
3374 auto _xattrs
= CInode::allocate_xattr_map();
3375 decode_noshare(*_xattrs
, p
);
3376 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs
<< dendl
;
3377 if (_xattrs
->count("encryption.ctx")) {
3378 _inode
->fscrypt
= true;
3380 in
->reset_xattrs(std::move(_xattrs
));
3383 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3384 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3385 _inode
->inline_data
.version
= CEPH_INLINE_NONE
;
3387 mdcache
->add_inode(in
); // add
3388 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3392 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3394 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3395 << " inotablev " << mds
->inotable
->get_projected_version()
3397 blob
->set_ino_alloc(mdr
->alloc_ino
,
3398 mdr
->used_prealloc_ino
,
3400 mdr
->client_request
->get_source(),
3401 mds
->sessionmap
.get_projected(),
3402 mds
->inotable
->get_projected_version());
3405 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3407 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3408 << " / " << mdr
->prealloc_inos
3409 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3411 if (mdr
->alloc_ino
) {
3412 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3414 if (mdr
->prealloc_inos
.size()) {
3415 ceph_assert(session
);
3416 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3417 session
->free_prealloc_inos
.insert(mdr
->prealloc_inos
);
3418 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3419 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3420 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3422 if (mdr
->used_prealloc_ino
) {
3423 ceph_assert(session
);
3424 session
->info
.prealloc_inos
.erase(mdr
->used_prealloc_ino
);
3425 mds
->sessionmap
.mark_dirty(session
);
3429 class C_MDS_TryFindInode
: public ServerContext
{
3432 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3433 void finish(int r
) override
{
3434 if (r
== -CEPHFS_ESTALE
) // :( find_ino_peers failed
3435 server
->respond_to_request(mdr
, r
);
3437 server
->dispatch_client_request(mdr
);
3441 /* If this returns null, the request has been handled
3442 * as appropriate: forwarded on, or the client's been replied to */
3443 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3447 const filepath
& refpath
= mdr
->get_filepath();
3448 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3450 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3454 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3456 if (refpath
.is_last_snap()) {
3460 if (!no_want_auth
&& forward_all_requests_to_auth
)
3462 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3465 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3466 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3468 return nullptr; // delayed
3469 if (r
< 0) { // error
3470 if (r
== -CEPHFS_ENOENT
&& !mdr
->dn
[0].empty()) {
3471 if (mdr
->client_request
&&
3472 mdr
->client_request
->get_dentry_wanted())
3473 mdr
->tracedn
= mdr
->dn
[0].back();
3474 respond_to_request(mdr
, r
);
3475 } else if (r
== -CEPHFS_ESTALE
) {
3476 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3477 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
3478 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
3480 dout(10) << "FAIL on error " << r
<< dendl
;
3481 respond_to_request(mdr
, r
);
3485 CInode
*ref
= mdr
->in
[0];
3486 dout(10) << "ref is " << *ref
<< dendl
;
3490 // do NOT proceed if freezing, as cap release may defer in that case, and
3491 // we could deadlock when we try to lock @ref.
3492 // if we're already auth_pinned, continue; the release has already been processed.
3493 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3494 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3495 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3496 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3497 if (mdr
->is_any_remote_auth_pin())
3498 mds
->locker
->notify_freeze_waiter(ref
);
3510 /** rdlock_path_xlock_dentry
3511 * traverse path to the directory that could/would contain dentry.
3512 * make sure i am auth for that dentry, forward as necessary.
3513 * create null dentry in place (or use existing if okexist).
3514 * get rdlocks on traversed dentries, xlock on new dentry.
3516 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3517 bool create
, bool okexist
, bool want_layout
)
3519 const filepath
& refpath
= mdr
->get_filepath();
3520 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3522 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3523 return mdr
->dn
[0].back();
3525 // figure parent dir vs dname
3526 if (refpath
.depth() == 0) {
3527 dout(7) << "invalid path (zero length)" << dendl
;
3528 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3532 if (refpath
.is_last_snap()) {
3533 respond_to_request(mdr
, -CEPHFS_EROFS
);
3537 if (refpath
.is_last_dot_or_dotdot()) {
3538 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3540 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3542 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
3546 // traverse to parent dir
3547 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3548 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3549 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3550 MDS_TRAVERSE_WANT_AUTH
;
3551 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3552 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3554 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3556 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3557 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3559 return nullptr; // delayed
3561 if (r
== -CEPHFS_ESTALE
) {
3562 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3563 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3566 respond_to_request(mdr
, r
);
3570 CDentry
*dn
= mdr
->dn
[0].back();
3571 CDir
*dir
= dn
->get_dir();
3572 CInode
*diri
= dir
->get_inode();
3574 if (!mdr
->reqid
.name
.is_mds()) {
3575 if (diri
->is_system() && !diri
->is_root()) {
3576 respond_to_request(mdr
, -CEPHFS_EROFS
);
3581 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3582 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3586 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3587 if (dnl
->is_null()) {
3588 if (!create
&& okexist
) {
3589 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3593 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3594 dn
->first
= std::max(dn
->first
, next_snap
);
3597 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3600 mdr
->in
[0] = dnl
->get_inode();
3606 /** rdlock_two_paths_xlock_destdn
3607 * traverse two paths and lock the two paths in proper order.
3608 * The order of taking locks is:
3609 * 1. Lock directory inodes or dentries according to which trees they
3610 * are under. Lock objects under fs root before objects under mdsdir.
3611 * 2. Lock directory inodes or dentries according to their depth, in
3613 * 3. Lock directory inodes or dentries according to inode numbers or
3614 * dentries' parent inode numbers, in ascending order.
3615 * 4. Lock dentries in the same directory in order of their keys.
3616 * 5. Lock non-directory inodes according to inode numbers, in ascending
3619 std::pair
<CDentry
*, CDentry
*>
3620 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3623 const filepath
& refpath
= mdr
->get_filepath();
3624 const filepath
& refpath2
= mdr
->get_filepath2();
3626 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3628 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3629 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3631 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3632 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3633 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3636 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3637 respond_to_request(mdr
, -CEPHFS_EROFS
);
3638 return std::make_pair(nullptr, nullptr);
3641 // traverse to parent dir
3642 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3643 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3644 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3646 if (r
== -CEPHFS_ESTALE
) {
3647 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl
;
3648 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3650 respond_to_request(mdr
, r
);
3652 return std::make_pair(nullptr, nullptr);
3655 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3656 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3658 if (r
== -CEPHFS_ESTALE
) {
3659 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
3660 mdcache
->find_ino_peers(refpath2
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3662 respond_to_request(mdr
, r
);
3664 return std::make_pair(nullptr, nullptr);
3667 CDentry
*srcdn
= mdr
->dn
[1].back();
3668 CDir
*srcdir
= srcdn
->get_dir();
3669 CDentry
*destdn
= mdr
->dn
[0].back();
3670 CDir
*destdir
= destdn
->get_dir();
3672 if (!mdr
->reqid
.name
.is_mds()) {
3673 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3674 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3675 respond_to_request(mdr
, -CEPHFS_EROFS
);
3676 return std::make_pair(nullptr, nullptr);
3680 if (!destdir
->get_inode()->is_base() &&
3681 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3682 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3683 return std::make_pair(nullptr, nullptr);
3686 MutationImpl::LockOpVec lov
;
3687 if (srcdir
->get_inode() == destdir
->get_inode()) {
3688 lov
.add_wrlock(&destdir
->inode
->filelock
);
3689 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3690 if (xlock_srcdn
&& srcdir
!= destdir
) {
3691 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3692 if (srcdir_auth
!= mds
->get_nodeid()) {
3693 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3694 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3698 if (srcdn
->get_name() > destdn
->get_name())
3699 lov
.add_xlock(&destdn
->lock
);
3702 lov
.add_xlock(&srcdn
->lock
);
3704 lov
.add_rdlock(&srcdn
->lock
);
3706 if (srcdn
->get_name() < destdn
->get_name())
3707 lov
.add_xlock(&destdn
->lock
);
3709 int cmp
= mdr
->compare_paths();
3710 bool lock_destdir_first
=
3711 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3713 if (lock_destdir_first
) {
3714 lov
.add_wrlock(&destdir
->inode
->filelock
);
3715 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3716 lov
.add_xlock(&destdn
->lock
);
3720 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3721 if (srcdir_auth
== mds
->get_nodeid()) {
3722 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3723 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3725 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3726 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3728 lov
.add_xlock(&srcdn
->lock
);
3730 lov
.add_rdlock(&srcdn
->lock
);
3733 if (!lock_destdir_first
) {
3734 lov
.add_wrlock(&destdir
->inode
->filelock
);
3735 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3736 lov
.add_xlock(&destdn
->lock
);
3740 CInode
*auth_pin_freeze
= nullptr;
3741 // XXX any better way to do this?
3742 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3743 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3744 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3746 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3747 return std::make_pair(nullptr, nullptr);
3749 if (srcdn
->get_projected_linkage()->is_null()) {
3750 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3751 return std::make_pair(nullptr, nullptr);
3754 if (destdn
->get_projected_linkage()->is_null()) {
3755 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3756 destdn
->first
= std::max(destdn
->first
, next_snap
);
3759 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3761 return std::make_pair(destdn
, srcdn
);
3765 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3767 * @param diri base inode
3768 * @param fg the exact frag we want
3769 * @param mdr request
3770 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3772 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3774 CDir
*dir
= diri
->get_dirfrag(fg
);
3777 // am i auth for the dirfrag?
3778 if (!dir
->is_auth()) {
3779 mds_rank_t auth
= dir
->authority().first
;
3780 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3781 << ", fw to mds." << auth
<< dendl
;
3782 mdcache
->request_forward(mdr
, auth
);
3786 // not open and inode not mine?
3787 if (!diri
->is_auth()) {
3788 mds_rank_t inauth
= diri
->authority().first
;
3789 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3790 mdcache
->request_forward(mdr
, inauth
);
3794 // not open and inode frozen?
3795 if (diri
->is_frozen()) {
3796 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3797 ceph_assert(diri
->get_parent_dir());
3798 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3803 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3810 // ===============================================================================
3813 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3815 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3817 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3818 // refpath can't be empty for lookup but it can for
3819 // getattr (we do getattr with empty refpath for mount of '/')
3820 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3824 bool want_auth
= false;
3825 int mask
= req
->head
.args
.getattr
.mask
;
3826 if (mask
& CEPH_STAT_RSTAT
)
3827 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3829 if (!mdr
->is_batch_head() && mdr
->can_batch()) {
3830 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
3831 int r
= mdcache
->path_traverse(mdr
, cf
, mdr
->get_filepath(),
3832 (want_auth
? MDS_TRAVERSE_WANT_AUTH
: 0),
3833 &mdr
->dn
[0], &mdr
->in
[0]);
3838 // fall-thru. let rdlock_path_pin_ref() check again.
3839 } else if (is_lookup
) {
3840 CDentry
* dn
= mdr
->dn
[0].back();
3842 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3844 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3846 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3847 em
.first
->second
->add_request(mdr
);
3851 CInode
*in
= mdr
->in
[0];
3853 auto em
= in
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3855 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3857 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3858 em
.first
->second
->add_request(mdr
);
3864 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3868 mdr
->getattr_caps
= mask
;
3871 * if client currently holds the EXCL cap on a field, do not rdlock
3872 * it; client's stat() will result in valid info if _either_ EXCL
3873 * cap is held or MDS rdlocks and reads the value here.
3875 * handling this case here is easier than weakening rdlock
3876 * semantics... that would cause problems elsewhere.
3878 client_t client
= mdr
->get_client();
3880 Capability
*cap
= ref
->get_client_cap(client
);
3881 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3882 mdr
->snapid
<= cap
->client_follows
))
3883 issued
= cap
->issued();
3886 MutationImpl::LockOpVec lov
;
3887 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3888 lov
.add_rdlock(&ref
->linklock
);
3889 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3890 lov
.add_rdlock(&ref
->authlock
);
3891 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3892 lov
.add_rdlock(&ref
->xattrlock
);
3893 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3894 // Don't wait on unstable filelock if client is allowed to read file size.
3895 // This can reduce the response time of getattr in the case that multiple
3896 // clients do stat(2) and there are writers.
3897 // The downside of this optimization is that mds may not issue Fs caps along
3898 // with getattr reply. Client may need to send more getattr requests.
3899 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3900 lov
.add_rdlock(&ref
->filelock
);
3901 } else if (ref
->filelock
.is_stable() ||
3902 ref
->filelock
.get_num_wrlocks() > 0 ||
3903 !ref
->filelock
.can_read(mdr
->get_client())) {
3904 lov
.add_rdlock(&ref
->filelock
);
3905 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
3909 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3912 if (!check_access(mdr
, ref
, MAY_READ
))
3915 utime_t now
= ceph_clock_now();
3916 mdr
->set_mds_stamp(now
);
3918 // note which caps are requested, so we return at least a snapshot
3919 // value for them. (currently this matters for xattrs and inline data)
3920 mdr
->getattr_caps
= mask
;
3922 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3925 dout(10) << "reply to stat on " << *req
<< dendl
;
3928 mdr
->tracedn
= mdr
->dn
[0].back();
3929 respond_to_request(mdr
, 0);
3932 struct C_MDS_LookupIno2
: public ServerContext
{
3934 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3935 void finish(int r
) override
{
3936 server
->_lookup_ino_2(mdr
, r
);
3943 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3944 bool want_parent
, bool want_dentry
)
3946 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3948 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
3949 return _lookup_snap_ino(mdr
);
3951 inodeno_t ino
= req
->get_filepath().get_ino();
3952 auto _ino
= ino
.val
;
3954 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
3955 * I do not have an explanation for how that happened organically but this
3956 * check will ensure that the client can no longer do that.
3958 * [1] https://tracker.ceph.com/issues/49922
3960 if (MDS_IS_PRIVATE_INO(_ino
)) {
3961 respond_to_request(mdr
, -CEPHFS_ESTALE
);
3965 CInode
*in
= mdcache
->get_inode(ino
);
3966 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3967 respond_to_request(mdr
, -CEPHFS_ESTALE
);
3971 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3975 // check for nothing (not read or write); this still applies the
3977 if (!check_access(mdr
, in
, 0))
3980 CDentry
*dn
= in
->get_projected_parent_dn();
3981 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3983 MutationImpl::LockOpVec lov
;
3984 if (dn
&& (want_parent
|| want_dentry
)) {
3986 lov
.add_rdlock(&dn
->lock
);
3989 unsigned mask
= req
->head
.args
.lookupino
.mask
;
3991 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3993 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3994 issued
= cap
->issued();
3996 // permission bits, ACL/security xattrs
3997 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3998 lov
.add_rdlock(&in
->authlock
);
3999 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4000 lov
.add_rdlock(&in
->xattrlock
);
4002 mdr
->getattr_caps
= mask
;
4006 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4010 // need read access to directory inode
4011 if (!check_access(mdr
, diri
, MAY_READ
))
4017 if (in
->is_base()) {
4018 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4021 if (!diri
|| diri
->is_stray()) {
4022 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4025 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
4027 respond_to_request(mdr
, 0);
4030 inodeno_t dirino
= req
->get_filepath2().get_ino();
4031 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
4032 respond_to_request(mdr
, -CEPHFS_ENOENT
);
4035 dout(10) << "reply to lookup_name " << *in
<< dendl
;
4037 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
4042 respond_to_request(mdr
, 0);
4046 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
4048 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4051 vino
.ino
= req
->get_filepath().get_ino();
4052 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
4053 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
4054 __u32 hash
= req
->head
.args
.lookupino
.hash
;
4056 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
4058 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
4060 in
= mdcache
->get_inode(vino
.ino
);
4062 if (in
->state_test(CInode::STATE_PURGING
) ||
4063 !in
->has_snap_data(vino
.snapid
)) {
4064 if (in
->is_dir() || !parent_ino
) {
4065 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4074 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
4075 mdr
->snapid
= vino
.snapid
;
4077 respond_to_request(mdr
, 0);
4081 CInode
*diri
= NULL
;
4083 diri
= mdcache
->get_inode(parent_ino
);
4085 mdcache
->open_ino(parent_ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
4089 if (!diri
->is_dir()) {
4090 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4094 MutationImpl::LockOpVec lov
;
4095 lov
.add_rdlock(&diri
->dirfragtreelock
);
4096 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4099 frag_t frag
= diri
->dirfragtree
[hash
];
4100 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4104 if (!dir
->is_complete()) {
4105 if (dir
->is_frozen()) {
4106 mds
->locker
->drop_locks(mdr
.get());
4107 mdr
->drop_local_auth_pins();
4108 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4111 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4115 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4117 mdcache
->open_ino(vino
.ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4121 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4123 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4124 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4126 // `r` is a rank if >=0, else an error code
4128 mds_rank_t
dest_rank(r
);
4129 if (dest_rank
== mds
->get_nodeid())
4130 dispatch_client_request(mdr
);
4132 mdcache
->request_forward(mdr
, dest_rank
);
4137 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
4139 respond_to_request(mdr
, r
);
4143 /* This function takes responsibility for the passed mdr*/
4144 void Server::handle_client_open(MDRequestRef
& mdr
)
4146 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4147 dout(7) << "open on " << req
->get_filepath() << dendl
;
4149 int flags
= req
->head
.args
.open
.flags
;
4150 int cmode
= ceph_flags_to_mode(flags
);
4152 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4156 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4157 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4159 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4160 dout(7) << "read-only FS" << dendl
;
4161 respond_to_request(mdr
, -CEPHFS_EROFS
);
4165 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4169 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4170 ceph_assert(!need_auth
);
4171 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4172 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4177 if (!cur
->is_file()) {
4178 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4179 cmode
= CEPH_FILE_MODE_PIN
;
4180 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4181 if (cur
->is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4182 flags
&= ~CEPH_O_TRUNC
;
4185 dout(10) << "open flags = " << flags
4186 << ", filemode = " << cmode
4187 << ", need_auth = " << need_auth
4191 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4192 dout(7) << "not a file or dir " << *cur << dendl;
4193 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4196 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->is_dir() && !cur
->is_symlink()) {
4197 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4198 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4202 if ((flags
& CEPH_O_TRUNC
) && !cur
->is_file()) {
4203 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4204 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4205 respond_to_request(mdr
, cur
->is_dir() ? -CEPHFS_EISDIR
: -CEPHFS_EINVAL
);
4209 if (cur
->get_inode()->inline_data
.version
!= CEPH_INLINE_NONE
&&
4210 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4211 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4212 respond_to_request(mdr
, -CEPHFS_EPERM
);
4216 // snapped data is read only
4217 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4218 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4219 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4220 respond_to_request(mdr
, -CEPHFS_EROFS
);
4224 MutationImpl::LockOpVec lov
;
4226 unsigned mask
= req
->head
.args
.open
.mask
;
4228 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4230 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4231 issued
= cap
->issued();
4232 // permission bits, ACL/security xattrs
4233 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4234 lov
.add_rdlock(&cur
->authlock
);
4235 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4236 lov
.add_rdlock(&cur
->xattrlock
);
4238 mdr
->getattr_caps
= mask
;
4242 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4243 ceph_assert(cur
->is_auth());
4245 lov
.add_xlock(&cur
->filelock
);
4246 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4249 if (!check_access(mdr
, cur
, MAY_WRITE
))
4252 // wait for pending truncate?
4253 const auto& pi
= cur
->get_projected_inode();
4254 if (pi
->is_truncating()) {
4255 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4256 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4257 mds
->locker
->drop_locks(mdr
.get());
4258 mdr
->drop_local_auth_pins();
4259 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4263 do_open_truncate(mdr
, cmode
);
4267 // sync filelock if snapped.
4268 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4269 // and that data itself is flushed so that we can read the snapped data off disk.
4270 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4271 lov
.add_rdlock(&cur
->filelock
);
4274 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4278 if (cmode
& CEPH_FILE_MODE_WR
)
4280 if (!check_access(mdr
, cur
, mask
))
4283 utime_t now
= ceph_clock_now();
4284 mdr
->set_mds_stamp(now
);
4286 if (cur
->is_file() || cur
->is_dir()) {
4287 if (mdr
->snapid
== CEPH_NOSNAP
) {
4289 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4291 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4292 << " for " << req
->get_source()
4293 << " on " << *cur
<< dendl
;
4295 int caps
= ceph_caps_for_mode(cmode
);
4296 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4297 << " for " << req
->get_source()
4298 << " snapid " << mdr
->snapid
4299 << " on " << *cur
<< dendl
;
4300 mdr
->snap_caps
= caps
;
4304 // increase max_size?
4305 if (cmode
& CEPH_FILE_MODE_WR
)
4306 mds
->locker
->check_inode_max_size(cur
);
4308 // make sure this inode gets into the journal
4309 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4310 mdcache
->open_file_table
.should_log_open(cur
)) {
4311 EOpen
*le
= new EOpen(mds
->mdlog
);
4312 mdlog
->start_entry(le
);
4313 le
->add_clean_inode(cur
);
4314 mdlog
->submit_entry(le
);
4318 if (cmode
& CEPH_FILE_MODE_WR
)
4319 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4321 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
4322 mdr
->client_request
->get_source().num());
4325 if (req
->get_dentry_wanted()) {
4326 ceph_assert(mdr
->dn
[0].size());
4327 dn
= mdr
->dn
[0].back();
4332 respond_to_request(mdr
, 0);
4335 class C_MDS_openc_finish
: public ServerLogContext
{
4339 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4340 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4341 void finish(int r
) override
{
4342 ceph_assert(r
== 0);
4344 dn
->pop_projected_linkage();
4346 // dirty inode, dn, dir
4347 newi
->mark_dirty(mdr
->ls
);
4348 newi
->mark_dirty_parent(mdr
->ls
, true);
4352 get_mds()->locker
->share_inode_max_size(newi
);
4354 MDRequestRef null_ref
;
4355 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4357 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4359 server
->respond_to_request(mdr
, 0);
4361 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4365 /* This function takes responsibility for the passed mdr*/
4366 void Server::handle_client_openc(MDRequestRef
& mdr
)
4368 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4369 client_t client
= mdr
->get_client();
4371 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4373 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4375 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4379 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4380 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true);
4384 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4385 if (!excl
&& !dnl
->is_null()) {
4387 mds
->locker
->xlock_downgrade(&dn
->lock
, mdr
.get());
4389 MutationImpl::LockOpVec lov
;
4390 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4391 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4394 handle_client_open(mdr
);
4398 ceph_assert(dnl
->is_null());
4400 if (req
->get_alternate_name().size() > alternate_name_max
) {
4401 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
4402 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
4405 dn
->set_alternate_name(req
->get_alternate_name());
4408 file_layout_t layout
;
4409 if (mdr
->dir_layout
!= file_layout_t())
4410 layout
= mdr
->dir_layout
;
4412 layout
= mdcache
->default_file_layout
;
4414 // What kind of client caps are required to complete this operation
4415 uint64_t access
= MAY_WRITE
;
4417 const auto default_layout
= layout
;
4419 // fill in any special params from client
4420 if (req
->head
.args
.open
.stripe_unit
)
4421 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4422 if (req
->head
.args
.open
.stripe_count
)
4423 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4424 if (req
->head
.args
.open
.object_size
)
4425 layout
.object_size
= req
->head
.args
.open
.object_size
;
4426 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4427 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4428 layout
.pool_id
= req
->head
.args
.open
.pool
;
4430 // make sure we have as new a map as the client
4431 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4432 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4437 // If client doesn't have capability to modify layout pools, then
4438 // only permit this request if the requested pool matches what the
4439 // file would have inherited anyway from its parent.
4440 if (default_layout
!= layout
) {
4441 access
|= MAY_SET_VXATTR
;
4444 if (!layout
.is_valid()) {
4445 dout(10) << " invalid initial file layout" << dendl
;
4446 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4449 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4450 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4451 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4456 CDir
*dir
= dn
->get_dir();
4457 CInode
*diri
= dir
->get_inode();
4458 if (!check_access(mdr
, diri
, access
))
4460 if (!check_fragment_space(mdr
, dir
))
4462 if (!check_dir_max_entries(mdr
, dir
))
4465 if (mdr
->dn
[0].size() == 1)
4466 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4469 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4470 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4474 dn
->push_projected_linkage(newi
);
4476 auto _inode
= newi
->_get_inode();
4477 _inode
->version
= dn
->pre_dirty();
4478 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4479 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
4480 _inode
->update_backtrace();
4481 _inode
->rstat
.rfiles
= 1;
4482 _inode
->accounted_rstat
= _inode
->rstat
;
4484 SnapRealm
*realm
= diri
->find_snaprealm();
4485 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4486 ceph_assert(follows
>= realm
->get_newest_seq());
4488 ceph_assert(dn
->first
== follows
+1);
4489 newi
->first
= dn
->first
;
4492 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
4493 newi
->authlock
.set_state(LOCK_EXCL
);
4494 newi
->xattrlock
.set_state(LOCK_EXCL
);
4496 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4497 _inode
->client_ranges
[client
].range
.first
= 0;
4498 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
4499 _inode
->client_ranges
[client
].follows
= follows
;
4500 newi
->mark_clientwriteable();
4501 cap
->mark_clientwriteable();
4505 mdr
->ls
= mdlog
->get_current_segment();
4506 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4507 mdlog
->start_entry(le
);
4508 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4509 journal_allocated_inos(mdr
, &le
->metablob
);
4510 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4511 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4513 // make sure this inode gets into the journal
4514 le
->metablob
.add_opened_ino(newi
->ino());
4516 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, newi
);
4518 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4519 openc_response_t ocresp
;
4521 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4522 ocresp
.created_ino
= _inode
->ino
;
4524 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4525 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4526 unsigned frac
= 100 / delegate_inos_pct
;
4527 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4528 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4531 encode(ocresp
, mdr
->reply_extra_bl
);
4532 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4533 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4534 // add the file created flag onto the reply if create_flags features is supported
4535 encode(newi
->ino(), mdr
->reply_extra_bl
);
4538 journal_and_reply(mdr
, newi
, dn
, le
, fin
);
4540 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4541 // have overshot the split size (multiple opencs in flight), so here is
4542 // an early chance to split the dir if this openc makes it oversized.
4543 mds
->balancer
->maybe_fragment(dir
, false);
4548 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4550 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4551 Session
*session
= mds
->get_session(req
);
4552 client_t client
= req
->get_source().num();
4553 MutationImpl::LockOpVec lov
;
4554 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4557 // it's a directory, right?
4558 if (!diri
->is_dir()) {
4560 dout(10) << "reply to " << *req
<< " readdir -CEPHFS_ENOTDIR" << dendl
;
4561 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
4565 auto num_caps
= session
->get_num_caps();
4566 auto session_cap_acquisition
= session
->get_cap_acquisition();
4568 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
4569 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
4570 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
4572 logger
->inc(l_mdss_cap_acquisition_throttle
);
4574 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
4578 lov
.add_rdlock(&diri
->filelock
);
4579 lov
.add_rdlock(&diri
->dirfragtreelock
);
4581 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4584 if (!check_access(mdr
, diri
, MAY_READ
))
4588 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4589 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4590 string offset_str
= req
->get_path2();
4592 __u32 offset_hash
= 0;
4593 if (!offset_str
.empty())
4594 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4596 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4598 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4599 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4601 // does the frag exist?
4602 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4604 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4605 if (fg
.contains((unsigned)offset_hash
)) {
4606 newfg
= diri
->dirfragtree
[offset_hash
];
4608 // client actually wants next frag
4609 newfg
= diri
->dirfragtree
[fg
.value()];
4613 newfg
= diri
->dirfragtree
[fg
.value()];
4615 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4619 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4623 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4624 ceph_assert(dir
->is_auth());
4626 if (!dir
->is_complete()) {
4627 if (dir
->is_frozen()) {
4628 dout(7) << "dir is frozen " << *dir
<< dendl
;
4629 mds
->locker
->drop_locks(mdr
.get());
4630 mdr
->drop_local_auth_pins();
4631 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4635 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4636 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4640 #ifdef MDS_VERIFY_FRAGSTAT
4641 dir
->verify_fragstat();
4644 utime_t now
= ceph_clock_now();
4645 mdr
->set_mds_stamp(now
);
4647 snapid_t snapid
= mdr
->snapid
;
4648 dout(10) << "snapid " << snapid
<< dendl
;
4650 SnapRealm
*realm
= diri
->find_snaprealm();
4652 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4654 max
= dir
->get_num_any(); // whatever, something big.
4655 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4657 // make sure at least one item can be encoded
4658 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4663 ds
.frag
= dir
->get_frag();
4664 ds
.auth
= dir
->get_dir_auth().first
;
4665 if (dir
->is_auth() && !forward_all_requests_to_auth
)
4666 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4668 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4670 // count bytes available.
4671 // this isn't perfect, but we should capture the main variable/unbounded size items!
4672 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4673 int bytes_left
= max_bytes
- front_bytes
;
4674 bytes_left
-= realm
->get_snap_trace().length();
4676 // build dir contents
4679 bool start
= !offset_hash
&& offset_str
.empty();
4680 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4681 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4682 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4683 bool end
= (it
== dir
->end());
4684 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4685 CDentry
*dn
= it
->second
;
4688 if (dn
->state_test(CDentry::STATE_PURGING
))
4691 bool dnp
= dn
->use_projected(client
, mdr
);
4692 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4697 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4698 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4703 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4704 if (!(offset_key
< dn
->key()))
4708 CInode
*in
= dnl
->get_inode();
4710 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4714 // better for the MDS to do the work, if we think the client will stat any of these files.
4715 if (dnl
->is_remote() && !in
) {
4716 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4718 dn
->link_remote(dnl
, in
);
4719 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4720 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4723 // touch everything i _do_ have
4724 for (auto &p
: *dir
) {
4725 if (!p
.second
->get_linkage()->is_null())
4726 mdcache
->lru
.lru_touch(p
.second
);
4729 // already issued caps and leases, reply immediately.
4730 if (dnbl
.length() > 0) {
4731 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4732 dout(10) << " open remote dentry after caps were issued, stopping at "
4733 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4737 mds
->locker
->drop_locks(mdr
.get());
4738 mdr
->drop_local_auth_pins();
4739 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4745 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4746 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4750 unsigned start_len
= dnbl
.length();
4753 dout(12) << "including dn " << *dn
<< dendl
;
4754 encode(dn
->get_name(), dnbl
);
4755 int lease_mask
= dnl
->is_primary() ? CEPH_LEASE_PRIMARY_LINK
: 0;
4756 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, dnbl
);
4759 dout(12) << "including inode " << *in
<< dendl
;
4760 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4762 // chop off dn->name, lease
4763 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4765 keep
.substr_of(dnbl
, 0, start_len
);
4769 ceph_assert(r
>= 0);
4773 mdcache
->lru
.lru_touch(dn
);
4776 session
->touch_readdir_cap(numfiles
);
4780 flags
= CEPH_READDIR_FRAG_END
;
4782 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4784 // client only understand END and COMPLETE flags ?
4785 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4786 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4789 // finish final blob
4790 encode(numfiles
, dirbl
);
4791 encode(flags
, dirbl
);
4792 dirbl
.claim_append(dnbl
);
4795 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4796 << " bytes=" << dirbl
.length()
4797 << " start=" << (int)start
4798 << " end=" << (int)end
4800 mdr
->reply_extra_bl
= dirbl
;
4802 // bump popularity. NOTE: this doesn't quite capture it.
4803 mds
->balancer
->hit_dir(dir
, META_POP_READDIR
, -1, numfiles
);
4807 respond_to_request(mdr
, 0);
4812 // ===============================================================================
4817 * finisher for basic inode updates
4819 class C_MDS_inode_update_finish
: public ServerLogContext
{
4821 bool truncating_smaller
, changed_ranges
, adjust_realm
;
4823 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4824 bool sm
=false, bool cr
=false, bool ar
=false) :
4825 ServerLogContext(s
, r
), in(i
),
4826 truncating_smaller(sm
), changed_ranges(cr
), adjust_realm(ar
) { }
4827 void finish(int r
) override
{
4828 ceph_assert(r
== 0);
4830 int snap_op
= (in
->snaprealm
? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
);
4835 MDSRank
*mds
= get_mds();
4837 // notify any clients
4838 if (truncating_smaller
&& in
->get_inode()->is_truncating()) {
4839 mds
->locker
->issue_truncate(in
);
4840 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4844 mds
->mdcache
->send_snap_update(in
, 0, snap_op
);
4845 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, snap_op
);
4848 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4850 server
->respond_to_request(mdr
, 0);
4853 get_mds()->locker
->share_inode_max_size(in
);
4857 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4859 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4860 MutationImpl::LockOpVec lov
;
4862 // get the inode to operate on, and set up any locks needed for that
4863 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4867 lov
.add_xlock(&cur
->flocklock
);
4868 /* acquire_locks will return true if it gets the locks. If it fails,
4869 it will redeliver this request at a later date, so drop the request.
4871 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4872 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4876 // copy the lock change into a ceph_filelock so we can store/apply it
4877 ceph_filelock set_lock
;
4878 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4879 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4880 set_lock
.client
= req
->get_orig_source().num();
4881 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4882 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4883 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4884 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4886 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4888 ceph_lock_state_t
*lock_state
= NULL
;
4889 bool interrupt
= false;
4891 // get the appropriate lock state
4892 switch (req
->head
.args
.filelock_change
.rule
) {
4893 case CEPH_LOCK_FLOCK_INTR
:
4896 case CEPH_LOCK_FLOCK
:
4897 lock_state
= cur
->get_flock_lock_state();
4900 case CEPH_LOCK_FCNTL_INTR
:
4903 case CEPH_LOCK_FCNTL
:
4904 lock_state
= cur
->get_fcntl_lock_state();
4908 dout(10) << "got unknown lock type " << set_lock
.type
4909 << ", dropping request!" << dendl
;
4910 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
4914 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4915 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4916 list
<ceph_filelock
> activated_locks
;
4917 MDSContext::vec waiters
;
4918 if (lock_state
->is_waiting(set_lock
)) {
4919 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4920 lock_state
->remove_waiting(set_lock
);
4921 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4922 } else if (!interrupt
) {
4923 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4924 lock_state
->remove_lock(set_lock
, activated_locks
);
4925 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4927 mds
->queue_waiters(waiters
);
4929 respond_to_request(mdr
, 0);
4931 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4932 bool deadlock
= false;
4933 if (mdr
->more()->flock_was_waiting
&&
4934 !lock_state
->is_waiting(set_lock
)) {
4935 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4936 respond_to_request(mdr
, -CEPHFS_EINTR
);
4937 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4938 dout(10) << " it failed on this attempt" << dendl
;
4939 // couldn't set lock right now
4941 respond_to_request(mdr
, -CEPHFS_EDEADLK
);
4942 } else if (!will_wait
) {
4943 respond_to_request(mdr
, -CEPHFS_EWOULDBLOCK
);
4945 dout(10) << " added to waiting list" << dendl
;
4946 ceph_assert(lock_state
->is_waiting(set_lock
));
4947 mdr
->more()->flock_was_waiting
= true;
4948 mds
->locker
->drop_locks(mdr
.get());
4949 mdr
->drop_local_auth_pins();
4950 mdr
->mark_event("failed to add lock, waiting");
4952 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4955 respond_to_request(mdr
, 0);
4957 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4960 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4962 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4963 MutationImpl::LockOpVec lov
;
4965 // get the inode to operate on, and set up any locks needed for that
4966 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4970 /* acquire_locks will return true if it gets the locks. If it fails,
4971 it will redeliver this request at a later date, so drop the request.
4973 lov
.add_rdlock(&cur
->flocklock
);
4974 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4975 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4979 // copy the lock change into a ceph_filelock so we can store/apply it
4980 ceph_filelock checking_lock
;
4981 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4982 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4983 checking_lock
.client
= req
->get_orig_source().num();
4984 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4985 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4986 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4988 // get the appropriate lock state
4989 ceph_lock_state_t
*lock_state
= NULL
;
4990 switch (req
->head
.args
.filelock_change
.rule
) {
4991 case CEPH_LOCK_FLOCK
:
4992 lock_state
= cur
->get_flock_lock_state();
4995 case CEPH_LOCK_FCNTL
:
4996 lock_state
= cur
->get_fcntl_lock_state();
5000 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
5001 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5004 lock_state
->look_for_lock(checking_lock
);
5007 encode(checking_lock
, lock_bl
);
5009 mdr
->reply_extra_bl
= lock_bl
;
5010 respond_to_request(mdr
, 0);
5013 void Server::handle_client_setattr(MDRequestRef
& mdr
)
5015 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5016 MutationImpl::LockOpVec lov
;
5017 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5020 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5021 respond_to_request(mdr
, -CEPHFS_EROFS
);
5024 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
5025 respond_to_request(mdr
, -CEPHFS_EPERM
);
5029 __u32 mask
= req
->head
.args
.setattr
.mask
;
5030 __u32 access_mask
= MAY_WRITE
;
5033 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
5034 lov
.add_xlock(&cur
->authlock
);
5035 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
5036 lov
.add_xlock(&cur
->filelock
);
5037 if (mask
& CEPH_SETATTR_CTIME
)
5038 lov
.add_wrlock(&cur
->versionlock
);
5040 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5043 if ((mask
& CEPH_SETATTR_UID
) && (cur
->get_inode()->uid
!= req
->head
.args
.setattr
.uid
))
5044 access_mask
|= MAY_CHOWN
;
5046 if ((mask
& CEPH_SETATTR_GID
) && (cur
->get_inode()->gid
!= req
->head
.args
.setattr
.gid
))
5047 access_mask
|= MAY_CHGRP
;
5049 if (!check_access(mdr
, cur
, access_mask
))
5052 // trunc from bigger -> smaller?
5053 const auto& pip
= cur
->get_projected_inode();
5055 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
5057 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5058 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
5059 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl
;
5060 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
5064 bool truncating_smaller
= false;
5065 if (mask
& CEPH_SETATTR_SIZE
) {
5066 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
5067 if (truncating_smaller
&& pip
->is_truncating()) {
5068 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
5069 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
5070 mds
->locker
->drop_locks(mdr
.get());
5071 mdr
->drop_local_auth_pins();
5072 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
5077 bool changed_ranges
= false;
5080 mdr
->ls
= mdlog
->get_current_segment();
5081 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
5082 mdlog
->start_entry(le
);
5084 auto pi
= cur
->project_inode(mdr
);
5086 if (mask
& CEPH_SETATTR_UID
)
5087 pi
.inode
->uid
= req
->head
.args
.setattr
.uid
;
5088 if (mask
& CEPH_SETATTR_GID
)
5089 pi
.inode
->gid
= req
->head
.args
.setattr
.gid
;
5091 if (mask
& CEPH_SETATTR_MODE
)
5092 pi
.inode
->mode
= (pi
.inode
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
5093 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
5094 S_ISREG(pi
.inode
->mode
) &&
5095 (pi
.inode
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
5096 pi
.inode
->mode
&= ~(S_ISUID
|S_ISGID
);
5099 if (mask
& CEPH_SETATTR_MTIME
)
5100 pi
.inode
->mtime
= req
->head
.args
.setattr
.mtime
;
5101 if (mask
& CEPH_SETATTR_ATIME
)
5102 pi
.inode
->atime
= req
->head
.args
.setattr
.atime
;
5103 if (mask
& CEPH_SETATTR_BTIME
)
5104 pi
.inode
->btime
= req
->head
.args
.setattr
.btime
;
5105 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
5106 pi
.inode
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
5107 if (mask
& CEPH_SETATTR_SIZE
) {
5108 if (truncating_smaller
) {
5109 pi
.inode
->truncate(old_size
, req
->head
.args
.setattr
.size
);
5110 le
->metablob
.add_truncate_start(cur
->ino());
5112 pi
.inode
->size
= req
->head
.args
.setattr
.size
;
5113 pi
.inode
->rstat
.rbytes
= pi
.inode
->size
;
5115 pi
.inode
->mtime
= mdr
->get_op_stamp();
5117 // adjust client's max_size?
5118 if (mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
->size
)) {
5119 dout(10) << " client_ranges " << cur
->get_previous_projected_inode()->client_ranges
5120 << " -> " << pi
.inode
->client_ranges
<< dendl
;
5121 changed_ranges
= true;
5125 pi
.inode
->version
= cur
->pre_dirty();
5126 pi
.inode
->ctime
= mdr
->get_op_stamp();
5127 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5128 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5129 pi
.inode
->change_attr
++;
5132 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5133 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5134 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5136 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5137 truncating_smaller
, changed_ranges
));
5139 // flush immediately if there are readers/writers waiting
5140 if (mdr
->is_xlocked(&cur
->filelock
) &&
5141 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5142 mds
->mdlog
->flush();
5145 /* Takes responsibility for mdr */
5146 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5148 CInode
*in
= mdr
->in
[0];
5149 client_t client
= mdr
->get_client();
5152 dout(10) << "do_open_truncate " << *in
<< dendl
;
5154 SnapRealm
*realm
= in
->find_snaprealm();
5155 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5157 mdr
->ls
= mdlog
->get_current_segment();
5158 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5159 mdlog
->start_entry(le
);
5162 auto pi
= in
->project_inode(mdr
);
5163 pi
.inode
->version
= in
->pre_dirty();
5164 pi
.inode
->mtime
= pi
.inode
->ctime
= mdr
->get_op_stamp();
5165 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5166 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5167 pi
.inode
->change_attr
++;
5169 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
5171 pi
.inode
->truncate(old_size
, 0);
5172 le
->metablob
.add_truncate_start(in
->ino());
5175 bool changed_ranges
= false;
5176 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5177 pi
.inode
->client_ranges
[client
].range
.first
= 0;
5178 pi
.inode
->client_ranges
[client
].range
.last
= pi
.inode
->get_layout_size_increment();
5179 pi
.inode
->client_ranges
[client
].follows
= realm
->get_newest_seq();
5180 changed_ranges
= true;
5181 in
->mark_clientwriteable();
5182 cap
->mark_clientwriteable();
5185 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5187 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5188 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5190 // make sure ino gets into the journal
5191 le
->metablob
.add_opened_ino(in
->ino());
5193 mdr
->o_trunc
= true;
5196 if (mdr
->client_request
->get_dentry_wanted()) {
5197 ceph_assert(mdr
->dn
[0].size());
5198 dn
= mdr
->dn
[0].back();
5201 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5203 // Although the `open` part can give an early reply, the truncation won't
5204 // happen until our EUpdate is persistent, to give the client a prompt
5205 // response we must also flush that event.
5210 /* This function cleans up the passed mdr */
5211 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5213 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5214 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5217 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5218 respond_to_request(mdr
, -CEPHFS_EROFS
);
5221 if (!cur
->is_file()) {
5222 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5225 if (cur
->get_projected_inode()->size
||
5226 cur
->get_projected_inode()->truncate_seq
> 1) {
5227 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5232 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5233 // save existing layout for later
5234 const auto old_layout
= layout
;
5236 int access
= MAY_WRITE
;
5238 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5239 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5240 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5241 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5242 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5243 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5244 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5245 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5247 // make sure we have as new a map as the client
5248 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5249 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5254 // Don't permit layout modifications without 'p' caps
5255 if (layout
!= old_layout
) {
5256 access
|= MAY_SET_VXATTR
;
5259 if (!layout
.is_valid()) {
5260 dout(10) << "bad layout" << dendl
;
5261 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5264 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5265 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5266 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5270 MutationImpl::LockOpVec lov
;
5271 lov
.add_xlock(&cur
->filelock
);
5272 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5275 if (!check_access(mdr
, cur
, access
))
5279 auto pi
= cur
->project_inode(mdr
);
5280 pi
.inode
->layout
= layout
;
5281 // add the old pool to the inode
5282 pi
.inode
->add_old_pool(old_layout
.pool_id
);
5283 pi
.inode
->version
= cur
->pre_dirty();
5284 pi
.inode
->ctime
= mdr
->get_op_stamp();
5285 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5286 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5287 pi
.inode
->change_attr
++;
5290 mdr
->ls
= mdlog
->get_current_segment();
5291 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5292 mdlog
->start_entry(le
);
5293 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5294 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5295 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5297 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5300 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5302 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5305 MutationImpl::LockOpVec lov
;
5306 lov
.add_xlock(&in
->policylock
);
5308 lov
.add_xlock(&in
->snaplock
);
5310 lov
.add_rdlock(&in
->snaplock
);
5311 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5314 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5315 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5316 want_layout
= false;
5318 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5319 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5323 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5327 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5329 CInode
*in
= mdcache
->get_inode(ino
);
5330 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5331 respond_to_request(mdr
, -CEPHFS_ESTALE
);
5334 if (!in
->is_auth()) {
5335 mdcache
->request_forward(mdr
, in
->authority().first
);
5342 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5344 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5346 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5347 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5351 if (!cur
->is_dir()) {
5352 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
5356 if (!xlock_policylock(mdr
, cur
, true))
5360 const auto& old_pi
= cur
->get_projected_inode();
5361 file_layout_t layout
;
5362 if (old_pi
->has_layout())
5363 layout
= old_pi
->layout
;
5364 else if (mdr
->dir_layout
!= file_layout_t())
5365 layout
= mdr
->dir_layout
;
5367 layout
= mdcache
->default_file_layout
;
5369 // Level of access required to complete
5370 int access
= MAY_WRITE
;
5372 const auto old_layout
= layout
;
5374 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5375 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5376 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5377 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5378 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5379 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5380 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5381 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5382 // make sure we have as new a map as the client
5383 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5384 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5389 if (layout
!= old_layout
) {
5390 access
|= MAY_SET_VXATTR
;
5393 if (!layout
.is_valid()) {
5394 dout(10) << "bad layout" << dendl
;
5395 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5398 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5399 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5400 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5404 if (!check_access(mdr
, cur
, access
))
5407 auto pi
= cur
->project_inode(mdr
);
5408 pi
.inode
->layout
= layout
;
5409 pi
.inode
->version
= cur
->pre_dirty();
5412 mdr
->ls
= mdlog
->get_current_segment();
5413 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5414 mdlog
->start_entry(le
);
5415 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5416 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5417 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5419 mdr
->no_early_reply
= true;
5420 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5425 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5426 file_layout_t
*layout
, bool validate
)
5428 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5430 if (name
== "layout") {
5431 string::iterator begin
= value
.begin();
5432 string::iterator end
= value
.end();
5433 keys_and_values
<string::iterator
> p
; // create instance of parser
5434 std::map
<string
, string
> m
; // map to receive results
5435 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5436 return -CEPHFS_EINVAL
;
5438 string
left(begin
, end
);
5439 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5441 return -CEPHFS_EINVAL
;
5442 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5443 // Skip validation on each attr, we do it once at the end (avoid
5444 // rejecting intermediate states if the overall result is ok)
5445 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
5446 osdmap
, layout
, false);
5450 } else if (name
== "layout.object_size") {
5451 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5452 } else if (name
== "layout.stripe_unit") {
5453 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5454 } else if (name
== "layout.stripe_count") {
5455 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5456 } else if (name
== "layout.pool") {
5458 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5459 } catch (boost::bad_lexical_cast
const&) {
5460 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5462 dout(10) << " unknown pool " << value
<< dendl
;
5463 return -CEPHFS_ENOENT
;
5465 layout
->pool_id
= pool
;
5467 } else if (name
== "layout.pool_namespace") {
5468 layout
->pool_ns
= value
;
5470 dout(10) << " unknown layout vxattr " << name
<< dendl
;
5471 return -CEPHFS_EINVAL
;
5473 } catch (boost::bad_lexical_cast
const&) {
5474 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5475 return -CEPHFS_EINVAL
;
5478 if (validate
&& !layout
->is_valid()) {
5479 dout(10) << "bad layout" << dendl
;
5480 return -CEPHFS_EINVAL
;
5482 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5483 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
5484 return -CEPHFS_EINVAL
;
5489 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5491 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5493 if (name
== "quota") {
5494 string::iterator begin
= value
.begin();
5495 string::iterator end
= value
.end();
5497 // keep quota unchanged. (for create_quota_realm())
5500 keys_and_values
<string::iterator
> p
; // create instance of parser
5501 std::map
<string
, string
> m
; // map to receive results
5502 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5503 return -CEPHFS_EINVAL
;
5505 string
left(begin
, end
);
5506 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5508 return -CEPHFS_EINVAL
;
5509 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5510 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5514 } else if (name
== "quota.max_bytes") {
5515 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5517 return -CEPHFS_EINVAL
;
5518 quota
->max_bytes
= q
;
5519 } else if (name
== "quota.max_files") {
5520 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5522 return -CEPHFS_EINVAL
;
5523 quota
->max_files
= q
;
5525 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5526 return -CEPHFS_EINVAL
;
5528 } catch (boost::bad_lexical_cast
const&) {
5529 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5530 return -CEPHFS_EINVAL
;
5533 if (!quota
->is_valid()) {
5534 dout(10) << "bad quota" << dendl
;
5535 return -CEPHFS_EINVAL
;
5540 void Server::create_quota_realm(CInode
*in
)
5542 dout(10) << __func__
<< " " << *in
<< dendl
;
5544 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5545 req
->set_filepath(filepath(in
->ino()));
5546 req
->set_string2("ceph.quota");
5547 // empty vxattr value
5548 req
->set_tid(mds
->issue_tid());
5550 mds
->send_message_mds(req
, in
->authority().first
);
5554 * Verify that the file layout attribute carried by client
5555 * is well-formatted.
5556 * Return 0 on success, otherwise this function takes
5557 * responsibility for the passed mdr.
5559 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5562 file_layout_t
*layout
)
5564 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5568 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5569 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5570 epoch
= osdmap
.get_epoch();
5573 if (r
== -CEPHFS_ENOENT
) {
5575 // we don't have the specified pool, make sure our map
5576 // is newer than or as new as the client.
5577 epoch_t req_epoch
= req
->get_osdmap_epoch();
5579 if (req_epoch
> epoch
) {
5581 // well, our map is older. consult mds.
5582 auto fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5584 mds
->objecter
->wait_for_map(req_epoch
, lambdafy(fin
));
5586 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5588 // For compatibility with client w/ old code, we still need get the
5589 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5590 // we can remove those code.
5591 mdr
->waited_for_osdmap
= true;
5592 mds
->objecter
->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5593 mds
, new C_MDS_RetryRequest(mdcache
, mdr
))));
5600 if (r
== -CEPHFS_ENOENT
)
5603 respond_to_request(mdr
, r
);
5611 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5613 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5614 string
name(req
->get_path2());
5615 bufferlist bl
= req
->get_data();
5616 string
value (bl
.c_str(), bl
.length());
5617 dout(10) << "handle_set_vxattr " << name
5618 << " val " << value
.length()
5619 << " bytes on " << *cur
5622 CInode::mempool_inode
*pip
= nullptr;
5625 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5629 bool adjust_realm
= false;
5630 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5631 if (!cur
->is_dir()) {
5632 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5636 if (!xlock_policylock(mdr
, cur
, true))
5639 file_layout_t layout
;
5640 if (cur
->get_projected_inode()->has_layout())
5641 layout
= cur
->get_projected_inode()->layout
;
5642 else if (mdr
->dir_layout
!= file_layout_t())
5643 layout
= mdr
->dir_layout
;
5645 layout
= mdcache
->default_file_layout
;
5647 rest
= name
.substr(name
.find("layout"));
5648 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5651 auto pi
= cur
->project_inode(mdr
);
5652 pi
.inode
->layout
= layout
;
5653 mdr
->no_early_reply
= true;
5654 pip
= pi
.inode
.get();
5655 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5656 if (!cur
->is_file()) {
5657 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5660 if (cur
->get_projected_inode()->size
||
5661 cur
->get_projected_inode()->truncate_seq
> 1) {
5662 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5665 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5666 rest
= name
.substr(name
.find("layout"));
5667 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5670 MutationImpl::LockOpVec lov
;
5671 lov
.add_xlock(&cur
->filelock
);
5672 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5675 auto pi
= cur
->project_inode(mdr
);
5676 int64_t old_pool
= pi
.inode
->layout
.pool_id
;
5677 pi
.inode
->add_old_pool(old_pool
);
5678 pi
.inode
->layout
= layout
;
5679 pip
= pi
.inode
.get();
5680 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5681 if (!cur
->is_dir()) {
5682 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5686 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5688 rest
= name
.substr(name
.find("quota"));
5689 int r
= parse_quota_vxattr(rest
, value
, "a
);
5691 respond_to_request(mdr
, r
);
5695 if (quota
.is_enable() && !cur
->get_projected_srnode())
5696 adjust_realm
= true;
5698 if (!xlock_policylock(mdr
, cur
, false, adjust_realm
))
5701 if (cur
->get_projected_inode()->quota
== quota
) {
5702 respond_to_request(mdr
, 0);
5706 auto pi
= cur
->project_inode(mdr
, false, adjust_realm
);
5707 pi
.inode
->quota
= quota
;
5710 pi
.snapnode
->created
= pi
.snapnode
->seq
= cur
->find_snaprealm()->get_newest_seq();
5712 mdr
->no_early_reply
= true;
5713 pip
= pi
.inode
.get();
5715 client_t exclude_ct
= mdr
->get_client();
5716 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5717 } else if (name
== "ceph.dir.subvolume"sv
) {
5718 if (!cur
->is_dir()) {
5719 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5725 val
= boost::lexical_cast
<bool>(value
);
5726 } catch (boost::bad_lexical_cast
const&) {
5727 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5728 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5732 /* Verify it's not already a subvolume with lighter weight
5735 if (!mdr
->more()->rdonly_checks
) {
5736 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
5737 MutationImpl::LockOpVec lov
;
5738 lov
.add_rdlock(&cur
->snaplock
);
5739 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5741 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5743 const auto srnode
= cur
->get_projected_srnode();
5744 if (val
== (srnode
&& srnode
->is_subvolume())) {
5745 dout(20) << "already marked subvolume" << dendl
;
5746 respond_to_request(mdr
, 0);
5749 mdr
->more()->rdonly_checks
= true;
5752 if ((mdr
->locking_state
& MutationImpl::ALL_LOCKED
) && !mdr
->is_xlocked(&cur
->snaplock
)) {
5753 /* drop the rdlock and acquire xlocks */
5754 dout(20) << "dropping rdlocks" << dendl
;
5755 mds
->locker
->drop_locks(mdr
.get());
5756 if (!xlock_policylock(mdr
, cur
, false, true))
5760 /* repeat rdonly checks in case changed between rdlock -> xlock */
5761 SnapRealm
*realm
= cur
->find_snaprealm();
5763 inodeno_t subvol_ino
= realm
->get_subvolume_ino();
5764 // can't create subvolume inside another subvolume
5765 if (subvol_ino
&& subvol_ino
!= cur
->ino()) {
5766 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5771 const auto srnode
= cur
->get_projected_srnode();
5772 if (val
== (srnode
&& srnode
->is_subvolume())) {
5773 respond_to_request(mdr
, 0);
5777 auto pi
= cur
->project_inode(mdr
, false, true);
5779 pi
.snapnode
->created
= pi
.snapnode
->seq
= realm
->get_newest_seq();
5781 pi
.snapnode
->mark_subvolume();
5783 pi
.snapnode
->clear_subvolume();
5785 mdr
->no_early_reply
= true;
5786 pip
= pi
.inode
.get();
5787 adjust_realm
= true;
5788 } else if (name
== "ceph.dir.pin"sv
) {
5789 if (!cur
->is_dir() || cur
->is_root()) {
5790 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5796 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5797 if (rank
< 0) rank
= MDS_RANK_NONE
;
5798 else if (rank
>= MAX_MDS
) {
5799 respond_to_request(mdr
, -CEPHFS_EDOM
);
5802 } catch (boost::bad_lexical_cast
const&) {
5803 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5804 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5808 if (!xlock_policylock(mdr
, cur
))
5811 auto pi
= cur
->project_inode(mdr
);
5812 cur
->set_export_pin(rank
);
5813 pip
= pi
.inode
.get();
5814 } else if (name
== "ceph.dir.pin.random"sv
) {
5815 if (!cur
->is_dir() || cur
->is_root()) {
5816 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5822 val
= boost::lexical_cast
<double>(value
);
5823 } catch (boost::bad_lexical_cast
const&) {
5824 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
5825 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5829 if (val
< 0.0 || 1.0 < val
) {
5830 respond_to_request(mdr
, -CEPHFS_EDOM
);
5832 } else if (mdcache
->export_ephemeral_random_max
< val
) {
5833 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5837 if (!xlock_policylock(mdr
, cur
))
5840 auto pi
= cur
->project_inode(mdr
);
5841 cur
->setxattr_ephemeral_rand(val
);
5842 pip
= pi
.inode
.get();
5843 } else if (name
== "ceph.dir.pin.distributed"sv
) {
5844 if (!cur
->is_dir() || cur
->is_root()) {
5845 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5851 val
= boost::lexical_cast
<bool>(value
);
5852 } catch (boost::bad_lexical_cast
const&) {
5853 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5854 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5858 if (!xlock_policylock(mdr
, cur
))
5861 auto pi
= cur
->project_inode(mdr
);
5862 cur
->setxattr_ephemeral_dist(val
);
5863 pip
= pi
.inode
.get();
5865 dout(10) << " unknown vxattr " << name
<< dendl
;
5866 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5871 pip
->ctime
= mdr
->get_op_stamp();
5872 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
5873 pip
->rstat
.rctime
= mdr
->get_op_stamp();
5874 pip
->version
= cur
->pre_dirty();
5876 pip
->update_backtrace();
5879 mdr
->ls
= mdlog
->get_current_segment();
5880 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
5881 mdlog
->start_entry(le
);
5882 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5883 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5884 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5886 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5887 false, false, adjust_realm
));
5891 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5893 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5894 string
name(req
->get_path2());
5896 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
5898 if (name
== "ceph.dir.layout") {
5899 if (!cur
->is_dir()) {
5900 respond_to_request(mdr
, -CEPHFS_ENODATA
);
5903 if (cur
->is_root()) {
5904 dout(10) << "can't remove layout policy on the root directory" << dendl
;
5905 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5909 if (!cur
->get_projected_inode()->has_layout()) {
5910 respond_to_request(mdr
, -CEPHFS_ENODATA
);
5914 MutationImpl::LockOpVec lov
;
5915 lov
.add_xlock(&cur
->policylock
);
5916 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5919 auto pi
= cur
->project_inode(mdr
);
5920 pi
.inode
->clear_layout();
5921 pi
.inode
->version
= cur
->pre_dirty();
5924 mdr
->ls
= mdlog
->get_current_segment();
5925 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
5926 mdlog
->start_entry(le
);
5927 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5928 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5929 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5931 mdr
->no_early_reply
= true;
5932 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5934 } else if (name
== "ceph.dir.layout.pool_namespace"
5935 || name
== "ceph.file.layout.pool_namespace") {
5936 // Namespace is the only layout field that has a meaningful
5937 // null/none value (empty string, means default layout). Is equivalent
5938 // to a setxattr with empty string: pass through the empty payload of
5939 // the rmxattr request to do this.
5940 handle_set_vxattr(mdr
, cur
);
5944 respond_to_request(mdr
, -CEPHFS_ENODATA
);
5947 const Server::XattrHandler
Server::xattr_handlers
[] = {
5949 xattr_name
: Server::DEFAULT_HANDLER
,
5950 description
: "default xattr handler",
5951 validate
: &Server::default_xattr_validate
,
5952 setxattr
: &Server::default_setxattr_handler
,
5953 removexattr
: &Server::default_removexattr_handler
,
5956 xattr_name
: "ceph.mirror.info",
5957 description
: "mirror info xattr handler",
5958 validate
: &Server::mirror_info_xattr_validate
,
5959 setxattr
: &Server::mirror_info_setxattr_handler
,
5960 removexattr
: &Server::mirror_info_removexattr_handler
5964 const Server::XattrHandler
* Server::get_xattr_or_default_handler(std::string_view xattr_name
) {
5965 const XattrHandler
*default_xattr_handler
= nullptr;
5967 for (auto &handler
: xattr_handlers
) {
5968 if (handler
.xattr_name
== Server::DEFAULT_HANDLER
) {
5969 ceph_assert(default_xattr_handler
== nullptr);
5970 default_xattr_handler
= &handler
;
5972 if (handler
.xattr_name
== xattr_name
) {
5973 dout(20) << "handler=" << handler
.description
<< dendl
;
5978 ceph_assert(default_xattr_handler
!= nullptr);
5979 dout(20) << "handler=" << default_xattr_handler
->description
<< dendl
;
5980 return default_xattr_handler
;
5983 int Server::xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
5984 const std::string
&xattr_name
, int op
, int flags
) {
5985 if (op
== CEPH_MDS_OP_SETXATTR
) {
5987 if ((flags
& CEPH_XATTR_CREATE
) && xattrs
->count(mempool::mds_co::string(xattr_name
))) {
5988 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur
<< dendl
;
5989 return -CEPHFS_EEXIST
;
5992 if ((flags
& CEPH_XATTR_REPLACE
) && !(xattrs
&& xattrs
->count(mempool::mds_co::string(xattr_name
)))) {
5993 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur
<< dendl
;
5994 return -CEPHFS_ENODATA
;
6000 if (op
== CEPH_MDS_OP_RMXATTR
) {
6001 if (!xattrs
|| xattrs
->count(mempool::mds_co::string(xattr_name
)) == 0) {
6002 dout(10) << "removexattr '" << xattr_name
<< "' and CEPHFS_ENODATA on " << *cur
<< dendl
;
6003 return -CEPHFS_ENODATA
;
6009 derr
<< ": unhandled validation for: " << xattr_name
<< dendl
;
6010 return -CEPHFS_EINVAL
;
6013 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
6014 const bufferlist
&xattr_value
) {
6015 size_t len
= xattr_value
.length();
6016 bufferptr b
= buffer::create(len
);
6018 xattr_value
.begin().copy(len
, b
.c_str());
6020 auto em
= xattrs
->emplace(std::piecewise_construct
,
6021 std::forward_as_tuple(mempool::mds_co::string(xattr_name
)),
6022 std::forward_as_tuple(b
));
6024 em
.first
->second
= b
;
6028 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
) {
6029 xattrs
->erase(mempool::mds_co::string(xattr_name
));
6032 int Server::default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6033 XattrOp
*xattr_op
) {
6034 return xattr_validate(cur
, xattrs
, xattr_op
->xattr_name
, xattr_op
->op
, xattr_op
->flags
);
6037 void Server::default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6038 const XattrOp
&xattr_op
) {
6039 xattr_set(xattrs
, xattr_op
.xattr_name
, xattr_op
.xattr_value
);
6042 void Server::default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6043 const XattrOp
&xattr_op
) {
6044 xattr_rm(xattrs
, xattr_op
.xattr_name
);
6047 // mirror info xattr handlers
6048 const std::string
Server::MirrorXattrInfo::MIRROR_INFO_REGEX
= "^cluster_id=([a-f0-9]{8}-" \
6049 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6050 "[a-f0-9]{4}-[a-f0-9]{12})" \
6052 const std::string
Server::MirrorXattrInfo::CLUSTER_ID
= "ceph.mirror.info.cluster_id";
6053 const std::string
Server::MirrorXattrInfo::FS_ID
= "ceph.mirror.info.fs_id";
6054 int Server::parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
6055 std::string
&cluster_id
, std::string
&fs_id
) {
6056 dout(20) << "parsing name=" << name
<< ", value=" << value
<< dendl
;
6058 static const std::regex
regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX
);
6061 std::regex_search(value
, match
, regex
);
6062 if (match
.size() != 3) {
6063 derr
<< "mirror info parse error" << dendl
;
6064 return -CEPHFS_EINVAL
;
6067 cluster_id
= match
[1];
6069 dout(20) << " parsed cluster_id=" << cluster_id
<< ", fs_id=" << fs_id
<< dendl
;
6073 int Server::mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6074 XattrOp
*xattr_op
) {
6075 if (!cur
->is_root()) {
6076 return -CEPHFS_EINVAL
;
6079 int v1
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, xattr_op
->op
, xattr_op
->flags
);
6080 int v2
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::FS_ID
, xattr_op
->op
, xattr_op
->flags
);
6082 derr
<< "inconsistent mirror info state (" << v1
<< "," << v2
<< ")" << dendl
;
6083 return -CEPHFS_EINVAL
;
6090 if (xattr_op
->op
== CEPH_MDS_OP_RMXATTR
) {
6094 std::string cluster_id
;
6096 int r
= parse_mirror_info_xattr(xattr_op
->xattr_name
, xattr_op
->xattr_value
.to_str(),
6102 xattr_op
->xinfo
= std::make_unique
<MirrorXattrInfo
>(cluster_id
, fs_id
);
6106 void Server::mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6107 const XattrOp
&xattr_op
) {
6108 auto mirror_info
= dynamic_cast<MirrorXattrInfo
&>(*(xattr_op
.xinfo
));
6111 bl
.append(mirror_info
.cluster_id
.c_str(), mirror_info
.cluster_id
.length());
6112 xattr_set(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, bl
);
6115 bl
.append(mirror_info
.fs_id
.c_str(), mirror_info
.fs_id
.length());
6116 xattr_set(xattrs
, Server::MirrorXattrInfo::FS_ID
, bl
);
6119 void Server::mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6120 const XattrOp
&xattr_op
) {
6121 xattr_rm(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
);
6122 xattr_rm(xattrs
, Server::MirrorXattrInfo::FS_ID
);
6125 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
6127 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6128 string
name(req
->get_path2());
6130 // is a ceph virtual xattr?
6131 if (is_ceph_vxattr(name
)) {
6132 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6133 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6137 handle_set_vxattr(mdr
, cur
);
6141 if (!is_allowed_ceph_xattr(name
)) {
6142 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6146 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
6150 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6151 respond_to_request(mdr
, -CEPHFS_EROFS
);
6155 int flags
= req
->head
.args
.setxattr
.flags
;
6157 MutationImpl::LockOpVec lov
;
6158 lov
.add_xlock(&cur
->xattrlock
);
6159 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6162 if (!check_access(mdr
, cur
, MAY_WRITE
))
6165 size_t len
= req
->get_data().length();
6166 size_t inc
= len
+ name
.length();
6168 auto handler
= Server::get_xattr_or_default_handler(name
);
6169 const auto& pxattrs
= cur
->get_projected_xattrs();
6171 // check xattrs kv pairs size
6172 size_t cur_xattrs_size
= 0;
6173 for (const auto& p
: *pxattrs
) {
6174 if ((flags
& CEPH_XATTR_REPLACE
) && name
.compare(p
.first
) == 0) {
6177 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
6180 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
6181 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6182 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
6183 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
6188 XattrOp
xattr_op(CEPH_MDS_OP_SETXATTR
, name
, req
->get_data(), flags
);
6189 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6191 respond_to_request(mdr
, r
);
6195 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
6198 auto pi
= cur
->project_inode(mdr
, true);
6199 pi
.inode
->version
= cur
->pre_dirty();
6200 pi
.inode
->ctime
= mdr
->get_op_stamp();
6201 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6202 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6203 if (name
== "encryption.ctx"sv
)
6204 pi
.inode
->fscrypt
= true;
6205 pi
.inode
->change_attr
++;
6206 pi
.inode
->xattr_version
++;
6208 if ((flags
& CEPH_XATTR_REMOVE
)) {
6209 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6211 std::invoke(handler
->setxattr
, this, cur
, pi
.xattrs
, xattr_op
);
6215 mdr
->ls
= mdlog
->get_current_segment();
6216 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
6217 mdlog
->start_entry(le
);
6218 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6219 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6220 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6222 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6225 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
6227 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6228 std::string
name(req
->get_path2());
6230 // is a ceph virtual xattr?
6231 if (is_ceph_vxattr(name
)) {
6232 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6233 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6237 handle_remove_vxattr(mdr
, cur
);
6241 if (!is_allowed_ceph_xattr(name
)) {
6242 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6246 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
6250 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6251 respond_to_request(mdr
, -CEPHFS_EROFS
);
6255 MutationImpl::LockOpVec lov
;
6256 lov
.add_xlock(&cur
->xattrlock
);
6257 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6261 auto handler
= Server::get_xattr_or_default_handler(name
);
6263 XattrOp
xattr_op(CEPH_MDS_OP_RMXATTR
, name
, bl
, 0);
6265 const auto& pxattrs
= cur
->get_projected_xattrs();
6266 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6268 respond_to_request(mdr
, r
);
6272 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
6275 auto pi
= cur
->project_inode(mdr
, true);
6276 pi
.inode
->version
= cur
->pre_dirty();
6277 pi
.inode
->ctime
= mdr
->get_op_stamp();
6278 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6279 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6280 pi
.inode
->change_attr
++;
6281 pi
.inode
->xattr_version
++;
6282 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6285 mdr
->ls
= mdlog
->get_current_segment();
6286 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
6287 mdlog
->start_entry(le
);
6288 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6289 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6290 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6292 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6296 // =================================================================
6297 // DIRECTORY and NAMESPACE OPS
6300 // ------------------------------------------------
6304 class C_MDS_mknod_finish
: public ServerLogContext
{
6308 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
6309 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
6310 void finish(int r
) override
{
6311 ceph_assert(r
== 0);
6314 dn
->pop_projected_linkage();
6316 // be a bit hacky with the inode version, here.. we decrement it
6317 // just to keep mark_dirty() happen. (we didn't bother projecting
6318 // a new version of hte inode since it's just been created)
6319 newi
->mark_dirty(mdr
->ls
);
6320 newi
->mark_dirty_parent(mdr
->ls
, true);
6323 if (newi
->is_dir()) {
6324 CDir
*dir
= newi
->get_dirfrag(frag_t());
6326 dir
->mark_dirty(mdr
->ls
);
6327 dir
->mark_new(mdr
->ls
);
6332 MDRequestRef null_ref
;
6333 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
6335 if (newi
->is_file()) {
6336 get_mds()->locker
->share_inode_max_size(newi
);
6337 } else if (newi
->is_dir()) {
6338 // We do this now so that the linkages on the new directory are stable.
6339 newi
->maybe_ephemeral_rand();
6343 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
6346 server
->respond_to_request(mdr
, 0);
6351 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6353 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6354 client_t client
= mdr
->get_client();
6356 unsigned mode
= req
->head
.args
.mknod
.mode
;
6357 if ((mode
& S_IFMT
) == 0)
6360 mdr
->disable_lock_cache();
6361 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, S_ISREG(mode
));
6365 CDir
*dir
= dn
->get_dir();
6366 CInode
*diri
= dir
->get_inode();
6367 if (!check_access(mdr
, diri
, MAY_WRITE
))
6369 if (!check_fragment_space(mdr
, dir
))
6371 if (!check_dir_max_entries(mdr
, dir
))
6374 ceph_assert(dn
->get_projected_linkage()->is_null());
6375 if (req
->get_alternate_name().size() > alternate_name_max
) {
6376 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6377 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6380 dn
->set_alternate_name(req
->get_alternate_name());
6383 file_layout_t layout
;
6384 if (mdr
->dir_layout
!= file_layout_t())
6385 layout
= mdr
->dir_layout
;
6387 layout
= mdcache
->default_file_layout
;
6389 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6392 dn
->push_projected_linkage(newi
);
6394 auto _inode
= newi
->_get_inode();
6395 _inode
->version
= dn
->pre_dirty();
6396 _inode
->rdev
= req
->head
.args
.mknod
.rdev
;
6397 _inode
->rstat
.rfiles
= 1;
6398 _inode
->accounted_rstat
= _inode
->rstat
;
6399 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6400 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
6401 _inode
->update_backtrace();
6403 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6404 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6405 ceph_assert(follows
>= realm
->get_newest_seq());
6407 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6408 // want to write to it (e.g., if they are reexporting NFS)
6409 if (S_ISREG(_inode
->mode
)) {
6410 // issue a cap on the file
6411 int cmode
= CEPH_FILE_MODE_RDWR
;
6412 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6416 // put locks in excl mode
6417 newi
->filelock
.set_state(LOCK_EXCL
);
6418 newi
->authlock
.set_state(LOCK_EXCL
);
6419 newi
->xattrlock
.set_state(LOCK_EXCL
);
6421 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6422 _inode
->client_ranges
[client
].range
.first
= 0;
6423 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
6424 _inode
->client_ranges
[client
].follows
= follows
;
6425 newi
->mark_clientwriteable();
6426 cap
->mark_clientwriteable();
6430 ceph_assert(dn
->first
== follows
+ 1);
6431 newi
->first
= dn
->first
;
6433 dout(10) << "mknod mode " << _inode
->mode
<< " rdev " << _inode
->rdev
<< dendl
;
6436 mdr
->ls
= mdlog
->get_current_segment();
6437 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6438 mdlog
->start_entry(le
);
6439 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6440 journal_allocated_inos(mdr
, &le
->metablob
);
6442 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6443 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6444 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6446 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6447 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6453 /* This function takes responsibility for the passed mdr*/
6454 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6456 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6458 mdr
->disable_lock_cache();
6459 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6463 CDir
*dir
= dn
->get_dir();
6464 CInode
*diri
= dir
->get_inode();
6466 // mkdir check access
6467 if (!check_access(mdr
, diri
, MAY_WRITE
))
6470 if (!check_fragment_space(mdr
, dir
))
6472 if (!check_dir_max_entries(mdr
, dir
))
6475 ceph_assert(dn
->get_projected_linkage()->is_null());
6476 if (req
->get_alternate_name().size() > alternate_name_max
) {
6477 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6478 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6481 dn
->set_alternate_name(req
->get_alternate_name());
6484 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6487 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6490 // it's a directory.
6491 dn
->push_projected_linkage(newi
);
6493 auto _inode
= newi
->_get_inode();
6494 _inode
->version
= dn
->pre_dirty();
6495 _inode
->rstat
.rsubdirs
= 1;
6496 _inode
->accounted_rstat
= _inode
->rstat
;
6497 _inode
->update_backtrace();
6499 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6500 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6501 ceph_assert(follows
>= realm
->get_newest_seq());
6503 dout(12) << " follows " << follows
<< dendl
;
6504 ceph_assert(dn
->first
== follows
+ 1);
6505 newi
->first
= dn
->first
;
6507 // ...and that new dir is empty.
6508 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
6509 newdir
->state_set(CDir::STATE_CREATING
);
6510 newdir
->mark_complete();
6511 newdir
->_get_fnode()->version
= newdir
->pre_dirty();
6514 mdr
->ls
= mdlog
->get_current_segment();
6515 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
6516 mdlog
->start_entry(le
);
6517 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6518 journal_allocated_inos(mdr
, &le
->metablob
);
6519 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6520 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6521 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
6523 // issue a cap on the directory
6524 int cmode
= CEPH_FILE_MODE_RDWR
;
6525 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6529 // put locks in excl mode
6530 newi
->filelock
.set_state(LOCK_EXCL
);
6531 newi
->authlock
.set_state(LOCK_EXCL
);
6532 newi
->xattrlock
.set_state(LOCK_EXCL
);
6535 // make sure this inode gets into the journal
6536 le
->metablob
.add_opened_ino(newi
->ino());
6538 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6540 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6541 // have overshot the split size (multiple mkdir in flight), so here is
6542 // an early chance to split the dir if this mkdir makes it oversized.
6543 mds
->balancer
->maybe_fragment(dir
, false);
6549 void Server::handle_client_symlink(MDRequestRef
& mdr
)
6551 const auto& req
= mdr
->client_request
;
6553 mdr
->disable_lock_cache();
6554 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6558 CDir
*dir
= dn
->get_dir();
6559 CInode
*diri
= dir
->get_inode();
6561 if (!check_access(mdr
, diri
, MAY_WRITE
))
6563 if (!check_fragment_space(mdr
, dir
))
6565 if (!check_dir_max_entries(mdr
, dir
))
6568 ceph_assert(dn
->get_projected_linkage()->is_null());
6569 if (req
->get_alternate_name().size() > alternate_name_max
) {
6570 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6571 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6573 dn
->set_alternate_name(req
->get_alternate_name());
6575 unsigned mode
= S_IFLNK
| 0777;
6576 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6580 dn
->push_projected_linkage(newi
);
6582 newi
->symlink
= req
->get_path2();
6583 auto _inode
= newi
->_get_inode();
6584 _inode
->version
= dn
->pre_dirty();
6585 _inode
->size
= newi
->symlink
.length();
6586 _inode
->rstat
.rbytes
= _inode
->size
;
6587 _inode
->rstat
.rfiles
= 1;
6588 _inode
->accounted_rstat
= _inode
->rstat
;
6589 _inode
->update_backtrace();
6591 newi
->first
= dn
->first
;
6594 mdr
->ls
= mdlog
->get_current_segment();
6595 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
6596 mdlog
->start_entry(le
);
6597 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6598 journal_allocated_inos(mdr
, &le
->metablob
);
6599 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6600 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6602 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6603 mds
->balancer
->maybe_fragment(dir
, false);
6612 void Server::handle_client_link(MDRequestRef
& mdr
)
6614 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6616 dout(7) << "handle_client_link " << req
->get_filepath()
6617 << " to " << req
->get_filepath2()
6620 mdr
->disable_lock_cache();
6625 if (req
->get_filepath2().depth() == 0) {
6626 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
6628 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
6629 mdcache
->find_ino_peers(req
->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr
));
6634 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
6635 CDentry
*pdn
= targeti
->get_projected_parent_dn();
6637 dout(7) << "target has no parent dn, failing..." << dendl
;
6638 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6641 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
6643 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
6646 destdn
= rdlock_path_xlock_dentry(mdr
, false);
6650 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
6655 if (!destdn
->get_projected_linkage()->is_null()) {
6656 respond_to_request(mdr
, -CEPHFS_EEXIST
);
6660 targeti
= ret
.second
->get_projected_linkage()->get_inode();
6663 ceph_assert(destdn
->get_projected_linkage()->is_null());
6664 if (req
->get_alternate_name().size() > alternate_name_max
) {
6665 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6666 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6669 destdn
->set_alternate_name(req
->get_alternate_name());
6671 if (targeti
->is_dir()) {
6672 dout(7) << "target is a dir, failing..." << dendl
;
6673 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6677 CDir
*dir
= destdn
->get_dir();
6678 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
6679 dout(7) << "target is " << *targeti
<< dendl
;
6681 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6682 MutationImpl::LockOpVec lov
;
6683 lov
.add_xlock(&targeti
->snaplock
);
6684 lov
.add_xlock(&targeti
->linklock
);
6686 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6689 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6692 if (targeti
->get_projected_inode()->nlink
== 0) {
6693 dout(7) << "target has no link, failing..." << dendl
;
6694 respond_to_request(mdr
, -CEPHFS_ENOENT
);
6698 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6699 if (!check_access(mdr
, targeti
, MAY_WRITE
))
6702 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
6705 if (!check_fragment_space(mdr
, dir
))
6708 if (!check_dir_max_entries(mdr
, dir
))
6712 CInode
* target_pin
= targeti
->get_projected_parent_dir()->inode
;
6713 SnapRealm
*target_realm
= target_pin
->find_snaprealm();
6714 if (target_pin
!= dir
->inode
&&
6715 target_realm
->get_subvolume_ino() !=
6716 dir
->inode
->find_snaprealm()->get_subvolume_ino()) {
6717 dout(7) << "target is in different subvolume, failing..." << dendl
;
6718 respond_to_request(mdr
, -CEPHFS_EXDEV
);
6723 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
6726 if (targeti
->is_auth())
6727 _link_local(mdr
, destdn
, targeti
, target_realm
);
6729 _link_remote(mdr
, true, destdn
, targeti
);
6730 mds
->balancer
->maybe_fragment(dir
, false);
6734 class C_MDS_link_local_finish
: public ServerLogContext
{
6741 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
6742 version_t dnpv_
, version_t tipv_
, bool ar
) :
6743 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
6744 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
6745 void finish(int r
) override
{
6746 ceph_assert(r
== 0);
6747 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
6752 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
)
6754 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
6756 mdr
->ls
= mdlog
->get_current_segment();
6758 // predirty NEW dentry
6759 version_t dnpv
= dn
->pre_dirty();
6760 version_t tipv
= targeti
->pre_dirty();
6762 // project inode update
6763 auto pi
= targeti
->project_inode(mdr
);
6765 pi
.inode
->ctime
= mdr
->get_op_stamp();
6766 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6767 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6768 pi
.inode
->change_attr
++;
6769 pi
.inode
->version
= tipv
;
6771 bool adjust_realm
= false;
6772 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
6773 sr_t
*newsnap
= targeti
->project_snaprealm();
6774 targeti
->mark_snaprealm_global(newsnap
);
6775 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, targeti
->get_projected_parent_dn(), true);
6776 adjust_realm
= true;
6780 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
6781 mdlog
->start_entry(le
);
6782 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6783 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
6784 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
6785 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6786 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
6788 // do this after predirty_*, to avoid funky extra dnl arg
6789 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6791 journal_and_reply(mdr
, targeti
, dn
, le
,
6792 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
6795 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
6796 version_t dnpv
, version_t tipv
, bool adjust_realm
)
6798 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
6800 // link and unlock the NEW dentry
6801 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6802 if (!dnl
->get_inode())
6803 dn
->link_remote(dnl
, targeti
);
6804 dn
->mark_dirty(dnpv
, mdr
->ls
);
6809 MDRequestRef null_ref
;
6810 mdcache
->send_dentry_link(dn
, null_ref
);
6813 int op
= CEPH_SNAP_OP_SPLIT
;
6814 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6815 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6818 // bump target popularity
6819 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6820 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6823 respond_to_request(mdr
, 0);
6827 // link / unlink remote
6829 class C_MDS_link_remote_finish
: public ServerLogContext
{
6835 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
6836 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
6837 dpv(d
->get_projected_version()) {}
6838 void finish(int r
) override
{
6839 ceph_assert(r
== 0);
6840 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
6844 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
6846 dout(10) << "_link_remote "
6847 << (inc
? "link ":"unlink ")
6848 << *dn
<< " to " << *targeti
<< dendl
;
6850 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6851 mds_rank_t linkauth
= targeti
->authority().first
;
6852 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
6853 if (mds
->is_cluster_degraded() &&
6854 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
6855 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
6856 if (mdr
->more()->waiting_on_peer
.empty())
6857 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
6861 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
6864 op
= MMDSPeerRequest::OP_LINKPREP
;
6866 op
= MMDSPeerRequest::OP_UNLINKPREP
;
6867 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
6868 targeti
->set_object_info(req
->get_object_info());
6869 req
->op_stamp
= mdr
->get_op_stamp();
6870 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
6871 encode(*desti_srnode
, req
->desti_snapbl
);
6872 mds
->send_message_mds(req
, linkauth
);
6874 ceph_assert(mdr
->more()->waiting_on_peer
.count(linkauth
) == 0);
6875 mdr
->more()->waiting_on_peer
.insert(linkauth
);
6878 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
6880 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
6882 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
6883 delete desti_srnode
;
6884 desti_srnode
= NULL
;
6887 mdr
->set_mds_stamp(ceph_clock_now());
6890 mdr
->ls
= mdlog
->get_current_segment();
6891 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
6892 mdlog
->start_entry(le
);
6893 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6894 if (!mdr
->more()->witnessed
.empty()) {
6895 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
6896 le
->reqid
= mdr
->reqid
;
6897 le
->had_peers
= true;
6898 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6903 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
6904 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6905 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6908 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6909 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6910 le
->metablob
.add_null_dentry(dn
, true);
6911 dn
->push_projected_linkage();
6914 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
6915 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
6918 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
6919 CDentry
*dn
, CInode
*targeti
,
6922 dout(10) << "_link_remote_finish "
6923 << (inc
? "link ":"unlink ")
6924 << *dn
<< " to " << *targeti
<< dendl
;
6926 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
6928 if (!mdr
->more()->witnessed
.empty())
6929 mdcache
->logged_leader_update(mdr
->reqid
);
6932 // link the new dentry
6933 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6934 if (!dnl
->get_inode())
6935 dn
->link_remote(dnl
, targeti
);
6936 dn
->mark_dirty(dpv
, mdr
->ls
);
6938 // unlink main dentry
6939 dn
->get_dir()->unlink_inode(dn
);
6940 dn
->pop_projected_linkage();
6941 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
6946 MDRequestRef null_ref
;
6948 mdcache
->send_dentry_link(dn
, null_ref
);
6950 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
6952 // bump target popularity
6953 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6954 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6957 respond_to_request(mdr
, 0);
6960 // removing a new dn?
6961 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6965 // remote linking/unlinking
6967 class C_MDS_PeerLinkPrep
: public ServerLogContext
{
6971 C_MDS_PeerLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
6972 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
6973 void finish(int r
) override
{
6974 ceph_assert(r
== 0);
6975 server
->_logged_peer_link(mdr
, targeti
, adjust_realm
);
6979 class C_MDS_PeerLinkCommit
: public ServerContext
{
6983 C_MDS_PeerLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
6984 ServerContext(s
), mdr(r
), targeti(t
) { }
6985 void finish(int r
) override
{
6986 server
->_commit_peer_link(mdr
, r
, targeti
);
6990 void Server::handle_peer_link_prep(MDRequestRef
& mdr
)
6992 dout(10) << "handle_peer_link_prep " << *mdr
6993 << " on " << mdr
->peer_request
->get_object_info()
6996 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
6998 CInode
*targeti
= mdcache
->get_inode(mdr
->peer_request
->get_object_info().ino
);
6999 ceph_assert(targeti
);
7000 dout(10) << "targeti " << *targeti
<< dendl
;
7001 CDentry
*dn
= targeti
->get_parent_dn();
7002 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7003 ceph_assert(dnl
->is_primary());
7005 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7007 mdr
->auth_pin(targeti
);
7009 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7010 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
7013 mdr
->ls
= mdlog
->get_current_segment();
7014 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_prep", mdr
->reqid
, mdr
->peer_to_mds
,
7015 EPeerUpdate::OP_PREPARE
, EPeerUpdate::LINK
);
7016 mdlog
->start_entry(le
);
7018 auto pi
= dnl
->get_inode()->project_inode(mdr
);
7020 // update journaled target inode
7022 bool adjust_realm
= false;
7023 bool realm_projected
= false;
7024 if (mdr
->peer_request
->get_op() == MMDSPeerRequest::OP_LINKPREP
) {
7028 CDentry
*target_pdn
= targeti
->get_projected_parent_dn();
7029 SnapRealm
*target_realm
= target_pdn
->get_dir()->inode
->find_snaprealm();
7030 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7031 sr_t
*newsnap
= targeti
->project_snaprealm();
7032 targeti
->mark_snaprealm_global(newsnap
);
7033 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, target_pdn
, true);
7034 adjust_realm
= true;
7035 realm_projected
= true;
7040 if (targeti
->is_projected_snaprealm_global()) {
7041 ceph_assert(mdr
->peer_request
->desti_snapbl
.length());
7042 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
7044 sr_t
*newsnap
= targeti
->project_snaprealm();
7045 decode(*newsnap
, p
);
7047 if (pi
.inode
->nlink
== 0)
7048 ceph_assert(!newsnap
->is_parent_global());
7050 realm_projected
= true;
7052 ceph_assert(mdr
->peer_request
->desti_snapbl
.length() == 0);
7056 link_rollback rollback
;
7057 rollback
.reqid
= mdr
->reqid
;
7058 rollback
.ino
= targeti
->ino();
7059 rollback
.old_ctime
= targeti
->get_inode()->ctime
; // we hold versionlock xlock; no concorrent projections
7060 const auto& pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
7061 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
7062 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
7063 rollback
.was_inc
= inc
;
7064 if (realm_projected
) {
7065 if (targeti
->snaprealm
) {
7066 encode(true, rollback
.snapbl
);
7067 targeti
->encode_snap_blob(rollback
.snapbl
);
7069 encode(false, rollback
.snapbl
);
7072 encode(rollback
, le
->rollback
);
7073 mdr
->more()->rollback_bl
= le
->rollback
;
7075 pi
.inode
->ctime
= mdr
->get_op_stamp();
7076 pi
.inode
->version
= targeti
->pre_dirty();
7078 dout(10) << " projected inode " << pi
.inode
->ino
<< " v " << pi
.inode
->version
<< dendl
;
7081 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
7082 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
7083 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7085 // set up commit waiter
7086 mdr
->more()->peer_commit
= new C_MDS_PeerLinkCommit(this, mdr
, targeti
);
7088 mdr
->more()->peer_update_journaled
= true;
7089 submit_mdlog_entry(le
, new C_MDS_PeerLinkPrep(this, mdr
, targeti
, adjust_realm
),
7094 void Server::_logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
7096 dout(10) << "_logged_peer_link " << *mdr
7097 << " " << *targeti
<< dendl
;
7099 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
7101 // update the target
7105 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7108 mdr
->reset_peer_request();
7111 int op
= CEPH_SNAP_OP_SPLIT
;
7112 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7113 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7117 if (!mdr
->aborted
) {
7118 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_LINKPREPACK
);
7119 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7121 dout(10) << " abort flag set, finishing" << dendl
;
7122 mdcache
->request_finish(mdr
);
7127 struct C_MDS_CommittedPeer
: public ServerLogContext
{
7128 C_MDS_CommittedPeer(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
7129 void finish(int r
) override
{
7130 server
->_committed_peer(mdr
);
7134 void Server::_commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
7136 dout(10) << "_commit_peer_link " << *mdr
7138 << " " << *targeti
<< dendl
;
7140 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
7143 // drop our pins, etc.
7146 // write a commit to the journal
7147 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_commit", mdr
->reqid
, mdr
->peer_to_mds
,
7148 EPeerUpdate::OP_COMMIT
, EPeerUpdate::LINK
);
7149 mdlog
->start_entry(le
);
7150 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7153 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7157 void Server::_committed_peer(MDRequestRef
& mdr
)
7159 dout(10) << "_committed_peer " << *mdr
<< dendl
;
7161 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
7163 bool assert_exist
= mdr
->more()->peer_update_journaled
;
7164 mdcache
->finish_uncommitted_peer(mdr
->reqid
, assert_exist
);
7165 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_COMMITTED
);
7166 mds
->send_message_mds(req
, mdr
->peer_to_mds
);
7167 mdcache
->request_finish(mdr
);
7170 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
7172 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7173 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7174 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
7175 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
7177 void finish(int r
) override
{
7178 server
->_link_rollback_finish(mut
, mdr
, splits
);
7182 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7184 link_rollback rollback
;
7185 auto p
= rbl
.cbegin();
7186 decode(rollback
, p
);
7188 dout(10) << "do_link_rollback on " << rollback
.reqid
7189 << (rollback
.was_inc
? " inc":" dec")
7190 << " ino " << rollback
.ino
7193 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
7195 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7196 ceph_assert(mdr
|| mds
->is_resolve());
7198 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7199 mut
->ls
= mds
->mdlog
->get_current_segment();
7201 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
7203 dout(10) << " target is " << *in
<< dendl
;
7204 ceph_assert(!in
->is_projected()); // live peer request hold versionlock xlock.
7206 auto pi
= in
->project_inode(mut
);
7207 pi
.inode
->version
= in
->pre_dirty();
7209 // parent dir rctime
7210 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
7211 auto pf
= parent
->project_fnode(mut
);
7212 pf
->version
= parent
->pre_dirty();
7213 if (pf
->fragstat
.mtime
== pi
.inode
->ctime
) {
7214 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
7215 if (pf
->rstat
.rctime
== pi
.inode
->ctime
)
7216 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
7217 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
7218 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
7222 pi
.inode
->ctime
= rollback
.old_ctime
;
7223 if (rollback
.was_inc
)
7228 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7229 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7231 auto p
= rollback
.snapbl
.cbegin();
7232 decode(hadrealm
, p
);
7234 if (!mds
->is_resolve()) {
7235 sr_t
*new_srnode
= new sr_t();
7236 decode(*new_srnode
, p
);
7237 in
->project_snaprealm(new_srnode
);
7239 decode(in
->snaprealm
->srnode
, p
);
7242 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
7243 if (!mds
->is_resolve())
7244 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
7245 in
->project_snaprealm(NULL
);
7250 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_rollback", rollback
.reqid
, leader
,
7251 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::LINK
);
7252 mdlog
->start_entry(le
);
7253 le
->commit
.add_dir_context(parent
);
7254 le
->commit
.add_dir(parent
, true);
7255 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
7257 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
7262 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
7263 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
7265 dout(10) << "_link_rollback_finish" << dendl
;
7267 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
7271 if (!mds
->is_resolve())
7272 mdcache
->send_snaps(splits
);
7275 mdcache
->request_finish(mdr
);
7277 mdcache
->finish_rollback(mut
->reqid
, mdr
);
7283 void Server::handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
)
7285 dout(10) << "handle_peer_link_prep_ack " << *mdr
7286 << " " << *m
<< dendl
;
7287 mds_rank_t from
= mds_rank_t(m
->get_source().num());
7289 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
7292 mdr
->more()->peers
.insert(from
);
7295 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
7296 mdr
->more()->witnessed
.insert(from
);
7297 ceph_assert(!m
->is_not_journaled());
7298 mdr
->more()->has_journaled_peers
= true;
7300 // remove from waiting list
7301 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7302 mdr
->more()->waiting_on_peer
.erase(from
);
7304 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
7306 dispatch_client_request(mdr
); // go again!
7315 void Server::handle_client_unlink(MDRequestRef
& mdr
)
7317 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7318 client_t client
= mdr
->get_client();
7321 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
7324 mdr
->disable_lock_cache();
7325 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
7329 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
7330 ceph_assert(!dnl
->is_null());
7331 CInode
*in
= dnl
->get_inode();
7334 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
7336 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
7338 dout(7) << "dn links to " << *in
<< dendl
;
7343 // do empty directory checks
7344 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
7345 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7349 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
7350 respond_to_request(mdr
, -CEPHFS_EISDIR
);
7356 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
7357 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
7362 CInode
*diri
= dn
->get_dir()->get_inode();
7363 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7364 if (!check_access(mdr
, diri
, MAY_WRITE
))
7368 // -- create stray dentry? --
7369 CDentry
*straydn
= NULL
;
7370 if (dnl
->is_primary()) {
7371 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
7374 dout(10) << " straydn is " << *straydn
<< dendl
;
7375 } else if (mdr
->straydn
) {
7376 mdr
->unpin(mdr
->straydn
);
7377 mdr
->straydn
= NULL
;
7381 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7382 MutationImpl::LockOpVec lov
;
7384 lov
.add_xlock(&in
->linklock
);
7385 lov
.add_xlock(&in
->snaplock
);
7387 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
7390 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7391 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7392 lov
.add_xlock(&straydn
->lock
);
7395 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7398 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7402 _dir_is_nonempty(mdr
, in
)) {
7403 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7408 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7410 if (!mdr
->more()->desti_srnode
) {
7411 if (in
->is_projected_snaprealm_global()) {
7412 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7413 in
->record_snaprealm_parent_dentry(new_srnode
, nullptr, dn
, dnl
->is_primary());
7414 // dropping the last linkage or dropping the last remote linkage,
7415 // detch the inode from global snaprealm
7416 auto nlink
= in
->get_projected_inode()->nlink
;
7418 (nlink
== 2 && !dnl
->is_primary() &&
7419 !in
->get_projected_parent_dir()->inode
->is_stray()))
7420 in
->clear_snaprealm_global(new_srnode
);
7421 mdr
->more()->desti_srnode
= new_srnode
;
7422 } else if (dnl
->is_primary()) {
7423 // prepare snaprealm blob for peer request
7424 SnapRealm
*realm
= in
->find_snaprealm();
7425 snapid_t follows
= realm
->get_newest_seq();
7426 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7427 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7428 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7429 mdr
->more()->desti_srnode
= new_srnode
;
7435 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7436 // subtree root auths need to be witnesses
7437 set
<mds_rank_t
> witnesses
;
7438 in
->list_replicas(witnesses
);
7439 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7441 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7442 p
!= witnesses
.end();
7444 if (mdr
->more()->witnessed
.count(*p
)) {
7445 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7446 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
7447 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7449 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7453 if (!mdr
->more()->waiting_on_peer
.empty())
7454 return; // we're waiting for a witness.
7457 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
7458 mds
->locker
->create_lock_cache(mdr
, diri
);
7461 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7462 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7464 _unlink_local(mdr
, dn
, straydn
);
7467 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7470 version_t dnpv
; // deleted dentry
7472 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7473 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
7474 dnpv(d
->get_projected_version()) {}
7475 void finish(int r
) override
{
7476 ceph_assert(r
== 0);
7477 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
7481 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7483 dout(10) << "_unlink_local " << *dn
<< dendl
;
7485 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7486 CInode
*in
= dnl
->get_inode();
7490 mdr
->ls
= mdlog
->get_current_segment();
7492 // prepare log entry
7493 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
7494 mdlog
->start_entry(le
);
7495 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7496 if (!mdr
->more()->witnessed
.empty()) {
7497 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7498 le
->reqid
= mdr
->reqid
;
7499 le
->had_peers
= true;
7500 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7504 ceph_assert(dnl
->is_primary());
7505 straydn
->push_projected_linkage(in
);
7508 // the unlinked dentry
7511 auto pi
= in
->project_inode(mdr
);
7514 dn
->make_path_string(t
, true);
7515 pi
.inode
->stray_prior_path
= std::move(t
);
7517 pi
.inode
->version
= in
->pre_dirty();
7518 pi
.inode
->ctime
= mdr
->get_op_stamp();
7519 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7520 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7521 pi
.inode
->change_attr
++;
7523 if (pi
.inode
->nlink
== 0)
7524 in
->state_set(CInode::STATE_ORPHAN
);
7526 if (mdr
->more()->desti_srnode
) {
7527 auto& desti_srnode
= mdr
->more()->desti_srnode
;
7528 in
->project_snaprealm(desti_srnode
);
7529 desti_srnode
= NULL
;
7533 // will manually pop projected inode
7535 // primary link. add stray dentry.
7536 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
7537 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7539 pi
.inode
->update_backtrace();
7540 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
7542 // remote link. update remote inode.
7543 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7544 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
7545 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
7548 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7549 le
->metablob
.add_null_dentry(dn
, true);
7552 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7553 le
->metablob
.renamed_dirino
= in
->ino();
7556 dn
->push_projected_linkage();
7559 ceph_assert(in
->first
<= straydn
->first
);
7560 in
->first
= straydn
->first
;
7564 ceph_assert(straydn
);
7565 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7568 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
7571 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
7572 CDentry
*dn
, CDentry
*straydn
,
7575 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
7577 if (!mdr
->more()->witnessed
.empty())
7578 mdcache
->logged_leader_update(mdr
->reqid
);
7580 CInode
*strayin
= NULL
;
7581 bool hadrealm
= false;
7583 // if there is newly created snaprealm, need to split old snaprealm's
7584 // inodes_with_caps. So pop snaprealm before linkage changes.
7585 strayin
= dn
->get_linkage()->get_inode();
7586 hadrealm
= strayin
->snaprealm
? true : false;
7587 strayin
->early_pop_projected_snaprealm();
7590 // unlink main dentry
7591 dn
->get_dir()->unlink_inode(dn
);
7592 dn
->pop_projected_linkage();
7593 dn
->mark_dirty(dnpv
, mdr
->ls
);
7595 // relink as stray? (i.e. was primary link?)
7597 dout(20) << " straydn is " << *straydn
<< dendl
;
7598 straydn
->pop_projected_linkage();
7599 mdcache
->touch_dentry_bottom(straydn
);
7604 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
7607 // update subtree map?
7608 if (strayin
->is_dir())
7609 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
7611 if (strayin
->snaprealm
&& !hadrealm
)
7612 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
7616 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7619 respond_to_request(mdr
, 0);
7621 // removing a new dn?
7622 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7625 // respond_to_request() drops locks. So stray reintegration can race with us.
7626 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7627 // Tip off the MDCache that this dentry is a stray that
7628 // might be elegible for purge.
7629 mdcache
->notify_stray(straydn
);
7633 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
7635 if (mds
->is_cluster_degraded() &&
7636 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7637 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
7638 if (mdr
->more()->waiting_on_peer
.empty())
7639 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7643 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
7644 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREP
);
7645 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
7646 for (auto dn
: trace
)
7647 req
->srcdnpath
.push_dentry(dn
->get_name());
7648 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
7649 if (mdr
->more()->desti_srnode
)
7650 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7652 req
->op_stamp
= mdr
->get_op_stamp();
7653 mds
->send_message_mds(req
, who
);
7655 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
7656 mdr
->more()->waiting_on_peer
.insert(who
);
7660 struct C_MDS_PeerRmdirPrep
: public ServerLogContext
{
7661 CDentry
*dn
, *straydn
;
7662 C_MDS_PeerRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
7663 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
7664 void finish(int r
) override
{
7665 server
->_logged_peer_rmdir(mdr
, dn
, straydn
);
7669 struct C_MDS_PeerRmdirCommit
: public ServerContext
{
7672 C_MDS_PeerRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
7673 : ServerContext(s
), mdr(r
), straydn(sd
) { }
7674 void finish(int r
) override
{
7675 server
->_commit_peer_rmdir(mdr
, r
, straydn
);
7679 void Server::handle_peer_rmdir_prep(MDRequestRef
& mdr
)
7681 dout(10) << "handle_peer_rmdir_prep " << *mdr
7682 << " " << mdr
->peer_request
->srcdnpath
7683 << " to " << mdr
->peer_request
->destdnpath
7686 vector
<CDentry
*> trace
;
7687 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
7688 dout(10) << " src " << srcpath
<< dendl
;
7690 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
7691 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
7692 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
7695 if (r
== -CEPHFS_ESTALE
) {
7696 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7697 mdr
->peer_to_mds
, true);
7700 ceph_assert(r
== 0);
7701 CDentry
*dn
= trace
.back();
7702 dout(10) << " dn " << *dn
<< dendl
;
7705 ceph_assert(mdr
->straydn
);
7706 CDentry
*straydn
= mdr
->straydn
;
7707 dout(10) << " straydn " << *straydn
<< dendl
;
7709 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7711 rmdir_rollback rollback
;
7712 rollback
.reqid
= mdr
->reqid
;
7713 rollback
.src_dir
= dn
->get_dir()->dirfrag();
7714 rollback
.src_dname
= dn
->get_name();
7715 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
7716 rollback
.dest_dname
= straydn
->get_name();
7717 if (mdr
->peer_request
->desti_snapbl
.length()) {
7718 if (in
->snaprealm
) {
7719 encode(true, rollback
.snapbl
);
7720 in
->encode_snap_blob(rollback
.snapbl
);
7722 encode(false, rollback
.snapbl
);
7725 encode(rollback
, mdr
->more()->rollback_bl
);
7726 // FIXME: rollback snaprealm
7727 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7729 // set up commit waiter
7730 mdr
->more()->peer_commit
= new C_MDS_PeerRmdirCommit(this, mdr
, straydn
);
7732 straydn
->push_projected_linkage(in
);
7733 dn
->push_projected_linkage();
7735 ceph_assert(straydn
->first
>= in
->first
);
7736 in
->first
= straydn
->first
;
7738 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
7739 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
7740 _logged_peer_rmdir(mdr
, dn
, straydn
);
7744 mdr
->ls
= mdlog
->get_current_segment();
7745 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir", mdr
->reqid
, mdr
->peer_to_mds
,
7746 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RMDIR
);
7747 mdlog
->start_entry(le
);
7748 le
->rollback
= mdr
->more()->rollback_bl
;
7750 le
->commit
.add_dir_context(straydn
->get_dir());
7751 le
->commit
.add_primary_dentry(straydn
, in
, true);
7752 // peer: no need to journal original dentry
7754 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7755 le
->commit
.renamed_dirino
= in
->ino();
7757 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7758 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7760 mdr
->more()->peer_update_journaled
= true;
7761 submit_mdlog_entry(le
, new C_MDS_PeerRmdirPrep(this, mdr
, dn
, straydn
),
7766 void Server::_logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7768 dout(10) << "_logged_peer_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
7769 CInode
*in
= dn
->get_linkage()->get_inode();
7772 if (mdr
->peer_request
->desti_snapbl
.length()) {
7773 new_realm
= !in
->snaprealm
;
7774 in
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
7775 ceph_assert(in
->snaprealm
);
7780 // update our cache now, so we are consistent with what is in the journal
7781 // when we journal a subtree map
7782 dn
->get_dir()->unlink_inode(dn
);
7783 straydn
->pop_projected_linkage();
7784 dn
->pop_projected_linkage();
7786 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->peer_update_journaled
);
7789 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
7792 mdr
->reset_peer_request();
7795 if (!mdr
->aborted
) {
7796 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREPACK
);
7797 if (!mdr
->more()->peer_update_journaled
)
7798 reply
->mark_not_journaled();
7799 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7801 dout(10) << " abort flag set, finishing" << dendl
;
7802 mdcache
->request_finish(mdr
);
7806 void Server::handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
7808 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7809 << " " << *ack
<< dendl
;
7811 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
7813 mdr
->more()->peers
.insert(from
);
7814 mdr
->more()->witnessed
.insert(from
);
7815 if (!ack
->is_not_journaled())
7816 mdr
->more()->has_journaled_peers
= true;
7818 // remove from waiting list
7819 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7820 mdr
->more()->waiting_on_peer
.erase(from
);
7822 if (mdr
->more()->waiting_on_peer
.empty())
7823 dispatch_client_request(mdr
); // go again!
7825 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
7828 void Server::_commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
7830 dout(10) << "_commit_peer_rmdir " << *mdr
<< " r=" << r
<< dendl
;
7833 if (mdr
->more()->peer_update_journaled
) {
7834 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7835 if (strayin
&& !strayin
->snaprealm
)
7836 mdcache
->clear_dirty_bits_for_stray(strayin
);
7841 if (mdr
->more()->peer_update_journaled
) {
7842 // write a commit to the journal
7843 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_commit", mdr
->reqid
,
7844 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
7845 EPeerUpdate::RMDIR
);
7846 mdlog
->start_entry(le
);
7847 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7850 _committed_peer(mdr
);
7854 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7858 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
7862 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
7863 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
7864 void finish(int r
) override
{
7865 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
7869 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7871 // unlink the other rollback methods, the rmdir rollback is only
7872 // needed to record the subtree changes in the journal for inode
7873 // replicas who are auth for empty dirfrags. no actual changes to
7874 // the file system are taking place here, so there is no Mutation.
7876 rmdir_rollback rollback
;
7877 auto p
= rbl
.cbegin();
7878 decode(rollback
, p
);
7880 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
7881 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7882 ceph_assert(mdr
|| mds
->is_resolve());
7884 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
7886 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
7888 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
7890 dout(10) << " dn " << *dn
<< dendl
;
7891 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
7892 ceph_assert(straydir
);
7893 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
7894 ceph_assert(straydn
);
7895 dout(10) << " straydn " << *straydn
<< dendl
;
7896 CInode
*in
= straydn
->get_linkage()->get_inode();
7898 dn
->push_projected_linkage(in
);
7899 straydn
->push_projected_linkage();
7901 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7903 auto p
= rollback
.snapbl
.cbegin();
7904 decode(hadrealm
, p
);
7906 decode(in
->snaprealm
->srnode
, p
);
7908 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
7912 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
7913 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
7915 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
7920 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_rollback", rollback
.reqid
, leader
,
7921 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RMDIR
);
7922 mdlog
->start_entry(le
);
7924 le
->commit
.add_dir_context(dn
->get_dir());
7925 le
->commit
.add_primary_dentry(dn
, in
, true);
7926 // peer: no need to journal straydn
7928 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7929 le
->commit
.renamed_dirino
= in
->ino();
7931 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
7933 submit_mdlog_entry(le
,
7934 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
7940 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
7942 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
7944 straydn
->get_dir()->unlink_inode(straydn
);
7945 dn
->pop_projected_linkage();
7946 straydn
->pop_projected_linkage();
7948 CInode
*in
= dn
->get_linkage()->get_inode();
7949 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
7950 !mdr
|| mdr
->more()->peer_update_journaled
);
7952 if (mds
->is_resolve()) {
7953 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
7954 mdcache
->try_trim_non_auth_subtree(root
);
7958 mdcache
->request_finish(mdr
);
7960 mdcache
->finish_rollback(reqid
, mdr
);
7964 /** _dir_is_nonempty[_unlocked]
7966 * check if a directory is non-empty (i.e. we can rmdir it).
7968 * the unlocked varient this is a fastpath check. we can't really be
7969 * sure until we rdlock the filelock.
7971 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
7973 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
7974 ceph_assert(in
->is_auth());
7976 if (in
->filelock
.is_cached())
7977 return false; // there can be pending async create/unlink. don't know.
7978 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
7979 return true; // in a snapshot!
7981 auto&& ls
= in
->get_dirfrags();
7982 for (const auto& dir
: ls
) {
7983 // is the frag obviously non-empty?
7984 if (dir
->is_auth()) {
7985 if (dir
->get_projected_fnode()->fragstat
.size()) {
7986 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7987 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
7996 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
7998 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
7999 ceph_assert(in
->is_auth());
8000 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
8002 frag_info_t dirstat
;
8003 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
8005 auto&& ls
= in
->get_dirfrags();
8006 for (const auto& dir
: ls
) {
8007 const auto& pf
= dir
->get_projected_fnode();
8008 if (pf
->fragstat
.size()) {
8009 dout(10) << "dir_is_nonempty dirstat has "
8010 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
8014 if (pf
->accounted_fragstat
.version
== dirstat_version
)
8015 dirstat
.add(pf
->accounted_fragstat
);
8017 dirstat
.add(pf
->fragstat
);
8020 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
8024 // ======================================================
8027 class C_MDS_rename_finish
: public ServerLogContext
{
8032 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
8033 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
8034 ServerLogContext(s
, r
),
8035 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
8036 void finish(int r
) override
{
8037 ceph_assert(r
== 0);
8038 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
8043 /** handle_client_rename
8045 * rename leader is the destdn auth. this is because cached inodes
8046 * must remain connected. thus, any replica of srci, must also
8047 * replicate destdn, and possibly straydn, so that srci (and
8048 * destdn->inode) remain connected during the rename.
8050 * to do this, we freeze srci, then leader (destdn auth) verifies that
8051 * all other nodes have also replciated destdn and straydn. note that
8052 * destdn replicas need not also replicate srci. this only works when
8055 * This function takes responsibility for the passed mdr.
8057 void Server::handle_client_rename(MDRequestRef
& mdr
)
8059 const auto& req
= mdr
->client_request
;
8060 dout(7) << "handle_client_rename " << *req
<< dendl
;
8062 filepath destpath
= req
->get_filepath();
8063 filepath srcpath
= req
->get_filepath2();
8064 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
8065 respond_to_request(mdr
, -CEPHFS_EBUSY
);
8069 if (req
->get_alternate_name().size() > alternate_name_max
) {
8070 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
8071 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
8075 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
8079 dout(10) << " destdn " << *destdn
<< dendl
;
8080 CDir
*destdir
= destdn
->get_dir();
8081 ceph_assert(destdir
->is_auth());
8082 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8084 dout(10) << " srcdn " << *srcdn
<< dendl
;
8085 CDir
*srcdir
= srcdn
->get_dir();
8086 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8087 CInode
*srci
= srcdnl
->get_inode();
8088 dout(10) << " srci " << *srci
<< dendl
;
8090 // -- some sanity checks --
8091 if (destdn
== srcdn
) {
8092 dout(7) << "rename src=dest, noop" << dendl
;
8093 respond_to_request(mdr
, 0);
8097 // dest a child of src?
8098 // e.g. mv /usr /usr/foo
8099 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
8100 dout(7) << "cannot rename item to be a child of itself" << dendl
;
8101 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8105 // is this a stray migration, reintegration or merge? (sanity checks!)
8106 if (mdr
->reqid
.name
.is_mds() &&
8107 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
8108 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
8109 !(destdnl
->is_remote() &&
8110 destdnl
->get_remote_ino() == srci
->ino())) {
8111 respond_to_request(mdr
, -CEPHFS_EINVAL
); // actually, this won't reply, but whatev.
8116 if (!destdnl
->is_null()) {
8117 //dout(10) << "dest dn exists " << *destdn << dendl;
8118 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
8120 dout(10) << " oldin " << *oldin
<< dendl
;
8122 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8123 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
8124 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8128 // mv /some/thing /to/some/existing_other_thing
8129 if (oldin
->is_dir() && !srci
->is_dir()) {
8130 respond_to_request(mdr
, -CEPHFS_EISDIR
);
8133 if (!oldin
->is_dir() && srci
->is_dir()) {
8134 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
8137 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
8138 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
8141 if (destdn
->get_alternate_name() != req
->get_alternate_name()) {
8142 /* the dentry exists but the alternate_names do not match, fail... */
8143 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8148 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
8149 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
8151 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8152 if (destpath
.get_ino() != srcpath
.get_ino() &&
8153 !(req
->get_source().is_mds() &&
8154 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8155 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
8156 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
8157 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8158 while (srcbase
!= destbase
&&
8159 !srcbase
->is_projected_ancestor_of(destbase
)) {
8160 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
8161 srctrace
.insert(srctrace
.begin(), pdn
);
8162 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
8163 srcbase
= pdn
->get_dir()->get_inode();
8166 // then, extend destpath until it shares the same parent inode as srcpath.
8167 while (destbase
!= srcbase
) {
8168 CDentry
*pdn
= destbase
->get_projected_parent_dn();
8169 desttrace
.insert(desttrace
.begin(), pdn
);
8170 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
8171 destbase
= pdn
->get_dir()->get_inode();
8173 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
8177 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8179 dout(10) << " this is a link merge" << dendl
;
8181 // -- create stray dentry? --
8182 CDentry
*straydn
= NULL
;
8183 if (destdnl
->is_primary() && !linkmerge
) {
8184 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
8187 dout(10) << " straydn is " << *straydn
<< dendl
;
8188 } else if (mdr
->straydn
) {
8189 mdr
->unpin(mdr
->straydn
);
8190 mdr
->straydn
= NULL
;
8195 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
8196 MutationImpl::LockOpVec lov
;
8198 // we need to update srci's ctime. xlock its least contended lock to do that...
8199 lov
.add_xlock(&srci
->linklock
);
8200 lov
.add_xlock(&srci
->snaplock
);
8203 // xlock oldin (for nlink--)
8204 lov
.add_xlock(&oldin
->linklock
);
8205 lov
.add_xlock(&oldin
->snaplock
);
8206 if (oldin
->is_dir()) {
8207 ceph_assert(srci
->is_dir());
8208 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
8210 // adjust locking order?
8211 int cmp
= mdr
->compare_paths();
8212 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
8213 std::reverse(lov
.begin(), lov
.end());
8215 ceph_assert(!srci
->is_dir());
8216 // adjust locking order;
8217 if (srci
->ino() > oldin
->ino())
8218 std::reverse(lov
.begin(), lov
.end());
8224 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
8225 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
8226 lov
.add_xlock(&straydn
->lock
);
8229 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
8230 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
8233 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
8237 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
8239 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
8240 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
8243 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
8246 if (!linkmerge
&& !check_fragment_space(mdr
, destdn
->get_dir()))
8249 if (!linkmerge
&& !check_dir_max_entries(mdr
, destdn
->get_dir()))
8252 if (!check_access(mdr
, srci
, MAY_WRITE
))
8256 // with read lock, really verify oldin is empty
8259 _dir_is_nonempty(mdr
, oldin
)) {
8260 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8264 /* project_snaprealm_past_parent() will do this job
8266 // moving between snaprealms?
8267 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8268 SnapRealm *srcrealm = srci->find_snaprealm();
8269 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8270 if (srcrealm != destrealm &&
8271 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8272 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8273 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8274 mdcache->snaprealm_create(mdr, srci);
8280 SnapRealm
*dest_realm
= nullptr;
8281 SnapRealm
*src_realm
= nullptr;
8283 dest_realm
= destdir
->inode
->find_snaprealm();
8284 if (srcdir
->inode
== destdir
->inode
)
8285 src_realm
= dest_realm
;
8287 src_realm
= srcdir
->inode
->find_snaprealm();
8288 if (src_realm
!= dest_realm
&&
8289 src_realm
->get_subvolume_ino() != dest_realm
->get_subvolume_ino()) {
8290 respond_to_request(mdr
, -CEPHFS_EXDEV
);
8295 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
8297 // -- open all srcdn inode frags, if any --
8298 // we need these open so that auth can properly delegate from inode to dirfrags
8299 // after the inode is _ours_.
8300 if (srcdnl
->is_primary() &&
8301 !srcdn
->is_auth() &&
8303 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
8304 mdr
->set_stickydirs(srci
);
8307 srci
->dirfragtree
.get_leaves(leaves
);
8308 for (const auto& leaf
: leaves
) {
8309 CDir
*dir
= srci
->get_dirfrag(leaf
);
8311 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
8312 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
8318 // -- prepare snaprealm ---
8321 if (!mdr
->more()->srci_srnode
&&
8322 srci
->get_projected_inode()->nlink
== 1 &&
8323 srci
->is_projected_snaprealm_global()) {
8324 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8325 srci
->record_snaprealm_parent_dentry(new_srnode
, nullptr, destdn
, false);
8327 srci
->clear_snaprealm_global(new_srnode
);
8328 mdr
->more()->srci_srnode
= new_srnode
;
8331 if (oldin
&& !mdr
->more()->desti_srnode
) {
8332 if (oldin
->is_projected_snaprealm_global()) {
8333 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
8334 oldin
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, destdn
, destdnl
->is_primary());
8335 // dropping the last linkage or dropping the last remote linkage,
8336 // detch the inode from global snaprealm
8337 auto nlink
= oldin
->get_projected_inode()->nlink
;
8339 (nlink
== 2 && !destdnl
->is_primary() &&
8340 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
8341 oldin
->clear_snaprealm_global(new_srnode
);
8342 mdr
->more()->desti_srnode
= new_srnode
;
8343 } else if (destdnl
->is_primary()) {
8344 snapid_t follows
= dest_realm
->get_newest_seq();
8345 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
8346 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
8347 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
8348 mdr
->more()->desti_srnode
= new_srnode
;
8352 if (!mdr
->more()->srci_srnode
) {
8353 if (srci
->is_projected_snaprealm_global()) {
8354 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8355 srci
->record_snaprealm_parent_dentry(new_srnode
, src_realm
, srcdn
, srcdnl
->is_primary());
8356 mdr
->more()->srci_srnode
= new_srnode
;
8357 } else if (srcdnl
->is_primary()) {
8358 snapid_t follows
= src_realm
->get_newest_seq();
8359 if (src_realm
!= dest_realm
&&
8360 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
8361 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
8362 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
8363 mdr
->more()->srci_srnode
= new_srnode
;
8369 // -- prepare witnesses --
8372 * NOTE: we use _all_ replicas as witnesses.
8373 * this probably isn't totally necessary (esp for file renames),
8374 * but if/when we change that, we have to make sure rejoin is
8375 * sufficiently robust to handle strong rejoins from survivors
8376 * with totally wrong dentry->inode linkage.
8377 * (currently, it can ignore rename effects, because the resolve
8378 * stage will sort them out.)
8380 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
8381 if (srcdn
->is_auth())
8382 srcdn
->list_replicas(witnesses
);
8384 witnesses
.insert(srcdn
->authority().first
);
8385 if (srcdnl
->is_remote() && !srci
->is_auth())
8386 witnesses
.insert(srci
->authority().first
);
8387 destdn
->list_replicas(witnesses
);
8388 if (destdnl
->is_remote() && !oldin
->is_auth())
8389 witnesses
.insert(oldin
->authority().first
);
8390 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
8392 if (!witnesses
.empty()) {
8393 // Replicas can't see projected dentry linkages and will get confused.
8394 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8395 // can't project these inodes' linkages.
8396 bool need_flush
= false;
8397 for (auto& dn
: srctrace
) {
8398 if (dn
->is_projected()) {
8404 CDentry
*dn
= destdn
;
8406 if (dn
->is_projected()) {
8410 CInode
*diri
= dn
->get_dir()->get_inode();
8411 dn
= diri
->get_projected_parent_dn();
8415 mdlog
->wait_for_safe(
8416 new MDSInternalContextWrapper(mds
,
8417 new C_MDS_RetryRequest(mdcache
, mdr
)));
8423 // do srcdn auth last
8424 mds_rank_t last
= MDS_RANK_NONE
;
8425 if (!srcdn
->is_auth()) {
8426 last
= srcdn
->authority().first
;
8427 mdr
->more()->srcdn_auth_mds
= last
;
8428 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8429 // are involved in the rename operation.
8430 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8431 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8432 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8433 ceph_assert(mdr
->more()->rename_inode
== srci
);
8434 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8439 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
8440 p
!= witnesses
.end();
8442 if (*p
== last
) continue; // do it last!
8443 if (mdr
->more()->witnessed
.count(*p
)) {
8444 dout(10) << " already witnessed by mds." << *p
<< dendl
;
8445 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
8446 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
8448 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
8452 if (!mdr
->more()->waiting_on_peer
.empty())
8453 return; // we're waiting for a witness.
8455 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
8456 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
8457 ceph_assert(mdr
->more()->waiting_on_peer
.count(last
) == 0);
8458 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8462 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8463 if (!mdr
->more()->peers
.empty() && !srci
->is_dir())
8464 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
8465 if (!mdr
->more()->peers
.empty() && srci
->is_dir())
8466 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
8468 // -- declare now --
8469 mdr
->set_mds_stamp(ceph_clock_now());
8471 // -- prepare journal entry --
8472 mdr
->ls
= mdlog
->get_current_segment();
8473 EUpdate
*le
= new EUpdate(mdlog
, "rename");
8474 mdlog
->start_entry(le
);
8475 le
->metablob
.add_client_req(mdr
->reqid
, req
->get_oldest_client_tid());
8476 if (!mdr
->more()->witnessed
.empty()) {
8477 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
8479 le
->reqid
= mdr
->reqid
;
8480 le
->had_peers
= true;
8482 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8483 // no need to send frozen auth pin to recovring auth MDS of srci
8484 mdr
->more()->is_remote_frozen_authpin
= false;
8487 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, req
->get_alternate_name(), straydn
);
8488 if (le
->client_map
.length())
8489 le
->cmapv
= mds
->sessionmap
.get_projected();
8491 // -- commit locally --
8492 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
8494 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
8495 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
8499 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8501 dout(10) << "_rename_finish " << *mdr
<< dendl
;
8503 if (!mdr
->more()->witnessed
.empty())
8504 mdcache
->logged_leader_update(mdr
->reqid
);
8507 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8509 mdcache
->send_dentry_link(destdn
, mdr
);
8511 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8512 CInode
*in
= destdnl
->get_inode();
8513 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
8515 // test hack: test peer commit
8516 if (!mdr
->more()->peers
.empty() && !in
->is_dir())
8517 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
8518 if (!mdr
->more()->peers
.empty() && in
->is_dir())
8519 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
8522 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8523 if (destdnl
->is_remote() && in
->is_auth())
8524 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
8526 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8528 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
8531 respond_to_request(mdr
, 0);
8534 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
8537 // respond_to_request() drops locks. So stray reintegration can race with us.
8538 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8539 mdcache
->notify_stray(straydn
);
8547 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
8548 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
8550 const auto& client_req
= mdr
->client_request
;
8551 ceph_assert(client_req
);
8553 if (mds
->is_cluster_degraded() &&
8554 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8555 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
8556 if (mdr
->more()->waiting_on_peer
.empty())
8557 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8561 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
8562 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREP
);
8564 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
8565 for (auto dn
: srctrace
)
8566 req
->srcdnpath
.push_dentry(dn
->get_name());
8567 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
8568 for (auto dn
: dsttrace
)
8569 req
->destdnpath
.push_dentry(dn
->get_name());
8570 req
->alternate_name
= client_req
->alternate_name
;
8572 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8574 if (mdr
->more()->srci_srnode
)
8575 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
8576 if (mdr
->more()->desti_srnode
)
8577 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8579 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
8581 // srcdn auth will verify our current witness list is sufficient
8582 req
->witnesses
= witnesse
;
8584 req
->op_stamp
= mdr
->get_op_stamp();
8585 mds
->send_message_mds(req
, who
);
8587 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
8588 mdr
->more()->waiting_on_peer
.insert(who
);
8592 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
8594 version_t oldpv
= mdr
->more()->inode_import_v
;
8596 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8599 auto blp
= mdr
->more()->inode_import
.cbegin();
8602 map
<client_t
,entity_inst_t
> client_map
;
8603 map
<client_t
, client_metadata_t
> client_metadata_map
;
8604 decode(client_map
, blp
);
8605 decode(client_metadata_map
, blp
);
8606 prepare_force_open_sessions(client_map
, client_metadata_map
,
8607 mdr
->more()->imported_session_map
);
8608 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
8609 encode(client_metadata_map
, *client_map_bl
);
8611 list
<ScatterLock
*> updated_scatterlocks
;
8612 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
8613 mdr
->more()->cap_imports
, updated_scatterlocks
);
8615 // hack: force back to !auth and clean, temporarily
8616 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
8617 srcdnl
->get_inode()->mark_clean();
8622 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
8624 auto&& dirs
= diri
->get_dirfrags();
8626 bool force_journal
= false;
8628 for (const auto& dir
: dirs
) {
8629 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
8630 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
8631 force_journal
= true;
8634 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
8637 // see if any children of our frags are auth subtrees.
8638 std::vector
<CDir
*> subtrees
;
8639 mdcache
->get_subtrees(subtrees
);
8640 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
8641 for (const auto& dir
: dirs
) {
8642 for (const auto& subtree
: subtrees
) {
8643 if (dir
->contains(subtree
)) {
8644 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
8645 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
8646 << *subtree
<< dendl
;
8647 force_journal
= true;
8650 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
8652 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
8658 return force_journal
;
8661 void Server::_rename_prepare(MDRequestRef
& mdr
,
8662 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
8663 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
8666 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8668 dout(10) << " straydn " << *straydn
<< dendl
;
8670 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8671 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8672 CInode
*srci
= srcdnl
->get_inode();
8673 CInode
*oldin
= destdnl
->get_inode();
8675 // primary+remote link merge?
8676 bool linkmerge
= (srci
== oldin
);
8678 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8679 bool silent
= srcdn
->get_dir()->inode
->is_stray();
8681 bool force_journal_dest
= false;
8682 if (srci
->is_dir() && !destdn
->is_auth()) {
8683 if (srci
->is_auth()) {
8684 // if we are auth for srci and exporting it, force journal because journal replay needs
8685 // the source inode to create auth subtrees.
8686 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
8687 force_journal_dest
= true;
8689 force_journal_dest
= _need_force_journal(srci
, false);
8692 bool force_journal_stray
= false;
8693 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
8694 force_journal_stray
= _need_force_journal(oldin
, true);
8697 dout(10) << " merging remote and primary links to the same inode" << dendl
;
8699 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
8700 if (force_journal_dest
)
8701 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
8702 if (force_journal_stray
)
8703 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
8705 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
8706 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
8707 metablob
->renamed_dirino
= srci
->ino();
8708 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
8709 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
8710 metablob
->renamed_dirino
= oldin
->ino();
8714 CInode::mempool_inode
*spi
= 0; // renamed inode
8715 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
8719 if (destdnl
->is_primary()) {
8720 ceph_assert(straydn
); // moving to straydn.
8721 // link--, and move.
8722 if (destdn
->is_auth()) {
8723 auto pi
= oldin
->project_inode(mdr
); //project_snaprealm
8724 pi
.inode
->version
= straydn
->pre_dirty(pi
.inode
->version
);
8725 pi
.inode
->update_backtrace();
8726 tpi
= pi
.inode
.get();
8728 straydn
->push_projected_linkage(oldin
);
8729 } else if (destdnl
->is_remote()) {
8731 if (oldin
->is_auth()) {
8732 auto pi
= oldin
->project_inode(mdr
);
8733 pi
.inode
->version
= oldin
->pre_dirty();
8734 tpi
= pi
.inode
.get();
8740 if (destdnl
->is_null()) {
8741 /* handle_client_rename checks that alternate_name matches for existing destdn */
8742 destdn
->set_alternate_name(alternate_name
);
8744 if (srcdnl
->is_remote()) {
8747 if (destdn
->is_auth())
8748 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
8749 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8751 if (srci
->is_auth()) {
8752 auto pi
= srci
->project_inode(mdr
);
8753 pi
.inode
->version
= srci
->pre_dirty();
8754 spi
= pi
.inode
.get();
8757 dout(10) << " will merge remote onto primary link" << dendl
;
8758 if (destdn
->is_auth()) {
8759 auto pi
= oldin
->project_inode(mdr
);
8760 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->get_version());
8761 spi
= pi
.inode
.get();
8765 if (destdn
->is_auth()) {
8767 if (srcdn
->is_auth())
8768 oldpv
= srci
->get_projected_version();
8770 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
8772 // note which dirfrags have child subtrees in the journal
8773 // event, so that we can open those (as bounds) during replay.
8774 if (srci
->is_dir()) {
8775 auto&& ls
= srci
->get_dirfrags();
8776 for (const auto& dir
: ls
) {
8777 if (!dir
->is_auth())
8778 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
8780 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
8783 auto pi
= srci
->project_inode(mdr
); // project snaprealm if srcdnl->is_primary
8784 // & srcdnl->snaprealm
8785 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
8786 pi
.inode
->update_backtrace();
8787 spi
= pi
.inode
.get();
8789 destdn
->push_projected_linkage(srci
);
8793 if (srcdn
->is_auth())
8794 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
8795 srcdn
->push_projected_linkage(); // push null linkage
8799 spi
->ctime
= mdr
->get_op_stamp();
8800 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
8801 spi
->rstat
.rctime
= mdr
->get_op_stamp();
8807 tpi
->ctime
= mdr
->get_op_stamp();
8808 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
8809 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
8813 destdn
->make_path_string(t
, true);
8814 tpi
->stray_prior_path
= std::move(t
);
8817 if (tpi
->nlink
== 0)
8818 oldin
->state_set(CInode::STATE_ORPHAN
);
8822 // prepare nesting, mtime updates
8823 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
8825 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8826 // then link the source inode to destdn
8827 if (destdnl
->is_primary()) {
8828 ceph_assert(straydn
);
8829 if (straydn
->is_auth()) {
8830 metablob
->add_dir_context(straydn
->get_dir());
8831 metablob
->add_dir(straydn
->get_dir(), true);
8835 if (!linkmerge
&& destdnl
->is_remote() && oldin
->is_auth()) {
8836 CDir
*oldin_dir
= oldin
->get_projected_parent_dir();
8837 if (oldin_dir
!= srcdn
->get_dir() && oldin_dir
!= destdn
->get_dir())
8838 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, oldin_dir
, PREDIRTY_PRIMARY
);
8842 if (destdn
->is_auth() && !destdnl
->is_null()) {
8843 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
8844 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
8845 if (destdnl
->is_primary()) {
8846 ceph_assert(straydn
);
8847 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
8848 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8852 if (srcdnl
->is_remote() && srci
->is_auth()) {
8853 CDir
*srci_dir
= srci
->get_projected_parent_dir();
8854 if (srci_dir
!= srcdn
->get_dir() && srci_dir
!= destdn
->get_dir())
8855 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srci_dir
, PREDIRTY_PRIMARY
);
8859 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
8860 int flags
= predirty_dir
| predirty_primary
;
8861 if (srcdn
->is_auth())
8862 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
8863 if (destdn
->is_auth())
8864 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
8866 // add it all to the metablob
8869 if (destdnl
->is_primary()) {
8870 ceph_assert(straydn
);
8871 if (destdn
->is_auth()) {
8872 // project snaprealm, too
8873 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8874 oldin
->project_snaprealm(desti_srnode
);
8875 if (tpi
->nlink
== 0)
8876 ceph_assert(!desti_srnode
->is_parent_global());
8877 desti_srnode
= NULL
;
8879 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8880 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
8881 } else if (force_journal_stray
) {
8882 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
8883 metablob
->add_dir_context(straydn
->get_dir());
8884 metablob
->add_primary_dentry(straydn
, oldin
, true);
8886 } else if (destdnl
->is_remote()) {
8887 if (oldin
->is_auth()) {
8888 sr_t
*new_srnode
= NULL
;
8889 if (mdr
->peer_request
) {
8890 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
8891 new_srnode
= new sr_t();
8892 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
8893 decode(*new_srnode
, p
);
8895 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8896 new_srnode
= desti_srnode
;
8897 desti_srnode
= NULL
;
8900 oldin
->project_snaprealm(new_srnode
);
8901 if (tpi
->nlink
== 0)
8902 ceph_assert(!new_srnode
->is_parent_global());
8905 CDentry
*oldin_pdn
= oldin
->get_projected_parent_dn();
8906 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin_pdn
);
8907 metablob
->add_primary_dentry(oldin_pdn
, oldin
, true);
8913 if (srcdnl
->is_remote()) {
8914 ceph_assert(!linkmerge
);
8915 if (destdn
->is_auth() && !destdnl
->is_null())
8916 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8918 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8920 if (destdn
->is_auth())
8921 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8923 if (srci
->is_auth() ) { // it's remote
8924 if (mdr
->peer_request
) {
8925 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
8926 sr_t
*new_srnode
= new sr_t();
8927 auto p
= mdr
->peer_request
->srci_snapbl
.cbegin();
8928 decode(*new_srnode
, p
);
8929 srci
->project_snaprealm(new_srnode
);
8931 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8932 srci
->project_snaprealm(srci_srnode
);
8936 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
8937 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
);
8938 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
8940 } else if (srcdnl
->is_primary()) {
8941 // project snap parent update?
8942 if (destdn
->is_auth()) {
8943 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8944 srci
->project_snaprealm(srci_srnode
);
8949 if (destdn
->is_auth() && !destdnl
->is_null())
8950 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8952 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8954 if (destdn
->is_auth())
8955 metablob
->add_primary_dentry(destdn
, srci
, true, true);
8956 else if (force_journal_dest
) {
8957 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
8958 metablob
->add_dir_context(destdn
->get_dir());
8959 metablob
->add_primary_dentry(destdn
, srci
, true);
8960 if (srcdn
->is_auth() && srci
->is_dir()) {
8961 // journal new subtrees root dirfrags
8962 auto&& ls
= srci
->get_dirfrags();
8963 for (const auto& dir
: ls
) {
8965 metablob
->add_dir(dir
, true);
8972 if (srcdn
->is_auth()) {
8973 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
8974 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
8975 // also journal the inode in case we need do peer rename rollback. It is Ok to add
8976 // both primary and NULL dentries. Because during journal replay, null dentry is
8977 // processed after primary dentry.
8978 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
8979 metablob
->add_primary_dentry(srcdn
, srci
, true);
8980 metablob
->add_null_dentry(srcdn
, true);
8982 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
8984 // make renamed inode first track the dn
8985 if (srcdnl
->is_primary() && destdn
->is_auth()) {
8986 ceph_assert(srci
->first
<= destdn
->first
);
8987 srci
->first
= destdn
->first
;
8989 // make stray inode first track the straydn
8990 if (straydn
&& straydn
->is_auth()) {
8991 ceph_assert(oldin
->first
<= straydn
->first
);
8992 oldin
->first
= straydn
->first
;
8995 if (oldin
&& oldin
->is_dir()) {
8996 ceph_assert(straydn
);
8997 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
9000 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
9005 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9007 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9008 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
9010 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9011 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9013 CInode
*oldin
= destdnl
->get_inode();
9015 // primary+remote link merge?
9016 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
9018 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
9020 bool new_in_snaprealm
= false;
9021 bool new_oldin_snaprealm
= false;
9025 if (destdnl
->is_primary()) {
9026 ceph_assert(straydn
);
9027 dout(10) << "straydn is " << *straydn
<< dendl
;
9029 // if there is newly created snaprealm, need to split old snaprealm's
9030 // inodes_with_caps. So pop snaprealm before linkage changes.
9031 if (destdn
->is_auth()) {
9032 bool hadrealm
= (oldin
->snaprealm
? true : false);
9033 oldin
->early_pop_projected_snaprealm();
9034 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
9036 ceph_assert(mdr
->peer_request
);
9037 if (mdr
->peer_request
->desti_snapbl
.length()) {
9038 new_oldin_snaprealm
= !oldin
->snaprealm
;
9039 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9040 ceph_assert(oldin
->snaprealm
);
9044 destdn
->get_dir()->unlink_inode(destdn
, false);
9046 straydn
->pop_projected_linkage();
9047 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9048 ceph_assert(!straydn
->is_projected()); // no other projected
9051 if (destdn
->is_auth())
9052 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9054 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
9055 } else if (destdnl
->is_remote()) {
9056 destdn
->get_dir()->unlink_inode(destdn
, false);
9057 if (oldin
->is_auth()) {
9058 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9059 } else if (mdr
->peer_request
) {
9060 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9061 ceph_assert(oldin
->snaprealm
);
9062 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9064 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9065 delete desti_srnode
;
9066 desti_srnode
= NULL
;
9071 // unlink src before we relink it at dest
9072 CInode
*in
= srcdnl
->get_inode();
9075 bool srcdn_was_remote
= srcdnl
->is_remote();
9076 if (!srcdn_was_remote
) {
9077 // if there is newly created snaprealm, need to split old snaprealm's
9078 // inodes_with_caps. So pop snaprealm before linkage changes.
9079 if (destdn
->is_auth()) {
9080 bool hadrealm
= (in
->snaprealm
? true : false);
9081 in
->early_pop_projected_snaprealm();
9082 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
9084 ceph_assert(mdr
->peer_request
);
9085 if (mdr
->peer_request
->srci_snapbl
.length()) {
9086 new_in_snaprealm
= !in
->snaprealm
;
9087 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9088 ceph_assert(in
->snaprealm
);
9093 srcdn
->get_dir()->unlink_inode(srcdn
);
9096 if (srcdn_was_remote
) {
9099 destdnl
= destdn
->pop_projected_linkage();
9100 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9101 ceph_assert(!destdn
->is_projected()); // no other projected
9103 destdn
->link_remote(destdnl
, in
);
9104 if (destdn
->is_auth())
9105 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
9107 if (in
->is_auth()) {
9108 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9109 } else if (mdr
->peer_request
) {
9110 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9111 ceph_assert(in
->snaprealm
);
9112 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9114 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9119 dout(10) << "merging remote onto primary link" << dendl
;
9120 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9124 dout(10) << "merging primary onto remote link" << dendl
;
9125 destdn
->get_dir()->unlink_inode(destdn
, false);
9127 destdnl
= destdn
->pop_projected_linkage();
9128 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9129 ceph_assert(!destdn
->is_projected()); // no other projected
9131 // srcdn inode import?
9132 if (!srcdn
->is_auth() && destdn
->is_auth()) {
9133 ceph_assert(mdr
->more()->inode_import
.length() > 0);
9135 map
<client_t
,Capability::Import
> imported_caps
;
9137 // finish cap imports
9138 finish_force_open_sessions(mdr
->more()->imported_session_map
);
9139 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
9140 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
9141 mdr
->more()->srcdn_auth_mds
, true,
9142 mdr
->more()->imported_session_map
,
9143 mdr
->more()->cap_imports
[destdnl
->get_inode()],
9147 mdr
->more()->inode_import
.clear();
9148 encode(imported_caps
, mdr
->more()->inode_import
);
9150 /* hack: add an auth pin for each xlock we hold. These were
9151 * remote xlocks previously but now they're local and
9152 * we're going to try and unpin when we xlock_finish. */
9154 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
9155 i
!= mdr
->locks
.end();
9157 SimpleLock
*lock
= i
->lock
;
9158 if (lock
->get_parent() != destdnl
->get_inode())
9160 if (i
->is_xlock() && !lock
->is_locallock())
9161 mds
->locker
->xlock_import(lock
);
9164 // hack: fix auth bit
9165 in
->state_set(CInode::STATE_AUTH
);
9167 mdr
->clear_ambiguous_auth();
9170 if (destdn
->is_auth())
9171 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9175 if (srcdn
->is_auth())
9176 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
9177 srcdn
->pop_projected_linkage();
9178 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9179 ceph_assert(!srcdn
->is_projected()); // no other projected
9181 // apply remaining projected inodes (nested)
9184 // update subtree map?
9185 if (destdnl
->is_primary() && in
->is_dir())
9186 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
9188 if (straydn
&& oldin
->is_dir())
9189 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
9191 if (new_oldin_snaprealm
)
9192 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
9193 if (new_in_snaprealm
)
9194 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
9196 // removing a new dn?
9197 if (srcdn
->is_auth())
9198 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
9206 class C_MDS_PeerRenamePrep
: public ServerLogContext
{
9207 CDentry
*srcdn
, *destdn
, *straydn
;
9209 C_MDS_PeerRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9210 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9211 void finish(int r
) override
{
9212 server
->_logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9216 class C_MDS_PeerRenameCommit
: public ServerContext
{
9218 CDentry
*srcdn
, *destdn
, *straydn
;
9220 C_MDS_PeerRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9221 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9222 void finish(int r
) override
{
9223 server
->_commit_peer_rename(mdr
, r
, srcdn
, destdn
, straydn
);
9227 class C_MDS_PeerRenameSessionsFlushed
: public ServerContext
{
9230 C_MDS_PeerRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
9231 ServerContext(s
), mdr(r
) {}
9232 void finish(int r
) override
{
9233 server
->_peer_rename_sessions_flushed(mdr
);
9237 void Server::handle_peer_rename_prep(MDRequestRef
& mdr
)
9239 dout(10) << "handle_peer_rename_prep " << *mdr
9240 << " " << mdr
->peer_request
->srcdnpath
9241 << " to " << mdr
->peer_request
->destdnpath
9244 if (mdr
->peer_request
->is_interrupted()) {
9245 dout(10) << " peer request interrupted, sending noop reply" << dendl
;
9246 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9247 reply
->mark_interrupted();
9248 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9249 mdr
->reset_peer_request();
9254 filepath
destpath(mdr
->peer_request
->destdnpath
);
9255 dout(10) << " dest " << destpath
<< dendl
;
9256 vector
<CDentry
*> trace
;
9257 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
9258 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
9259 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
9262 if (r
== -CEPHFS_ESTALE
) {
9263 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
9264 mdr
->peer_to_mds
, true);
9267 ceph_assert(r
== 0); // we shouldn't get an error here!
9269 CDentry
*destdn
= trace
.back();
9270 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9271 dout(10) << " destdn " << *destdn
<< dendl
;
9275 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
9276 dout(10) << " src " << srcpath
<< dendl
;
9277 CInode
*srci
= nullptr;
9278 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
9279 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
9282 ceph_assert(r
== 0);
9284 CDentry
*srcdn
= trace
.back();
9285 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9286 dout(10) << " srcdn " << *srcdn
<< dendl
;
9291 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
9293 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9294 CDentry
*straydn
= mdr
->straydn
;
9295 if (destdnl
->is_primary() && !linkmerge
)
9296 ceph_assert(straydn
);
9298 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
9299 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
9301 // set up commit waiter (early, to clean up any freezing etc we do)
9302 if (!mdr
->more()->peer_commit
)
9303 mdr
->more()->peer_commit
= new C_MDS_PeerRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
9306 if (srcdn
->is_auth()) {
9307 set
<mds_rank_t
> srcdnrep
;
9308 srcdn
->list_replicas(srcdnrep
);
9310 bool reply_witness
= false;
9311 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
9314 // - avoid conflicting lock state changes
9315 // - avoid concurrent updates to the inode
9316 // (this could also be accomplished with the versionlock)
9317 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9318 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
9319 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
9321 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9322 if (srcdnl
->get_inode()->is_frozen_auth_pin())
9323 mdr
->unfreeze_auth_pin();
9325 if (!frozen_inode
) {
9326 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
9331 * set ambiguous auth for srci
9332 * NOTE: we don't worry about ambiguous cache expire as we do
9333 * with subtree migrations because all peers will pin
9334 * srcdn->get_inode() for duration of this rename.
9336 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9338 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9339 // the leader will send another OP_RENAMEPREP peer request later.
9340 if (mdr
->peer_request
->witnesses
.size() > 1) {
9341 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
9342 reply_witness
= true;
9345 // make sure bystanders have received all lock related messages
9346 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9347 if (*p
== mdr
->peer_to_mds
||
9348 (mds
->is_cluster_degraded() &&
9349 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
9351 auto notify
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMENOTIFY
);
9352 mds
->send_message_mds(notify
, *p
);
9353 mdr
->more()->waiting_on_peer
.insert(*p
);
9356 // make sure clients have received all cap related messages
9357 set
<client_t
> export_client_set
;
9358 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
9360 MDSGatherBuilder
gather(g_ceph_context
);
9361 flush_client_sessions(export_client_set
, gather
);
9362 if (gather
.has_subs()) {
9363 mdr
->more()->waiting_on_peer
.insert(MDS_RANK_NONE
);
9364 gather
.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr
));
9369 // is witness list sufficient?
9370 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9371 if (*p
== mdr
->peer_to_mds
||
9372 mdr
->peer_request
->witnesses
.count(*p
)) continue;
9373 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
9374 reply_witness
= true;
9378 if (reply_witness
) {
9379 ceph_assert(!srcdnrep
.empty());
9380 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9381 reply
->witnesses
.swap(srcdnrep
);
9382 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9383 mdr
->reset_peer_request();
9386 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
9387 if (!mdr
->more()->waiting_on_peer
.empty()) {
9388 dout(10) << " still waiting for rename notify acks from "
9389 << mdr
->more()->waiting_on_peer
<< dendl
;
9392 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
9393 // set ambiguous auth for srci on witnesses
9394 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9397 // encode everything we'd need to roll this back... basically, just the original state.
9398 rename_rollback rollback
;
9400 rollback
.reqid
= mdr
->reqid
;
9402 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
9403 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9404 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9405 rollback
.orig_src
.dname
= srcdn
->get_name();
9406 if (srcdnl
->is_primary())
9407 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
9409 ceph_assert(srcdnl
->is_remote());
9410 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
9411 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
9414 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
9415 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9416 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9417 rollback
.orig_dest
.dname
= destdn
->get_name();
9418 if (destdnl
->is_primary())
9419 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
9420 else if (destdnl
->is_remote()) {
9421 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
9422 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
9426 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
9427 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9428 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9429 rollback
.stray
.dname
= straydn
->get_name();
9431 if (mdr
->peer_request
->desti_snapbl
.length()) {
9432 CInode
*oldin
= destdnl
->get_inode();
9433 if (oldin
->snaprealm
) {
9434 encode(true, rollback
.desti_snapbl
);
9435 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
9437 encode(false, rollback
.desti_snapbl
);
9440 if (mdr
->peer_request
->srci_snapbl
.length()) {
9441 if (srci
->snaprealm
) {
9442 encode(true, rollback
.srci_snapbl
);
9443 srci
->encode_snap_blob(rollback
.srci_snapbl
);
9445 encode(false, rollback
.srci_snapbl
);
9448 encode(rollback
, mdr
->more()->rollback_bl
);
9449 // FIXME: rollback snaprealm
9450 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
9453 mdr
->ls
= mdlog
->get_current_segment();
9454 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_prep", mdr
->reqid
, mdr
->peer_to_mds
,
9455 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RENAME
);
9456 mdlog
->start_entry(le
);
9457 le
->rollback
= mdr
->more()->rollback_bl
;
9459 bufferlist blah
; // inode import data... obviously not used if we're the peer
9460 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, mdr
->peer_request
->alternate_name
, straydn
);
9462 if (le
->commit
.empty()) {
9463 dout(10) << " empty metablob, skipping journal" << dendl
;
9464 mdlog
->cancel_entry(le
);
9466 _logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9468 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
9469 mdr
->more()->peer_update_journaled
= true;
9470 submit_mdlog_entry(le
, new C_MDS_PeerRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
9476 void Server::_logged_peer_rename(MDRequestRef
& mdr
,
9477 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9479 dout(10) << "_logged_peer_rename " << *mdr
<< dendl
;
9482 ref_t
<MMDSPeerRequest
> reply
;
9483 if (!mdr
->aborted
) {
9484 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9485 if (!mdr
->more()->peer_update_journaled
)
9486 reply
->mark_not_journaled();
9489 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9490 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9493 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
9494 // set export bounds for CInode::encode_export()
9496 std::vector
<CDir
*> bounds
;
9497 if (srcdnl
->get_inode()->is_dir()) {
9498 srcdnl
->get_inode()->get_dirfrags(bounds
);
9499 for (const auto& bound
: bounds
) {
9500 bound
->state_set(CDir::STATE_EXPORTBOUND
);
9504 map
<client_t
,entity_inst_t
> exported_client_map
;
9505 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
9507 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
9508 exported_client_map
,
9509 exported_client_metadata_map
);
9511 for (const auto& bound
: bounds
) {
9512 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
9515 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
9516 encode(exported_client_metadata_map
, reply
->inode_export
);
9517 reply
->inode_export
.claim_append(inodebl
);
9518 reply
->inode_export_v
= srcdnl
->get_inode()->get_version();
9521 // remove mdr auth pin
9522 mdr
->auth_unpin(srcdnl
->get_inode());
9523 mdr
->more()->is_inode_exporter
= true;
9525 if (srcdnl
->get_inode()->is_dirty())
9526 srcdnl
->get_inode()->mark_clean();
9528 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
9532 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9534 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9537 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9538 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
9539 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
9542 mdr
->reset_peer_request();
9546 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9548 ceph_assert(mdr
->aborted
);
9549 dout(10) << " abort flag set, finishing" << dendl
;
9550 mdcache
->request_finish(mdr
);
9554 void Server::_commit_peer_rename(MDRequestRef
& mdr
, int r
,
9555 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9557 dout(10) << "_commit_peer_rename " << *mdr
<< " r=" << r
<< dendl
;
9559 CInode
*in
= destdn
->get_linkage()->get_inode();
9561 inodeno_t migrated_stray
;
9562 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
9563 migrated_stray
= in
->ino();
9565 MDSContext::vec finished
;
9567 // unfreeze+singleauth inode
9568 // hmm, do i really need to delay this?
9569 if (mdr
->more()->is_inode_exporter
) {
9571 // we exported, clear out any xlocks that we moved to another MDS
9573 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
9574 i
!= mdr
->locks
.end(); ) {
9575 SimpleLock
*lock
= i
->lock
;
9576 if (lock
->get_parent() != in
)
9578 // we only care about xlocks on the exported inode
9579 if (i
->is_xlock() && !lock
->is_locallock())
9580 mds
->locker
->xlock_export(i
++, mdr
.get());
9585 map
<client_t
,Capability::Import
> peer_imported
;
9586 auto bp
= mdr
->more()->inode_import
.cbegin();
9587 decode(peer_imported
, bp
);
9589 dout(10) << " finishing inode export on " << *in
<< dendl
;
9590 mdcache
->migrator
->finish_export_inode(in
, mdr
->peer_to_mds
, peer_imported
, finished
);
9591 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
9594 ceph_assert(in
->is_frozen_inode());
9595 in
->unfreeze_inode(finished
);
9599 if (mdr
->more()->is_ambiguous_auth
) {
9600 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9601 mdr
->more()->is_ambiguous_auth
= false;
9604 if (straydn
&& mdr
->more()->peer_update_journaled
) {
9605 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
9606 if (strayin
&& !strayin
->snaprealm
)
9607 mdcache
->clear_dirty_bits_for_stray(strayin
);
9610 mds
->queue_waiters(finished
);
9613 if (mdr
->more()->peer_update_journaled
) {
9614 // write a commit to the journal
9615 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_commit", mdr
->reqid
,
9616 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
9617 EPeerUpdate::RENAME
);
9618 mdlog
->start_entry(le
);
9619 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
9622 _committed_peer(mdr
);
9627 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9628 // witness list from the leader, and they failed before we tried prep again.
9629 if (mdr
->more()->rollback_bl
.length()) {
9630 if (mdr
->more()->is_inode_exporter
) {
9631 dout(10) << " reversing inode export of " << *in
<< dendl
;
9634 if (mdcache
->is_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
)) {
9635 mdcache
->remove_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
);
9636 // rollback but preserve the peer request
9637 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, false);
9638 mdr
->more()->rollback_bl
.clear();
9640 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, true);
9642 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl
;
9644 if (mdr
->more()->is_ambiguous_auth
) {
9645 if (srcdn
->is_auth())
9646 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9648 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9649 mdr
->more()->is_ambiguous_auth
= false;
9651 mds
->queue_waiters(finished
);
9652 mdcache
->request_finish(mdr
);
9656 if (migrated_stray
&& mds
->is_stopping())
9657 mdcache
->shutdown_export_stray_finish(migrated_stray
);
9660 static void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
,
9661 rename_rollback::drec
&r
, utime_t ctime
,
9662 bool isdir
, const nest_info_t
&rstat
)
9664 auto pf
= dir
->project_fnode(mut
);
9665 pf
->version
= dir
->pre_dirty();
9668 pf
->fragstat
.nsubdirs
+= 1;
9670 pf
->fragstat
.nfiles
+= 1;
9673 pf
->rstat
.rbytes
+= rstat
.rbytes
;
9674 pf
->rstat
.rfiles
+= rstat
.rfiles
;
9675 pf
->rstat
.rsubdirs
+= rstat
.rsubdirs
;
9676 pf
->rstat
.rsnaps
+= rstat
.rsnaps
;
9678 if (pf
->fragstat
.mtime
== ctime
) {
9679 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
9680 if (pf
->rstat
.rctime
== ctime
)
9681 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
9683 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
9684 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
9687 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
9693 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9695 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
9696 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
9697 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
9698 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
9699 straydn(st
), finish_mdr(f
) {
9700 splits
[0].swap(_splits
[0]);
9701 splits
[1].swap(_splits
[1]);
9703 void finish(int r
) override
{
9704 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
9705 destdn
, straydn
, splits
, finish_mdr
);
9709 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
,
9712 rename_rollback rollback
;
9713 auto p
= rbl
.cbegin();
9714 decode(rollback
, p
);
9716 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
9717 // need to finish this update before sending resolve to claim the subtree
9718 mdcache
->add_rollback(rollback
.reqid
, leader
);
9720 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
9721 mut
->ls
= mds
->mdlog
->get_current_segment();
9723 CDentry
*srcdn
= NULL
;
9724 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
9726 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
9728 dout(10) << " srcdir " << *srcdir
<< dendl
;
9729 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
9731 dout(10) << " srcdn " << *srcdn
<< dendl
;
9732 ceph_assert(srcdn
->get_linkage()->is_null());
9734 dout(10) << " srcdn not found" << dendl
;
9736 dout(10) << " srcdir not found" << dendl
;
9738 CDentry
*destdn
= NULL
;
9739 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
9741 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
9743 dout(10) << " destdir " << *destdir
<< dendl
;
9744 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
9746 dout(10) << " destdn " << *destdn
<< dendl
;
9748 dout(10) << " destdn not found" << dendl
;
9750 dout(10) << " destdir not found" << dendl
;
9753 if (rollback
.orig_src
.ino
) {
9754 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
9755 if (in
&& in
->is_dir())
9756 ceph_assert(srcdn
&& destdn
);
9758 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
9760 CDir
*straydir
= NULL
;
9761 CDentry
*straydn
= NULL
;
9762 if (rollback
.stray
.dirfrag
.ino
) {
9763 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
9765 dout(10) << "straydir " << *straydir
<< dendl
;
9766 straydn
= straydir
->lookup(rollback
.stray
.dname
);
9768 dout(10) << " straydn " << *straydn
<< dendl
;
9769 ceph_assert(straydn
->get_linkage()->is_primary());
9771 dout(10) << " straydn not found" << dendl
;
9773 dout(10) << "straydir not found" << dendl
;
9776 CInode
*target
= NULL
;
9777 if (rollback
.orig_dest
.ino
) {
9778 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
9780 ceph_assert(destdn
&& straydn
);
9781 } else if (rollback
.orig_dest
.remote_ino
)
9782 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
9784 // can't use is_auth() in the resolve stage
9785 mds_rank_t whoami
= mds
->get_nodeid();
9787 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
9788 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
9790 bool force_journal_src
= false;
9791 bool force_journal_dest
= false;
9792 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
9793 force_journal_src
= _need_force_journal(in
, false);
9794 if (in
&& target
&& target
->is_dir())
9795 force_journal_dest
= _need_force_journal(in
, true);
9797 version_t srcdnpv
= 0;
9800 if (srcdn
->authority().first
== whoami
)
9801 srcdnpv
= srcdn
->pre_dirty();
9802 if (rollback
.orig_src
.ino
) {
9804 srcdn
->push_projected_linkage(in
);
9806 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
9807 rollback
.orig_src
.remote_d_type
);
9810 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9812 const CInode::mempool_inode
*pip
= nullptr;
9815 CDir
*pdir
= in
->get_projected_parent_dir();
9816 if (pdir
->authority().first
== whoami
) {
9817 auto pi
= in
->project_inode(mut
);
9818 pi
.inode
->version
= in
->pre_dirty();
9819 if (pdir
!= srcdir
) {
9820 auto pf
= pdir
->project_fnode(mut
);
9821 pf
->version
= pdir
->pre_dirty();
9823 if (pi
.inode
->ctime
== rollback
.ctime
)
9824 pi
.inode
->ctime
= rollback
.orig_src
.old_ctime
;
9827 if (in
->get_inode()->ctime
== rollback
.ctime
) {
9828 auto _inode
= CInode::allocate_inode(*in
->get_inode());
9829 _inode
->ctime
= rollback
.orig_src
.old_ctime
;
9830 in
->reset_inode(_inode
);
9834 pip
= in
->get_projected_inode().get();
9836 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
9838 auto p
= rollback
.srci_snapbl
.cbegin();
9839 decode(hadrealm
, p
);
9841 if (projected
&& !mds
->is_resolve()) {
9842 sr_t
*new_srnode
= new sr_t();
9843 decode(*new_srnode
, p
);
9844 in
->project_snaprealm(new_srnode
);
9846 decode(in
->snaprealm
->srnode
, p
);
9849 if (rollback
.orig_src
.ino
) {
9850 ceph_assert(srcdir
);
9851 realm
= srcdir
->get_inode()->find_snaprealm();
9853 realm
= in
->snaprealm
->parent
;
9855 if (!mds
->is_resolve())
9856 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
9858 in
->project_snaprealm(NULL
);
9860 in
->snaprealm
->merge_to(realm
);
9867 if (rollback
.orig_dest
.ino
&& target
) {
9868 destdn
->push_projected_linkage(target
);
9869 } else if (rollback
.orig_dest
.remote_ino
) {
9870 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
9871 rollback
.orig_dest
.remote_d_type
);
9873 // the dentry will be trimmed soon, it's ok to have wrong linkage
9874 if (rollback
.orig_dest
.ino
)
9875 ceph_assert(mds
->is_resolve());
9876 destdn
->push_projected_linkage();
9881 straydn
->push_projected_linkage();
9885 CInode::inode_ptr ti
;
9886 CDir
*pdir
= target
->get_projected_parent_dir();
9887 if (pdir
->authority().first
== whoami
) {
9888 auto pi
= target
->project_inode(mut
);
9889 pi
.inode
->version
= target
->pre_dirty();
9890 if (pdir
!= srcdir
) {
9891 auto pf
= pdir
->project_fnode(mut
);
9892 pf
->version
= pdir
->pre_dirty();
9897 ti
= CInode::allocate_inode(*target
->get_inode());
9901 if (ti
->ctime
== rollback
.ctime
)
9902 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
9903 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
9904 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
9905 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
9907 ceph_assert(rollback
.orig_dest
.remote_ino
&&
9908 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
9913 target
->reset_inode(ti
);
9915 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
9917 auto p
= rollback
.desti_snapbl
.cbegin();
9918 decode(hadrealm
, p
);
9920 if (projected
&& !mds
->is_resolve()) {
9921 sr_t
*new_srnode
= new sr_t();
9922 decode(*new_srnode
, p
);
9923 target
->project_snaprealm(new_srnode
);
9925 decode(target
->snaprealm
->srnode
, p
);
9928 if (rollback
.orig_dest
.ino
) {
9929 ceph_assert(destdir
);
9930 realm
= destdir
->get_inode()->find_snaprealm();
9932 realm
= target
->snaprealm
->parent
;
9934 if (!mds
->is_resolve())
9935 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
9937 target
->project_snaprealm(NULL
);
9939 target
->snaprealm
->merge_to(realm
);
9944 if (srcdn
&& srcdn
->authority().first
== whoami
) {
9946 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
9947 in
&& in
->is_dir(), pip
? pip
->accounted_rstat
: blah
);
9951 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
9953 dout(0) << " srci back to " << *in
<< dendl
;
9955 dout(0) << " destdn back to " << *destdn
<< dendl
;
9957 dout(0) << " desti back to " << *target
<< dendl
;
9960 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_rollback", rollback
.reqid
, leader
,
9961 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RENAME
);
9962 mdlog
->start_entry(le
);
9964 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9965 le
->commit
.add_dir_context(srcdir
);
9966 if (rollback
.orig_src
.ino
)
9967 le
->commit
.add_primary_dentry(srcdn
, 0, true);
9969 le
->commit
.add_remote_dentry(srcdn
, true);
9972 if (!rollback
.orig_src
.ino
&& // remote linkage
9973 in
&& in
->authority().first
== whoami
) {
9974 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
9975 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9978 if (force_journal_dest
) {
9979 ceph_assert(rollback
.orig_dest
.ino
);
9980 le
->commit
.add_dir_context(destdir
);
9981 le
->commit
.add_primary_dentry(destdn
, 0, true);
9984 // peer: no need to journal straydn
9986 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
9987 ceph_assert(rollback
.orig_dest
.remote_ino
);
9988 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
9989 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
9992 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9993 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
9994 le
->commit
.renamed_dirino
= in
->ino();
9995 if (srcdn
->authority().first
== whoami
) {
9996 auto&& ls
= in
->get_dirfrags();
9997 for (const auto& dir
: ls
) {
9998 if (!dir
->is_auth())
9999 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
10001 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
10003 } else if (force_journal_dest
) {
10004 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
10005 le
->commit
.renamed_dirino
= target
->ino();
10008 if (target
&& target
->is_dir()) {
10009 ceph_assert(destdn
);
10010 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
10013 if (in
&& in
->is_dir()) {
10014 ceph_assert(srcdn
);
10015 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
10018 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
10019 ceph_assert(le
->commit
.empty());
10020 mdlog
->cancel_entry(le
);
10022 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
10024 ceph_assert(!le
->commit
.empty());
10026 mdr
->more()->peer_update_journaled
= false;
10027 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
10028 srcdn
, srcdnpv
, destdn
, straydn
,
10029 splits
, finish_mdr
);
10030 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
10035 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
10036 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
10037 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
10039 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
10042 straydn
->get_dir()->unlink_inode(straydn
);
10043 straydn
->pop_projected_linkage();
10046 destdn
->get_dir()->unlink_inode(destdn
);
10047 destdn
->pop_projected_linkage();
10050 srcdn
->pop_projected_linkage();
10051 if (srcdn
->authority().first
== mds
->get_nodeid()) {
10052 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
10053 if (srcdn
->get_linkage()->is_primary())
10054 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
10060 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
10061 CInode
*in
= srcdn
->get_linkage()->get_inode();
10062 if (in
&& in
->is_dir()) {
10063 ceph_assert(destdn
);
10064 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
10069 CInode
*oldin
= destdn
->get_linkage()->get_inode();
10070 // update subtree map?
10071 if (oldin
&& oldin
->is_dir()) {
10072 ceph_assert(straydn
);
10073 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
10077 if (mds
->is_resolve()) {
10080 root
= mdcache
->get_subtree_root(straydn
->get_dir());
10082 root
= mdcache
->get_subtree_root(destdn
->get_dir());
10084 mdcache
->try_trim_non_auth_subtree(root
);
10086 mdcache
->send_snaps(splits
[1]);
10087 mdcache
->send_snaps(splits
[0]);
10091 MDSContext::vec finished
;
10092 if (mdr
->more()->is_ambiguous_auth
) {
10093 if (srcdn
->is_auth())
10094 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10096 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10097 mdr
->more()->is_ambiguous_auth
= false;
10099 mds
->queue_waiters(finished
);
10100 if (finish_mdr
|| mdr
->aborted
)
10101 mdcache
->request_finish(mdr
);
10103 mdr
->more()->peer_rolling_back
= false;
10106 mdcache
->finish_rollback(mut
->reqid
, mdr
);
10111 void Server::handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10113 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10114 << " witnessed by " << ack
->get_source()
10115 << " " << *ack
<< dendl
;
10116 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10119 mdr
->more()->peers
.insert(from
);
10120 if (mdr
->more()->srcdn_auth_mds
== from
&&
10121 mdr
->more()->is_remote_frozen_authpin
&&
10122 !mdr
->more()->is_ambiguous_auth
) {
10123 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
10126 // witnessed? or add extra witnesses?
10127 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
10128 if (ack
->is_interrupted()) {
10129 dout(10) << " peer request interrupted, noop" << dendl
;
10130 } else if (ack
->witnesses
.empty()) {
10131 mdr
->more()->witnessed
.insert(from
);
10132 if (!ack
->is_not_journaled())
10133 mdr
->more()->has_journaled_peers
= true;
10135 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
10136 mdr
->more()->extra_witnesses
= ack
->witnesses
;
10137 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
10141 if (ack
->inode_export
.length()) {
10142 dout(10) << " got srci import" << dendl
;
10143 mdr
->more()->inode_import
.share(ack
->inode_export
);
10144 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
10147 // remove from waiting list
10148 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
10149 mdr
->more()->waiting_on_peer
.erase(from
);
10151 if (mdr
->more()->waiting_on_peer
.empty())
10152 dispatch_client_request(mdr
); // go again!
10154 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
10157 void Server::handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10159 dout(10) << "handle_peer_rename_notify_ack " << *mdr
<< " from mds."
10160 << ack
->get_source() << dendl
;
10161 ceph_assert(mdr
->is_peer());
10162 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10164 if (mdr
->more()->waiting_on_peer
.count(from
)) {
10165 mdr
->more()->waiting_on_peer
.erase(from
);
10167 if (mdr
->more()->waiting_on_peer
.empty()) {
10168 if (mdr
->peer_request
)
10169 dispatch_peer_request(mdr
);
10171 dout(10) << " still waiting for rename notify acks from "
10172 << mdr
->more()->waiting_on_peer
<< dendl
;
10176 void Server::_peer_rename_sessions_flushed(MDRequestRef
& mdr
)
10178 dout(10) << "_peer_rename_sessions_flushed " << *mdr
<< dendl
;
10180 if (mdr
->more()->waiting_on_peer
.count(MDS_RANK_NONE
)) {
10181 mdr
->more()->waiting_on_peer
.erase(MDS_RANK_NONE
);
10183 if (mdr
->more()->waiting_on_peer
.empty()) {
10184 if (mdr
->peer_request
)
10185 dispatch_peer_request(mdr
);
10187 dout(10) << " still waiting for rename notify acks from "
10188 << mdr
->more()->waiting_on_peer
<< dendl
;
10193 /* This function takes responsibility for the passed mdr*/
10194 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
10196 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10198 // traverse to path
10199 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10203 if (!diri
->is_dir()) {
10204 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10207 dout(10) << "lssnap on " << *diri
<< dendl
;
10210 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
10213 if (!check_access(mdr
, diri
, MAY_READ
))
10216 SnapRealm
*realm
= diri
->find_snaprealm();
10217 map
<snapid_t
,const SnapInfo
*> infomap
;
10218 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
10220 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
10222 max_entries
= infomap
.size();
10223 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
10225 // make sure at least one item can be encoded
10226 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
10228 __u64 last_snapid
= 0;
10229 string offset_str
= req
->get_path2();
10230 if (!offset_str
.empty())
10231 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
10235 static DirStat empty
;
10236 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
10238 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
10242 auto p
= infomap
.upper_bound(last_snapid
);
10243 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
10244 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
10248 if (p
->second
->ino
== diri
->ino())
10249 snap_name
= p
->second
->name
;
10251 snap_name
= p
->second
->get_long_name();
10253 unsigned start_len
= dnbl
.length();
10254 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
10257 encode(snap_name
, dnbl
);
10259 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
10260 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
10261 dout(20) << "encode_infinite_lease" << dendl
;
10263 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
10266 keep
.substr_of(dnbl
, 0, start_len
);
10273 encode(num
, dirbl
);
10275 if (p
== infomap
.end()) {
10276 flags
= CEPH_READDIR_FRAG_END
;
10277 if (last_snapid
== 0)
10278 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
10280 encode(flags
, dirbl
);
10281 dirbl
.claim_append(dnbl
);
10283 mdr
->reply_extra_bl
= dirbl
;
10284 mdr
->tracei
= diri
;
10285 respond_to_request(mdr
, 0);
10291 struct C_MDS_mksnap_finish
: public ServerLogContext
{
10294 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
10295 ServerLogContext(s
, r
), diri(di
), info(i
) {}
10296 void finish(int r
) override
{
10297 server
->_mksnap_finish(mdr
, diri
, info
);
10301 /* This function takes responsibility for the passed mdr*/
10302 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
10304 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10305 // make sure we have as new a map as the client
10306 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
10307 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
10310 if (!mds
->mdsmap
->allows_snaps()) {
10311 // you can't make snapshots until you set an option right now
10312 dout(5) << "new snapshots are disabled for this fs" << dendl
;
10313 respond_to_request(mdr
, -CEPHFS_EPERM
);
10317 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10322 if (!diri
->is_dir()) {
10323 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10326 if (diri
->is_system() && !diri
->is_root()) {
10327 // no snaps in system dirs (root is ok)
10328 dout(5) << "is an internal system dir" << dendl
;
10329 respond_to_request(mdr
, -CEPHFS_EPERM
);
10333 std::string_view snapname
= req
->get_filepath().last_dentry();
10335 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10336 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10337 respond_to_request(mdr
, -CEPHFS_EPERM
);
10341 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
10344 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10345 MutationImpl::LockOpVec lov
;
10346 lov
.add_xlock(&diri
->snaplock
);
10347 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10350 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10351 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10354 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10357 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10360 if (inodeno_t subvol_ino
= diri
->find_snaprealm()->get_subvolume_ino();
10361 (subvol_ino
&& subvol_ino
!= diri
->ino())) {
10362 dout(5) << "is a descendent of a subvolume dir" << dendl
;
10363 respond_to_request(mdr
, -CEPHFS_EPERM
);
10367 // check if we can create any more snapshots
10368 // we don't allow any more if we are already at or beyond the limit
10369 if (diri
->snaprealm
&&
10370 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
10371 respond_to_request(mdr
, -CEPHFS_EMLINK
);
10375 // make sure name is unique
10376 if (diri
->snaprealm
&&
10377 diri
->snaprealm
->exists(snapname
)) {
10378 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10381 if (snapname
.length() == 0 ||
10382 snapname
[0] == '_') {
10383 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10387 // allocate a snapid
10388 if (!mdr
->more()->stid
) {
10390 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
10391 mdr
->get_mds_stamp(),
10392 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10393 new C_MDS_RetryRequest(mdcache
, mdr
));
10397 version_t stid
= mdr
->more()->stid
;
10399 auto p
= mdr
->more()->snapidbl
.cbegin();
10401 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
10403 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10405 SnapPayload payload
;
10406 if (req
->get_data().length()) {
10408 auto iter
= req
->get_data().cbegin();
10409 decode(payload
, iter
);
10410 } catch (const ceph::buffer::error
&e
) {
10411 // backward compat -- client sends xattr bufferlist. however,
10412 // that is not used anywhere -- so (log and) ignore.
10413 dout(20) << ": no metadata in payload (old client?)" << dendl
;
10419 info
.ino
= diri
->ino();
10420 info
.snapid
= snapid
;
10421 info
.name
= snapname
;
10422 info
.stamp
= mdr
->get_op_stamp();
10423 info
.metadata
= payload
.metadata
;
10425 auto pi
= diri
->project_inode(mdr
, false, true);
10426 pi
.inode
->ctime
= info
.stamp
;
10427 if (info
.stamp
> pi
.inode
->rstat
.rctime
)
10428 pi
.inode
->rstat
.rctime
= info
.stamp
;
10429 pi
.inode
->rstat
.rsnaps
++;
10430 pi
.inode
->version
= diri
->pre_dirty();
10432 // project the snaprealm
10433 auto &newsnap
= *pi
.snapnode
;
10434 newsnap
.created
= snapid
;
10435 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
10437 em
.first
->second
= info
;
10438 newsnap
.seq
= snapid
;
10439 newsnap
.last_created
= snapid
;
10441 // journal the inode changes
10442 mdr
->ls
= mdlog
->get_current_segment();
10443 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
10444 mdlog
->start_entry(le
);
10446 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10447 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10448 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10449 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10451 // journal the snaprealm changes
10452 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
10457 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
10459 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
10461 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
10465 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10468 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10470 // notify other mds
10471 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
10473 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
10477 mdr
->snapid
= info
.snapid
;
10478 mdr
->tracei
= diri
;
10479 respond_to_request(mdr
, 0);
10485 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
10488 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10489 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10490 void finish(int r
) override
{
10491 server
->_rmsnap_finish(mdr
, diri
, snapid
);
10495 /* This function takes responsibility for the passed mdr*/
10496 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
10498 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10500 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10504 if (!diri
->is_dir()) {
10505 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10509 std::string_view snapname
= req
->get_filepath().last_dentry();
10511 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10512 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10513 respond_to_request(mdr
, -CEPHFS_EPERM
);
10517 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
10519 // does snap exist?
10520 if (snapname
.length() == 0 || snapname
[0] == '_') {
10521 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't prune a parent snap, currently.
10524 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
10525 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10528 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
10529 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
10531 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10532 MutationImpl::LockOpVec lov
;
10533 lov
.add_xlock(&diri
->snaplock
);
10534 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10536 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10537 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10540 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10543 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10547 if (!mdr
->more()->stid
) {
10548 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
10549 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10550 new C_MDS_RetryRequest(mdcache
, mdr
));
10553 version_t stid
= mdr
->more()->stid
;
10554 auto p
= mdr
->more()->snapidbl
.cbegin();
10557 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
10559 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10562 auto pi
= diri
->project_inode(mdr
, false, true);
10563 pi
.inode
->version
= diri
->pre_dirty();
10564 pi
.inode
->ctime
= mdr
->get_op_stamp();
10565 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10566 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10567 pi
.inode
->rstat
.rsnaps
--;
10569 mdr
->ls
= mdlog
->get_current_segment();
10570 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
10571 mdlog
->start_entry(le
);
10573 // project the snaprealm
10574 auto &newnode
= *pi
.snapnode
;
10575 newnode
.snaps
.erase(snapid
);
10577 newnode
.last_destroyed
= seq
;
10579 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10580 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10581 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10582 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10584 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
10589 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10591 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10592 snapid_t stid
= mdr
->more()->stid
;
10593 auto p
= mdr
->more()->snapidbl
.cbegin();
10599 mds
->snapclient
->commit(stid
, mdr
->ls
);
10601 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10603 // notify other mds
10604 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
10606 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
10610 respond_to_request(mdr
, 0);
10612 // purge snapshot data
10613 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
10616 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
10619 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10620 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10621 void finish(int r
) override
{
10622 server
->_renamesnap_finish(mdr
, diri
, snapid
);
10626 /* This function takes responsibility for the passed mdr*/
10627 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
10629 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10630 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
10631 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10635 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10639 if (!diri
->is_dir()) { // dir only
10640 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10644 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
10645 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10646 respond_to_request(mdr
, -CEPHFS_EPERM
);
10650 std::string_view dstname
= req
->get_filepath().last_dentry();
10651 std::string_view srcname
= req
->get_filepath2().last_dentry();
10652 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
10654 if (srcname
.length() == 0 || srcname
[0] == '_') {
10655 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't rename a parent snap.
10658 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
10659 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10662 if (dstname
.length() == 0 || dstname
[0] == '_') {
10663 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10666 if (diri
->snaprealm
->exists(dstname
)) {
10667 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10671 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
10672 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
10675 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10676 MutationImpl::LockOpVec lov
;
10677 lov
.add_xlock(&diri
->snaplock
);
10678 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10680 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10681 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10684 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10687 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10691 if (!mdr
->more()->stid
) {
10692 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
10693 &mdr
->more()->stid
,
10694 new C_MDS_RetryRequest(mdcache
, mdr
));
10698 version_t stid
= mdr
->more()->stid
;
10699 dout(10) << " stid is " << stid
<< dendl
;
10701 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10704 auto pi
= diri
->project_inode(mdr
, false, true);
10705 pi
.inode
->ctime
= mdr
->get_op_stamp();
10706 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10707 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10708 pi
.inode
->version
= diri
->pre_dirty();
10710 // project the snaprealm
10711 auto &newsnap
= *pi
.snapnode
;
10712 auto it
= newsnap
.snaps
.find(snapid
);
10713 ceph_assert(it
!= newsnap
.snaps
.end());
10714 it
->second
.name
= dstname
;
10716 // journal the inode changes
10717 mdr
->ls
= mdlog
->get_current_segment();
10718 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
10719 mdlog
->start_entry(le
);
10721 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10722 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10723 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10724 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10726 // journal the snaprealm changes
10727 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
10732 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10734 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10738 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10740 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10742 // notify other mds
10743 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
10745 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
10749 mdr
->tracei
= diri
;
10750 mdr
->snapid
= snapid
;
10751 respond_to_request(mdr
, 0);
10755 * Return true if server is in state RECONNECT and this
10756 * client has not yet reconnected.
10758 bool Server::waiting_for_reconnect(client_t c
) const
10760 return client_reconnect_gather
.count(c
) > 0;
10763 void Server::dump_reconnect_status(Formatter
*f
) const
10765 f
->open_object_section("reconnect_status");
10766 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
10767 f
->close_section();