1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
35 #include "msg/Messenger.h"
37 #include "osdc/Objecter.h"
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
59 #include <string_view>
62 #include "common/config.h"
64 #include "msg/Message.h"
66 #define dout_context g_ceph_context
67 #define dout_subsys ceph_subsys_mds
69 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
73 class ServerContext
: public MDSContext
{
76 MDSRank
*get_mds() override
82 explicit ServerContext(Server
*s
) : server(s
) {
83 ceph_assert(server
!= NULL
);
87 class Batch_Getattr_Lookup
: public BatchOp
{
90 ceph::ref_t
<MDRequestImpl
> mdr
;
91 std::vector
<ceph::ref_t
<MDRequestImpl
>> batch_reqs
;
94 Batch_Getattr_Lookup(Server
* s
, const ceph::ref_t
<MDRequestImpl
>& r
)
96 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
)
97 mdr
->batch_op_map
= &mdr
->dn
[0].back()->batch_ops
;
99 mdr
->batch_op_map
= &mdr
->in
[0]->batch_ops
;
101 void add_request(const ceph::ref_t
<MDRequestImpl
>& r
) override
{
102 batch_reqs
.push_back(r
);
104 ceph::ref_t
<MDRequestImpl
> find_new_head() override
{
105 while (!batch_reqs
.empty()) {
106 auto r
= std::move(batch_reqs
.back());
107 batch_reqs
.pop_back();
111 r
->batch_op_map
= mdr
->batch_op_map
;
112 mdr
->batch_op_map
= nullptr;
118 void _forward(mds_rank_t t
) override
{
119 MDCache
* mdcache
= server
->mdcache
;
120 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
121 mdr
->set_mds_stamp(ceph_clock_now());
122 for (auto& m
: batch_reqs
) {
124 mdcache
->request_forward(m
, t
);
128 void _respond(int r
) override
{
129 mdr
->set_mds_stamp(ceph_clock_now());
130 for (auto& m
: batch_reqs
) {
132 m
->tracei
= mdr
->tracei
;
133 m
->tracedn
= mdr
->tracedn
;
134 server
->respond_to_request(m
, r
);
138 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
140 void print(std::ostream
& o
) {
141 o
<< "[batch front=" << *mdr
<< "]";
145 class ServerLogContext
: public MDSLogContextBase
{
148 MDSRank
*get_mds() override
154 void pre_finish(int r
) override
{
156 mdr
->mark_event("journal_committed: ");
159 explicit ServerLogContext(Server
*s
) : server(s
) {
160 ceph_assert(server
!= NULL
);
162 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
163 ceph_assert(server
!= NULL
);
167 void Server::create_logger()
169 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
171 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
172 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
173 plb
.add_u64_counter(l_mdss_handle_peer_request
, "handle_peer_request",
174 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
175 plb
.add_u64_counter(l_mdss_handle_client_session
,
176 "handle_client_session", "Client session messages", "hcs",
177 PerfCountersBuilder::PRIO_INTERESTING
);
178 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
179 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
180 plb
.add_u64_counter(l_mdss_cap_acquisition_throttle
,
181 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
182 PerfCountersBuilder::PRIO_INTERESTING
);
184 // fop latencies are useful
185 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
186 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
187 "Request type lookup hash of inode latency");
188 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
189 "Request type lookup inode latency");
190 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
191 "Request type lookup parent latency");
192 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
193 "Request type lookup name latency");
194 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
195 "Request type lookup latency");
196 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
197 "Request type lookup snapshot latency");
198 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
199 "Request type get attribute latency");
200 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
201 "Request type set attribute latency");
202 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
203 "Request type set file layout latency");
204 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
205 "Request type set directory layout latency");
206 plb
.add_time_avg(l_mdss_req_getvxattr_latency
, "req_getvxattr_latency",
207 "Request type get virtual extended attribute latency");
208 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
209 "Request type set extended attribute latency");
210 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
211 "Request type remove extended attribute latency");
212 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
213 "Request type read directory latency");
214 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
215 "Request type set file lock latency");
216 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
217 "Request type get file lock latency");
218 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
219 "Request type create latency");
220 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
221 "Request type open latency");
222 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
223 "Request type make node latency");
224 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
225 "Request type link latency");
226 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
227 "Request type unlink latency");
228 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
229 "Request type remove directory latency");
230 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
231 "Request type rename latency");
232 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
233 "Request type make directory latency");
234 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
235 "Request type symbolic link latency");
236 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
237 "Request type list snapshot latency");
238 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
239 "Request type make snapshot latency");
240 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
241 "Request type remove snapshot latency");
242 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
243 "Request type rename snapshot latency");
245 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
246 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
247 "Client requests dispatched");
248 plb
.add_u64_counter(l_mdss_dispatch_peer_request
, "dispatch_server_request",
249 "Server requests dispatched");
251 logger
= plb
.create_perf_counters();
252 g_ceph_context
->get_perfcounters_collection()->add(logger
);
255 Server::Server(MDSRank
*m
, MetricsHandler
*metrics_handler
) :
257 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
258 inject_rename_corrupt_dentry_first(g_conf().get_val
<double>("mds_inject_rename_corrupt_dentry_first")),
259 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate")),
260 metrics_handler(metrics_handler
)
262 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
263 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
264 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
265 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
266 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
267 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
268 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
269 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
270 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
271 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
272 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
273 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
274 supported_metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
277 void Server::dispatch(const cref_t
<Message
> &m
)
279 switch (m
->get_type()) {
280 case CEPH_MSG_CLIENT_RECONNECT
:
281 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
286 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
288 1. In reconnect phase, client sent unsafe requests to mds.
289 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
290 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
291 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
294 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
296 // handle_peer_request()/handle_client_session() will wait if necessary
297 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
298 const auto &req
= ref_cast
<MClientRequest
>(m
);
299 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
300 Session
*session
= mds
->get_session(req
);
301 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
302 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
305 bool queue_replay
= false;
306 if (req
->is_replay() || req
->is_async()) {
307 dout(3) << "queuing replayed op" << dendl
;
310 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
311 inodeno_t
ino(req
->head
.ino
);
312 mdcache
->add_replay_ino_alloc(ino
);
313 if (replay_unsafe_with_closed_session
&&
314 session
->free_prealloc_inos
.contains(ino
)) {
315 // don't purge inodes that will be created by later replay
316 session
->free_prealloc_inos
.erase(ino
);
317 session
->delegated_inos
.insert(ino
);
320 } else if (req
->get_retry_attempt()) {
321 // process completed request in clientreplay stage. The completed request
322 // might have created new file/directorie. This guarantees MDS sends a reply
323 // to client before other request modifies the new file/directorie.
324 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
325 dout(3) << "queuing completed op" << dendl
;
328 // this request was created before the cap reconnect message, drop any embedded
330 req
->releases
.clear();
333 req
->mark_queued_for_replay();
334 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
339 bool wait_for_active
= true;
340 if (mds
->is_stopping()) {
341 wait_for_active
= false;
342 } else if (mds
->is_clientreplay()) {
343 if (req
->is_queued_for_replay()) {
344 wait_for_active
= false;
347 if (wait_for_active
) {
348 dout(3) << "not active yet, waiting" << dendl
;
349 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
354 switch (m
->get_type()) {
355 case CEPH_MSG_CLIENT_SESSION
:
356 handle_client_session(ref_cast
<MClientSession
>(m
));
358 case CEPH_MSG_CLIENT_REQUEST
:
359 handle_client_request(ref_cast
<MClientRequest
>(m
));
361 case CEPH_MSG_CLIENT_RECLAIM
:
362 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
364 case MSG_MDS_PEER_REQUEST
:
365 handle_peer_request(ref_cast
<MMDSPeerRequest
>(m
));
368 derr
<< "Server unknown message " << m
->get_type() << " from peer type " << m
->get_connection()->get_peer_type() << dendl
;
369 ceph_abort_msg("server unknown message " + to_string(m
->get_type()) + " from peer type " + to_string(m
->get_connection()->get_peer_type()));
375 // ----------------------------------------------------------
376 // SESSION management
378 class C_MDS_session_finish
: public ServerLogContext
{
383 interval_set
<inodeno_t
> inos_to_free
;
385 interval_set
<inodeno_t
> inos_to_purge
;
386 LogSegment
*ls
= nullptr;
389 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= nullptr) :
390 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
391 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
,
392 const interval_set
<inodeno_t
>& to_free
, version_t iv
,
393 const interval_set
<inodeno_t
>& to_purge
, LogSegment
*_ls
, Context
*fin_
= nullptr) :
394 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
),
395 inos_to_free(to_free
), inotablev(iv
), inos_to_purge(to_purge
), ls(_ls
), fin(fin_
) {}
396 void finish(int r
) override
{
398 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos_to_free
, inotablev
, inos_to_purge
, ls
);
405 Session
* Server::find_session_by_uuid(std::string_view uuid
)
407 Session
* session
= nullptr;
408 for (auto& it
: mds
->sessionmap
.get_sessions()) {
409 auto& metadata
= it
.second
->info
.client_metadata
;
411 auto p
= metadata
.find("uuid");
412 if (p
== metadata
.end() || p
->second
!= uuid
)
417 } else if (!session
->reclaiming_from
) {
418 ceph_assert(it
.second
->reclaiming_from
== session
);
421 ceph_assert(session
->reclaiming_from
== it
.second
);
427 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
429 if (!session
->is_open() && !session
->is_stale()) {
430 dout(10) << "session not open, dropping this req" << dendl
;
434 auto reply
= make_message
<MClientReclaimReply
>(0);
435 if (m
->get_uuid().empty()) {
436 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
437 reply
->set_result(-CEPHFS_EINVAL
);
438 mds
->send_message_client(reply
, session
);
442 unsigned flags
= m
->get_flags();
443 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
444 dout(10) << __func__
<< " unsupported flags" << dendl
;
445 reply
->set_result(-CEPHFS_EINVAL
);
446 mds
->send_message_client(reply
, session
);
450 Session
* target
= find_session_by_uuid(m
->get_uuid());
452 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
453 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
454 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
455 reply
->set_result(-CEPHFS_EPERM
);
456 mds
->send_message_client(reply
, session
);
459 ceph_assert(!target
->reclaiming_from
);
460 ceph_assert(!session
->reclaiming_from
);
461 session
->reclaiming_from
= target
;
462 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
465 if (flags
& CEPH_RECLAIM_RESET
) {
466 finish_reclaim_session(session
, reply
);
467 } else ceph_assert(0); /* no other flags are handled at this time */
470 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
472 Session
*target
= session
->reclaiming_from
;
474 session
->reclaiming_from
= nullptr;
478 int64_t session_id
= session
->get_client().v
;
479 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
480 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
481 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
485 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
486 reply
->set_epoch(epoch
);
487 mds
->send_message_client(reply
, session
);
490 send_reply
= nullptr;
493 bool blocklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
494 return map
.is_blocklisted(target
->info
.inst
.addr
);
497 if (blocklisted
|| !g_conf()->mds_session_blocklist_on_evict
) {
498 kill_session(target
, send_reply
);
500 CachedStackStringStream css
;
501 mds
->evict_client(target
->get_client().v
, false, true, *css
, send_reply
);
504 mds
->send_message_client(reply
, session
);
508 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
510 Session
*session
= mds
->get_session(m
);
511 uint32_t flags
= m
->get_flags();
512 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
513 ceph_assert(m
->is_a_client()); // should _not_ come from an mds!
516 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
520 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
521 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
522 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
526 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
527 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
531 if (flags
& MClientReclaim::FLAG_FINISH
) {
532 if (flags
^ MClientReclaim::FLAG_FINISH
) {
533 dout(0) << __func__
<< " client specified FLAG_FINISH with other flags."
534 " Other flags:" << flags
<< dendl
;
535 auto reply
= make_message
<MClientReclaimReply
>(0);
536 reply
->set_result(-CEPHFS_EINVAL
);
537 mds
->send_message_client(reply
, session
);
540 finish_reclaim_session(session
);
542 reclaim_session(session
, m
);
546 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
549 Session
*session
= mds
->get_session(m
);
551 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
552 ceph_assert(m
->is_a_client()); // should _not_ come from an mds!
555 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
556 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
557 reply
->metadata
["error_string"] = "sessionless";
558 mds
->send_message(reply
, m
->get_connection());
562 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
563 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
564 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
565 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
566 reply
->metadata
["error_string"] = "client doesn't have caps for FS \"" +
567 std::string(fs_name
) + "\"";
568 mds
->send_message(std::move(reply
), m
->get_connection());
572 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
573 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
574 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
575 // close requests need to be handled when mds is active
576 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
577 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
581 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
582 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
588 logger
->inc(l_mdss_handle_client_session
);
591 switch (m
->get_op()) {
592 case CEPH_SESSION_REQUEST_OPEN
:
593 if(mds
->mdsmap
->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
)) {
594 dout(0) << "new sessions are not permitted, enable again via"
595 "`ceph fs set <fs_name> refuse_client_session false`" << dendl
;
596 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
597 reply
->metadata
["error_string"] = "new sessions are not permitted,"
598 " enable again via `ceph fs set"
599 " <fs_name> refuse_client_session false`";
600 mds
->send_message(reply
, m
->get_connection());
603 if (session
->is_opening() ||
604 session
->is_open() ||
605 session
->is_stale() ||
606 session
->is_killing() ||
607 terminating_sessions
) {
608 if (m
->supported_features
.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE
)) {
609 if (session
->is_open() && !mds
->is_stopping()) {
610 dout(10) << "currently already opened" << dendl
;
612 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
,
613 session
->get_push_seq());
614 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
615 reply
->supported_features
= supported_features
;
616 mds
->send_message_client(reply
, session
);
617 if (mdcache
->is_readonly()) {
618 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
619 mds
->send_message_client(m
, session
);
623 dout(10) << "currently " << session
->get_state_name()
624 << ", dropping this req" << dendl
;
627 ceph_assert(session
->is_closed() || session
->is_closing());
629 if (mds
->is_stopping()) {
630 dout(10) << "mds is stopping, dropping open req" << dendl
;
635 auto& addr
= session
->info
.inst
.addr
;
636 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
637 auto& client_metadata
= session
->info
.client_metadata
;
639 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
640 auto now
= ceph_clock_now();
641 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
642 auto elapsed
= now
- m
->get_recv_stamp();
643 CachedStackStringStream css
;
644 *css
<< "New client session:"
645 << " addr=\"" << session
->info
.inst
.addr
<< "\""
646 << ",elapsed=" << elapsed
647 << ",throttled=" << throttle_elapsed
648 << ",status=\"" << status
<< "\"";
650 *css
<< ",error=\"" << err
<< "\"";
652 const auto& metadata
= session
->info
.client_metadata
;
653 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
654 *css
<< ",root=\"" << it
->second
<< "\"";
656 dout(2) << css
->strv() << dendl
;
659 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
, unsigned flags
=0) {
660 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
, 0, flags
);
661 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
662 m
->metadata
["error_string"] = err_str
;
663 mds
->send_message_client(m
, session
);
664 log_session_status("REJECTED", err_str
);
667 bool blocklisted
= mds
->objecter
->with_osdmap(
668 [&addr
](const OSDMap
&osd_map
) -> bool {
669 return osd_map
.is_blocklisted(addr
);
673 dout(10) << "rejecting blocklisted client " << addr
<< dendl
;
674 // This goes on the wire and the "blacklisted" substring is
675 // depended upon by the kernel client for detecting whether it
676 // has been blocklisted. If mounted with recover_session=clean
677 // (since 5.4), it tries to automatically recover itself from
680 flags
|= MClientSession::SESSION_BLOCKLISTED
;
681 send_reject_message("blocklisted (blacklisted)", flags
);
686 if (client_metadata
.features
.empty())
687 infer_supported_features(session
, client_metadata
);
689 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
690 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
691 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
692 for (const auto& p
: client_metadata
) {
693 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
696 feature_bitset_t missing_features
= required_client_features
;
697 missing_features
-= client_metadata
.features
;
698 if (!missing_features
.empty()) {
699 CachedStackStringStream css
;
700 *css
<< "missing required features '" << missing_features
<< "'";
701 send_reject_message(css
->strv());
702 mds
->clog
->warn() << "client session (" << session
->info
.inst
703 << ") lacks required features " << missing_features
704 << "; client supports " << client_metadata
.features
;
709 // Special case for the 'root' metadata path; validate that the claimed
710 // root is actually within the caps of the session
711 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
712 auto claimed_root
= it
->second
;
713 CachedStackStringStream css
;
715 // claimed_root has a leading "/" which we strip before passing
717 if (claimed_root
.empty() || claimed_root
[0] != '/') {
719 *css
<< "invalue root '" << claimed_root
<< "'";
720 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
722 *css
<< "non-allowable root '" << claimed_root
<< "'";
726 // Tell the client we're rejecting their open
727 send_reject_message(css
->strv());
728 mds
->clog
->warn() << "client session with " << css
->strv()
729 << " denied (" << session
->info
.inst
<< ")";
735 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
736 if (find_session_by_uuid(it
->second
)) {
737 send_reject_message("duplicated session uuid");
738 mds
->clog
->warn() << "client session with duplicated session uuid '"
739 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
745 if (session
->is_closed()) {
746 mds
->sessionmap
.add_session(session
);
749 pv
= mds
->sessionmap
.mark_projected(session
);
750 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
751 mds
->sessionmap
.touch_session(session
);
752 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
754 log_session_status("ACCEPTED", "");
756 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
757 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
762 case CEPH_SESSION_REQUEST_RENEWCAPS
:
763 if (session
->is_open() || session
->is_stale()) {
764 mds
->sessionmap
.touch_session(session
);
765 if (session
->is_stale()) {
766 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
767 mds
->locker
->resume_stale_caps(session
);
768 mds
->sessionmap
.touch_session(session
);
770 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
771 mds
->send_message_client(reply
, session
);
773 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
777 case CEPH_SESSION_REQUEST_CLOSE
:
779 if (session
->is_closed() ||
780 session
->is_closing() ||
781 session
->is_killing()) {
782 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
785 if (session
->is_importing()) {
786 dout(10) << "ignoring close req on importing session" << dendl
;
789 ceph_assert(session
->is_open() ||
790 session
->is_stale() ||
791 session
->is_opening());
792 if (m
->get_seq() < session
->get_push_seq()) {
793 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
794 << ", dropping" << dendl
;
797 // We are getting a seq that is higher than expected.
798 // Handle the same as any other seqn error.
800 if (m
->get_seq() != session
->get_push_seq()) {
801 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
802 << ", BUGGY!" << dendl
;
803 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
804 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
807 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
811 case CEPH_SESSION_FLUSHMSG_ACK
:
812 finish_flush_session(session
, m
->get_seq());
815 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
816 if (mds
->is_active())
821 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
822 mds
->send_message_client(m
, session
);
823 derr
<< "Server received unknown message " << m
->get_type() << ", closing session and blocklisting the client " << session
->get_client() << dendl
;
824 CachedStackStringStream css
;
825 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
829 void Server::flush_session(Session
*session
, MDSGatherBuilder
& gather
) {
830 if (!session
->is_open() ||
831 !session
->get_connection() ||
832 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
836 version_t seq
= session
->wait_for_flush(gather
.new_sub());
837 mds
->send_message_client(
838 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
841 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
843 for (const auto& client
: client_set
) {
844 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
845 ceph_assert(session
);
846 flush_session(session
, gather
);
850 void Server::finish_flush_session(Session
*session
, version_t seq
)
852 MDSContext::vec finished
;
853 session
->finish_flush(seq
, finished
);
854 mds
->queue_waiters(finished
);
857 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
858 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
859 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
)
861 dout(10) << "_session_logged " << session
->info
.inst
862 << " state_seq " << state_seq
863 << " " << (open
? "open":"close") << " " << pv
864 << " inos_to_free " << inos_to_free
<< " inotablev " << piv
865 << " inos_to_purge " << inos_to_purge
<< dendl
;
868 if (inos_to_purge
.size()){
870 session
->info
.prealloc_inos
.subtract(inos_to_purge
);
871 ls
->purging_inodes
.insert(inos_to_purge
);
872 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())
873 mdcache
->purge_inodes(inos_to_purge
, ls
);
876 if (inos_to_free
.size()) {
878 ceph_assert(session
->is_closing() || session
->is_killing() ||
879 session
->is_opening()); // re-open closing session
880 session
->info
.prealloc_inos
.subtract(inos_to_free
);
881 mds
->inotable
->apply_release_ids(inos_to_free
);
882 ceph_assert(mds
->inotable
->get_version() == piv
);
884 session
->free_prealloc_inos
= session
->info
.prealloc_inos
;
885 session
->delegated_inos
.clear();
888 mds
->sessionmap
.mark_dirty(session
);
891 if (session
->get_state_seq() != state_seq
) {
892 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
893 << ", noop" << dendl
;
894 // close must have been canceled (by an import?), or any number of other things..
896 ceph_assert(session
->is_opening());
897 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
898 mds
->sessionmap
.touch_session(session
);
899 metrics_handler
->add_session(session
);
900 ceph_assert(session
->get_connection());
901 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
902 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
903 reply
->supported_features
= supported_features
;
904 reply
->metric_spec
= supported_metric_spec
;
906 mds
->send_message_client(reply
, session
);
907 if (mdcache
->is_readonly()) {
908 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
909 mds
->send_message_client(m
, session
);
911 } else if (session
->is_closing() ||
912 session
->is_killing()) {
913 // kill any lingering capabilities, leases, requests
914 bool killing
= session
->is_killing();
915 while (!session
->caps
.empty()) {
916 Capability
*cap
= session
->caps
.front();
917 CInode
*in
= cap
->get_inode();
918 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
919 mds
->locker
->remove_client_cap(in
, cap
, killing
);
921 while (!session
->leases
.empty()) {
922 ClientLease
*r
= session
->leases
.front();
923 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
924 dout(20) << " killing client lease of " << *dn
<< dendl
;
925 dn
->remove_client_lease(r
, mds
->locker
);
927 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
928 dout(20) << " removing client from reconnect set" << dendl
;
929 if (client_reconnect_gather
.empty()) {
930 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
931 reconnect_gather_finish();
934 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
935 dout(20) << " removing client from reclaim set" << dendl
;
936 if (client_reclaim_gather
.empty()) {
937 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
938 mds
->maybe_clientreplay_done();
942 if (session
->is_closing()) {
943 // mark con disposable. if there is a fault, we will get a
944 // reset and clean it up. if the client hasn't received the
945 // CLOSE message yet, they will reconnect and get an
946 // ms_handle_remote_reset() and realize they had in fact closed.
947 // do this *before* sending the message to avoid a possible
949 if (session
->get_connection()) {
950 // Conditional because terminate_sessions will indiscrimately
951 // put sessions in CLOSING whether they ever had a conn or not.
952 session
->get_connection()->mark_disposable();
956 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
957 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
959 metrics_handler
->remove_session(session
);
960 mds
->sessionmap
.remove_session(session
);
961 } else if (session
->is_killing()) {
962 // destroy session, close connection
963 if (session
->get_connection()) {
964 session
->get_connection()->mark_down();
965 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
966 session
->set_connection(nullptr);
968 metrics_handler
->remove_session(session
);
969 mds
->sessionmap
.remove_session(session
);
979 * Inject sessions from some source other than actual connections.
982 * - sessions inferred from journal replay
983 * - sessions learned from other MDSs during rejoin
984 * - sessions learned from other MDSs during dir/caps migration
985 * - sessions learned from other MDSs during a cross-MDS rename
987 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
988 map
<client_t
,client_metadata_t
>& cmm
,
989 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
991 version_t pv
= mds
->sessionmap
.get_projected();
993 dout(10) << "prepare_force_open_sessions " << pv
994 << " on " << cm
.size() << " clients"
997 mds
->objecter
->with_osdmap(
998 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
999 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
1000 if (osd_map
.is_blocklisted(p
->second
.addr
)) {
1001 dout(10) << " ignoring blocklisted client." << p
->first
1002 << " (" << p
->second
.addr
<< ")" << dendl
;
1003 cmm
.erase(p
->first
);
1011 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
1012 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
1013 pv
= mds
->sessionmap
.mark_projected(session
);
1015 if (session
->is_closed() ||
1016 session
->is_closing() ||
1017 session
->is_killing()) {
1018 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
1019 auto q
= cmm
.find(p
->first
);
1021 session
->info
.client_metadata
.merge(q
->second
);
1023 ceph_assert(session
->is_open() ||
1024 session
->is_opening() ||
1025 session
->is_stale());
1028 smap
[p
->first
] = make_pair(session
, sseq
);
1029 session
->inc_importing();
1034 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
1038 * FIXME: need to carefully consider the race conditions between a
1039 * client trying to close a session and an MDS doing an import
1040 * trying to force open a session...
1042 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
1043 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
1045 for (auto &it
: smap
) {
1046 Session
*session
= it
.second
.first
;
1047 uint64_t sseq
= it
.second
.second
;
1049 if (session
->get_state_seq() != sseq
) {
1050 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
1052 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
1053 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1054 mds
->sessionmap
.touch_session(session
);
1055 metrics_handler
->add_session(session
);
1057 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1058 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1059 reply
->supported_features
= supported_features
;
1060 reply
->metric_spec
= supported_metric_spec
;
1062 mds
->send_message_client(reply
, session
);
1064 if (mdcache
->is_readonly())
1065 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1068 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
1069 ceph_assert(session
->is_open() || session
->is_stale());
1073 session
->dec_importing();
1076 mds
->sessionmap
.mark_dirty(session
);
1079 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
1082 class C_MDS_TerminatedSessions
: public ServerContext
{
1083 void finish(int r
) override
{
1084 server
->terminating_sessions
= false;
1087 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
1090 void Server::terminate_sessions()
1092 dout(5) << "terminating all sessions..." << dendl
;
1094 terminating_sessions
= true;
1096 // kill them off. clients will retry etc.
1097 set
<Session
*> sessions
;
1098 mds
->sessionmap
.get_client_session_set(sessions
);
1099 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1100 p
!= sessions
.end();
1102 Session
*session
= *p
;
1103 if (session
->is_closing() ||
1104 session
->is_killing() ||
1105 session
->is_closed())
1107 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
1110 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
1114 void Server::find_idle_sessions()
1116 auto now
= clock::now();
1117 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1119 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1122 // (caps go stale, lease die)
1123 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1124 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1126 // don't kick clients if we've been laggy
1127 if (last_cleared_laggy
< cutoff
) {
1128 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1129 << "), not marking any client stale" << dendl
;
1133 std::vector
<Session
*> to_evict
;
1135 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1136 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1137 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1138 std::vector
<Session
*> new_stale
;
1140 for (auto session
: *(sessions_p1
->second
)) {
1141 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1142 if (last_cap_renew_span
< cutoff
) {
1143 dout(20) << "laggiest active session is " << session
->info
.inst
1144 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1148 if (session
->last_seen
> session
->last_cap_renew
) {
1149 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1150 if (last_cap_renew_span
< cutoff
) {
1151 dout(20) << "laggiest active session is " << session
->info
.inst
1152 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1157 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1158 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1159 "has arrived" << dendl
;
1160 // evict session without marking it stale
1161 to_evict
.push_back(session
);
1165 if (defer_session_stale
&&
1166 !session
->is_any_flush_waiter() &&
1167 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1168 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1169 "since it holds no caps" << dendl
;
1173 auto it
= session
->info
.client_metadata
.find("timeout");
1174 if (it
!= session
->info
.client_metadata
.end()) {
1175 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1177 dout(10) << "skipping session " << session
->info
.inst
1178 << ", infinite timeout specified" << dendl
;
1181 double cutoff
= queue_max_age
+ timeout
;
1182 if (last_cap_renew_span
< cutoff
) {
1183 dout(10) << "skipping session " << session
->info
.inst
1184 << ", timeout (" << timeout
<< ") specified"
1185 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1189 // do not go through stale, evict it directly.
1190 to_evict
.push_back(session
);
1192 dout(10) << "new stale session " << session
->info
.inst
1193 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1194 new_stale
.push_back(session
);
1198 for (auto session
: new_stale
) {
1199 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1200 if (mds
->locker
->revoke_stale_caps(session
)) {
1201 mds
->locker
->remove_stale_leases(session
);
1202 finish_flush_session(session
, session
->get_push_seq());
1203 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
);
1204 mds
->send_message_client(m
, session
);
1206 to_evict
.push_back(session
);
1212 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1214 // Collect a list of sessions exceeding the autoclose threshold
1215 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1216 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1217 for (auto session
: *(sessions_p2
->second
)) {
1218 ceph_assert(session
->is_stale());
1219 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1220 if (last_cap_renew_span
< cutoff
) {
1221 dout(20) << "oldest stale session is " << session
->info
.inst
1222 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1225 to_evict
.push_back(session
);
1229 for (auto session
: to_evict
) {
1230 if (session
->is_importing()) {
1231 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1235 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1236 mds
->clog
->warn() << "evicting unresponsive client " << *session
1237 << ", after " << last_cap_renew_span
<< " seconds";
1238 dout(10) << "autoclosing stale session " << session
->info
.inst
1239 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1241 if (g_conf()->mds_session_blocklist_on_timeout
) {
1242 CachedStackStringStream css
;
1243 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
1245 kill_session(session
, NULL
);
1250 void Server::evict_cap_revoke_non_responders() {
1251 if (!cap_revoke_eviction_timeout
) {
1255 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1257 for (auto const &client
: to_evict
) {
1258 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1259 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1260 << " seconds, evicting";
1261 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1264 CachedStackStringStream css
;
1265 bool evicted
= mds
->evict_client(client
.v
, false,
1266 g_conf()->mds_session_blocklist_on_evict
,
1268 if (evicted
&& logger
) {
1269 logger
->inc(l_mdss_cap_revoke_eviction
);
1274 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1275 if (changed
.count("mds_forward_all_requests_to_auth")){
1276 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
1278 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1279 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1280 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1281 << cap_revoke_eviction_timeout
<< dendl
;
1283 if (changed
.count("mds_recall_max_decay_rate")) {
1284 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1286 if (changed
.count("mds_max_snaps_per_dir")) {
1287 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1288 dout(20) << __func__
<< " max snapshots per directory changed to "
1289 << max_snaps_per_dir
<< dendl
;
1291 if (changed
.count("mds_client_delegate_inos_pct")) {
1292 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1294 if (changed
.count("mds_max_caps_per_client")) {
1295 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1297 if (changed
.count("mds_session_cap_acquisition_throttle")) {
1298 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
1300 if (changed
.count("mds_session_max_caps_throttle_ratio")) {
1301 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
1303 if (changed
.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1304 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1306 if (changed
.count("mds_alternate_name_max")) {
1307 alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
1309 if (changed
.count("mds_fscrypt_last_block_max_size")) {
1310 fscrypt_last_block_max_size
= g_conf().get_val
<Option::size_t>("mds_fscrypt_last_block_max_size");
1312 if (changed
.count("mds_dir_max_entries")) {
1313 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
1314 dout(20) << __func__
<< " max entries per directory changed to "
1315 << dir_max_entries
<< dendl
;
1317 if (changed
.count("mds_bal_fragment_size_max")) {
1318 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
1319 dout(20) << __func__
<< " max fragment size changed to "
1320 << bal_fragment_size_max
<< dendl
;
1322 if (changed
.count("mds_inject_rename_corrupt_dentry_first")) {
1323 inject_rename_corrupt_dentry_first
= g_conf().get_val
<double>("mds_inject_rename_corrupt_dentry_first");
1328 * XXX bump in the interface here, not using an MDSContext here
1329 * because all the callers right now happen to use a SaferCond
1331 void Server::kill_session(Session
*session
, Context
*on_safe
)
1333 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1335 if ((session
->is_opening() ||
1336 session
->is_open() ||
1337 session
->is_stale()) &&
1338 !session
->is_importing()) {
1339 dout(10) << "kill_session " << session
<< dendl
;
1340 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1342 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1343 if (session
->is_closing() ||
1344 session
->is_killing()) {
1346 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1348 ceph_assert(session
->is_closed() ||
1349 session
->is_importing());
1351 on_safe
->complete(0);
1356 size_t Server::apply_blocklist()
1358 std::vector
<Session
*> victims
;
1359 const auto& sessions
= mds
->sessionmap
.get_sessions();
1360 mds
->objecter
->with_osdmap(
1361 [&](const OSDMap
& o
) {
1362 for (const auto& p
: sessions
) {
1363 if (!p
.first
.is_client()) {
1364 // Do not apply OSDMap blocklist to MDS daemons, we find out
1365 // about their death via MDSMap.
1368 if (o
.is_blocklisted(p
.second
->info
.inst
.addr
)) {
1369 victims
.push_back(p
.second
);
1374 for (const auto& s
: victims
) {
1375 kill_session(s
, nullptr);
1378 dout(10) << "apply_blocklist: killed " << victims
.size() << dendl
;
1380 return victims
.size();
1383 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1385 dout(10) << __func__
<< " : "
1386 << session
->info
.inst
1387 << " pending_prealloc_inos " << session
->pending_prealloc_inos
1388 << " free_prealloc_inos " << session
->free_prealloc_inos
1389 << " delegated_inos " << session
->delegated_inos
<< dendl
;
1391 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1392 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1395 // release alloc and pending-alloc inos for this session
1396 // and wipe out session state, in case the session close aborts for some reason
1397 interval_set
<inodeno_t
> inos_to_free
;
1398 inos_to_free
.insert(session
->pending_prealloc_inos
);
1399 inos_to_free
.insert(session
->free_prealloc_inos
);
1400 if (inos_to_free
.size()) {
1401 mds
->inotable
->project_release_ids(inos_to_free
);
1402 piv
= mds
->inotable
->get_projected_version();
1406 auto le
= new ESession(session
->info
.inst
, false, pv
, inos_to_free
, piv
, session
->delegated_inos
);
1407 auto fin
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, inos_to_free
, piv
,
1408 session
->delegated_inos
, mdlog
->get_current_segment(), on_safe
);
1409 mdlog
->start_submit_entry(le
, fin
);
1412 // clean up requests, too
1413 while(!session
->requests
.empty()) {
1414 auto mdr
= MDRequestRef(*session
->requests
.begin());
1415 mdcache
->request_kill(mdr
);
1418 finish_flush_session(session
, session
->get_push_seq());
1421 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1423 reconnect_done
= reconnect_done_
;
1425 auto now
= clock::now();
1426 set
<Session
*> sessions
;
1427 mds
->sessionmap
.get_client_session_set(sessions
);
1428 for (auto session
: sessions
) {
1429 if (session
->is_open()) {
1430 client_reconnect_gather
.insert(session
->get_client());
1431 session
->set_reconnecting(true);
1432 session
->last_cap_renew
= now
;
1436 if (client_reconnect_gather
.empty()) {
1437 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1438 reconnect_gather_finish();
1442 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1444 reconnect_start
= now
;
1445 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1446 mds
->sessionmap
.dump();
1449 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1451 dout(7) << "handle_client_reconnect " << m
->get_source()
1452 << (m
->has_more() ? " (more)" : "") << dendl
;
1453 client_t from
= m
->get_source().num();
1454 Session
*session
= mds
->get_session(m
);
1456 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1457 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1458 reply
->metadata
["error_string"] = "sessionless";
1459 mds
->send_message(reply
, m
->get_connection());
1463 if(mds
->mdsmap
->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
)) {
1464 mds
->clog
->warn() << "client could not reconnect as"
1465 " file system flag refuse_client_session is set";
1466 dout(0) << "client cannot reconnect when file system flag"
1467 " refuse_client_session is set" << dendl
;
1468 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1469 reply
->metadata
["error_string"] = "client cannot reconnect when file system flag"
1470 " refuse_client_session is set";
1471 mds
->send_message(reply
, m
->get_connection());
1475 if (!session
->is_open()) {
1476 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1477 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1478 mds
->send_message(reply
, m
->get_connection());
1482 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1484 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1485 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1486 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1490 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1491 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1494 if (reconnect_all_deny
|| !mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1495 // XXX maybe in the future we can do better than this?
1496 if (reconnect_all_deny
) {
1497 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl
;
1499 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1501 mds
->clog
->info() << "denied reconnect attempt (mds is "
1502 << ceph_mds_state_name(mds
->get_state())
1503 << ") from " << m
->get_source_inst()
1504 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1507 std::string error_str
;
1508 if (!session
->is_open()) {
1509 error_str
= "session is closed";
1510 } else if (mdcache
->is_readonly()) {
1511 error_str
= "mds is readonly";
1513 if (session
->info
.client_metadata
.features
.empty())
1514 infer_supported_features(session
, session
->info
.client_metadata
);
1516 feature_bitset_t missing_features
= required_client_features
;
1517 missing_features
-= session
->info
.client_metadata
.features
;
1518 if (!missing_features
.empty()) {
1519 CachedStackStringStream css
;
1520 *css
<< "missing required features '" << missing_features
<< "'";
1521 error_str
= css
->strv();
1525 if (!error_str
.empty()) {
1527 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1528 mds
->clog
->info() << "denied reconnect attempt from "
1529 << m
->get_source_inst() << " (" << error_str
<< ")";
1534 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1535 mds
->send_message_client(r
, session
);
1536 if (session
->is_open()) {
1537 client_reconnect_denied
.insert(session
->get_client());
1542 if (!m
->has_more()) {
1543 metrics_handler
->add_session(session
);
1544 // notify client of success with an OPEN
1545 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1546 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1547 reply
->supported_features
= supported_features
;
1548 reply
->metric_spec
= supported_metric_spec
;
1550 mds
->send_message_client(reply
, session
);
1551 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1554 session
->last_cap_renew
= clock::now();
1557 for (const auto &r
: m
->realms
) {
1558 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1559 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1562 if (in
->snaprealm
) {
1563 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1565 // this can happen if we are non-auth or we rollback snaprealm
1566 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1568 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1570 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1571 << " seq " << r
.realm
.seq
<< dendl
;
1572 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1577 for (const auto &p
: m
->caps
) {
1578 // make sure our last_cap_id is MAX over all issued caps
1579 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1580 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1582 CInode
*in
= mdcache
->get_inode(p
.first
);
1583 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1585 if (in
&& in
->is_auth()) {
1586 // we recovered it, and it's ours. take note.
1587 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1588 << " on " << *in
<< dendl
;
1589 in
->reconnect_cap(from
, p
.second
, session
);
1590 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1591 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1595 if (in
&& !in
->is_auth()) {
1597 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1598 // add to cap export list.
1599 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1600 in
->authority().first
, true);
1602 // don't know if the inode is mine
1603 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1604 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1608 reconnect_last_seen
= clock::now();
1610 if (!m
->has_more()) {
1611 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1613 // remove from gather set
1614 client_reconnect_gather
.erase(from
);
1615 session
->set_reconnecting(false);
1616 if (client_reconnect_gather
.empty())
1617 reconnect_gather_finish();
1621 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1624 auto it
= client_metadata
.find("ceph_version");
1625 if (it
!= client_metadata
.end()) {
1626 // user space client
1627 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1628 supported
= CEPHFS_FEATURE_LUMINOUS
;
1629 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1630 supported
= CEPHFS_FEATURE_KRAKEN
;
1632 it
= client_metadata
.find("kernel_version");
1633 if (it
!= client_metadata
.end()) {
1635 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1636 supported
= CEPHFS_FEATURE_LUMINOUS
;
1639 if (supported
== -1 &&
1640 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1641 supported
= CEPHFS_FEATURE_JEWEL
;
1643 if (supported
>= 0) {
1644 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1645 client_metadata
.features
= feature_bitset_t(value
);
1646 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1650 void Server::update_required_client_features()
1652 required_client_features
= mds
->mdsmap
->get_required_client_features();
1653 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1655 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1656 set
<Session
*> sessions
;
1657 mds
->sessionmap
.get_client_session_set(sessions
);
1658 for (auto session
: sessions
) {
1659 feature_bitset_t missing_features
= required_client_features
;
1660 missing_features
-= session
->info
.client_metadata
.features
;
1661 if (!missing_features
.empty()) {
1662 bool blocklisted
= mds
->objecter
->with_osdmap(
1663 [session
](const OSDMap
&osd_map
) -> bool {
1664 return osd_map
.is_blocklisted(session
->info
.inst
.addr
);
1669 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1670 << missing_features
<< "'";
1671 CachedStackStringStream css
;
1672 mds
->evict_client(session
->get_client().v
, false,
1673 g_conf()->mds_session_blocklist_on_evict
, *css
);
1679 void Server::reconnect_gather_finish()
1681 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1682 ceph_assert(reconnect_done
);
1684 if (!mds
->snapclient
->is_synced()) {
1685 // make sure snaptable cache is populated. snaprealms will be
1686 // extensively used in rejoin stage.
1687 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1688 mds
->snapclient
->wait_for_sync(reconnect_done
);
1690 reconnect_done
->complete(0);
1692 reconnect_done
= NULL
;
1695 void Server::reconnect_tick()
1697 bool reject_all_reconnect
= false;
1698 if (reconnect_evicting
) {
1699 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1704 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1705 * then load less meta information in rejoin phase. This will shorten reboot time.
1706 * Moreover, loading less meta increases the chance standby with less memory can failover.
1708 * Why not shorten reconnect period?
1709 * Clients may send unsafe or retry requests, which haven't been
1710 * completed before old mds stop, to new mds. These requests may
1711 * need to be processed during new mds's clientreplay phase,
1712 * see: #https://github.com/ceph/ceph/pull/29059.
1714 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1715 if (client_reconnect_gather
.empty())
1718 if (reconnect_all_deny
&& (client_reconnect_gather
== client_reconnect_denied
))
1719 reject_all_reconnect
= true;
1721 auto now
= clock::now();
1722 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1723 if (elapse1
< g_conf()->mds_reconnect_timeout
&& !reject_all_reconnect
)
1726 vector
<Session
*> remaining_sessions
;
1727 remaining_sessions
.reserve(client_reconnect_gather
.size());
1728 for (auto c
: client_reconnect_gather
) {
1729 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1730 ceph_assert(session
);
1731 remaining_sessions
.push_back(session
);
1732 // client re-sends cap flush messages before the reconnect message
1733 if (session
->last_seen
> reconnect_last_seen
)
1734 reconnect_last_seen
= session
->last_seen
;
1737 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1738 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2 && !reject_all_reconnect
) {
1739 dout(7) << "reconnect_tick: last seen " << elapse2
1740 << " seconds ago, extending reconnect interval" << dendl
;
1744 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1745 << " clients have not reconnected in time" << dendl
;
1747 // If we're doing blocklist evictions, use this to wait for them before
1748 // proceeding to reconnect_gather_finish
1749 MDSGatherBuilder
gather(g_ceph_context
);
1751 for (auto session
: remaining_sessions
) {
1752 // Keep sessions that have specified timeout. These sessions will prevent
1753 // mds from going to active. MDS goes to active after they all have been
1754 // killed or reclaimed.
1755 if (session
->info
.client_metadata
.find("timeout") !=
1756 session
->info
.client_metadata
.end()) {
1757 dout(1) << "reconnect keeps " << session
->info
.inst
1758 << ", need to be reclaimed" << dendl
;
1759 client_reclaim_gather
.insert(session
->get_client());
1763 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1765 mds
->clog
->warn() << "evicting unresponsive client " << *session
1766 << ", after waiting " << elapse1
1767 << " seconds during MDS startup";
1769 // make _session_logged() purge orphan objects of lost async/unsafe requests
1770 session
->delegated_inos
.swap(session
->free_prealloc_inos
);
1772 if (g_conf()->mds_session_blocklist_on_timeout
) {
1773 CachedStackStringStream css
;
1774 mds
->evict_client(session
->get_client().v
, false, true, *css
,
1777 kill_session(session
, NULL
);
1780 failed_reconnects
++;
1782 client_reconnect_gather
.clear();
1783 client_reconnect_denied
.clear();
1785 if (gather
.has_subs()) {
1786 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1787 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1788 [this](int r
){reconnect_gather_finish();})));
1790 reconnect_evicting
= true;
1792 reconnect_gather_finish();
1796 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1798 if (!locks
.length()) return;
1801 auto p
= locks
.cbegin();
1802 decode(numlocks
, p
);
1803 for (int i
= 0; i
< numlocks
; ++i
) {
1805 lock
.client
= client
;
1806 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1807 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1809 decode(numlocks
, p
);
1810 for (int i
= 0; i
< numlocks
; ++i
) {
1812 lock
.client
= client
;
1813 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1814 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1819 * Call this when the MDCache is oversized, to send requests to the clients
1820 * to trim some caps, and consequently unpin some inodes in the MDCache so
1821 * that it can trim too.
1823 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1825 const auto now
= clock::now();
1826 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1827 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1828 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1829 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1831 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1832 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1833 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1834 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1835 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1836 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1838 dout(7) << __func__
<< ":"
1839 << " min=" << min_caps_per_client
1840 << " max=" << max_caps_per_client
1841 << " total=" << Capability::count()
1842 << " flags=" << flags
1845 /* trim caps of sessions with the most caps first */
1846 std::multimap
<uint64_t, Session
*> caps_session
;
1847 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1848 auto num_caps
= s
->caps
.size();
1849 auto cache_liveness
= s
->get_session_cache_liveness();
1850 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1851 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1854 mds
->sessionmap
.get_client_sessions(std::move(f
));
1856 std::pair
<bool, uint64_t> result
= {false, 0};
1857 auto& [throttled
, caps_recalled
] = result
;
1858 last_recall_state
= now
;
1859 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1860 if (!session
->is_open() ||
1861 !session
->get_connection() ||
1862 !session
->info
.inst
.name
.is_client())
1865 dout(10) << __func__
<< ":"
1866 << " session " << session
->info
.inst
1867 << " caps " << num_caps
1868 << ", leases " << session
->leases
.size()
1872 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1873 newlim
= min_caps_per_client
;
1875 newlim
= num_caps
-recall_max_caps
;
1877 if (num_caps
> newlim
) {
1878 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1879 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1880 newlim
= num_caps
-recall
;
1881 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1882 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1883 const uint64_t global_recall_throttle
= recall_throttle
.get();
1884 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1885 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1888 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1889 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1892 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1893 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1898 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1900 const auto session_recall
= session
->get_recall_caps();
1901 const auto session_release
= session
->get_release_caps();
1902 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1903 /* The session has been unable to keep up with the number of caps
1904 * recalled (by half); additionally, to prevent marking sessions
1905 * we've just begun to recall from, the session_recall counter
1906 * (decayed count of caps recently recalled) is **greater** than the
1907 * session threshold for the session's cap recall throttle.
1909 dout(15) << " 2*session_release < session_recall"
1910 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1911 " 2*session_recall < recall_max_decay_threshold"
1912 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1913 " Skipping because we are unlikely to get more released." << dendl
;
1915 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1916 /* The number of caps recalled is less than the number we *could*
1917 * recall (so there isn't much left to recall?) and the number of
1918 * caps is less than the current recall_caps counter (decayed count
1919 * of caps recently recalled).
1921 dout(15) << " 2*recall < session_recall "
1922 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1923 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1924 " Skipping because we are unlikely to get more released." << dendl
;
1929 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1931 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1932 m
->head
.max_caps
= newlim
;
1933 mds
->send_message_client(m
, session
);
1935 flush_session(session
, *gather
);
1937 caps_recalled
+= session
->notify_recall_sent(newlim
);
1938 recall_throttle
.hit(recall
);
1942 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1947 void Server::force_clients_readonly()
1949 dout(10) << "force_clients_readonly" << dendl
;
1950 set
<Session
*> sessions
;
1951 mds
->sessionmap
.get_client_session_set(sessions
);
1952 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1953 p
!= sessions
.end();
1955 Session
*session
= *p
;
1956 if (!session
->info
.inst
.name
.is_client() ||
1957 !(session
->is_open() || session
->is_stale()))
1959 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1964 * some generic stuff for finishing off requests
1966 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1968 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1969 ceph_assert(!mdr
->has_completed
);
1971 // note trace items for eventual reply.
1980 early_reply(mdr
, in
, dn
);
1982 mdr
->committing
= true;
1983 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1985 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1986 if (mds
->queue_one_replay()) {
1987 dout(10) << " queued next replay op" << dendl
;
1989 dout(10) << " journaled last replay op" << dendl
;
1991 } else if (mdr
->did_early_reply
) {
1992 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1993 if (dn
&& dn
->is_waiter_for(CDentry::WAIT_UNLINK_FINISH
))
2000 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
2001 std::string_view event
)
2004 string
event_str("submit entry: ");
2006 mdr
->mark_event(event_str
);
2008 mdlog
->submit_entry(le
, fin
);
2012 * send response built from mdr contents and error code; clean up mdr
2014 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
2016 if (mdr
->client_request
) {
2017 if (mdr
->is_batch_head()) {
2018 dout(20) << __func__
<< " batch head " << *mdr
<< dendl
;
2019 mdr
->release_batch_op()->respond(r
);
2021 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
2023 } else if (mdr
->internal_op
> -1) {
2024 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
2025 if (!mdr
->internal_op_finish
)
2026 ceph_abort_msg("trying to respond to internal op without finisher");
2027 mdr
->internal_op_finish
->complete(r
);
2028 mdcache
->request_finish(mdr
);
2032 // statistics mds req op number and latency
2033 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
2035 int code
= l_mdss_first
;
2036 switch(req
->get_op()) {
2037 case CEPH_MDS_OP_LOOKUPHASH
:
2038 code
= l_mdss_req_lookuphash_latency
;
2040 case CEPH_MDS_OP_LOOKUPINO
:
2041 code
= l_mdss_req_lookupino_latency
;
2043 case CEPH_MDS_OP_LOOKUPPARENT
:
2044 code
= l_mdss_req_lookupparent_latency
;
2046 case CEPH_MDS_OP_LOOKUPNAME
:
2047 code
= l_mdss_req_lookupname_latency
;
2049 case CEPH_MDS_OP_LOOKUP
:
2050 code
= l_mdss_req_lookup_latency
;
2052 case CEPH_MDS_OP_LOOKUPSNAP
:
2053 code
= l_mdss_req_lookupsnap_latency
;
2055 case CEPH_MDS_OP_GETATTR
:
2056 code
= l_mdss_req_getattr_latency
;
2058 case CEPH_MDS_OP_SETATTR
:
2059 code
= l_mdss_req_setattr_latency
;
2061 case CEPH_MDS_OP_SETLAYOUT
:
2062 code
= l_mdss_req_setlayout_latency
;
2064 case CEPH_MDS_OP_SETDIRLAYOUT
:
2065 code
= l_mdss_req_setdirlayout_latency
;
2067 case CEPH_MDS_OP_GETVXATTR
:
2068 code
= l_mdss_req_getvxattr_latency
;
2070 case CEPH_MDS_OP_SETXATTR
:
2071 code
= l_mdss_req_setxattr_latency
;
2073 case CEPH_MDS_OP_RMXATTR
:
2074 code
= l_mdss_req_rmxattr_latency
;
2076 case CEPH_MDS_OP_READDIR
:
2077 code
= l_mdss_req_readdir_latency
;
2079 case CEPH_MDS_OP_SETFILELOCK
:
2080 code
= l_mdss_req_setfilelock_latency
;
2082 case CEPH_MDS_OP_GETFILELOCK
:
2083 code
= l_mdss_req_getfilelock_latency
;
2085 case CEPH_MDS_OP_CREATE
:
2086 code
= l_mdss_req_create_latency
;
2088 case CEPH_MDS_OP_OPEN
:
2089 code
= l_mdss_req_open_latency
;
2091 case CEPH_MDS_OP_MKNOD
:
2092 code
= l_mdss_req_mknod_latency
;
2094 case CEPH_MDS_OP_LINK
:
2095 code
= l_mdss_req_link_latency
;
2097 case CEPH_MDS_OP_UNLINK
:
2098 code
= l_mdss_req_unlink_latency
;
2100 case CEPH_MDS_OP_RMDIR
:
2101 code
= l_mdss_req_rmdir_latency
;
2103 case CEPH_MDS_OP_RENAME
:
2104 code
= l_mdss_req_rename_latency
;
2106 case CEPH_MDS_OP_MKDIR
:
2107 code
= l_mdss_req_mkdir_latency
;
2109 case CEPH_MDS_OP_SYMLINK
:
2110 code
= l_mdss_req_symlink_latency
;
2112 case CEPH_MDS_OP_LSSNAP
:
2113 code
= l_mdss_req_lssnap_latency
;
2115 case CEPH_MDS_OP_MKSNAP
:
2116 code
= l_mdss_req_mksnap_latency
;
2118 case CEPH_MDS_OP_RMSNAP
:
2119 code
= l_mdss_req_rmsnap_latency
;
2121 case CEPH_MDS_OP_RENAMESNAP
:
2122 code
= l_mdss_req_renamesnap_latency
;
2125 dout(1) << ": unknown client op" << dendl
;
2128 logger
->tinc(code
, lat
);
2131 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
2133 if (!g_conf()->mds_early_reply
)
2136 if (mdr
->no_early_reply
) {
2137 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2141 if (mdr
->has_more() && mdr
->more()->has_journaled_peers
) {
2142 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl
;
2146 if (mdr
->alloc_ino
) {
2147 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2151 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2152 entity_inst_t client_inst
= req
->get_source_inst();
2153 if (client_inst
.name
.is_mds())
2156 if (req
->is_replay()) {
2157 dout(10) << " no early reply on replay op" << dendl
;
2162 auto reply
= make_message
<MClientReply
>(*req
, 0);
2163 reply
->set_unsafe();
2165 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2167 //_rename_finish() does not send dentry link/unlink message to replicas.
2168 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2169 // that have projected linkages from getting new replica.
2170 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2172 dout(10) << "early_reply " << reply
->get_result()
2173 << " (" << cpp_strerror(reply
->get_result())
2174 << ") " << *req
<< dendl
;
2176 if (tracei
|| tracedn
) {
2178 mdr
->cap_releases
.erase(tracei
->vino());
2180 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2182 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2185 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2186 mds
->send_message_client(reply
, mdr
->session
);
2188 mdr
->did_early_reply
= true;
2190 mds
->logger
->inc(l_mds_reply
);
2191 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2192 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2193 if (lat
>= g_conf()->mds_op_complaint_time
) {
2194 mds
->logger
->inc(l_mds_slow_reply
);
2196 if (client_inst
.name
.is_client()) {
2197 mds
->sessionmap
.hit_session(mdr
->session
);
2199 perf_gather_op_latency(req
, lat
);
2200 dout(20) << "lat " << lat
<< dendl
;
2202 mdr
->mark_event("early_replied");
2207 * include a trace to tracei
2210 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2212 ceph_assert(mdr
.get());
2213 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2215 dout(7) << "reply_client_request " << reply
->get_result()
2216 << " (" << cpp_strerror(reply
->get_result())
2217 << ") " << *req
<< dendl
;
2219 mdr
->mark_event("replying");
2221 Session
*session
= mdr
->session
;
2223 // note successful request in session map?
2225 // setfilelock requests are special, they only modify states in MDS memory.
2226 // The states get lost when MDS fails. If Client re-send a completed
2227 // setfilelock request, it means that client did not receive corresponding
2228 // setfilelock reply. So MDS should re-execute the setfilelock request.
2229 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2230 reply
->get_result() == 0 && session
) {
2231 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2232 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2234 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2238 // give any preallocated inos to the session
2239 apply_allocated_inos(mdr
, session
);
2241 // get tracei/tracedn from mdr?
2242 CInode
*tracei
= mdr
->tracei
;
2243 CDentry
*tracedn
= mdr
->tracedn
;
2245 bool is_replay
= mdr
->client_request
->is_replay();
2246 bool did_early_reply
= mdr
->did_early_reply
;
2247 entity_inst_t client_inst
= req
->get_source_inst();
2249 if (!did_early_reply
&& !is_replay
) {
2251 mds
->logger
->inc(l_mds_reply
);
2252 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2253 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2254 if (lat
>= g_conf()->mds_op_complaint_time
) {
2255 mds
->logger
->inc(l_mds_slow_reply
);
2257 if (session
&& client_inst
.name
.is_client()) {
2258 mds
->sessionmap
.hit_session(session
);
2260 perf_gather_op_latency(req
, lat
);
2261 dout(20) << "lat " << lat
<< dendl
;
2264 mdr
->cap_releases
.erase(tracei
->vino());
2266 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2269 // drop non-rdlocks before replying, so that we can issue leases
2270 mdcache
->request_drop_non_rdlocks(mdr
);
2273 if (session
&& !client_inst
.name
.is_mds()) {
2275 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2276 (tracei
|| tracedn
)) {
2279 mdcache
->try_reconnect_cap(tracei
, session
);
2281 // include metadata in reply
2282 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2286 // We can set the extra bl unconditionally: if it's already been sent in the
2287 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2288 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2290 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2291 mds
->send_message_client(reply
, session
);
2294 if (req
->is_queued_for_replay() &&
2295 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2296 if (reply
->get_result() < 0) {
2297 int r
= reply
->get_result();
2298 derr
<< "reply_client_request: failed to replay " << *req
2299 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2300 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2302 mds
->queue_one_replay();
2306 mdcache
->request_finish(mdr
);
2308 // take a closer look at tracei, if it happens to be a remote link
2311 tracedn
->get_projected_linkage()->is_remote()) {
2312 mdcache
->eval_remote(tracedn
);
2317 * pass inode OR dentry (not both, or we may get confused)
2319 * trace is in reverse order (i.e. root inode comes last)
2321 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2322 CInode
*in
, CDentry
*dn
,
2325 // skip doing this for debugging purposes?
2326 if (g_conf()->mds_inject_traceless_reply_probability
&&
2327 mdr
->ls
&& !mdr
->o_trunc
&&
2328 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2329 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2333 // inode, dentry, dir, ..., inode
2335 mds_rank_t whoami
= mds
->get_nodeid();
2336 Session
*session
= mdr
->session
;
2337 snapid_t snapid
= mdr
->snapid
;
2338 utime_t now
= ceph_clock_now();
2340 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2343 if (snapid
== CEPH_NOSNAP
) {
2346 realm
= in
->find_snaprealm();
2348 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2349 reply
->snapbl
= get_snap_trace(session
, realm
);
2350 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2355 reply
->head
.is_dentry
= 1;
2356 CDir
*dir
= dn
->get_dir();
2357 CInode
*diri
= dir
->get_inode();
2359 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2360 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2362 #ifdef MDS_VERIFY_FRAGSTAT
2363 if (dir
->is_complete())
2364 dir
->verify_fragstat();
2367 ds
.frag
= dir
->get_frag();
2368 ds
.auth
= dir
->get_dir_auth().first
;
2369 if (dir
->is_auth() && !forward_all_requests_to_auth
)
2370 dir
->get_dist_spec(ds
.dist
, whoami
);
2372 dir
->encode_dirstat(bl
, session
->info
, ds
);
2373 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2375 encode(dn
->get_name(), bl
);
2376 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, bl
);
2378 reply
->head
.is_dentry
= 0;
2382 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2383 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2384 reply
->head
.is_target
= 1;
2386 reply
->head
.is_target
= 0;
2388 reply
->set_trace(bl
);
2391 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2393 dout(4) << "handle_client_request " << *req
<< dendl
;
2396 mds
->logger
->inc(l_mds_request
);
2398 logger
->inc(l_mdss_handle_client_request
);
2400 if (!mdcache
->is_open()) {
2401 dout(5) << "waiting for root" << dendl
;
2402 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2406 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2408 Session
*session
= 0;
2409 if (req
->is_a_client()) {
2410 session
= mds
->get_session(req
);
2412 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2413 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2414 session
->is_closing() ||
2415 session
->is_killing()) {
2416 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2420 if (req
->is_queued_for_replay())
2421 mds
->queue_one_replay();
2427 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2428 // send it? hrm, this isn't ideal; they may get a lot of copies if
2429 // they have a high request rate.
2432 // completed request?
2433 bool has_completed
= false;
2434 if (req
->is_replay() || req
->get_retry_attempt()) {
2435 ceph_assert(session
);
2437 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2438 has_completed
= true;
2439 if (!session
->is_open())
2441 // Don't send traceless reply if the completed request has created
2442 // new inode. Treat the request as lookup request instead.
2443 if (req
->is_replay() ||
2444 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2445 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2446 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2447 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2448 auto reply
= make_message
<MClientReply
>(*req
, 0);
2449 if (created
!= inodeno_t()) {
2451 encode(created
, extra
);
2452 reply
->set_extra_bl(extra
);
2454 mds
->send_message_client(reply
, session
);
2456 if (req
->is_queued_for_replay())
2457 mds
->queue_one_replay();
2461 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2462 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2463 dout(10) << " completed request which created new inode " << created
2464 << ", convert it to lookup request" << dendl
;
2465 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2466 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2471 // trim completed_request list
2472 if (req
->get_oldest_client_tid() > 0) {
2473 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2474 ceph_assert(session
);
2475 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2476 // Sessions 'completed_requests' was dirtied, mark it to be
2477 // potentially flushed at segment expiry.
2478 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2480 if (session
->get_num_trim_requests_warnings() > 0 &&
2481 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2482 session
->reset_num_trim_requests_warnings();
2484 if (session
->get_num_completed_requests() >=
2485 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2486 session
->inc_num_trim_requests_warnings();
2487 CachedStackStringStream css
;
2488 *css
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2489 << req
->get_oldest_client_tid() << "), "
2490 << session
->get_num_completed_requests()
2491 << " completed requests recorded in session\n";
2492 mds
->clog
->warn() << css
->strv();
2493 dout(20) << __func__
<< " " << css
->strv() << dendl
;
2498 // register + dispatch
2499 MDRequestRef mdr
= mdcache
->request_start(req
);
2504 mdr
->session
= session
;
2505 session
->requests
.push_back(&mdr
->item_session_request
);
2509 mdr
->has_completed
= true;
2511 // process embedded cap releases?
2512 // (only if NOT replay!)
2513 if (!req
->releases
.empty() && req
->is_a_client() && !req
->is_replay()) {
2514 client_t client
= req
->get_source().num();
2515 for (const auto &r
: req
->releases
) {
2516 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2518 req
->releases
.clear();
2521 dispatch_client_request(mdr
);
2525 void Server::handle_osd_map()
2527 /* Note that we check the OSDMAP_FULL flag directly rather than
2528 * using osdmap_full_flag(), because we want to know "is the flag set"
2529 * rather than "does the flag apply to us?" */
2530 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2531 auto pi
= o
.get_pg_pool(mds
->get_metadata_pool());
2532 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2533 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2534 << o
.get_epoch() << dendl
;
2538 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2540 // we shouldn't be waiting on anyone.
2541 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_peer
.empty());
2544 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2545 //if the mdr is a "batch_op" and it has followers, pick a follower as
2546 //the new "head of the batch ops" and go on processing the new one.
2547 if (mdr
->is_batch_head()) {
2548 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2549 auto it
= mdr
->batch_op_map
->find(mask
);
2550 auto new_batch_head
= it
->second
->find_new_head();
2551 if (!new_batch_head
) {
2552 mdr
->batch_op_map
->erase(it
);
2555 mdr
= std::move(new_batch_head
);
2559 } else if (mdr
->aborted
) {
2560 mdr
->aborted
= false;
2561 mdcache
->request_kill(mdr
);
2565 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2567 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2569 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2571 if (req
->may_write() && mdcache
->is_readonly()) {
2572 dout(10) << " read-only FS" << dendl
;
2573 respond_to_request(mdr
, -CEPHFS_EROFS
);
2576 if (mdr
->has_more() && mdr
->more()->peer_error
) {
2577 dout(10) << " got error from peers" << dendl
;
2578 respond_to_request(mdr
, mdr
->more()->peer_error
);
2583 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
2585 // the request is already responded to
2588 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2589 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2590 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2591 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2592 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2593 req
->get_op() == CEPH_MDS_OP_CREATE
||
2594 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2595 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2596 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2597 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2598 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started peer request
2601 if (check_access(mdr
, cur
, MAY_FULL
)) {
2602 dout(20) << __func__
<< ": full, has FULL caps, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2604 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2605 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
2609 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2613 switch (req
->get_op()) {
2614 case CEPH_MDS_OP_LOOKUPHASH
:
2615 case CEPH_MDS_OP_LOOKUPINO
:
2616 handle_client_lookup_ino(mdr
, false, false);
2618 case CEPH_MDS_OP_LOOKUPPARENT
:
2619 handle_client_lookup_ino(mdr
, true, false);
2621 case CEPH_MDS_OP_LOOKUPNAME
:
2622 handle_client_lookup_ino(mdr
, false, true);
2626 case CEPH_MDS_OP_LOOKUP
:
2627 handle_client_getattr(mdr
, true);
2630 case CEPH_MDS_OP_LOOKUPSNAP
:
2631 // lookupsnap does not reference a CDentry; treat it as a getattr
2632 case CEPH_MDS_OP_GETATTR
:
2633 handle_client_getattr(mdr
, false);
2635 case CEPH_MDS_OP_GETVXATTR
:
2636 handle_client_getvxattr(mdr
);
2639 case CEPH_MDS_OP_SETATTR
:
2640 handle_client_setattr(mdr
);
2642 case CEPH_MDS_OP_SETLAYOUT
:
2643 handle_client_setlayout(mdr
);
2645 case CEPH_MDS_OP_SETDIRLAYOUT
:
2646 handle_client_setdirlayout(mdr
);
2648 case CEPH_MDS_OP_SETXATTR
:
2649 handle_client_setxattr(mdr
);
2651 case CEPH_MDS_OP_RMXATTR
:
2652 handle_client_removexattr(mdr
);
2655 case CEPH_MDS_OP_READDIR
:
2656 handle_client_readdir(mdr
);
2659 case CEPH_MDS_OP_SETFILELOCK
:
2660 handle_client_file_setlock(mdr
);
2663 case CEPH_MDS_OP_GETFILELOCK
:
2664 handle_client_file_readlock(mdr
);
2668 case CEPH_MDS_OP_CREATE
:
2669 if (mdr
->has_completed
)
2670 handle_client_open(mdr
); // already created.. just open
2672 handle_client_openc(mdr
);
2675 case CEPH_MDS_OP_OPEN
:
2676 handle_client_open(mdr
);
2681 case CEPH_MDS_OP_MKNOD
:
2682 handle_client_mknod(mdr
);
2684 case CEPH_MDS_OP_LINK
:
2685 handle_client_link(mdr
);
2687 case CEPH_MDS_OP_UNLINK
:
2688 case CEPH_MDS_OP_RMDIR
:
2689 handle_client_unlink(mdr
);
2691 case CEPH_MDS_OP_RENAME
:
2692 handle_client_rename(mdr
);
2694 case CEPH_MDS_OP_MKDIR
:
2695 handle_client_mkdir(mdr
);
2697 case CEPH_MDS_OP_SYMLINK
:
2698 handle_client_symlink(mdr
);
2703 case CEPH_MDS_OP_LSSNAP
:
2704 handle_client_lssnap(mdr
);
2706 case CEPH_MDS_OP_MKSNAP
:
2707 handle_client_mksnap(mdr
);
2709 case CEPH_MDS_OP_RMSNAP
:
2710 handle_client_rmsnap(mdr
);
2712 case CEPH_MDS_OP_RENAMESNAP
:
2713 handle_client_renamesnap(mdr
);
2717 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2718 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
2723 // ---------------------------------------
2726 void Server::handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
)
2728 dout(4) << "handle_peer_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2729 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2731 if (logger
) logger
->inc(l_mdss_handle_peer_request
);
2735 return handle_peer_request_reply(m
);
2737 // the purpose of rename notify is enforcing causal message ordering. making sure
2738 // bystanders have received all messages from rename srcdn's auth MDS.
2739 if (m
->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY
) {
2740 auto reply
= make_message
<MMDSPeerRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK
);
2741 mds
->send_message(reply
, m
->get_connection());
2745 CDentry
*straydn
= NULL
;
2746 if (m
->straybl
.length() > 0) {
2747 mdcache
->decode_replica_stray(straydn
, nullptr, m
->straybl
, from
);
2748 ceph_assert(straydn
);
2752 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2753 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2754 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2760 if (mdcache
->have_request(m
->get_reqid())) {
2762 mdr
= mdcache
->request_get(m
->get_reqid());
2764 // is my request newer?
2765 if (mdr
->attempt
> m
->get_attempt()) {
2766 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2767 << ", dropping " << *m
<< dendl
;
2771 if (mdr
->attempt
< m
->get_attempt()) {
2772 // mine is old, close it out
2773 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2774 << ", closing out" << dendl
;
2775 mdcache
->request_finish(mdr
);
2777 } else if (mdr
->peer_to_mds
!= from
) {
2778 dout(10) << "local request " << *mdr
<< " not peer to mds." << from
<< dendl
;
2782 // may get these while mdr->peer_request is non-null
2783 if (m
->get_op() == MMDSPeerRequest::OP_DROPLOCKS
) {
2784 mds
->locker
->drop_locks(mdr
.get());
2787 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2788 if (m
->is_abort()) {
2789 mdr
->aborted
= true;
2790 if (mdr
->peer_request
) {
2791 // only abort on-going xlock, wrlock and auth pin
2792 ceph_assert(!mdr
->peer_did_prepare());
2794 mdcache
->request_finish(mdr
);
2797 if (m
->inode_export
.length() > 0)
2798 mdr
->more()->inode_import
= m
->inode_export
;
2799 // finish off request.
2800 mdcache
->request_finish(mdr
);
2807 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2808 dout(10) << "missing peer request for " << m
->get_reqid()
2809 << " OP_FINISH, must have lost race with a forward" << dendl
;
2812 mdr
= mdcache
->request_start_peer(m
->get_reqid(), m
->get_attempt(), m
);
2813 mdr
->set_op_stamp(m
->op_stamp
);
2815 ceph_assert(mdr
->peer_request
== 0); // only one at a time, please!
2819 mdr
->straydn
= straydn
;
2822 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2823 mdr
->locks
.empty()) {
2824 dout(3) << "not active yet, waiting" << dendl
;
2825 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2829 mdr
->reset_peer_request(m
);
2831 dispatch_peer_request(mdr
);
2834 void Server::handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
)
2836 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2838 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2839 metareqid_t r
= m
->get_reqid();
2840 if (!mdcache
->have_uncommitted_leader(r
, from
)) {
2841 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2842 << from
<< " reqid " << r
<< dendl
;
2845 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2846 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2850 if (m
->get_op() == MMDSPeerRequest::OP_COMMITTED
) {
2851 metareqid_t r
= m
->get_reqid();
2852 mdcache
->committed_leader_peer(r
, from
);
2856 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2857 if (m
->get_attempt() != mdr
->attempt
) {
2858 dout(10) << "handle_peer_request_reply " << *mdr
<< " ignoring reply from other attempt "
2859 << m
->get_attempt() << dendl
;
2863 switch (m
->get_op()) {
2864 case MMDSPeerRequest::OP_XLOCKACK
:
2866 // identify lock, leader request
2867 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2868 m
->get_object_info());
2869 mdr
->more()->peers
.insert(from
);
2870 lock
->decode_locked_state(m
->get_lock_data());
2871 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2872 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2873 mdr
->finish_locking(lock
);
2874 lock
->get_xlock(mdr
, mdr
->get_client());
2876 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2877 mdr
->more()->waiting_on_peer
.erase(from
);
2878 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2879 mdcache
->dispatch_request(mdr
);
2883 case MMDSPeerRequest::OP_WRLOCKACK
:
2885 // identify lock, leader request
2886 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2887 m
->get_object_info());
2888 mdr
->more()->peers
.insert(from
);
2889 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2890 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2891 ceph_assert(it
->is_remote_wrlock());
2892 ceph_assert(it
->wrlock_target
== from
);
2894 mdr
->finish_locking(lock
);
2896 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2897 mdr
->more()->waiting_on_peer
.erase(from
);
2898 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2899 mdcache
->dispatch_request(mdr
);
2903 case MMDSPeerRequest::OP_AUTHPINACK
:
2904 handle_peer_auth_pin_ack(mdr
, m
);
2907 case MMDSPeerRequest::OP_LINKPREPACK
:
2908 handle_peer_link_prep_ack(mdr
, m
);
2911 case MMDSPeerRequest::OP_RMDIRPREPACK
:
2912 handle_peer_rmdir_prep_ack(mdr
, m
);
2915 case MMDSPeerRequest::OP_RENAMEPREPACK
:
2916 handle_peer_rename_prep_ack(mdr
, m
);
2919 case MMDSPeerRequest::OP_RENAMENOTIFYACK
:
2920 handle_peer_rename_notify_ack(mdr
, m
);
2924 ceph_abort_msg("unknown op " + to_string(m
->get_op()) + " requested");
2928 void Server::dispatch_peer_request(MDRequestRef
& mdr
)
2930 dout(7) << "dispatch_peer_request " << *mdr
<< " " << *mdr
->peer_request
<< dendl
;
2933 dout(7) << " abort flag set, finishing" << dendl
;
2934 mdcache
->request_finish(mdr
);
2938 if (logger
) logger
->inc(l_mdss_dispatch_peer_request
);
2940 int op
= mdr
->peer_request
->get_op();
2942 case MMDSPeerRequest::OP_XLOCK
:
2943 case MMDSPeerRequest::OP_WRLOCK
:
2946 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2947 mdr
->peer_request
->get_object_info());
2950 dout(10) << "don't have object, dropping" << dendl
;
2951 ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
2953 if (op
== MMDSPeerRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2954 dout(10) << "not auth for remote xlock attempt, dropping on "
2955 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2957 // use acquire_locks so that we get auth_pinning.
2958 MutationImpl::LockOpVec lov
;
2959 for (const auto& p
: mdr
->locks
) {
2961 lov
.add_xlock(p
.lock
);
2962 else if (p
.is_wrlock())
2963 lov
.add_wrlock(p
.lock
);
2968 case MMDSPeerRequest::OP_XLOCK
:
2969 lov
.add_xlock(lock
);
2970 replycode
= MMDSPeerRequest::OP_XLOCKACK
;
2972 case MMDSPeerRequest::OP_WRLOCK
:
2973 lov
.add_wrlock(lock
);
2974 replycode
= MMDSPeerRequest::OP_WRLOCKACK
;
2978 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2982 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2983 r
->set_lock_type(lock
->get_type());
2984 lock
->get_parent()->set_object_info(r
->get_object_info());
2985 if (replycode
== MMDSPeerRequest::OP_XLOCKACK
)
2986 lock
->encode_locked_state(r
->get_lock_data());
2987 mds
->send_message(r
, mdr
->peer_request
->get_connection());
2991 mdr
->reset_peer_request();
2995 case MMDSPeerRequest::OP_UNXLOCK
:
2996 case MMDSPeerRequest::OP_UNWRLOCK
:
2998 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2999 mdr
->peer_request
->get_object_info());
3001 auto it
= mdr
->locks
.find(lock
);
3002 ceph_assert(it
!= mdr
->locks
.end());
3003 bool need_issue
= false;
3005 case MMDSPeerRequest::OP_UNXLOCK
:
3006 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
3008 case MMDSPeerRequest::OP_UNWRLOCK
:
3009 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
3013 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
3015 // done. no ack necessary.
3016 mdr
->reset_peer_request();
3020 case MMDSPeerRequest::OP_AUTHPIN
:
3021 handle_peer_auth_pin(mdr
);
3024 case MMDSPeerRequest::OP_LINKPREP
:
3025 case MMDSPeerRequest::OP_UNLINKPREP
:
3026 handle_peer_link_prep(mdr
);
3029 case MMDSPeerRequest::OP_RMDIRPREP
:
3030 handle_peer_rmdir_prep(mdr
);
3033 case MMDSPeerRequest::OP_RENAMEPREP
:
3034 handle_peer_rename_prep(mdr
);
3038 ceph_abort_msg("unknown op "+ to_string(op
)+ " received");
3042 void Server::handle_peer_auth_pin(MDRequestRef
& mdr
)
3044 dout(10) << "handle_peer_auth_pin " << *mdr
<< dendl
;
3046 // build list of objects
3047 list
<MDSCacheObject
*> objects
;
3048 CInode
*auth_pin_freeze
= NULL
;
3049 bool nonblocking
= mdr
->peer_request
->is_nonblocking();
3050 bool fail
= false, wouldblock
= false, readonly
= false;
3051 ref_t
<MMDSPeerRequest
> reply
;
3053 if (mdcache
->is_readonly()) {
3054 dout(10) << " read-only FS" << dendl
;
3060 for (const auto &oi
: mdr
->peer_request
->get_authpins()) {
3061 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3063 dout(10) << " don't have " << oi
<< dendl
;
3068 objects
.push_back(object
);
3069 if (oi
== mdr
->peer_request
->get_authpin_freeze())
3070 auth_pin_freeze
= static_cast<CInode
*>(object
);
3074 // can we auth pin them?
3076 for (const auto& obj
: objects
) {
3077 if (!obj
->is_auth()) {
3078 dout(10) << " not auth for " << *obj
<< dendl
;
3082 if (mdr
->is_auth_pinned(obj
))
3084 if (!mdr
->can_auth_pin(obj
)) {
3086 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
3092 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
3093 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3094 mdr
->drop_local_auth_pins();
3096 mds
->locker
->notify_freeze_waiter(obj
);
3103 /* freeze authpin wrong inode */
3104 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
3105 mdr
->more()->rename_inode
!= auth_pin_freeze
)
3106 mdr
->unfreeze_auth_pin(true);
3108 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3109 * on the source inode to complete. This happens after all locks for the rename
3110 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3111 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3112 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3113 * The solution is freeze the inode and prevent other MDRequests from getting new
3116 if (auth_pin_freeze
) {
3117 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3118 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3119 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3120 mds
->mdlog
->flush();
3126 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3129 mdr
->drop_local_auth_pins(); // just in case
3131 reply
->mark_error_rofs();
3133 reply
->mark_error_wouldblock();
3136 for (const auto& obj
: objects
) {
3137 dout(10) << "auth_pinning " << *obj
<< dendl
;
3140 // return list of my auth_pins (if any)
3141 for (const auto &p
: mdr
->object_states
) {
3142 if (!p
.second
.auth_pinned
)
3144 MDSCacheObjectInfo info
;
3145 p
.first
->set_object_info(info
);
3146 reply
->get_authpins().push_back(info
);
3147 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3148 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3152 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3154 // clean up this request
3155 mdr
->reset_peer_request();
3159 if (mdr
->peer_request
->should_notify_blocking()) {
3160 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3161 reply
->mark_req_blocked();
3162 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3163 mdr
->peer_request
->clear_notify_blocking();
3168 void Server::handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
3170 dout(10) << "handle_peer_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3171 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3173 if (ack
->is_req_blocked()) {
3174 mdr
->disable_lock_cache();
3175 // peer auth pin is blocked, drop locks to avoid deadlock
3176 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3181 set
<MDSCacheObject
*> pinned
;
3182 for (const auto &oi
: ack
->get_authpins()) {
3183 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3184 ceph_assert(object
); // we pinned it
3185 dout(10) << " remote has pinned " << *object
<< dendl
;
3186 mdr
->set_remote_auth_pinned(object
, from
);
3187 if (oi
== ack
->get_authpin_freeze())
3188 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3189 pinned
.insert(object
);
3192 // removed frozen auth pin ?
3193 if (mdr
->more()->is_remote_frozen_authpin
&&
3194 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3195 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3196 ceph_assert(stat_p
);
3197 if (stat_p
->remote_auth_pinned
== from
) {
3198 mdr
->more()->is_remote_frozen_authpin
= false;
3202 // removed auth pins?
3203 for (auto& p
: mdr
->object_states
) {
3204 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3206 MDSCacheObject
* object
= p
.first
;
3207 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3208 dout(10) << " remote has unpinned " << *object
<< dendl
;
3209 mdr
->_clear_remote_auth_pinned(p
.second
);
3214 mdr
->more()->peers
.insert(from
);
3216 // clear from waiting list
3217 auto ret
= mdr
->more()->waiting_on_peer
.erase(from
);
3220 if (ack
->is_error_rofs()) {
3221 mdr
->more()->peer_error
= -CEPHFS_EROFS
;
3222 } else if (ack
->is_error_wouldblock()) {
3223 mdr
->more()->peer_error
= -CEPHFS_EWOULDBLOCK
;
3227 if (mdr
->more()->waiting_on_peer
.empty())
3228 mdcache
->dispatch_request(mdr
);
3230 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
3234 // ---------------------------------------
3239 * check whether we are permitted to complete a request
3241 * Check whether we have permission to perform the operation specified
3242 * by mask on the given inode, based on the capability in the mdr's
3245 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3248 int r
= mdr
->session
->check_access(
3250 mdr
->client_request
->get_caller_uid(),
3251 mdr
->client_request
->get_caller_gid(),
3252 &mdr
->client_request
->get_caller_gid_list(),
3253 mdr
->client_request
->head
.args
.setattr
.uid
,
3254 mdr
->client_request
->head
.args
.setattr
.gid
);
3256 respond_to_request(mdr
, r
);
3264 * check whether fragment has reached maximum size
3267 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*dir
)
3269 const auto size
= dir
->get_frag_size();
3270 const auto max
= bal_fragment_size_max
;
3272 dout(10) << "fragment " << *dir
<< " size exceeds " << max
<< " (CEPHFS_ENOSPC)" << dendl
;
3273 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
3276 dout(20) << "fragment " << *dir
<< " size " << size
<< " < " << max
<< dendl
;
3283 * check whether entries in a dir reached maximum size
3286 bool Server::check_dir_max_entries(MDRequestRef
&mdr
, CDir
*in
)
3288 const uint64_t size
= in
->inode
->get_projected_inode()->dirstat
.nfiles
+
3289 in
->inode
->get_projected_inode()->dirstat
.nsubdirs
;
3290 if (dir_max_entries
&& size
>= dir_max_entries
) {
3291 dout(10) << "entries per dir " << *in
<< " size exceeds " << dir_max_entries
<< " (ENOSPC)" << dendl
;
3292 respond_to_request(mdr
, -ENOSPC
);
3299 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3302 in
->name_stray_dentry(straydname
);
3304 CDentry
*straydn
= mdr
->straydn
;
3306 ceph_assert(straydn
->get_name() == straydname
);
3309 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3311 if (!mdr
->client_request
->is_replay() &&
3312 !check_fragment_space(mdr
, straydir
))
3315 straydn
= straydir
->lookup(straydname
);
3317 if (straydir
->is_frozen_dir()) {
3318 dout(10) << __func__
<< ": " << *straydir
<< " is frozen, waiting" << dendl
;
3319 mds
->locker
->drop_locks(mdr
.get());
3320 mdr
->drop_local_auth_pins();
3321 straydir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3324 straydn
= straydir
->add_null_dentry(straydname
);
3325 straydn
->mark_new();
3327 ceph_assert(straydn
->get_projected_linkage()->is_null());
3330 straydn
->state_set(CDentry::STATE_STRAY
);
3331 mdr
->straydn
= straydn
;
3337 /** prepare_new_inode
3339 * create a new inode. set c/m/atime. hit dir pop.
3341 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3342 const file_layout_t
*layout
)
3344 CInode
*in
= new CInode(mdcache
);
3345 auto _inode
= in
->_get_inode();
3347 // Server::prepare_force_open_sessions() can re-open session in closing
3348 // state. In that corner case, session's prealloc_inos are being freed.
3349 // To simplify the code, we disallow using/refilling session's prealloc_ino
3350 // while session is opening.
3351 bool allow_prealloc_inos
= mdr
->session
->is_open();
3354 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= _inode
->ino
= mdr
->session
->take_ino(useino
))) {
3355 mds
->sessionmap
.mark_projected(mdr
->session
);
3356 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3357 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3361 _inode
->ino
= mds
->inotable
->project_alloc_id(useino
);
3362 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3365 if (useino
&& useino
!= _inode
->ino
) {
3366 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << _inode
->ino
<< dendl
;
3367 mds
->clog
->error() << mdr
->client_request
->get_source()
3368 << " specified ino " << useino
3369 << " but mds." << mds
->get_nodeid() << " allocated " << _inode
->ino
;
3370 //ceph_abort(); // just for now.
3373 if (allow_prealloc_inos
&&
3374 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3375 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3376 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3377 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3378 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3379 mds
->sessionmap
.mark_projected(mdr
->session
);
3380 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3383 _inode
->version
= 1;
3384 _inode
->xattr_version
= 1;
3385 _inode
->nlink
= 1; // FIXME
3387 _inode
->mode
= mode
;
3389 // FIPS zeroization audit 20191117: this memset is not security related.
3390 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
3391 if (_inode
->is_dir()) {
3392 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3393 } else if (layout
) {
3394 _inode
->layout
= *layout
;
3396 _inode
->layout
= mdcache
->default_file_layout
;
3399 _inode
->truncate_size
= -1ull; // not truncated, yet!
3400 _inode
->truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3402 CInode
*diri
= dir
->get_inode();
3403 auto pip
= diri
->get_projected_inode();
3405 dout(10) << oct
<< " dir mode 0" << pip
->mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3407 if (pip
->mode
& S_ISGID
) {
3408 dout(10) << " dir is sticky" << dendl
;
3409 _inode
->gid
= pip
->gid
;
3410 if (S_ISDIR(mode
)) {
3411 dout(10) << " new dir also sticky" << dendl
;
3412 _inode
->mode
|= S_ISGID
;
3415 _inode
->gid
= mdr
->client_request
->get_caller_gid();
3418 _inode
->uid
= mdr
->client_request
->get_caller_uid();
3420 _inode
->btime
= _inode
->ctime
= _inode
->mtime
= _inode
->atime
=
3421 mdr
->get_op_stamp();
3423 _inode
->change_attr
= 0;
3425 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3427 dout(10) << "copying fscrypt_auth len " << req
->fscrypt_auth
.size() << dendl
;
3428 _inode
->fscrypt_auth
= req
->fscrypt_auth
;
3429 _inode
->fscrypt_file
= req
->fscrypt_file
;
3431 if (req
->get_data().length()) {
3432 auto p
= req
->get_data().cbegin();
3434 // xattrs on new inode?
3435 auto _xattrs
= CInode::allocate_xattr_map();
3436 decode_noshare(*_xattrs
, p
);
3437 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs
<< dendl
;
3438 in
->reset_xattrs(std::move(_xattrs
));
3441 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3442 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3443 _inode
->inline_data
.version
= CEPH_INLINE_NONE
;
3445 mdcache
->add_inode(in
); // add
3446 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3450 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3452 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3453 << " inotablev " << mds
->inotable
->get_projected_version()
3455 blob
->set_ino_alloc(mdr
->alloc_ino
,
3456 mdr
->used_prealloc_ino
,
3458 mdr
->client_request
->get_source(),
3459 mds
->sessionmap
.get_projected(),
3460 mds
->inotable
->get_projected_version());
3463 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3465 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3466 << " / " << mdr
->prealloc_inos
3467 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3469 if (mdr
->alloc_ino
) {
3470 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3472 if (mdr
->prealloc_inos
.size()) {
3473 ceph_assert(session
);
3474 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3475 session
->free_prealloc_inos
.insert(mdr
->prealloc_inos
);
3476 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3477 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3478 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3480 if (mdr
->used_prealloc_ino
) {
3481 ceph_assert(session
);
3482 session
->info
.prealloc_inos
.erase(mdr
->used_prealloc_ino
);
3483 mds
->sessionmap
.mark_dirty(session
);
3487 struct C_MDS_TryOpenInode
: public ServerContext
{
3490 C_MDS_TryOpenInode(Server
*s
, MDRequestRef
& r
, inodeno_t i
) :
3491 ServerContext(s
), mdr(r
), ino(i
) {}
3492 void finish(int r
) override
{
3493 server
->_try_open_ino(mdr
, r
, ino
);
3497 void Server::_try_open_ino(MDRequestRef
& mdr
, int r
, inodeno_t ino
)
3499 dout(10) << "_try_open_ino " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3501 // `r` is a rank if >=0, else an error code
3503 mds_rank_t
dest_rank(r
);
3504 if (dest_rank
== mds
->get_nodeid())
3505 dispatch_client_request(mdr
);
3507 mdcache
->request_forward(mdr
, dest_rank
);
3512 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
3514 respond_to_request(mdr
, r
);
3517 class C_MDS_TryFindInode
: public ServerContext
{
3522 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
, MDCache
*m
, inodeno_t i
) :
3523 ServerContext(s
), mdr(r
), mdcache(m
), ino(i
) {}
3524 void finish(int r
) override
{
3525 if (r
== -CEPHFS_ESTALE
) { // :( find_ino_peers failed
3527 * There has one case that when the MDS crashes and the
3528 * openfiletable journal couldn't be flushed and then
3529 * the replacing MDS is possibly won't load some already
3530 * opened CInodes into the MDCache. And if the clients
3531 * will retry some requests after reconnected, the MDS
3532 * will return -ESTALE after failing to find the ino in
3535 * As a workaround users can run `ls -R ${mountpoint}`
3536 * to list all the sub-files or sub-direcotries from the
3539 * We need try to open the ino and try it again.
3541 CInode
*in
= mdcache
->get_inode(ino
);
3542 if (in
&& in
->state_test(CInode::STATE_PURGING
))
3543 server
->respond_to_request(mdr
, r
);
3545 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_TryOpenInode(server
, mdr
, ino
));
3547 server
->dispatch_client_request(mdr
);
3552 /* If this returns null, the request has been handled
3553 * as appropriate: forwarded on, or the client's been replied to */
3554 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3558 const filepath
& refpath
= mdr
->get_filepath();
3559 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3561 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3565 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3567 if (refpath
.is_last_snap()) {
3571 if (!no_want_auth
&& forward_all_requests_to_auth
)
3573 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3576 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3577 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3579 return nullptr; // delayed
3580 if (r
< 0) { // error
3581 if (r
== -CEPHFS_ENOENT
&& !mdr
->dn
[0].empty()) {
3582 if (mdr
->client_request
&&
3583 mdr
->client_request
->get_dentry_wanted())
3584 mdr
->tracedn
= mdr
->dn
[0].back();
3585 respond_to_request(mdr
, r
);
3586 } else if (r
== -CEPHFS_ESTALE
) {
3587 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3588 inodeno_t ino
= refpath
.get_ino();
3589 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3591 dout(10) << "FAIL on error " << r
<< dendl
;
3592 respond_to_request(mdr
, r
);
3596 CInode
*ref
= mdr
->in
[0];
3597 dout(10) << "ref is " << *ref
<< dendl
;
3601 // do NOT proceed if freezing, as cap release may defer in that case, and
3602 // we could deadlock when we try to lock @ref.
3603 // if we're already auth_pinned, continue; the release has already been processed.
3604 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3605 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3606 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3607 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3608 if (mdr
->is_any_remote_auth_pin())
3609 mds
->locker
->notify_freeze_waiter(ref
);
3621 /** rdlock_path_xlock_dentry
3622 * traverse path to the directory that could/would contain dentry.
3623 * make sure i am auth for that dentry (or target inode if it exists and authexist),
3624 * forward as necessary. create null dentry in place (or use existing if okexist).
3625 * get rdlocks on traversed dentries, xlock on new dentry.
3627 * set authexist true if caller requires the target inode to be auth when it exists.
3628 * the tail dentry is not always auth any more if authexist because it is impossible
3629 * to ensure tail dentry and target inode are both auth in one mds. the tail dentry
3630 * will not be xlocked too if authexist and the target inode exists.
3632 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3633 bool create
, bool okexist
, bool authexist
,
3636 const filepath
& refpath
= mdr
->get_filepath();
3637 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3639 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3640 return mdr
->dn
[0].back();
3642 // figure parent dir vs dname
3643 if (refpath
.depth() == 0) {
3644 dout(7) << "invalid path (zero length)" << dendl
;
3645 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3649 if (refpath
.is_last_snap()) {
3650 respond_to_request(mdr
, -CEPHFS_EROFS
);
3654 if (refpath
.is_last_dot_or_dotdot()) {
3655 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3657 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3659 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
3663 // traverse to parent dir
3664 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3665 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3666 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3667 MDS_TRAVERSE_WANT_AUTH
;
3668 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3669 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3671 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3673 flags
|= MDS_TRAVERSE_WANT_INODE
;
3675 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3676 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3678 return nullptr; // delayed
3680 if (r
== -CEPHFS_ESTALE
) {
3681 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3682 inodeno_t ino
= refpath
.get_ino();
3683 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3686 respond_to_request(mdr
, r
);
3690 CDentry
*dn
= mdr
->dn
[0].back();
3691 CDir
*dir
= dn
->get_dir();
3692 CInode
*diri
= dir
->get_inode();
3694 if (!mdr
->reqid
.name
.is_mds()) {
3695 if (diri
->is_system() && !diri
->is_root()) {
3696 respond_to_request(mdr
, -CEPHFS_EROFS
);
3701 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3702 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3706 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3707 if (dnl
->is_null()) {
3708 if (!create
&& okexist
) {
3709 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3713 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3714 dn
->first
= std::max(dn
->first
, next_snap
);
3717 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3720 mdr
->in
[0] = dnl
->get_inode();
3726 /** rdlock_two_paths_xlock_destdn
3727 * traverse two paths and lock the two paths in proper order.
3728 * The order of taking locks is:
3729 * 1. Lock directory inodes or dentries according to which trees they
3730 * are under. Lock objects under fs root before objects under mdsdir.
3731 * 2. Lock directory inodes or dentries according to their depth, in
3733 * 3. Lock directory inodes or dentries according to inode numbers or
3734 * dentries' parent inode numbers, in ascending order.
3735 * 4. Lock dentries in the same directory in order of their keys.
3736 * 5. Lock non-directory inodes according to inode numbers, in ascending
3739 std::pair
<CDentry
*, CDentry
*>
3740 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3743 const filepath
& refpath
= mdr
->get_filepath();
3744 const filepath
& refpath2
= mdr
->get_filepath2();
3746 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3748 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3749 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3751 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3752 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3753 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3756 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3757 respond_to_request(mdr
, -CEPHFS_EROFS
);
3758 return std::make_pair(nullptr, nullptr);
3761 // traverse to parent dir
3762 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3763 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3764 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3766 if (r
== -CEPHFS_ESTALE
) {
3767 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl
;
3768 inodeno_t ino
= refpath
.get_ino();
3769 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3771 respond_to_request(mdr
, r
);
3773 return std::make_pair(nullptr, nullptr);
3776 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3777 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3779 if (r
== -CEPHFS_ESTALE
) {
3780 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
3781 inodeno_t ino
= refpath2
.get_ino();
3782 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3784 respond_to_request(mdr
, r
);
3786 return std::make_pair(nullptr, nullptr);
3789 CDentry
*srcdn
= mdr
->dn
[1].back();
3790 CDir
*srcdir
= srcdn
->get_dir();
3791 CDentry
*destdn
= mdr
->dn
[0].back();
3792 CDir
*destdir
= destdn
->get_dir();
3794 if (!mdr
->reqid
.name
.is_mds()) {
3795 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3796 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3797 respond_to_request(mdr
, -CEPHFS_EROFS
);
3798 return std::make_pair(nullptr, nullptr);
3802 if (!destdir
->get_inode()->is_base() &&
3803 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3804 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3805 return std::make_pair(nullptr, nullptr);
3808 MutationImpl::LockOpVec lov
;
3809 if (srcdir
->get_inode() == destdir
->get_inode()) {
3810 lov
.add_wrlock(&destdir
->inode
->filelock
);
3811 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3812 if (xlock_srcdn
&& srcdir
!= destdir
) {
3813 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3814 if (srcdir_auth
!= mds
->get_nodeid()) {
3815 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3816 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3820 if (srcdn
->get_name() > destdn
->get_name())
3821 lov
.add_xlock(&destdn
->lock
);
3824 lov
.add_xlock(&srcdn
->lock
);
3826 lov
.add_rdlock(&srcdn
->lock
);
3828 if (srcdn
->get_name() < destdn
->get_name())
3829 lov
.add_xlock(&destdn
->lock
);
3831 int cmp
= mdr
->compare_paths();
3832 bool lock_destdir_first
=
3833 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3835 if (lock_destdir_first
) {
3836 lov
.add_wrlock(&destdir
->inode
->filelock
);
3837 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3838 lov
.add_xlock(&destdn
->lock
);
3842 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3843 if (srcdir_auth
== mds
->get_nodeid()) {
3844 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3845 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3847 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3848 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3850 lov
.add_xlock(&srcdn
->lock
);
3852 lov
.add_rdlock(&srcdn
->lock
);
3855 if (!lock_destdir_first
) {
3856 lov
.add_wrlock(&destdir
->inode
->filelock
);
3857 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3858 lov
.add_xlock(&destdn
->lock
);
3862 CInode
*auth_pin_freeze
= nullptr;
3863 // XXX any better way to do this?
3864 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3865 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3866 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3868 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3869 return std::make_pair(nullptr, nullptr);
3871 if (srcdn
->get_projected_linkage()->is_null()) {
3872 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3873 return std::make_pair(nullptr, nullptr);
3876 if (destdn
->get_projected_linkage()->is_null()) {
3877 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3878 destdn
->first
= std::max(destdn
->first
, next_snap
);
3881 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3883 return std::make_pair(destdn
, srcdn
);
3887 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3889 * @param diri base inode
3890 * @param fg the exact frag we want
3891 * @param mdr request
3892 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3894 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3896 CDir
*dir
= diri
->get_dirfrag(fg
);
3899 // am i auth for the dirfrag?
3900 if (!dir
->is_auth()) {
3901 mds_rank_t auth
= dir
->authority().first
;
3902 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3903 << ", fw to mds." << auth
<< dendl
;
3904 mdcache
->request_forward(mdr
, auth
);
3908 // not open and inode not mine?
3909 if (!diri
->is_auth()) {
3910 mds_rank_t inauth
= diri
->authority().first
;
3911 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3912 mdcache
->request_forward(mdr
, inauth
);
3916 // not open and inode frozen?
3917 if (diri
->is_frozen()) {
3918 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3919 ceph_assert(diri
->get_parent_dir());
3920 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3925 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3932 // ===============================================================================
3935 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3937 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3939 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3940 // refpath can't be empty for lookup but it can for
3941 // getattr (we do getattr with empty refpath for mount of '/')
3942 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3946 bool want_auth
= false;
3947 int mask
= req
->head
.args
.getattr
.mask
;
3948 if (mask
& CEPH_STAT_RSTAT
)
3949 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3951 if (!mdr
->is_batch_head() && mdr
->can_batch()) {
3952 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
3953 int r
= mdcache
->path_traverse(mdr
, cf
, mdr
->get_filepath(),
3954 (want_auth
? MDS_TRAVERSE_WANT_AUTH
: 0),
3955 &mdr
->dn
[0], &mdr
->in
[0]);
3960 // fall-thru. let rdlock_path_pin_ref() check again.
3961 } else if (is_lookup
) {
3962 CDentry
* dn
= mdr
->dn
[0].back();
3964 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3966 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3968 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3969 em
.first
->second
->add_request(mdr
);
3973 CInode
*in
= mdr
->in
[0];
3975 auto em
= in
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3977 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3979 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3980 em
.first
->second
->add_request(mdr
);
3986 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3991 * if client currently holds the EXCL cap on a field, do not rdlock
3992 * it; client's stat() will result in valid info if _either_ EXCL
3993 * cap is held or MDS rdlocks and reads the value here.
3995 * handling this case here is easier than weakening rdlock
3996 * semantics... that would cause problems elsewhere.
3998 client_t client
= mdr
->get_client();
4000 Capability
*cap
= ref
->get_client_cap(client
);
4001 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
4002 mdr
->snapid
<= cap
->client_follows
))
4003 issued
= cap
->issued();
4006 MutationImpl::LockOpVec lov
;
4007 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
4008 lov
.add_rdlock(&ref
->linklock
);
4009 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
4010 lov
.add_rdlock(&ref
->authlock
);
4011 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
4012 lov
.add_rdlock(&ref
->xattrlock
);
4013 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
4014 // Don't wait on unstable filelock if client is allowed to read file size.
4015 // This can reduce the response time of getattr in the case that multiple
4016 // clients do stat(2) and there are writers.
4017 // The downside of this optimization is that mds may not issue Fs caps along
4018 // with getattr reply. Client may need to send more getattr requests.
4019 if (mdr
->is_rdlocked(&ref
->filelock
)) {
4020 lov
.add_rdlock(&ref
->filelock
);
4021 } else if (ref
->filelock
.is_stable() ||
4022 ref
->filelock
.get_num_wrlocks() > 0 ||
4023 !ref
->filelock
.can_read(mdr
->get_client())) {
4024 lov
.add_rdlock(&ref
->filelock
);
4025 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
4029 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4032 if (!check_access(mdr
, ref
, MAY_READ
))
4035 utime_t now
= ceph_clock_now();
4036 mdr
->set_mds_stamp(now
);
4038 // note which caps are requested, so we return at least a snapshot
4039 // value for them. (currently this matters for xattrs and inline data)
4040 mdr
->getattr_caps
= mask
;
4042 mds
->balancer
->hit_inode(ref
, META_POP_IRD
);
4045 dout(10) << "reply to stat on " << *req
<< dendl
;
4048 mdr
->tracedn
= mdr
->dn
[0].back();
4049 respond_to_request(mdr
, 0);
4052 struct C_MDS_LookupIno2
: public ServerContext
{
4054 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
4055 void finish(int r
) override
{
4056 server
->_lookup_ino_2(mdr
, r
);
4063 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
4064 bool want_parent
, bool want_dentry
)
4066 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4068 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
4069 return _lookup_snap_ino(mdr
);
4071 inodeno_t ino
= req
->get_filepath().get_ino();
4072 auto _ino
= ino
.val
;
4074 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4075 * I do not have an explanation for how that happened organically but this
4076 * check will ensure that the client can no longer do that.
4078 * [1] https://tracker.ceph.com/issues/49922
4080 if (MDS_IS_PRIVATE_INO(_ino
)) {
4081 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4085 CInode
*in
= mdcache
->get_inode(ino
);
4086 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
4087 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4091 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
4095 // check for nothing (not read or write); this still applies the
4097 if (!check_access(mdr
, in
, 0))
4100 CDentry
*dn
= in
->get_projected_parent_dn();
4101 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
4103 MutationImpl::LockOpVec lov
;
4104 if (dn
&& (want_parent
|| want_dentry
)) {
4106 lov
.add_rdlock(&dn
->lock
);
4109 unsigned mask
= req
->head
.args
.lookupino
.mask
;
4111 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
4113 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4114 issued
= cap
->issued();
4116 // permission bits, ACL/security xattrs
4117 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4118 lov
.add_rdlock(&in
->authlock
);
4119 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4120 lov
.add_rdlock(&in
->xattrlock
);
4122 mdr
->getattr_caps
= mask
;
4126 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4130 // need read access to directory inode
4131 if (!check_access(mdr
, diri
, MAY_READ
))
4137 if (in
->is_base()) {
4138 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4141 if (!diri
|| diri
->is_stray()) {
4142 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4145 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
4147 respond_to_request(mdr
, 0);
4150 inodeno_t dirino
= req
->get_filepath2().get_ino();
4151 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
4152 respond_to_request(mdr
, -CEPHFS_ENOENT
);
4155 dout(10) << "reply to lookup_name " << *in
<< dendl
;
4157 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
4162 respond_to_request(mdr
, 0);
4166 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
4168 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4171 vino
.ino
= req
->get_filepath().get_ino();
4172 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
4173 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
4174 __u32 hash
= req
->head
.args
.lookupino
.hash
;
4176 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
4178 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
4180 in
= mdcache
->get_inode(vino
.ino
);
4182 if (in
->state_test(CInode::STATE_PURGING
) ||
4183 !in
->has_snap_data(vino
.snapid
)) {
4184 if (in
->is_dir() || !parent_ino
) {
4185 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4194 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
4195 mdr
->snapid
= vino
.snapid
;
4197 respond_to_request(mdr
, 0);
4201 CInode
*diri
= NULL
;
4203 diri
= mdcache
->get_inode(parent_ino
);
4205 mdcache
->open_ino(parent_ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
4209 if (!diri
->is_dir()) {
4210 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4214 MutationImpl::LockOpVec lov
;
4215 lov
.add_rdlock(&diri
->dirfragtreelock
);
4216 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4219 frag_t frag
= diri
->dirfragtree
[hash
];
4220 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4224 if (!dir
->is_complete()) {
4225 if (dir
->is_frozen()) {
4226 mds
->locker
->drop_locks(mdr
.get());
4227 mdr
->drop_local_auth_pins();
4228 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4231 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4235 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4237 mdcache
->open_ino(vino
.ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4241 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4243 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4244 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4246 // `r` is a rank if >=0, else an error code
4248 mds_rank_t
dest_rank(r
);
4249 if (dest_rank
== mds
->get_nodeid())
4250 dispatch_client_request(mdr
);
4252 mdcache
->request_forward(mdr
, dest_rank
);
4257 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
4259 respond_to_request(mdr
, r
);
4263 /* This function takes responsibility for the passed mdr*/
4264 void Server::handle_client_open(MDRequestRef
& mdr
)
4266 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4267 dout(7) << "open on " << req
->get_filepath() << dendl
;
4269 int flags
= req
->head
.args
.open
.flags
;
4270 int cmode
= ceph_flags_to_mode(flags
);
4272 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4276 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4277 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4279 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4280 dout(7) << "read-only FS" << dendl
;
4281 respond_to_request(mdr
, -CEPHFS_EROFS
);
4285 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4289 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4290 ceph_assert(!need_auth
);
4291 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4292 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4297 if (!cur
->is_file()) {
4298 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4299 cmode
= CEPH_FILE_MODE_PIN
;
4300 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4301 if (cur
->is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4302 flags
&= ~CEPH_O_TRUNC
;
4305 dout(10) << "open flags = " << flags
4306 << ", filemode = " << cmode
4307 << ", need_auth = " << need_auth
4311 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4312 dout(7) << "not a file or dir " << *cur << dendl;
4313 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4316 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->is_dir() && !cur
->is_symlink()) {
4317 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4318 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4322 if ((flags
& CEPH_O_TRUNC
) && !cur
->is_file()) {
4323 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4324 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4325 respond_to_request(mdr
, cur
->is_dir() ? -CEPHFS_EISDIR
: -CEPHFS_EINVAL
);
4329 if (cur
->get_inode()->inline_data
.version
!= CEPH_INLINE_NONE
&&
4330 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4331 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4332 respond_to_request(mdr
, -CEPHFS_EPERM
);
4336 // snapped data is read only
4337 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4338 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4339 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4340 respond_to_request(mdr
, -CEPHFS_EROFS
);
4344 MutationImpl::LockOpVec lov
;
4346 unsigned mask
= req
->head
.args
.open
.mask
;
4348 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4350 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4351 issued
= cap
->issued();
4352 // permission bits, ACL/security xattrs
4353 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4354 lov
.add_rdlock(&cur
->authlock
);
4355 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4356 lov
.add_rdlock(&cur
->xattrlock
);
4358 mdr
->getattr_caps
= mask
;
4362 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4363 ceph_assert(cur
->is_auth());
4365 lov
.add_xlock(&cur
->filelock
);
4366 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4369 if (!check_access(mdr
, cur
, MAY_WRITE
))
4372 // wait for pending truncate?
4373 const auto& pi
= cur
->get_projected_inode();
4374 if (pi
->is_truncating()) {
4375 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4376 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4377 mds
->locker
->drop_locks(mdr
.get());
4378 mdr
->drop_local_auth_pins();
4379 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4383 do_open_truncate(mdr
, cmode
);
4387 // sync filelock if snapped.
4388 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4389 // and that data itself is flushed so that we can read the snapped data off disk.
4390 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4391 lov
.add_rdlock(&cur
->filelock
);
4394 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4398 if (cmode
& CEPH_FILE_MODE_WR
)
4400 if (!check_access(mdr
, cur
, mask
))
4403 utime_t now
= ceph_clock_now();
4404 mdr
->set_mds_stamp(now
);
4406 if (cur
->is_file() || cur
->is_dir()) {
4407 if (mdr
->snapid
== CEPH_NOSNAP
) {
4409 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4411 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4412 << " for " << req
->get_source()
4413 << " on " << *cur
<< dendl
;
4415 int caps
= ceph_caps_for_mode(cmode
);
4416 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4417 << " for " << req
->get_source()
4418 << " snapid " << mdr
->snapid
4419 << " on " << *cur
<< dendl
;
4420 mdr
->snap_caps
= caps
;
4424 // increase max_size?
4425 if (cmode
& CEPH_FILE_MODE_WR
)
4426 mds
->locker
->check_inode_max_size(cur
);
4428 // make sure this inode gets into the journal
4429 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4430 mdcache
->open_file_table
.should_log_open(cur
)) {
4431 EOpen
*le
= new EOpen(mds
->mdlog
);
4432 mdlog
->start_entry(le
);
4433 le
->add_clean_inode(cur
);
4434 mdlog
->submit_entry(le
);
4438 if (cmode
& CEPH_FILE_MODE_WR
)
4439 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4441 mds
->balancer
->hit_inode(cur
, META_POP_IRD
);
4444 if (req
->get_dentry_wanted()) {
4445 ceph_assert(mdr
->dn
[0].size());
4446 dn
= mdr
->dn
[0].back();
4451 respond_to_request(mdr
, 0);
4454 class C_MDS_openc_finish
: public ServerLogContext
{
4458 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4459 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4460 void finish(int r
) override
{
4461 ceph_assert(r
== 0);
4463 dn
->pop_projected_linkage();
4465 // dirty inode, dn, dir
4466 newi
->mark_dirty(mdr
->ls
);
4467 newi
->mark_dirty_parent(mdr
->ls
, true);
4471 get_mds()->locker
->share_inode_max_size(newi
);
4473 MDRequestRef null_ref
;
4474 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4476 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4478 server
->respond_to_request(mdr
, 0);
4480 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4484 /* This function takes responsibility for the passed mdr*/
4485 void Server::handle_client_openc(MDRequestRef
& mdr
)
4487 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4488 client_t client
= mdr
->get_client();
4490 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4492 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4494 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4498 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4499 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true, true);
4503 if (is_unlink_pending(dn
)) {
4504 wait_for_pending_unlink(dn
, mdr
);
4508 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4509 if (!excl
&& !dnl
->is_null()) {
4511 ceph_assert(mdr
.get()->is_rdlocked(&dn
->lock
));
4513 MutationImpl::LockOpVec lov
;
4514 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4515 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4518 handle_client_open(mdr
);
4522 ceph_assert(dnl
->is_null());
4524 if (req
->get_alternate_name().size() > alternate_name_max
) {
4525 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
4526 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
4529 dn
->set_alternate_name(req
->get_alternate_name());
4532 file_layout_t layout
;
4533 if (mdr
->dir_layout
!= file_layout_t())
4534 layout
= mdr
->dir_layout
;
4536 layout
= mdcache
->default_file_layout
;
4538 // What kind of client caps are required to complete this operation
4539 uint64_t access
= MAY_WRITE
;
4541 const auto default_layout
= layout
;
4543 // fill in any special params from client
4544 if (req
->head
.args
.open
.stripe_unit
)
4545 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4546 if (req
->head
.args
.open
.stripe_count
)
4547 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4548 if (req
->head
.args
.open
.object_size
)
4549 layout
.object_size
= req
->head
.args
.open
.object_size
;
4550 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4551 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4552 layout
.pool_id
= req
->head
.args
.open
.pool
;
4554 // make sure we have as new a map as the client
4555 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4556 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4561 // If client doesn't have capability to modify layout pools, then
4562 // only permit this request if the requested pool matches what the
4563 // file would have inherited anyway from its parent.
4564 if (default_layout
!= layout
) {
4565 access
|= MAY_SET_VXATTR
;
4568 if (!layout
.is_valid()) {
4569 dout(10) << " invalid initial file layout" << dendl
;
4570 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4573 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4574 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4575 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4580 CDir
*dir
= dn
->get_dir();
4581 CInode
*diri
= dir
->get_inode();
4582 if (!check_access(mdr
, diri
, access
))
4584 if (!check_fragment_space(mdr
, dir
))
4586 if (!check_dir_max_entries(mdr
, dir
))
4589 if (mdr
->dn
[0].size() == 1)
4590 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4593 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4594 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4598 dn
->push_projected_linkage(newi
);
4600 auto _inode
= newi
->_get_inode();
4601 _inode
->version
= dn
->pre_dirty();
4602 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4603 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
4604 _inode
->update_backtrace();
4605 _inode
->rstat
.rfiles
= 1;
4606 _inode
->accounted_rstat
= _inode
->rstat
;
4608 SnapRealm
*realm
= diri
->find_snaprealm();
4609 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4610 ceph_assert(follows
>= realm
->get_newest_seq());
4612 ceph_assert(dn
->first
== follows
+1);
4613 newi
->first
= dn
->first
;
4616 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
4617 newi
->authlock
.set_state(LOCK_EXCL
);
4618 newi
->xattrlock
.set_state(LOCK_EXCL
);
4620 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4621 _inode
->client_ranges
[client
].range
.first
= 0;
4622 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
4623 _inode
->client_ranges
[client
].follows
= follows
;
4624 newi
->mark_clientwriteable();
4625 cap
->mark_clientwriteable();
4629 mdr
->ls
= mdlog
->get_current_segment();
4630 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4631 mdlog
->start_entry(le
);
4632 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4633 journal_allocated_inos(mdr
, &le
->metablob
);
4634 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4635 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4637 // make sure this inode gets into the journal
4638 le
->metablob
.add_opened_ino(newi
->ino());
4640 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, newi
);
4642 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4643 openc_response_t ocresp
;
4645 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4646 ocresp
.created_ino
= _inode
->ino
;
4648 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4649 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4650 unsigned frac
= 100 / delegate_inos_pct
;
4651 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4652 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4655 encode(ocresp
, mdr
->reply_extra_bl
);
4656 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4657 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4658 // add the file created flag onto the reply if create_flags features is supported
4659 encode(newi
->ino(), mdr
->reply_extra_bl
);
4662 journal_and_reply(mdr
, newi
, dn
, le
, fin
);
4664 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4665 // have overshot the split size (multiple opencs in flight), so here is
4666 // an early chance to split the dir if this openc makes it oversized.
4667 mds
->balancer
->maybe_fragment(dir
, false);
4672 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4674 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4675 Session
*session
= mds
->get_session(req
);
4676 client_t client
= req
->get_source().num();
4677 MutationImpl::LockOpVec lov
;
4678 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4681 // it's a directory, right?
4682 if (!diri
->is_dir()) {
4684 dout(10) << "reply to " << *req
<< " readdir -CEPHFS_ENOTDIR" << dendl
;
4685 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
4689 auto num_caps
= session
->get_num_caps();
4690 auto session_cap_acquisition
= session
->get_cap_acquisition();
4692 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
4693 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
4694 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
4696 logger
->inc(l_mdss_cap_acquisition_throttle
);
4698 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
4702 lov
.add_rdlock(&diri
->filelock
);
4703 lov
.add_rdlock(&diri
->dirfragtreelock
);
4705 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4708 if (!check_access(mdr
, diri
, MAY_READ
))
4712 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4713 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4714 string offset_str
= req
->get_path2();
4716 __u32 offset_hash
= 0;
4717 if (!offset_str
.empty())
4718 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4720 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4722 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4723 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4725 // does the frag exist?
4726 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4728 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4729 if (fg
.contains((unsigned)offset_hash
)) {
4730 newfg
= diri
->dirfragtree
[offset_hash
];
4732 // client actually wants next frag
4733 newfg
= diri
->dirfragtree
[fg
.value()];
4737 newfg
= diri
->dirfragtree
[fg
.value()];
4739 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4743 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4747 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4748 ceph_assert(dir
->is_auth());
4750 if (!dir
->is_complete()) {
4751 if (dir
->is_frozen()) {
4752 dout(7) << "dir is frozen " << *dir
<< dendl
;
4753 mds
->locker
->drop_locks(mdr
.get());
4754 mdr
->drop_local_auth_pins();
4755 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4759 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4760 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4764 #ifdef MDS_VERIFY_FRAGSTAT
4765 dir
->verify_fragstat();
4768 utime_t now
= ceph_clock_now();
4769 mdr
->set_mds_stamp(now
);
4771 snapid_t snapid
= mdr
->snapid
;
4772 dout(10) << "snapid " << snapid
<< dendl
;
4774 SnapRealm
*realm
= diri
->find_snaprealm();
4776 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4778 max
= dir
->get_num_any(); // whatever, something big.
4779 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4781 // make sure at least one item can be encoded
4782 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4787 ds
.frag
= dir
->get_frag();
4788 ds
.auth
= dir
->get_dir_auth().first
;
4789 if (dir
->is_auth() && !forward_all_requests_to_auth
)
4790 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4792 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4794 // count bytes available.
4795 // this isn't perfect, but we should capture the main variable/unbounded size items!
4796 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4797 int bytes_left
= max_bytes
- front_bytes
;
4798 bytes_left
-= get_snap_trace(session
, realm
).length();
4800 // build dir contents
4803 bool start
= !offset_hash
&& offset_str
.empty();
4804 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4805 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4806 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4807 bool end
= (it
== dir
->end());
4808 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4809 CDentry
*dn
= it
->second
;
4812 if (dn
->state_test(CDentry::STATE_PURGING
))
4815 bool dnp
= dn
->use_projected(client
, mdr
);
4816 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4818 if (dnl
->is_null()) {
4819 if (dn
->get_num_ref() == 0 && !dn
->is_projected())
4820 dir
->remove_dentry(dn
);
4824 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4825 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4830 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4831 if (!(offset_key
< dn
->key()))
4835 CInode
*in
= dnl
->get_inode();
4837 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4841 // better for the MDS to do the work, if we think the client will stat any of these files.
4842 if (dnl
->is_remote() && !in
) {
4843 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4845 dn
->link_remote(dnl
, in
);
4846 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4847 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4850 // touch everything i _do_ have
4851 for (auto &p
: *dir
) {
4852 if (!p
.second
->get_linkage()->is_null())
4853 mdcache
->lru
.lru_touch(p
.second
);
4856 // already issued caps and leases, reply immediately.
4857 if (dnbl
.length() > 0) {
4858 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4859 dout(10) << " open remote dentry after caps were issued, stopping at "
4860 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4864 mds
->locker
->drop_locks(mdr
.get());
4865 mdr
->drop_local_auth_pins();
4866 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4872 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4873 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4877 unsigned start_len
= dnbl
.length();
4880 dout(12) << "including dn " << *dn
<< dendl
;
4881 encode(dn
->get_name(), dnbl
);
4882 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, dnbl
);
4885 dout(12) << "including inode " << *in
<< dendl
;
4886 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4888 // chop off dn->name, lease
4889 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4891 keep
.substr_of(dnbl
, 0, start_len
);
4895 ceph_assert(r
>= 0);
4899 mdcache
->lru
.lru_touch(dn
);
4902 session
->touch_readdir_cap(numfiles
);
4906 flags
= CEPH_READDIR_FRAG_END
;
4908 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4910 // client only understand END and COMPLETE flags ?
4911 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4912 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4915 // finish final blob
4916 encode(numfiles
, dirbl
);
4917 encode(flags
, dirbl
);
4918 dirbl
.claim_append(dnbl
);
4921 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4922 << " bytes=" << dirbl
.length()
4923 << " start=" << (int)start
4924 << " end=" << (int)end
4926 mdr
->reply_extra_bl
= dirbl
;
4928 // bump popularity. NOTE: this doesn't quite capture it.
4929 mds
->balancer
->hit_dir(dir
, META_POP_READDIR
, numfiles
);
4933 respond_to_request(mdr
, 0);
4938 // ===============================================================================
4943 * finisher for basic inode updates
4945 class C_MDS_inode_update_finish
: public ServerLogContext
{
4947 bool truncating_smaller
, changed_ranges
, adjust_realm
;
4949 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4950 bool sm
=false, bool cr
=false, bool ar
=false) :
4951 ServerLogContext(s
, r
), in(i
),
4952 truncating_smaller(sm
), changed_ranges(cr
), adjust_realm(ar
) { }
4953 void finish(int r
) override
{
4954 ceph_assert(r
== 0);
4956 int snap_op
= (in
->snaprealm
? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
);
4961 MDSRank
*mds
= get_mds();
4963 // notify any clients
4964 if (truncating_smaller
&& in
->get_inode()->is_truncating()) {
4965 mds
->locker
->issue_truncate(in
);
4966 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4970 mds
->mdcache
->send_snap_update(in
, 0, snap_op
);
4971 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, snap_op
);
4974 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4976 server
->respond_to_request(mdr
, 0);
4979 get_mds()->locker
->share_inode_max_size(in
);
4983 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4985 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4986 MutationImpl::LockOpVec lov
;
4988 // get the inode to operate on, and set up any locks needed for that
4989 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4993 lov
.add_xlock(&cur
->flocklock
);
4994 /* acquire_locks will return true if it gets the locks. If it fails,
4995 it will redeliver this request at a later date, so drop the request.
4997 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4998 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
5002 // copy the lock change into a ceph_filelock so we can store/apply it
5003 ceph_filelock set_lock
;
5004 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
5005 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
5006 set_lock
.client
= req
->get_orig_source().num();
5007 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
5008 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
5009 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
5010 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
5012 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
5014 ceph_lock_state_t
*lock_state
= NULL
;
5015 bool interrupt
= false;
5017 // get the appropriate lock state
5018 switch (req
->head
.args
.filelock_change
.rule
) {
5019 case CEPH_LOCK_FLOCK_INTR
:
5022 case CEPH_LOCK_FLOCK
:
5023 lock_state
= cur
->get_flock_lock_state();
5026 case CEPH_LOCK_FCNTL_INTR
:
5029 case CEPH_LOCK_FCNTL
:
5030 lock_state
= cur
->get_fcntl_lock_state();
5034 dout(10) << "got unknown lock type " << set_lock
.type
5035 << ", dropping request!" << dendl
;
5036 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
5040 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
5041 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
5042 list
<ceph_filelock
> activated_locks
;
5043 MDSContext::vec waiters
;
5044 if (lock_state
->is_waiting(set_lock
)) {
5045 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
5046 lock_state
->remove_waiting(set_lock
);
5047 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
5048 } else if (!interrupt
) {
5049 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
5050 lock_state
->remove_lock(set_lock
, activated_locks
);
5051 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
5053 mds
->queue_waiters(waiters
);
5055 respond_to_request(mdr
, 0);
5057 dout(10) << " lock attempt on " << set_lock
<< dendl
;
5058 bool deadlock
= false;
5059 if (mdr
->more()->flock_was_waiting
&&
5060 !lock_state
->is_waiting(set_lock
)) {
5061 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
5062 respond_to_request(mdr
, -CEPHFS_EINTR
);
5063 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
5064 dout(10) << " it failed on this attempt" << dendl
;
5065 // couldn't set lock right now
5067 respond_to_request(mdr
, -CEPHFS_EDEADLK
);
5068 } else if (!will_wait
) {
5069 respond_to_request(mdr
, -CEPHFS_EWOULDBLOCK
);
5071 dout(10) << " added to waiting list" << dendl
;
5072 ceph_assert(lock_state
->is_waiting(set_lock
));
5073 mdr
->more()->flock_was_waiting
= true;
5074 mds
->locker
->drop_locks(mdr
.get());
5075 mdr
->drop_local_auth_pins();
5076 mdr
->mark_event("failed to add lock, waiting");
5078 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
5081 respond_to_request(mdr
, 0);
5083 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
5086 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
5088 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5089 MutationImpl::LockOpVec lov
;
5091 // get the inode to operate on, and set up any locks needed for that
5092 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5096 /* acquire_locks will return true if it gets the locks. If it fails,
5097 it will redeliver this request at a later date, so drop the request.
5099 lov
.add_rdlock(&cur
->flocklock
);
5100 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
5101 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
5105 // copy the lock change into a ceph_filelock so we can store/apply it
5106 ceph_filelock checking_lock
;
5107 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
5108 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
5109 checking_lock
.client
= req
->get_orig_source().num();
5110 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
5111 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
5112 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
5114 // get the appropriate lock state
5115 ceph_lock_state_t
*lock_state
= NULL
;
5116 switch (req
->head
.args
.filelock_change
.rule
) {
5117 case CEPH_LOCK_FLOCK
:
5118 lock_state
= cur
->get_flock_lock_state();
5121 case CEPH_LOCK_FCNTL
:
5122 lock_state
= cur
->get_fcntl_lock_state();
5126 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
5127 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5130 lock_state
->look_for_lock(checking_lock
);
5133 encode(checking_lock
, lock_bl
);
5135 mdr
->reply_extra_bl
= lock_bl
;
5136 respond_to_request(mdr
, 0);
5139 void Server::handle_client_setattr(MDRequestRef
& mdr
)
5141 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5142 MutationImpl::LockOpVec lov
;
5143 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5146 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5147 respond_to_request(mdr
, -CEPHFS_EROFS
);
5150 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
5151 respond_to_request(mdr
, -CEPHFS_EPERM
);
5155 __u32 mask
= req
->head
.args
.setattr
.mask
;
5156 __u32 access_mask
= MAY_WRITE
;
5158 if (req
->get_header().version
< 6) {
5159 // No changes to fscrypted inodes by downrevved clients
5160 if (!cur
->get_inode()->fscrypt_auth
.empty()) {
5161 respond_to_request(mdr
, -CEPHFS_EPERM
);
5165 // Only allow fscrypt field changes by capable clients
5166 if (mask
& (CEPH_SETATTR_FSCRYPT_FILE
|CEPH_SETATTR_FSCRYPT_AUTH
)) {
5167 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5173 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
|CEPH_SETATTR_FSCRYPT_AUTH
|CEPH_SETATTR_KILL_SUID
|CEPH_SETATTR_KILL_SGID
))
5174 lov
.add_xlock(&cur
->authlock
);
5175 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
|CEPH_SETATTR_FSCRYPT_FILE
))
5176 lov
.add_xlock(&cur
->filelock
);
5177 if (mask
& CEPH_SETATTR_CTIME
)
5178 lov
.add_wrlock(&cur
->versionlock
);
5180 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5183 if ((mask
& CEPH_SETATTR_UID
) && (cur
->get_inode()->uid
!= req
->head
.args
.setattr
.uid
))
5184 access_mask
|= MAY_CHOWN
;
5186 if ((mask
& CEPH_SETATTR_GID
) && (cur
->get_inode()->gid
!= req
->head
.args
.setattr
.gid
))
5187 access_mask
|= MAY_CHGRP
;
5189 if (!check_access(mdr
, cur
, access_mask
))
5192 // trunc from bigger -> smaller?
5193 const auto& pip
= cur
->get_projected_inode();
5195 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
5197 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5198 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
5199 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl
;
5200 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
5204 bool truncating_smaller
= false;
5205 if (mask
& CEPH_SETATTR_SIZE
) {
5206 if (req
->get_data().length() >
5207 sizeof(struct ceph_fscrypt_last_block_header
) + fscrypt_last_block_max_size
) {
5208 dout(10) << __func__
<< ": the last block size is too large" << dendl
;
5209 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5213 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
||
5214 (req
->head
.args
.setattr
.size
== old_size
&& req
->get_data().length());
5215 if (truncating_smaller
&& pip
->is_truncating()) {
5216 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
5217 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
5218 mds
->locker
->drop_locks(mdr
.get());
5219 mdr
->drop_local_auth_pins();
5220 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
5224 if (truncating_smaller
&& req
->get_data().length()) {
5225 struct ceph_fscrypt_last_block_header header
;
5226 memset(&header
, 0, sizeof(header
));
5227 auto bl
= req
->get_data().cbegin();
5228 DECODE_START(1, bl
);
5229 decode(header
.change_attr
, bl
);
5232 dout(20) << __func__
<< " mdr->retry:" << mdr
->retry
5233 << " header.change_attr: " << header
.change_attr
5234 << " header.file_offset: " << header
.file_offset
5235 << " header.block_size: " << header
.block_size
5238 if (header
.change_attr
!= pip
->change_attr
) {
5239 dout(5) << __func__
<< ": header.change_attr:" << header
.change_attr
5240 << " != current change_attr:" << pip
->change_attr
5241 << ", let client retry it!" << dendl
;
5242 // flush the journal to make sure the clients will get the lasted
5243 // change_attr as possible for the next retry
5244 mds
->mdlog
->flush();
5245 respond_to_request(mdr
, -CEPHFS_EAGAIN
);
5251 bool changed_ranges
= false;
5254 mdr
->ls
= mdlog
->get_current_segment();
5255 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
5256 mdlog
->start_entry(le
);
5258 auto pi
= cur
->project_inode(mdr
);
5260 if (mask
& CEPH_SETATTR_UID
)
5261 pi
.inode
->uid
= req
->head
.args
.setattr
.uid
;
5262 if (mask
& CEPH_SETATTR_GID
)
5263 pi
.inode
->gid
= req
->head
.args
.setattr
.gid
;
5265 if (mask
& CEPH_SETATTR_MODE
)
5266 pi
.inode
->mode
= (pi
.inode
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
5267 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
|
5268 CEPH_SETATTR_KILL_SUID
|CEPH_SETATTR_KILL_SGID
)) &&
5269 S_ISREG(pi
.inode
->mode
)) {
5270 if (mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
) &&
5271 (pi
.inode
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
5272 pi
.inode
->mode
&= ~(S_ISUID
|S_ISGID
);
5274 if (mask
& CEPH_SETATTR_KILL_SUID
) {
5275 pi
.inode
->mode
&= ~S_ISUID
;
5277 if (mask
& CEPH_SETATTR_KILL_SGID
) {
5278 pi
.inode
->mode
&= ~S_ISGID
;
5283 if (mask
& CEPH_SETATTR_MTIME
)
5284 pi
.inode
->mtime
= req
->head
.args
.setattr
.mtime
;
5285 if (mask
& CEPH_SETATTR_ATIME
)
5286 pi
.inode
->atime
= req
->head
.args
.setattr
.atime
;
5287 if (mask
& CEPH_SETATTR_BTIME
)
5288 pi
.inode
->btime
= req
->head
.args
.setattr
.btime
;
5289 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
5290 pi
.inode
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
5291 if (mask
& CEPH_SETATTR_SIZE
) {
5292 if (truncating_smaller
) {
5293 pi
.inode
->truncate(old_size
, req
->head
.args
.setattr
.size
, req
->get_data());
5294 le
->metablob
.add_truncate_start(cur
->ino());
5296 pi
.inode
->size
= req
->head
.args
.setattr
.size
;
5297 pi
.inode
->rstat
.rbytes
= pi
.inode
->size
;
5299 pi
.inode
->mtime
= mdr
->get_op_stamp();
5301 // adjust client's max_size?
5302 if (mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
->size
)) {
5303 dout(10) << " client_ranges " << cur
->get_previous_projected_inode()->client_ranges
5304 << " -> " << pi
.inode
->client_ranges
<< dendl
;
5305 changed_ranges
= true;
5309 if (mask
& CEPH_SETATTR_FSCRYPT_AUTH
)
5310 pi
.inode
->fscrypt_auth
= req
->fscrypt_auth
;
5311 if (mask
& CEPH_SETATTR_FSCRYPT_FILE
)
5312 pi
.inode
->fscrypt_file
= req
->fscrypt_file
;
5314 pi
.inode
->version
= cur
->pre_dirty();
5315 pi
.inode
->ctime
= mdr
->get_op_stamp();
5316 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5317 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5318 pi
.inode
->change_attr
++;
5321 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5322 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5323 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5325 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5326 truncating_smaller
, changed_ranges
));
5328 // flush immediately if there are readers/writers waiting
5329 if (mdr
->is_xlocked(&cur
->filelock
) &&
5330 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5331 mds
->mdlog
->flush();
5334 /* Takes responsibility for mdr */
5335 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5337 CInode
*in
= mdr
->in
[0];
5338 client_t client
= mdr
->get_client();
5341 dout(10) << "do_open_truncate " << *in
<< dendl
;
5343 SnapRealm
*realm
= in
->find_snaprealm();
5344 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5346 mdr
->ls
= mdlog
->get_current_segment();
5347 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5348 mdlog
->start_entry(le
);
5351 auto pi
= in
->project_inode(mdr
);
5352 pi
.inode
->version
= in
->pre_dirty();
5353 pi
.inode
->mtime
= pi
.inode
->ctime
= mdr
->get_op_stamp();
5354 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5355 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5356 pi
.inode
->change_attr
++;
5358 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
5360 pi
.inode
->truncate(old_size
, 0);
5361 le
->metablob
.add_truncate_start(in
->ino());
5364 bool changed_ranges
= false;
5365 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5366 pi
.inode
->client_ranges
[client
].range
.first
= 0;
5367 pi
.inode
->client_ranges
[client
].range
.last
= pi
.inode
->get_layout_size_increment();
5368 pi
.inode
->client_ranges
[client
].follows
= realm
->get_newest_seq();
5369 changed_ranges
= true;
5370 in
->mark_clientwriteable();
5371 cap
->mark_clientwriteable();
5374 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5376 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5377 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5379 // make sure ino gets into the journal
5380 le
->metablob
.add_opened_ino(in
->ino());
5382 mdr
->o_trunc
= true;
5385 if (mdr
->client_request
->get_dentry_wanted()) {
5386 ceph_assert(mdr
->dn
[0].size());
5387 dn
= mdr
->dn
[0].back();
5390 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5392 // Although the `open` part can give an early reply, the truncation won't
5393 // happen until our EUpdate is persistent, to give the client a prompt
5394 // response we must also flush that event.
5399 /* This function cleans up the passed mdr */
5400 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5402 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5403 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5406 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5407 respond_to_request(mdr
, -CEPHFS_EROFS
);
5410 if (!cur
->is_file()) {
5411 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5414 if (cur
->get_projected_inode()->size
||
5415 cur
->get_projected_inode()->truncate_seq
> 1) {
5416 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5421 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5422 // save existing layout for later
5423 const auto old_layout
= layout
;
5425 int access
= MAY_WRITE
;
5427 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5428 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5429 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5430 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5431 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5432 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5433 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5434 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5436 // make sure we have as new a map as the client
5437 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5438 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5443 // Don't permit layout modifications without 'p' caps
5444 if (layout
!= old_layout
) {
5445 access
|= MAY_SET_VXATTR
;
5448 if (!layout
.is_valid()) {
5449 dout(10) << "bad layout" << dendl
;
5450 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5453 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5454 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5455 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5459 MutationImpl::LockOpVec lov
;
5460 lov
.add_xlock(&cur
->filelock
);
5461 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5464 if (!check_access(mdr
, cur
, access
))
5468 auto pi
= cur
->project_inode(mdr
);
5469 pi
.inode
->layout
= layout
;
5470 // add the old pool to the inode
5471 pi
.inode
->add_old_pool(old_layout
.pool_id
);
5472 pi
.inode
->version
= cur
->pre_dirty();
5473 pi
.inode
->ctime
= mdr
->get_op_stamp();
5474 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5475 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5476 pi
.inode
->change_attr
++;
5479 mdr
->ls
= mdlog
->get_current_segment();
5480 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5481 mdlog
->start_entry(le
);
5482 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5483 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5484 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5486 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5489 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5491 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5494 MutationImpl::LockOpVec lov
;
5495 lov
.add_xlock(&in
->policylock
);
5497 lov
.add_xlock(&in
->snaplock
);
5499 lov
.add_rdlock(&in
->snaplock
);
5500 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5503 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5504 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5505 want_layout
= false;
5507 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5508 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5512 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5516 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5518 CInode
*in
= mdcache
->get_inode(ino
);
5519 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5520 respond_to_request(mdr
, -CEPHFS_ESTALE
);
5523 if (!in
->is_auth()) {
5524 mdcache
->request_forward(mdr
, in
->authority().first
);
5531 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5533 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5535 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5536 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5540 if (!cur
->is_dir()) {
5541 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
5545 if (!xlock_policylock(mdr
, cur
, true))
5549 const auto& old_pi
= cur
->get_projected_inode();
5550 file_layout_t layout
;
5551 if (old_pi
->has_layout())
5552 layout
= old_pi
->layout
;
5553 else if (mdr
->dir_layout
!= file_layout_t())
5554 layout
= mdr
->dir_layout
;
5556 layout
= mdcache
->default_file_layout
;
5558 // Level of access required to complete
5559 int access
= MAY_WRITE
;
5561 const auto old_layout
= layout
;
5563 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5564 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5565 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5566 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5567 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5568 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5569 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5570 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5571 // make sure we have as new a map as the client
5572 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5573 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5578 if (layout
!= old_layout
) {
5579 access
|= MAY_SET_VXATTR
;
5582 if (!layout
.is_valid()) {
5583 dout(10) << "bad layout" << dendl
;
5584 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5587 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5588 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5589 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5593 if (!check_access(mdr
, cur
, access
))
5596 auto pi
= cur
->project_inode(mdr
);
5597 pi
.inode
->layout
= layout
;
5598 pi
.inode
->version
= cur
->pre_dirty();
5601 mdr
->ls
= mdlog
->get_current_segment();
5602 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5603 mdlog
->start_entry(le
);
5604 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5605 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5606 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5608 mdr
->no_early_reply
= true;
5609 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5613 int Server::parse_layout_vxattr_json(
5614 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5616 auto parse_pool
= [&](std::string pool_name
, int64_t pool_id
) -> int64_t {
5617 if (pool_name
!= "") {
5618 int64_t _pool_id
= osdmap
.lookup_pg_pool_name(pool_name
);
5620 dout(10) << __func__
<< ": unknown pool name:" << pool_name
<< dendl
;
5621 return -CEPHFS_EINVAL
;
5624 } else if (pool_id
>= 0) {
5625 const auto pools
= osdmap
.get_pools();
5626 if (pools
.find(pool_id
) == pools
.end()) {
5627 dout(10) << __func__
<< ": unknown pool id:" << pool_id
<< dendl
;
5628 return -CEPHFS_EINVAL
;
5632 return -CEPHFS_EINVAL
;
5637 if (name
== "layout.json") {
5638 JSONParser json_parser
;
5639 if (json_parser
.parse(value
.c_str(), value
.length()) and json_parser
.is_object()) {
5642 field
= "object_size";
5643 JSONDecoder::decode_json("object_size", layout
->object_size
, &json_parser
, true);
5645 field
= "stripe_unit";
5646 JSONDecoder::decode_json("stripe_unit", layout
->stripe_unit
, &json_parser
, true);
5648 field
= "stripe_count";
5649 JSONDecoder::decode_json("stripe_count", layout
->stripe_count
, &json_parser
, true);
5651 field
= "pool_namespace";
5652 JSONDecoder::decode_json("pool_namespace", layout
->pool_ns
, &json_parser
, false);
5655 int64_t pool_id
= 0;
5656 JSONDecoder::decode_json("pool_id", pool_id
, &json_parser
, false);
5658 field
= "pool_name";
5659 std::string pool_name
;
5660 JSONDecoder::decode_json("pool_name", pool_name
, &json_parser
, false);
5662 pool_id
= parse_pool(pool_name
, pool_id
);
5664 return (int)pool_id
;
5666 layout
->pool_id
= pool_id
;
5667 } catch (JSONDecoder::err
&) {
5668 dout(10) << __func__
<< ": json is missing a mandatory field named "
5670 return -CEPHFS_EINVAL
;
5673 dout(10) << __func__
<< ": bad json" << dendl
;
5674 return -CEPHFS_EINVAL
;
5677 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5678 return -CEPHFS_ENODATA
; // no such attribute
5680 } catch (boost::bad_lexical_cast
const&) {
5681 dout(10) << __func__
<< ": bad vxattr value:" << value
5682 << ", unable to parse for xattr:" << name
<< dendl
;
5683 return -CEPHFS_EINVAL
;
5688 // parse old style layout string
5689 int Server::parse_layout_vxattr_string(
5690 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5693 if (name
== "layout") {
5694 string::iterator begin
= value
.begin();
5695 string::iterator end
= value
.end();
5696 keys_and_values
<string::iterator
> p
; // create instance of parser
5697 std::map
<string
, string
> m
; // map to receive results
5698 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5699 return -CEPHFS_EINVAL
;
5701 string
left(begin
, end
);
5702 dout(10) << __func__
<< ": parsed " << m
<< " left '" << left
<< "'" << dendl
;
5704 return -CEPHFS_EINVAL
;
5705 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5706 // Skip validation on each attr, we do it once at the end (avoid
5707 // rejecting intermediate states if the overall result is ok)
5708 int r
= parse_layout_vxattr_string(string("layout.") + q
->first
, q
->second
,
5713 } else if (name
== "layout.object_size") {
5714 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5715 } else if (name
== "layout.stripe_unit") {
5716 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5717 } else if (name
== "layout.stripe_count") {
5718 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5719 } else if (name
== "layout.pool") {
5721 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5722 } catch (boost::bad_lexical_cast
const&) {
5723 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5725 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5726 return -CEPHFS_ENOENT
;
5728 layout
->pool_id
= pool
;
5730 } else if (name
== "layout.pool_id") {
5731 layout
->pool_id
= boost::lexical_cast
<int64_t>(value
);
5732 } else if (name
== "layout.pool_name") {
5733 layout
->pool_id
= osdmap
.lookup_pg_pool_name(value
);
5734 if (layout
->pool_id
< 0) {
5735 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5736 return -CEPHFS_EINVAL
;
5738 } else if (name
== "layout.pool_namespace") {
5739 layout
->pool_ns
= value
;
5741 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5742 return -CEPHFS_ENODATA
; // no such attribute
5744 } catch (boost::bad_lexical_cast
const&) {
5745 dout(10) << __func__
<< ": bad vxattr value, unable to parse int for "
5747 return -CEPHFS_EINVAL
;
5752 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5753 file_layout_t
*layout
, bool validate
)
5755 dout(20) << __func__
<< ": name:" << name
<< " value:'" << value
<< "'" << dendl
;
5758 if (name
== "layout.json") {
5759 r
= parse_layout_vxattr_json(name
, value
, osdmap
, layout
);
5761 r
= parse_layout_vxattr_string(name
, value
, osdmap
, layout
);
5767 if (validate
&& !layout
->is_valid()) {
5768 dout(10) << __func__
<< ": bad layout" << dendl
;
5769 return -CEPHFS_EINVAL
;
5771 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5772 dout(10) << __func__
<< ": invalid data pool " << layout
->pool_id
<< dendl
;
5773 return -CEPHFS_EINVAL
;
5778 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5780 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5782 if (name
== "quota") {
5783 string::iterator begin
= value
.begin();
5784 string::iterator end
= value
.end();
5786 // keep quota unchanged. (for create_quota_realm())
5789 keys_and_values
<string::iterator
> p
; // create instance of parser
5790 std::map
<string
, string
> m
; // map to receive results
5791 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5792 return -CEPHFS_EINVAL
;
5794 string
left(begin
, end
);
5795 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5797 return -CEPHFS_EINVAL
;
5798 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5799 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5803 } else if (name
== "quota.max_bytes") {
5804 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5806 return -CEPHFS_EINVAL
;
5807 quota
->max_bytes
= q
;
5808 } else if (name
== "quota.max_files") {
5809 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5811 return -CEPHFS_EINVAL
;
5812 quota
->max_files
= q
;
5814 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5815 return -CEPHFS_EINVAL
;
5817 } catch (boost::bad_lexical_cast
const&) {
5818 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5819 return -CEPHFS_EINVAL
;
5822 if (!quota
->is_valid()) {
5823 dout(10) << "bad quota" << dendl
;
5824 return -CEPHFS_EINVAL
;
5829 void Server::create_quota_realm(CInode
*in
)
5831 dout(10) << __func__
<< " " << *in
<< dendl
;
5833 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5834 req
->set_filepath(filepath(in
->ino()));
5835 req
->set_string2("ceph.quota");
5836 // empty vxattr value
5837 req
->set_tid(mds
->issue_tid());
5839 mds
->send_message_mds(req
, in
->authority().first
);
5843 * Verify that the file layout attribute carried by client
5844 * is well-formatted.
5845 * Return 0 on success, otherwise this function takes
5846 * responsibility for the passed mdr.
5848 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5851 file_layout_t
*layout
)
5853 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5857 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5858 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5859 epoch
= osdmap
.get_epoch();
5862 if (r
== -CEPHFS_ENOENT
) {
5864 // we don't have the specified pool, make sure our map
5865 // is newer than or as new as the client.
5866 epoch_t req_epoch
= req
->get_osdmap_epoch();
5868 if (req_epoch
> epoch
) {
5870 // well, our map is older. consult mds.
5871 auto fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5873 mds
->objecter
->wait_for_map(req_epoch
, lambdafy(fin
));
5875 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5877 // For compatibility with client w/ old code, we still need get the
5878 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5879 // we can remove those code.
5880 mdr
->waited_for_osdmap
= true;
5881 mds
->objecter
->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5882 mds
, new C_MDS_RetryRequest(mdcache
, mdr
))));
5889 if (r
== -CEPHFS_ENOENT
)
5892 respond_to_request(mdr
, r
);
5900 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5902 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5903 MutationImpl::LockOpVec lov
;
5904 string
name(req
->get_path2());
5905 bufferlist bl
= req
->get_data();
5906 string
value (bl
.c_str(), bl
.length());
5907 dout(10) << "handle_set_vxattr " << name
5908 << " val " << value
.length()
5909 << " bytes on " << *cur
5912 CInode::mempool_inode
*pip
= nullptr;
5915 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5919 bool adjust_realm
= false;
5920 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5921 if (!cur
->is_dir()) {
5922 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5926 if (!xlock_policylock(mdr
, cur
, true))
5929 /* We need 'As' caps for the fscrypt context */
5930 lov
.add_xlock(&cur
->authlock
);
5931 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
5935 /* encrypted directories can't have their layout changed */
5936 if (!cur
->get_inode()->fscrypt_auth
.empty()) {
5937 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5941 file_layout_t layout
;
5942 if (cur
->get_projected_inode()->has_layout())
5943 layout
= cur
->get_projected_inode()->layout
;
5944 else if (mdr
->dir_layout
!= file_layout_t())
5945 layout
= mdr
->dir_layout
;
5947 layout
= mdcache
->default_file_layout
;
5949 rest
= name
.substr(name
.find("layout"));
5950 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5953 auto pi
= cur
->project_inode(mdr
);
5954 pi
.inode
->layout
= layout
;
5955 mdr
->no_early_reply
= true;
5956 pip
= pi
.inode
.get();
5957 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5958 if (!cur
->is_file()) {
5959 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5962 if (cur
->get_projected_inode()->size
||
5963 cur
->get_projected_inode()->truncate_seq
> 1) {
5964 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5967 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5968 rest
= name
.substr(name
.find("layout"));
5969 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5972 lov
.add_xlock(&cur
->filelock
);
5973 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5976 /* encrypted files can't have their layout changed */
5977 if (!cur
->get_inode()->fscrypt_auth
.empty()) {
5978 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5982 auto pi
= cur
->project_inode(mdr
);
5983 int64_t old_pool
= pi
.inode
->layout
.pool_id
;
5984 pi
.inode
->add_old_pool(old_pool
);
5985 pi
.inode
->layout
= layout
;
5986 pip
= pi
.inode
.get();
5987 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5988 if (!cur
->is_dir()) {
5989 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5993 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5995 rest
= name
.substr(name
.find("quota"));
5996 int r
= parse_quota_vxattr(rest
, value
, "a
);
5998 respond_to_request(mdr
, r
);
6002 if (quota
.is_enabled() && !cur
->get_projected_srnode())
6003 adjust_realm
= true;
6005 if (!xlock_policylock(mdr
, cur
, false, adjust_realm
))
6008 if (cur
->get_projected_inode()->quota
== quota
) {
6009 respond_to_request(mdr
, 0);
6013 auto pi
= cur
->project_inode(mdr
, false, adjust_realm
);
6014 pi
.inode
->quota
= quota
;
6017 pi
.snapnode
->created
= pi
.snapnode
->seq
= cur
->find_snaprealm()->get_newest_seq();
6019 mdr
->no_early_reply
= true;
6020 pip
= pi
.inode
.get();
6022 client_t exclude_ct
= mdr
->get_client();
6023 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
6024 } else if (name
== "ceph.dir.subvolume"sv
) {
6025 if (!cur
->is_dir()) {
6026 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6032 val
= boost::lexical_cast
<bool>(value
);
6033 } catch (boost::bad_lexical_cast
const&) {
6034 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
6035 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6039 /* Verify it's not already a subvolume with lighter weight
6042 if (!mdr
->more()->rdonly_checks
) {
6043 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6044 lov
.add_rdlock(&cur
->snaplock
);
6045 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6047 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6049 const auto srnode
= cur
->get_projected_srnode();
6050 if (val
== (srnode
&& srnode
->is_subvolume())) {
6051 dout(20) << "already marked subvolume" << dendl
;
6052 respond_to_request(mdr
, 0);
6055 mdr
->more()->rdonly_checks
= true;
6058 if ((mdr
->locking_state
& MutationImpl::ALL_LOCKED
) && !mdr
->is_xlocked(&cur
->snaplock
)) {
6059 /* drop the rdlock and acquire xlocks */
6060 dout(20) << "dropping rdlocks" << dendl
;
6061 mds
->locker
->drop_locks(mdr
.get());
6062 if (!xlock_policylock(mdr
, cur
, false, true))
6066 /* repeat rdonly checks in case changed between rdlock -> xlock */
6067 SnapRealm
*realm
= cur
->find_snaprealm();
6069 inodeno_t subvol_ino
= realm
->get_subvolume_ino();
6070 // can't create subvolume inside another subvolume
6071 if (subvol_ino
&& subvol_ino
!= cur
->ino()) {
6072 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6077 const auto srnode
= cur
->get_projected_srnode();
6078 if (val
== (srnode
&& srnode
->is_subvolume())) {
6079 respond_to_request(mdr
, 0);
6083 auto pi
= cur
->project_inode(mdr
, false, true);
6085 pi
.snapnode
->created
= pi
.snapnode
->seq
= realm
->get_newest_seq();
6087 pi
.snapnode
->mark_subvolume();
6089 pi
.snapnode
->clear_subvolume();
6091 mdr
->no_early_reply
= true;
6092 pip
= pi
.inode
.get();
6093 adjust_realm
= true;
6094 } else if (name
== "ceph.dir.pin"sv
) {
6095 if (!cur
->is_dir() || cur
->is_root()) {
6096 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6102 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
6103 if (rank
< 0) rank
= MDS_RANK_NONE
;
6104 else if (rank
>= MAX_MDS
) {
6105 respond_to_request(mdr
, -CEPHFS_EDOM
);
6108 } catch (boost::bad_lexical_cast
const&) {
6109 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
6110 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6114 if (!xlock_policylock(mdr
, cur
))
6117 auto pi
= cur
->project_inode(mdr
);
6118 cur
->set_export_pin(rank
);
6119 pip
= pi
.inode
.get();
6120 } else if (name
== "ceph.dir.pin.random"sv
) {
6121 if (!cur
->is_dir() || cur
->is_root()) {
6122 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6128 val
= boost::lexical_cast
<double>(value
);
6129 } catch (boost::bad_lexical_cast
const&) {
6130 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
6131 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6135 if (val
< 0.0 || 1.0 < val
) {
6136 respond_to_request(mdr
, -CEPHFS_EDOM
);
6138 } else if (mdcache
->export_ephemeral_random_max
< val
) {
6139 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6143 if (!xlock_policylock(mdr
, cur
))
6146 auto pi
= cur
->project_inode(mdr
);
6147 cur
->setxattr_ephemeral_rand(val
);
6148 pip
= pi
.inode
.get();
6149 } else if (name
== "ceph.dir.pin.distributed"sv
) {
6150 if (!cur
->is_dir() || cur
->is_root()) {
6151 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6157 val
= boost::lexical_cast
<bool>(value
);
6158 } catch (boost::bad_lexical_cast
const&) {
6159 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
6160 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6164 if (!xlock_policylock(mdr
, cur
))
6167 auto pi
= cur
->project_inode(mdr
);
6168 cur
->setxattr_ephemeral_dist(val
);
6169 pip
= pi
.inode
.get();
6171 dout(10) << " unknown vxattr " << name
<< dendl
;
6172 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6177 pip
->ctime
= mdr
->get_op_stamp();
6178 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
6179 pip
->rstat
.rctime
= mdr
->get_op_stamp();
6180 pip
->version
= cur
->pre_dirty();
6182 pip
->update_backtrace();
6185 mdr
->ls
= mdlog
->get_current_segment();
6186 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
6187 mdlog
->start_entry(le
);
6188 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6189 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6190 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6192 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
6193 false, false, adjust_realm
));
6197 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
6199 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6200 string
name(req
->get_path2());
6202 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
6204 if (name
== "ceph.dir.layout") {
6205 if (!cur
->is_dir()) {
6206 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6209 if (cur
->is_root()) {
6210 dout(10) << "can't remove layout policy on the root directory" << dendl
;
6211 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6215 if (!cur
->get_projected_inode()->has_layout()) {
6216 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6220 MutationImpl::LockOpVec lov
;
6221 lov
.add_xlock(&cur
->policylock
);
6222 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6225 auto pi
= cur
->project_inode(mdr
);
6226 pi
.inode
->clear_layout();
6227 pi
.inode
->version
= cur
->pre_dirty();
6230 mdr
->ls
= mdlog
->get_current_segment();
6231 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
6232 mdlog
->start_entry(le
);
6233 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6234 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6235 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6237 mdr
->no_early_reply
= true;
6238 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6240 } else if (name
== "ceph.dir.layout.pool_namespace"
6241 || name
== "ceph.file.layout.pool_namespace") {
6242 // Namespace is the only layout field that has a meaningful
6243 // null/none value (empty string, means default layout). Is equivalent
6244 // to a setxattr with empty string: pass through the empty payload of
6245 // the rmxattr request to do this.
6246 handle_set_vxattr(mdr
, cur
);
6250 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6253 const Server::XattrHandler
Server::xattr_handlers
[] = {
6255 xattr_name
: Server::DEFAULT_HANDLER
,
6256 description
: "default xattr handler",
6257 validate
: &Server::default_xattr_validate
,
6258 setxattr
: &Server::default_setxattr_handler
,
6259 removexattr
: &Server::default_removexattr_handler
,
6262 xattr_name
: "ceph.mirror.info",
6263 description
: "mirror info xattr handler",
6264 validate
: &Server::mirror_info_xattr_validate
,
6265 setxattr
: &Server::mirror_info_setxattr_handler
,
6266 removexattr
: &Server::mirror_info_removexattr_handler
6270 const Server::XattrHandler
* Server::get_xattr_or_default_handler(std::string_view xattr_name
) {
6271 const XattrHandler
*default_xattr_handler
= nullptr;
6273 for (auto &handler
: xattr_handlers
) {
6274 if (handler
.xattr_name
== Server::DEFAULT_HANDLER
) {
6275 ceph_assert(default_xattr_handler
== nullptr);
6276 default_xattr_handler
= &handler
;
6278 if (handler
.xattr_name
== xattr_name
) {
6279 dout(20) << "handler=" << handler
.description
<< dendl
;
6284 ceph_assert(default_xattr_handler
!= nullptr);
6285 dout(20) << "handler=" << default_xattr_handler
->description
<< dendl
;
6286 return default_xattr_handler
;
6289 int Server::xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6290 const std::string
&xattr_name
, int op
, int flags
) {
6291 if (op
== CEPH_MDS_OP_SETXATTR
) {
6293 if ((flags
& CEPH_XATTR_CREATE
) && xattrs
->count(mempool::mds_co::string(xattr_name
))) {
6294 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur
<< dendl
;
6295 return -CEPHFS_EEXIST
;
6298 if ((flags
& CEPH_XATTR_REPLACE
) && !(xattrs
&& xattrs
->count(mempool::mds_co::string(xattr_name
)))) {
6299 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur
<< dendl
;
6300 return -CEPHFS_ENODATA
;
6306 if (op
== CEPH_MDS_OP_RMXATTR
) {
6307 if (!xattrs
|| xattrs
->count(mempool::mds_co::string(xattr_name
)) == 0) {
6308 dout(10) << "removexattr '" << xattr_name
<< "' and CEPHFS_ENODATA on " << *cur
<< dendl
;
6309 return -CEPHFS_ENODATA
;
6315 derr
<< ": unhandled validation for: " << xattr_name
<< dendl
;
6316 return -CEPHFS_EINVAL
;
6319 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
6320 const bufferlist
&xattr_value
) {
6321 size_t len
= xattr_value
.length();
6322 bufferptr b
= buffer::create(len
);
6324 xattr_value
.begin().copy(len
, b
.c_str());
6326 auto em
= xattrs
->emplace(std::piecewise_construct
,
6327 std::forward_as_tuple(mempool::mds_co::string(xattr_name
)),
6328 std::forward_as_tuple(b
));
6330 em
.first
->second
= b
;
6334 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
) {
6335 xattrs
->erase(mempool::mds_co::string(xattr_name
));
6338 int Server::default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6339 XattrOp
*xattr_op
) {
6340 return xattr_validate(cur
, xattrs
, xattr_op
->xattr_name
, xattr_op
->op
, xattr_op
->flags
);
6343 void Server::default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6344 const XattrOp
&xattr_op
) {
6345 xattr_set(xattrs
, xattr_op
.xattr_name
, xattr_op
.xattr_value
);
6348 void Server::default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6349 const XattrOp
&xattr_op
) {
6350 xattr_rm(xattrs
, xattr_op
.xattr_name
);
6353 // mirror info xattr handlers
6354 const std::string
Server::MirrorXattrInfo::MIRROR_INFO_REGEX
= "^cluster_id=([a-f0-9]{8}-" \
6355 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6356 "[a-f0-9]{4}-[a-f0-9]{12})" \
6358 const std::string
Server::MirrorXattrInfo::CLUSTER_ID
= "ceph.mirror.info.cluster_id";
6359 const std::string
Server::MirrorXattrInfo::FS_ID
= "ceph.mirror.info.fs_id";
6360 int Server::parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
6361 std::string
&cluster_id
, std::string
&fs_id
) {
6362 dout(20) << "parsing name=" << name
<< ", value=" << value
<< dendl
;
6364 static const std::regex
regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX
);
6367 std::regex_search(value
, match
, regex
);
6368 if (match
.size() != 3) {
6369 derr
<< "mirror info parse error" << dendl
;
6370 return -CEPHFS_EINVAL
;
6373 cluster_id
= match
[1];
6375 dout(20) << " parsed cluster_id=" << cluster_id
<< ", fs_id=" << fs_id
<< dendl
;
6379 int Server::mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6380 XattrOp
*xattr_op
) {
6381 if (!cur
->is_root()) {
6382 return -CEPHFS_EINVAL
;
6385 int v1
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, xattr_op
->op
, xattr_op
->flags
);
6386 int v2
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::FS_ID
, xattr_op
->op
, xattr_op
->flags
);
6388 derr
<< "inconsistent mirror info state (" << v1
<< "," << v2
<< ")" << dendl
;
6389 return -CEPHFS_EINVAL
;
6396 if (xattr_op
->op
== CEPH_MDS_OP_RMXATTR
) {
6400 std::string cluster_id
;
6402 int r
= parse_mirror_info_xattr(xattr_op
->xattr_name
, xattr_op
->xattr_value
.to_str(),
6408 xattr_op
->xinfo
= std::make_unique
<MirrorXattrInfo
>(cluster_id
, fs_id
);
6412 void Server::mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6413 const XattrOp
&xattr_op
) {
6414 auto mirror_info
= dynamic_cast<MirrorXattrInfo
&>(*(xattr_op
.xinfo
));
6417 bl
.append(mirror_info
.cluster_id
.c_str(), mirror_info
.cluster_id
.length());
6418 xattr_set(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, bl
);
6421 bl
.append(mirror_info
.fs_id
.c_str(), mirror_info
.fs_id
.length());
6422 xattr_set(xattrs
, Server::MirrorXattrInfo::FS_ID
, bl
);
6425 void Server::mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6426 const XattrOp
&xattr_op
) {
6427 xattr_rm(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
);
6428 xattr_rm(xattrs
, Server::MirrorXattrInfo::FS_ID
);
6431 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
6433 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6434 string
name(req
->get_path2());
6436 // is a ceph virtual xattr?
6437 if (is_ceph_vxattr(name
)) {
6438 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6439 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6443 handle_set_vxattr(mdr
, cur
);
6447 if (!is_allowed_ceph_xattr(name
)) {
6448 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6452 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
6456 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6457 respond_to_request(mdr
, -CEPHFS_EROFS
);
6461 int flags
= req
->head
.args
.setxattr
.flags
;
6463 MutationImpl::LockOpVec lov
;
6464 lov
.add_xlock(&cur
->xattrlock
);
6465 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6468 if (!check_access(mdr
, cur
, MAY_WRITE
))
6471 size_t len
= req
->get_data().length();
6472 size_t inc
= len
+ name
.length();
6474 auto handler
= Server::get_xattr_or_default_handler(name
);
6475 const auto& pxattrs
= cur
->get_projected_xattrs();
6477 // check xattrs kv pairs size
6478 size_t cur_xattrs_size
= 0;
6479 for (const auto& p
: *pxattrs
) {
6480 if ((flags
& CEPH_XATTR_REPLACE
) && name
.compare(p
.first
) == 0) {
6483 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
6486 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
6487 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6488 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
6489 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
6494 XattrOp
xattr_op(CEPH_MDS_OP_SETXATTR
, name
, req
->get_data(), flags
);
6495 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6497 respond_to_request(mdr
, r
);
6501 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
6504 auto pi
= cur
->project_inode(mdr
, true);
6505 pi
.inode
->version
= cur
->pre_dirty();
6506 pi
.inode
->ctime
= mdr
->get_op_stamp();
6507 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6508 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6509 pi
.inode
->change_attr
++;
6510 pi
.inode
->xattr_version
++;
6512 if ((flags
& CEPH_XATTR_REMOVE
)) {
6513 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6515 std::invoke(handler
->setxattr
, this, cur
, pi
.xattrs
, xattr_op
);
6519 mdr
->ls
= mdlog
->get_current_segment();
6520 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
6521 mdlog
->start_entry(le
);
6522 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6523 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6524 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6526 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6529 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
6531 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6532 std::string
name(req
->get_path2());
6534 // is a ceph virtual xattr?
6535 if (is_ceph_vxattr(name
)) {
6536 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6537 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6541 handle_remove_vxattr(mdr
, cur
);
6545 if (!is_allowed_ceph_xattr(name
)) {
6546 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6550 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
6554 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6555 respond_to_request(mdr
, -CEPHFS_EROFS
);
6559 MutationImpl::LockOpVec lov
;
6560 lov
.add_xlock(&cur
->xattrlock
);
6561 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6565 auto handler
= Server::get_xattr_or_default_handler(name
);
6567 XattrOp
xattr_op(CEPH_MDS_OP_RMXATTR
, name
, bl
, 0);
6569 const auto& pxattrs
= cur
->get_projected_xattrs();
6570 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6572 respond_to_request(mdr
, r
);
6576 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
6579 auto pi
= cur
->project_inode(mdr
, true);
6580 pi
.inode
->version
= cur
->pre_dirty();
6581 pi
.inode
->ctime
= mdr
->get_op_stamp();
6582 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6583 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6584 pi
.inode
->change_attr
++;
6585 pi
.inode
->xattr_version
++;
6586 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6589 mdr
->ls
= mdlog
->get_current_segment();
6590 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
6591 mdlog
->start_entry(le
);
6592 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6593 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6594 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6596 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6599 void Server::handle_client_getvxattr(MDRequestRef
& mdr
)
6601 const auto& req
= mdr
->client_request
;
6602 string xattr_name
{req
->get_path2()};
6604 // is a ceph virtual xattr?
6605 if (!is_ceph_vxattr(xattr_name
)) {
6606 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6610 CInode
*cur
= rdlock_path_pin_ref(mdr
, true, false);
6615 if (is_ceph_dir_vxattr(xattr_name
)) {
6616 if (!cur
->is_dir()) {
6617 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6620 } else if (is_ceph_file_vxattr(xattr_name
)) {
6621 if (cur
->is_dir()) {
6622 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6627 CachedStackStringStream css
;
6629 ceph::bufferlist bl
;
6630 // handle these vxattrs
6631 if ((xattr_name
.substr(0, 15) == "ceph.dir.layout"sv
) ||
6632 (xattr_name
.substr(0, 16) == "ceph.file.layout"sv
)) {
6633 std::string layout_field
;
6635 struct layout_xattr_info_t
{
6636 enum class InheritanceStatus
: uint32_t {
6642 const file_layout_t layout
;
6643 const InheritanceStatus status
;
6645 layout_xattr_info_t(const file_layout_t
& l
, InheritanceStatus inh
)
6646 : layout(l
), status(inh
) { }
6648 static std::string
status_to_string(InheritanceStatus status
) {
6650 case InheritanceStatus::DEFAULT
: return "default"s
;
6651 case InheritanceStatus::SET
: return "set"s
;
6652 case InheritanceStatus::INHERITED
: return "inherited"s
;
6653 default: return "unknown"s
;
6658 auto is_default_layout
= [&](const file_layout_t
& layout
) -> bool {
6659 return (layout
== mdcache
->default_file_layout
);
6661 auto get_inherited_layout
= [&](CInode
*cur
) -> layout_xattr_info_t
{
6665 if (cur
->get_projected_inode()->has_layout()) {
6666 auto& curr_layout
= cur
->get_projected_inode()->layout
;
6667 if (is_default_layout(curr_layout
)) {
6668 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::DEFAULT
};
6670 if (cur
== orig_in
) {
6671 // we've found a new layout at this inode
6672 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::SET
};
6674 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::INHERITED
};
6678 if (cur
->is_root()) {
6682 cur
= cur
->get_projected_parent_dir()->get_inode();
6684 mds
->clog
->error() << "no layout found at root dir!";
6685 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6688 if (xattr_name
== "ceph.dir.layout.json"sv
||
6689 xattr_name
== "ceph.file.layout.json"sv
) {
6690 // fetch layout only for valid xattr_name
6691 const auto lxi
= get_inherited_layout(cur
);
6693 *css
<< "{\"stripe_unit\": " << lxi
.layout
.stripe_unit
6694 << ", \"stripe_count\": " << lxi
.layout
.stripe_count
6695 << ", \"object_size\": " << lxi
.layout
.object_size
6696 << ", \"pool_name\": ";
6697 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6699 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6700 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6704 *css
<< ", \"pool_id\": " << (uint64_t)lxi
.layout
.pool_id
;
6705 *css
<< ", \"pool_namespace\": \"" << lxi
.layout
.pool_ns
<< "\"";
6706 *css
<< ", \"inheritance\": \"@"
6707 << layout_xattr_info_t::status_to_string(lxi
.status
) << "\"}";
6708 } else if ((xattr_name
== "ceph.dir.layout.pool_name"sv
) ||
6709 (xattr_name
== "ceph.file.layout.pool_name"sv
)) {
6710 // fetch layout only for valid xattr_name
6711 const auto lxi
= get_inherited_layout(cur
);
6712 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6713 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6714 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6717 } else if ((xattr_name
== "ceph.dir.layout.pool_id"sv
) ||
6718 (xattr_name
== "ceph.file.layout.pool_id"sv
)) {
6719 // fetch layout only for valid xattr_name
6720 const auto lxi
= get_inherited_layout(cur
);
6721 *css
<< (uint64_t)lxi
.layout
.pool_id
;
6723 r
= -CEPHFS_ENODATA
; // no such attribute
6725 } else if (xattr_name
.substr(0, 12) == "ceph.dir.pin"sv
) {
6726 if (xattr_name
== "ceph.dir.pin"sv
) {
6727 *css
<< cur
->get_projected_inode()->export_pin
;
6728 } else if (xattr_name
== "ceph.dir.pin.random"sv
) {
6729 *css
<< cur
->get_projected_inode()->export_ephemeral_random_pin
;
6730 } else if (xattr_name
== "ceph.dir.pin.distributed"sv
) {
6731 *css
<< cur
->get_projected_inode()->export_ephemeral_distributed_pin
;
6733 // otherwise respond as invalid request
6734 // since we only handle ceph vxattrs here
6735 r
= -CEPHFS_ENODATA
; // no such attribute
6738 // otherwise respond as invalid request
6739 // since we only handle ceph vxattrs here
6740 r
= -CEPHFS_ENODATA
; // no such attribute
6744 ENCODE_START(1, 1, bl
);
6745 encode(css
->strv(), bl
);
6747 mdr
->reply_extra_bl
= bl
;
6750 respond_to_request(mdr
, r
);
6753 // =================================================================
6754 // DIRECTORY and NAMESPACE OPS
6757 // ------------------------------------------------
6759 struct C_WaitUnlinkToFinish
: public MDSContext
{
6765 MDSRank
*get_mds() override
6767 ceph_assert(mdcache
!= NULL
);
6768 return mdcache
->mds
;
6772 C_WaitUnlinkToFinish(MDCache
*m
, CDentry
*d
, MDSContext
*f
) :
6773 mdcache(m
), dn(d
), fin(f
) {}
6774 void finish(int r
) override
{
6776 dn
->put(CDentry::PIN_PURGING
);
6780 bool Server::is_unlink_pending(CDentry
*dn
)
6782 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
6783 if (!dnl
->is_null() && dn
->state_test(CDentry::STATE_UNLINKING
)) {
6789 void Server::wait_for_pending_unlink(CDentry
*dn
, MDRequestRef
& mdr
)
6791 dout(20) << __func__
<< " dn " << *dn
<< dendl
;
6792 mds
->locker
->drop_locks(mdr
.get());
6793 auto fin
= new C_MDS_RetryRequest(mdcache
, mdr
);
6794 dn
->get(CDentry::PIN_PURGING
);
6795 dn
->add_waiter(CDentry::WAIT_UNLINK_FINISH
, new C_WaitUnlinkToFinish(mdcache
, dn
, fin
));
6800 class C_MDS_mknod_finish
: public ServerLogContext
{
6804 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
6805 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
6806 void finish(int r
) override
{
6807 ceph_assert(r
== 0);
6810 dn
->pop_projected_linkage();
6812 // be a bit hacky with the inode version, here.. we decrement it
6813 // just to keep mark_dirty() happen. (we didn't bother projecting
6814 // a new version of hte inode since it's just been created)
6815 newi
->mark_dirty(mdr
->ls
);
6816 newi
->mark_dirty_parent(mdr
->ls
, true);
6819 if (newi
->is_dir()) {
6820 CDir
*dir
= newi
->get_dirfrag(frag_t());
6822 dir
->mark_dirty(mdr
->ls
);
6823 dir
->mark_new(mdr
->ls
);
6828 MDRequestRef null_ref
;
6829 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
6831 if (newi
->is_file()) {
6832 get_mds()->locker
->share_inode_max_size(newi
);
6833 } else if (newi
->is_dir()) {
6834 // We do this now so that the linkages on the new directory are stable.
6835 newi
->maybe_ephemeral_rand();
6839 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
6842 server
->respond_to_request(mdr
, 0);
6847 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6849 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6850 client_t client
= mdr
->get_client();
6852 unsigned mode
= req
->head
.args
.mknod
.mode
;
6853 if ((mode
& S_IFMT
) == 0)
6856 mdr
->disable_lock_cache();
6857 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, false, S_ISREG(mode
));
6861 if (is_unlink_pending(dn
)) {
6862 wait_for_pending_unlink(dn
, mdr
);
6866 CDir
*dir
= dn
->get_dir();
6867 CInode
*diri
= dir
->get_inode();
6868 if (!check_access(mdr
, diri
, MAY_WRITE
))
6870 if (!check_fragment_space(mdr
, dir
))
6872 if (!check_dir_max_entries(mdr
, dir
))
6875 ceph_assert(dn
->get_projected_linkage()->is_null());
6876 if (req
->get_alternate_name().size() > alternate_name_max
) {
6877 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6878 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6881 dn
->set_alternate_name(req
->get_alternate_name());
6884 file_layout_t layout
;
6885 if (mdr
->dir_layout
!= file_layout_t())
6886 layout
= mdr
->dir_layout
;
6888 layout
= mdcache
->default_file_layout
;
6890 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6893 dn
->push_projected_linkage(newi
);
6895 auto _inode
= newi
->_get_inode();
6896 _inode
->version
= dn
->pre_dirty();
6897 _inode
->rdev
= req
->head
.args
.mknod
.rdev
;
6898 _inode
->rstat
.rfiles
= 1;
6899 _inode
->accounted_rstat
= _inode
->rstat
;
6900 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6901 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
6902 _inode
->update_backtrace();
6904 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6905 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6906 ceph_assert(follows
>= realm
->get_newest_seq());
6908 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6909 // want to write to it (e.g., if they are reexporting NFS)
6910 if (S_ISREG(_inode
->mode
)) {
6911 // issue a cap on the file
6912 int cmode
= CEPH_FILE_MODE_RDWR
;
6913 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6917 // put locks in excl mode
6918 newi
->filelock
.set_state(LOCK_EXCL
);
6919 newi
->authlock
.set_state(LOCK_EXCL
);
6920 newi
->xattrlock
.set_state(LOCK_EXCL
);
6922 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6923 _inode
->client_ranges
[client
].range
.first
= 0;
6924 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
6925 _inode
->client_ranges
[client
].follows
= follows
;
6926 newi
->mark_clientwriteable();
6927 cap
->mark_clientwriteable();
6931 ceph_assert(dn
->first
== follows
+ 1);
6932 newi
->first
= dn
->first
;
6934 dout(10) << "mknod mode " << _inode
->mode
<< " rdev " << _inode
->rdev
<< dendl
;
6937 mdr
->ls
= mdlog
->get_current_segment();
6938 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6939 mdlog
->start_entry(le
);
6940 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6941 journal_allocated_inos(mdr
, &le
->metablob
);
6943 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6944 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6945 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6947 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6948 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6954 /* This function takes responsibility for the passed mdr*/
6955 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6957 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6959 mdr
->disable_lock_cache();
6960 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6964 if (is_unlink_pending(dn
)) {
6965 wait_for_pending_unlink(dn
, mdr
);
6969 CDir
*dir
= dn
->get_dir();
6970 CInode
*diri
= dir
->get_inode();
6972 // mkdir check access
6973 if (!check_access(mdr
, diri
, MAY_WRITE
))
6976 if (!check_fragment_space(mdr
, dir
))
6978 if (!check_dir_max_entries(mdr
, dir
))
6981 ceph_assert(dn
->get_projected_linkage()->is_null());
6982 if (req
->get_alternate_name().size() > alternate_name_max
) {
6983 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6984 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6987 dn
->set_alternate_name(req
->get_alternate_name());
6990 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6993 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6996 // it's a directory.
6997 dn
->push_projected_linkage(newi
);
6999 auto _inode
= newi
->_get_inode();
7000 _inode
->version
= dn
->pre_dirty();
7001 _inode
->rstat
.rsubdirs
= 1;
7002 _inode
->accounted_rstat
= _inode
->rstat
;
7003 _inode
->update_backtrace();
7005 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
7006 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
7007 ceph_assert(follows
>= realm
->get_newest_seq());
7009 dout(12) << " follows " << follows
<< dendl
;
7010 ceph_assert(dn
->first
== follows
+ 1);
7011 newi
->first
= dn
->first
;
7013 // ...and that new dir is empty.
7014 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
7015 newdir
->state_set(CDir::STATE_CREATING
);
7016 newdir
->mark_complete();
7017 newdir
->_get_fnode()->version
= newdir
->pre_dirty();
7020 mdr
->ls
= mdlog
->get_current_segment();
7021 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
7022 mdlog
->start_entry(le
);
7023 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
7024 journal_allocated_inos(mdr
, &le
->metablob
);
7025 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7026 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
7027 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
7029 // issue a cap on the directory
7030 int cmode
= CEPH_FILE_MODE_RDWR
;
7031 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
7035 // put locks in excl mode
7036 newi
->filelock
.set_state(LOCK_EXCL
);
7037 newi
->authlock
.set_state(LOCK_EXCL
);
7038 newi
->xattrlock
.set_state(LOCK_EXCL
);
7041 // make sure this inode gets into the journal
7042 le
->metablob
.add_opened_ino(newi
->ino());
7044 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
7046 // We hit_dir (via hit_inode) in our finish callback, but by then we might
7047 // have overshot the split size (multiple mkdir in flight), so here is
7048 // an early chance to split the dir if this mkdir makes it oversized.
7049 mds
->balancer
->maybe_fragment(dir
, false);
7055 void Server::handle_client_symlink(MDRequestRef
& mdr
)
7057 const auto& req
= mdr
->client_request
;
7059 mdr
->disable_lock_cache();
7060 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
7064 if (is_unlink_pending(dn
)) {
7065 wait_for_pending_unlink(dn
, mdr
);
7069 CDir
*dir
= dn
->get_dir();
7070 CInode
*diri
= dir
->get_inode();
7072 if (!check_access(mdr
, diri
, MAY_WRITE
))
7074 if (!check_fragment_space(mdr
, dir
))
7076 if (!check_dir_max_entries(mdr
, dir
))
7079 ceph_assert(dn
->get_projected_linkage()->is_null());
7080 if (req
->get_alternate_name().size() > alternate_name_max
) {
7081 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
7082 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
7084 dn
->set_alternate_name(req
->get_alternate_name());
7086 unsigned mode
= S_IFLNK
| 0777;
7087 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
7091 dn
->push_projected_linkage(newi
);
7093 newi
->symlink
= req
->get_path2();
7094 auto _inode
= newi
->_get_inode();
7095 _inode
->version
= dn
->pre_dirty();
7096 _inode
->size
= newi
->symlink
.length();
7097 _inode
->rstat
.rbytes
= _inode
->size
;
7098 _inode
->rstat
.rfiles
= 1;
7099 _inode
->accounted_rstat
= _inode
->rstat
;
7100 _inode
->update_backtrace();
7102 newi
->first
= dn
->first
;
7105 mdr
->ls
= mdlog
->get_current_segment();
7106 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
7107 mdlog
->start_entry(le
);
7108 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
7109 journal_allocated_inos(mdr
, &le
->metablob
);
7110 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7111 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
7113 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
7114 mds
->balancer
->maybe_fragment(dir
, false);
7123 void Server::handle_client_link(MDRequestRef
& mdr
)
7125 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7127 dout(7) << "handle_client_link " << req
->get_filepath()
7128 << " to " << req
->get_filepath2()
7131 mdr
->disable_lock_cache();
7136 if (req
->get_filepath2().depth() == 0) {
7137 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
7139 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
7140 inodeno_t ino
= req
->get_filepath2().get_ino();
7141 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
7146 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
7147 CDentry
*pdn
= targeti
->get_projected_parent_dn();
7149 dout(7) << "target has no parent dn, failing..." << dendl
;
7150 respond_to_request(mdr
, -CEPHFS_EINVAL
);
7153 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
7155 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
7158 destdn
= rdlock_path_xlock_dentry(mdr
, false);
7162 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
7167 if (!destdn
->get_projected_linkage()->is_null()) {
7168 respond_to_request(mdr
, -CEPHFS_EEXIST
);
7172 targeti
= ret
.second
->get_projected_linkage()->get_inode();
7175 if (is_unlink_pending(destdn
)) {
7176 wait_for_pending_unlink(destdn
, mdr
);
7180 ceph_assert(destdn
->get_projected_linkage()->is_null());
7181 if (req
->get_alternate_name().size() > alternate_name_max
) {
7182 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
7183 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
7186 destdn
->set_alternate_name(req
->get_alternate_name());
7188 if (targeti
->is_dir()) {
7189 dout(7) << "target is a dir, failing..." << dendl
;
7190 respond_to_request(mdr
, -CEPHFS_EINVAL
);
7194 CDir
*dir
= destdn
->get_dir();
7195 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
7196 dout(7) << "target is " << *targeti
<< dendl
;
7198 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7199 MutationImpl::LockOpVec lov
;
7200 lov
.add_xlock(&targeti
->snaplock
);
7201 lov
.add_xlock(&targeti
->linklock
);
7203 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7206 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7209 if (targeti
->get_projected_inode()->nlink
== 0) {
7210 dout(7) << "target has no link, failing..." << dendl
;
7211 respond_to_request(mdr
, -CEPHFS_ENOENT
);
7215 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7216 if (!check_access(mdr
, targeti
, MAY_WRITE
))
7219 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
7222 if (!check_fragment_space(mdr
, dir
))
7225 if (!check_dir_max_entries(mdr
, dir
))
7229 CInode
* target_pin
= targeti
->get_projected_parent_dir()->inode
;
7230 SnapRealm
*target_realm
= target_pin
->find_snaprealm();
7231 if (target_pin
!= dir
->inode
&&
7232 target_realm
->get_subvolume_ino() !=
7233 dir
->inode
->find_snaprealm()->get_subvolume_ino()) {
7234 if (target_pin
->is_stray()) {
7235 mds
->locker
->drop_locks(mdr
.get());
7236 targeti
->add_waiter(CInode::WAIT_UNLINK
,
7237 new C_MDS_RetryRequest(mdcache
, mdr
));
7241 dout(7) << "target is in different subvolume, failing..." << dendl
;
7242 respond_to_request(mdr
, -CEPHFS_EXDEV
);
7247 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
7250 if (targeti
->is_auth())
7251 _link_local(mdr
, destdn
, targeti
, target_realm
);
7253 _link_remote(mdr
, true, destdn
, targeti
);
7254 mds
->balancer
->maybe_fragment(dir
, false);
7258 class C_MDS_link_local_finish
: public ServerLogContext
{
7265 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
7266 version_t dnpv_
, version_t tipv_
, bool ar
) :
7267 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
7268 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
7269 void finish(int r
) override
{
7270 ceph_assert(r
== 0);
7271 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
7276 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
)
7278 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
7280 mdr
->ls
= mdlog
->get_current_segment();
7282 // predirty NEW dentry
7283 version_t dnpv
= dn
->pre_dirty();
7284 version_t tipv
= targeti
->pre_dirty();
7286 // project inode update
7287 auto pi
= targeti
->project_inode(mdr
);
7289 pi
.inode
->ctime
= mdr
->get_op_stamp();
7290 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7291 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7292 pi
.inode
->change_attr
++;
7293 pi
.inode
->version
= tipv
;
7295 bool adjust_realm
= false;
7296 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7297 sr_t
*newsnap
= targeti
->project_snaprealm();
7298 targeti
->mark_snaprealm_global(newsnap
);
7299 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, targeti
->get_projected_parent_dn(), true);
7300 adjust_realm
= true;
7304 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
7305 mdlog
->start_entry(le
);
7306 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7307 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
7308 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
7309 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7310 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
7312 // do this after predirty_*, to avoid funky extra dnl arg
7313 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7315 journal_and_reply(mdr
, targeti
, dn
, le
,
7316 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
7319 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
7320 version_t dnpv
, version_t tipv
, bool adjust_realm
)
7322 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
7324 // link and unlock the NEW dentry
7325 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7326 if (!dnl
->get_inode())
7327 dn
->link_remote(dnl
, targeti
);
7328 dn
->mark_dirty(dnpv
, mdr
->ls
);
7333 MDRequestRef null_ref
;
7334 mdcache
->send_dentry_link(dn
, null_ref
);
7337 int op
= CEPH_SNAP_OP_SPLIT
;
7338 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7339 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7342 // bump target popularity
7343 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7344 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7347 respond_to_request(mdr
, 0);
7351 // link / unlink remote
7353 class C_MDS_link_remote_finish
: public ServerLogContext
{
7359 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
7360 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
7361 dpv(d
->get_projected_version()) {}
7362 void finish(int r
) override
{
7363 ceph_assert(r
== 0);
7364 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
7368 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
7370 dout(10) << "_link_remote "
7371 << (inc
? "link ":"unlink ")
7372 << *dn
<< " to " << *targeti
<< dendl
;
7374 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7375 mds_rank_t linkauth
= targeti
->authority().first
;
7376 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
7377 if (mds
->is_cluster_degraded() &&
7378 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
7379 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
7380 if (mdr
->more()->waiting_on_peer
.empty())
7381 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
7385 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
7388 op
= MMDSPeerRequest::OP_LINKPREP
;
7390 op
= MMDSPeerRequest::OP_UNLINKPREP
;
7391 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
7392 targeti
->set_object_info(req
->get_object_info());
7393 req
->op_stamp
= mdr
->get_op_stamp();
7394 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
7395 encode(*desti_srnode
, req
->desti_snapbl
);
7396 mds
->send_message_mds(req
, linkauth
);
7398 ceph_assert(mdr
->more()->waiting_on_peer
.count(linkauth
) == 0);
7399 mdr
->more()->waiting_on_peer
.insert(linkauth
);
7402 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
7404 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
7406 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
7407 delete desti_srnode
;
7408 desti_srnode
= NULL
;
7411 mdr
->set_mds_stamp(ceph_clock_now());
7414 mdr
->ls
= mdlog
->get_current_segment();
7415 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
7416 mdlog
->start_entry(le
);
7417 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7418 if (!mdr
->more()->witnessed
.empty()) {
7419 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7420 le
->reqid
= mdr
->reqid
;
7421 le
->had_peers
= true;
7422 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7427 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
7428 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7429 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7432 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7433 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7434 le
->metablob
.add_null_dentry(dn
, true);
7435 dn
->push_projected_linkage();
7438 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
7439 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
7442 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
7443 CDentry
*dn
, CInode
*targeti
,
7446 dout(10) << "_link_remote_finish "
7447 << (inc
? "link ":"unlink ")
7448 << *dn
<< " to " << *targeti
<< dendl
;
7450 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
7452 if (!mdr
->more()->witnessed
.empty())
7453 mdcache
->logged_leader_update(mdr
->reqid
);
7456 // link the new dentry
7457 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7458 if (!dnl
->get_inode())
7459 dn
->link_remote(dnl
, targeti
);
7460 dn
->mark_dirty(dpv
, mdr
->ls
);
7462 // unlink main dentry
7463 dn
->get_dir()->unlink_inode(dn
);
7464 dn
->pop_projected_linkage();
7465 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
7470 MDRequestRef null_ref
;
7472 mdcache
->send_dentry_link(dn
, null_ref
);
7474 dn
->state_clear(CDentry::STATE_UNLINKING
);
7475 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
7477 MDSContext::vec finished
;
7478 dn
->take_waiting(CDentry::WAIT_UNLINK_FINISH
, finished
);
7479 mdcache
->mds
->queue_waiters(finished
);
7482 // bump target popularity
7483 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7484 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7487 respond_to_request(mdr
, 0);
7490 // removing a new dn?
7491 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7495 // remote linking/unlinking
7497 class C_MDS_PeerLinkPrep
: public ServerLogContext
{
7501 C_MDS_PeerLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
7502 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
7503 void finish(int r
) override
{
7504 ceph_assert(r
== 0);
7505 server
->_logged_peer_link(mdr
, targeti
, adjust_realm
);
7509 class C_MDS_PeerLinkCommit
: public ServerContext
{
7513 C_MDS_PeerLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
7514 ServerContext(s
), mdr(r
), targeti(t
) { }
7515 void finish(int r
) override
{
7516 server
->_commit_peer_link(mdr
, r
, targeti
);
7520 void Server::handle_peer_link_prep(MDRequestRef
& mdr
)
7522 dout(10) << "handle_peer_link_prep " << *mdr
7523 << " on " << mdr
->peer_request
->get_object_info()
7526 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
7528 CInode
*targeti
= mdcache
->get_inode(mdr
->peer_request
->get_object_info().ino
);
7529 ceph_assert(targeti
);
7530 dout(10) << "targeti " << *targeti
<< dendl
;
7531 CDentry
*dn
= targeti
->get_parent_dn();
7532 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7533 ceph_assert(dnl
->is_primary());
7535 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7537 mdr
->auth_pin(targeti
);
7539 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7540 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
7543 mdr
->ls
= mdlog
->get_current_segment();
7544 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_prep", mdr
->reqid
, mdr
->peer_to_mds
,
7545 EPeerUpdate::OP_PREPARE
, EPeerUpdate::LINK
);
7546 mdlog
->start_entry(le
);
7548 auto pi
= dnl
->get_inode()->project_inode(mdr
);
7550 // update journaled target inode
7552 bool adjust_realm
= false;
7553 bool realm_projected
= false;
7554 if (mdr
->peer_request
->get_op() == MMDSPeerRequest::OP_LINKPREP
) {
7558 CDentry
*target_pdn
= targeti
->get_projected_parent_dn();
7559 SnapRealm
*target_realm
= target_pdn
->get_dir()->inode
->find_snaprealm();
7560 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7561 sr_t
*newsnap
= targeti
->project_snaprealm();
7562 targeti
->mark_snaprealm_global(newsnap
);
7563 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, target_pdn
, true);
7564 adjust_realm
= true;
7565 realm_projected
= true;
7570 if (targeti
->is_projected_snaprealm_global()) {
7571 ceph_assert(mdr
->peer_request
->desti_snapbl
.length());
7572 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
7574 sr_t
*newsnap
= targeti
->project_snaprealm();
7575 decode(*newsnap
, p
);
7577 if (pi
.inode
->nlink
== 0)
7578 ceph_assert(!newsnap
->is_parent_global());
7580 realm_projected
= true;
7582 ceph_assert(mdr
->peer_request
->desti_snapbl
.length() == 0);
7586 link_rollback rollback
;
7587 rollback
.reqid
= mdr
->reqid
;
7588 rollback
.ino
= targeti
->ino();
7589 rollback
.old_ctime
= targeti
->get_inode()->ctime
; // we hold versionlock xlock; no concorrent projections
7590 const auto& pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
7591 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
7592 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
7593 rollback
.was_inc
= inc
;
7594 if (realm_projected
) {
7595 if (targeti
->snaprealm
) {
7596 encode(true, rollback
.snapbl
);
7597 targeti
->encode_snap_blob(rollback
.snapbl
);
7599 encode(false, rollback
.snapbl
);
7602 encode(rollback
, le
->rollback
);
7603 mdr
->more()->rollback_bl
= le
->rollback
;
7605 pi
.inode
->ctime
= mdr
->get_op_stamp();
7606 pi
.inode
->version
= targeti
->pre_dirty();
7608 dout(10) << " projected inode " << pi
.inode
->ino
<< " v " << pi
.inode
->version
<< dendl
;
7611 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
7612 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
7613 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7615 // set up commit waiter
7616 mdr
->more()->peer_commit
= new C_MDS_PeerLinkCommit(this, mdr
, targeti
);
7618 mdr
->more()->peer_update_journaled
= true;
7619 submit_mdlog_entry(le
, new C_MDS_PeerLinkPrep(this, mdr
, targeti
, adjust_realm
),
7624 void Server::_logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
7626 dout(10) << "_logged_peer_link " << *mdr
7627 << " " << *targeti
<< dendl
;
7629 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
7631 // update the target
7635 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7638 mdr
->reset_peer_request();
7641 int op
= CEPH_SNAP_OP_SPLIT
;
7642 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7643 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7647 if (!mdr
->aborted
) {
7648 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_LINKPREPACK
);
7649 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7651 dout(10) << " abort flag set, finishing" << dendl
;
7652 mdcache
->request_finish(mdr
);
7657 struct C_MDS_CommittedPeer
: public ServerLogContext
{
7658 C_MDS_CommittedPeer(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
7659 void finish(int r
) override
{
7660 server
->_committed_peer(mdr
);
7664 void Server::_commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
7666 dout(10) << "_commit_peer_link " << *mdr
7668 << " " << *targeti
<< dendl
;
7670 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
7673 // drop our pins, etc.
7676 // write a commit to the journal
7677 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_commit", mdr
->reqid
, mdr
->peer_to_mds
,
7678 EPeerUpdate::OP_COMMIT
, EPeerUpdate::LINK
);
7679 mdlog
->start_entry(le
);
7680 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7683 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7687 void Server::_committed_peer(MDRequestRef
& mdr
)
7689 dout(10) << "_committed_peer " << *mdr
<< dendl
;
7691 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
7693 bool assert_exist
= mdr
->more()->peer_update_journaled
;
7694 mdcache
->finish_uncommitted_peer(mdr
->reqid
, assert_exist
);
7695 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_COMMITTED
);
7696 mds
->send_message_mds(req
, mdr
->peer_to_mds
);
7697 mdcache
->request_finish(mdr
);
7700 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
7702 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7703 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7704 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
7705 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
7707 void finish(int r
) override
{
7708 server
->_link_rollback_finish(mut
, mdr
, splits
);
7712 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7714 link_rollback rollback
;
7715 auto p
= rbl
.cbegin();
7716 decode(rollback
, p
);
7718 dout(10) << "do_link_rollback on " << rollback
.reqid
7719 << (rollback
.was_inc
? " inc":" dec")
7720 << " ino " << rollback
.ino
7723 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
7725 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7726 ceph_assert(mdr
|| mds
->is_resolve());
7728 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7729 mut
->ls
= mds
->mdlog
->get_current_segment();
7731 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
7733 dout(10) << " target is " << *in
<< dendl
;
7734 ceph_assert(!in
->is_projected()); // live peer request hold versionlock xlock.
7736 auto pi
= in
->project_inode(mut
);
7737 pi
.inode
->version
= in
->pre_dirty();
7739 // parent dir rctime
7740 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
7741 auto pf
= parent
->project_fnode(mut
);
7742 pf
->version
= parent
->pre_dirty();
7743 if (pf
->fragstat
.mtime
== pi
.inode
->ctime
) {
7744 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
7745 if (pf
->rstat
.rctime
== pi
.inode
->ctime
)
7746 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
7747 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
7748 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
7752 pi
.inode
->ctime
= rollback
.old_ctime
;
7753 if (rollback
.was_inc
)
7758 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7759 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7761 auto p
= rollback
.snapbl
.cbegin();
7762 decode(hadrealm
, p
);
7764 if (!mds
->is_resolve()) {
7765 sr_t
*new_srnode
= new sr_t();
7766 decode(*new_srnode
, p
);
7767 in
->project_snaprealm(new_srnode
);
7769 decode(in
->snaprealm
->srnode
, p
);
7772 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
7773 if (!mds
->is_resolve())
7774 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
7775 in
->project_snaprealm(NULL
);
7780 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_rollback", rollback
.reqid
, leader
,
7781 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::LINK
);
7782 mdlog
->start_entry(le
);
7783 le
->commit
.add_dir_context(parent
);
7784 le
->commit
.add_dir(parent
, true);
7785 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
7787 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
7792 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
7793 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
7795 dout(10) << "_link_rollback_finish" << dendl
;
7797 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
7801 if (!mds
->is_resolve())
7802 mdcache
->send_snaps(splits
);
7805 mdcache
->request_finish(mdr
);
7807 mdcache
->finish_rollback(mut
->reqid
, mdr
);
7813 void Server::handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
)
7815 dout(10) << "handle_peer_link_prep_ack " << *mdr
7816 << " " << *m
<< dendl
;
7817 mds_rank_t from
= mds_rank_t(m
->get_source().num());
7819 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
7822 mdr
->more()->peers
.insert(from
);
7825 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
7826 mdr
->more()->witnessed
.insert(from
);
7827 ceph_assert(!m
->is_not_journaled());
7828 mdr
->more()->has_journaled_peers
= true;
7830 // remove from waiting list
7831 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7832 mdr
->more()->waiting_on_peer
.erase(from
);
7834 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
7836 dispatch_client_request(mdr
); // go again!
7845 void Server::handle_client_unlink(MDRequestRef
& mdr
)
7847 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7848 client_t client
= mdr
->get_client();
7851 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
7854 mdr
->disable_lock_cache();
7856 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
7860 // notify replica MDSes the dentry is under unlink
7861 if (!dn
->state_test(CDentry::STATE_UNLINKING
)) {
7862 dn
->state_set(CDentry::STATE_UNLINKING
);
7863 mdcache
->send_dentry_unlink(dn
, nullptr, mdr
, true);
7864 if (dn
->replica_unlinking_ref
) {
7869 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
7870 ceph_assert(!dnl
->is_null());
7871 CInode
*in
= dnl
->get_inode();
7874 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
7876 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
7878 dout(7) << "dn links to " << *in
<< dendl
;
7883 // do empty directory checks
7884 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
7885 dn
->state_clear(CDentry::STATE_UNLINKING
);
7886 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7890 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
7891 dn
->state_clear(CDentry::STATE_UNLINKING
);
7892 respond_to_request(mdr
, -CEPHFS_EISDIR
);
7898 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
7899 dn
->state_clear(CDentry::STATE_UNLINKING
);
7900 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
7905 CInode
*diri
= dn
->get_dir()->get_inode();
7906 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7907 if (!check_access(mdr
, diri
, MAY_WRITE
)) {
7908 dn
->state_clear(CDentry::STATE_UNLINKING
);
7913 // -- create stray dentry? --
7914 CDentry
*straydn
= NULL
;
7915 if (dnl
->is_primary()) {
7916 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
7919 dout(10) << " straydn is " << *straydn
<< dendl
;
7920 } else if (mdr
->straydn
) {
7921 mdr
->unpin(mdr
->straydn
);
7922 mdr
->straydn
= NULL
;
7926 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7927 MutationImpl::LockOpVec lov
;
7929 lov
.add_xlock(&in
->linklock
);
7930 lov
.add_xlock(&in
->snaplock
);
7932 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
7935 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7936 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7937 lov
.add_xlock(&straydn
->lock
);
7940 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7943 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7947 _dir_is_nonempty(mdr
, in
)) {
7948 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7949 dn
->state_clear(CDentry::STATE_UNLINKING
);
7954 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7956 if (!mdr
->more()->desti_srnode
) {
7957 if (in
->is_projected_snaprealm_global()) {
7958 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7959 in
->record_snaprealm_parent_dentry(new_srnode
, nullptr, dn
, dnl
->is_primary());
7960 // dropping the last linkage or dropping the last remote linkage,
7961 // detch the inode from global snaprealm
7962 auto nlink
= in
->get_projected_inode()->nlink
;
7964 (nlink
== 2 && !dnl
->is_primary() &&
7965 !in
->get_projected_parent_dir()->inode
->is_stray()))
7966 in
->clear_snaprealm_global(new_srnode
);
7967 mdr
->more()->desti_srnode
= new_srnode
;
7968 } else if (dnl
->is_primary()) {
7969 // prepare snaprealm blob for peer request
7970 SnapRealm
*realm
= in
->find_snaprealm();
7971 snapid_t follows
= realm
->get_newest_seq();
7972 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7973 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7974 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7975 mdr
->more()->desti_srnode
= new_srnode
;
7981 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7982 // subtree root auths need to be witnesses
7983 set
<mds_rank_t
> witnesses
;
7984 in
->list_replicas(witnesses
);
7985 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7987 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7988 p
!= witnesses
.end();
7990 if (mdr
->more()->witnessed
.count(*p
)) {
7991 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7992 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
7993 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7995 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7999 if (!mdr
->more()->waiting_on_peer
.empty())
8000 return; // we're waiting for a witness.
8003 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
8004 mds
->locker
->create_lock_cache(mdr
, diri
);
8007 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
8008 _link_remote(mdr
, false, dn
, dnl
->get_inode());
8010 _unlink_local(mdr
, dn
, straydn
);
8013 class C_MDS_unlink_local_finish
: public ServerLogContext
{
8016 version_t dnpv
; // deleted dentry
8018 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
8019 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
8020 dnpv(d
->get_projected_version()) {}
8021 void finish(int r
) override
{
8022 ceph_assert(r
== 0);
8023 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
8027 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
8029 dout(10) << "_unlink_local " << *dn
<< dendl
;
8031 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8032 CInode
*in
= dnl
->get_inode();
8036 mdr
->ls
= mdlog
->get_current_segment();
8038 // prepare log entry
8039 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
8040 mdlog
->start_entry(le
);
8041 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
8042 if (!mdr
->more()->witnessed
.empty()) {
8043 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
8044 le
->reqid
= mdr
->reqid
;
8045 le
->had_peers
= true;
8046 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8050 ceph_assert(dnl
->is_primary());
8051 straydn
->push_projected_linkage(in
);
8054 // the unlinked dentry
8057 auto pi
= in
->project_inode(mdr
);
8060 dn
->make_path_string(t
, true);
8061 pi
.inode
->stray_prior_path
= std::move(t
);
8063 pi
.inode
->version
= in
->pre_dirty();
8064 pi
.inode
->ctime
= mdr
->get_op_stamp();
8065 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
8066 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
8067 pi
.inode
->change_attr
++;
8069 if (pi
.inode
->nlink
== 0)
8070 in
->state_set(CInode::STATE_ORPHAN
);
8072 if (mdr
->more()->desti_srnode
) {
8073 auto& desti_srnode
= mdr
->more()->desti_srnode
;
8074 in
->project_snaprealm(desti_srnode
);
8075 desti_srnode
= NULL
;
8079 // will manually pop projected inode
8081 // primary link. add stray dentry.
8082 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
8083 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8085 pi
.inode
->update_backtrace();
8086 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
8088 // remote link. update remote inode.
8089 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
8090 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
8091 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
8094 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
8095 le
->metablob
.add_null_dentry(dn
, true);
8098 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8099 le
->metablob
.renamed_dirino
= in
->ino();
8102 dn
->push_projected_linkage();
8105 ceph_assert(in
->first
<= straydn
->first
);
8106 in
->first
= straydn
->first
;
8110 ceph_assert(straydn
);
8111 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
8114 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
8117 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
8118 CDentry
*dn
, CDentry
*straydn
,
8121 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
8123 if (!mdr
->more()->witnessed
.empty())
8124 mdcache
->logged_leader_update(mdr
->reqid
);
8126 CInode
*strayin
= NULL
;
8127 bool hadrealm
= false;
8129 // if there is newly created snaprealm, need to split old snaprealm's
8130 // inodes_with_caps. So pop snaprealm before linkage changes.
8131 strayin
= dn
->get_linkage()->get_inode();
8132 hadrealm
= strayin
->snaprealm
? true : false;
8133 strayin
->early_pop_projected_snaprealm();
8136 // unlink main dentry
8137 dn
->get_dir()->unlink_inode(dn
);
8138 dn
->pop_projected_linkage();
8139 dn
->mark_dirty(dnpv
, mdr
->ls
);
8141 // relink as stray? (i.e. was primary link?)
8143 dout(20) << " straydn is " << *straydn
<< dendl
;
8144 straydn
->pop_projected_linkage();
8145 mdcache
->touch_dentry_bottom(straydn
);
8150 dn
->state_clear(CDentry::STATE_UNLINKING
);
8151 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
8153 MDSContext::vec finished
;
8154 dn
->take_waiting(CDentry::WAIT_UNLINK_FINISH
, finished
);
8155 mdcache
->mds
->queue_waiters(finished
);
8158 // update subtree map?
8159 if (strayin
->is_dir())
8160 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
8162 if (strayin
->snaprealm
&& !hadrealm
)
8163 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
8167 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
8170 respond_to_request(mdr
, 0);
8172 // removing a new dn?
8173 dn
->get_dir()->try_remove_unlinked_dn(dn
);
8176 // respond_to_request() drops locks. So stray reintegration can race with us.
8177 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8178 // Tip off the MDCache that this dentry is a stray that
8179 // might be elegible for purge.
8180 mdcache
->notify_stray(straydn
);
8184 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
8186 if (mds
->is_cluster_degraded() &&
8187 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8188 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
8189 if (mdr
->more()->waiting_on_peer
.empty())
8190 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8194 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
8195 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREP
);
8196 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
8197 for (auto dn
: trace
)
8198 req
->srcdnpath
.push_dentry(dn
->get_name());
8199 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8200 if (mdr
->more()->desti_srnode
)
8201 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8203 req
->op_stamp
= mdr
->get_op_stamp();
8204 mds
->send_message_mds(req
, who
);
8206 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
8207 mdr
->more()->waiting_on_peer
.insert(who
);
8211 struct C_MDS_PeerRmdirPrep
: public ServerLogContext
{
8212 CDentry
*dn
, *straydn
;
8213 C_MDS_PeerRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
8214 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
8215 void finish(int r
) override
{
8216 server
->_logged_peer_rmdir(mdr
, dn
, straydn
);
8220 struct C_MDS_PeerRmdirCommit
: public ServerContext
{
8223 C_MDS_PeerRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
8224 : ServerContext(s
), mdr(r
), straydn(sd
) { }
8225 void finish(int r
) override
{
8226 server
->_commit_peer_rmdir(mdr
, r
, straydn
);
8230 void Server::handle_peer_rmdir_prep(MDRequestRef
& mdr
)
8232 dout(10) << "handle_peer_rmdir_prep " << *mdr
8233 << " " << mdr
->peer_request
->srcdnpath
8234 << " to " << mdr
->peer_request
->destdnpath
8237 vector
<CDentry
*> trace
;
8238 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
8239 dout(10) << " src " << srcpath
<< dendl
;
8241 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
8242 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
8243 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
8246 if (r
== -CEPHFS_ESTALE
) {
8247 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
8248 mdr
->peer_to_mds
, true);
8251 ceph_assert(r
== 0);
8252 CDentry
*dn
= trace
.back();
8253 dout(10) << " dn " << *dn
<< dendl
;
8256 ceph_assert(mdr
->straydn
);
8257 CDentry
*straydn
= mdr
->straydn
;
8258 dout(10) << " straydn " << *straydn
<< dendl
;
8260 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
8262 rmdir_rollback rollback
;
8263 rollback
.reqid
= mdr
->reqid
;
8264 rollback
.src_dir
= dn
->get_dir()->dirfrag();
8265 rollback
.src_dname
= dn
->get_name();
8266 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
8267 rollback
.dest_dname
= straydn
->get_name();
8268 if (mdr
->peer_request
->desti_snapbl
.length()) {
8269 if (in
->snaprealm
) {
8270 encode(true, rollback
.snapbl
);
8271 in
->encode_snap_blob(rollback
.snapbl
);
8273 encode(false, rollback
.snapbl
);
8276 encode(rollback
, mdr
->more()->rollback_bl
);
8277 // FIXME: rollback snaprealm
8278 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
8280 // set up commit waiter
8281 mdr
->more()->peer_commit
= new C_MDS_PeerRmdirCommit(this, mdr
, straydn
);
8283 straydn
->push_projected_linkage(in
);
8284 dn
->push_projected_linkage();
8286 ceph_assert(straydn
->first
>= in
->first
);
8287 in
->first
= straydn
->first
;
8289 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
8290 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
8291 _logged_peer_rmdir(mdr
, dn
, straydn
);
8295 mdr
->ls
= mdlog
->get_current_segment();
8296 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir", mdr
->reqid
, mdr
->peer_to_mds
,
8297 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RMDIR
);
8298 mdlog
->start_entry(le
);
8299 le
->rollback
= mdr
->more()->rollback_bl
;
8301 le
->commit
.add_dir_context(straydn
->get_dir());
8302 le
->commit
.add_primary_dentry(straydn
, in
, true);
8303 // peer: no need to journal original dentry
8305 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8306 le
->commit
.renamed_dirino
= in
->ino();
8308 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
8309 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
8311 mdr
->more()->peer_update_journaled
= true;
8312 submit_mdlog_entry(le
, new C_MDS_PeerRmdirPrep(this, mdr
, dn
, straydn
),
8317 void Server::_logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
8319 dout(10) << "_logged_peer_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
8320 CInode
*in
= dn
->get_linkage()->get_inode();
8323 if (mdr
->peer_request
->desti_snapbl
.length()) {
8324 new_realm
= !in
->snaprealm
;
8325 in
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
8326 ceph_assert(in
->snaprealm
);
8331 // update our cache now, so we are consistent with what is in the journal
8332 // when we journal a subtree map
8333 dn
->get_dir()->unlink_inode(dn
);
8334 straydn
->pop_projected_linkage();
8335 dn
->pop_projected_linkage();
8337 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->peer_update_journaled
);
8340 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
8343 mdr
->reset_peer_request();
8346 if (!mdr
->aborted
) {
8347 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREPACK
);
8348 if (!mdr
->more()->peer_update_journaled
)
8349 reply
->mark_not_journaled();
8350 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
8352 dout(10) << " abort flag set, finishing" << dendl
;
8353 mdcache
->request_finish(mdr
);
8357 void Server::handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
8359 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8360 << " " << *ack
<< dendl
;
8362 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8364 mdr
->more()->peers
.insert(from
);
8365 mdr
->more()->witnessed
.insert(from
);
8366 if (!ack
->is_not_journaled())
8367 mdr
->more()->has_journaled_peers
= true;
8369 // remove from waiting list
8370 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
8371 mdr
->more()->waiting_on_peer
.erase(from
);
8373 if (mdr
->more()->waiting_on_peer
.empty())
8374 dispatch_client_request(mdr
); // go again!
8376 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
8379 void Server::_commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
8381 dout(10) << "_commit_peer_rmdir " << *mdr
<< " r=" << r
<< dendl
;
8384 if (mdr
->more()->peer_update_journaled
) {
8385 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
8386 if (strayin
&& !strayin
->snaprealm
)
8387 mdcache
->clear_dirty_bits_for_stray(strayin
);
8392 if (mdr
->more()->peer_update_journaled
) {
8393 // write a commit to the journal
8394 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_commit", mdr
->reqid
,
8395 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
8396 EPeerUpdate::RMDIR
);
8397 mdlog
->start_entry(le
);
8398 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
8401 _committed_peer(mdr
);
8405 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
8409 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
8413 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
8414 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
8415 void finish(int r
) override
{
8416 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
8420 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
8422 // unlink the other rollback methods, the rmdir rollback is only
8423 // needed to record the subtree changes in the journal for inode
8424 // replicas who are auth for empty dirfrags. no actual changes to
8425 // the file system are taking place here, so there is no Mutation.
8427 rmdir_rollback rollback
;
8428 auto p
= rbl
.cbegin();
8429 decode(rollback
, p
);
8431 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
8432 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
8433 ceph_assert(mdr
|| mds
->is_resolve());
8435 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
8437 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
8439 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
8441 dout(10) << " dn " << *dn
<< dendl
;
8442 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
8443 ceph_assert(straydir
);
8444 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
8445 ceph_assert(straydn
);
8446 dout(10) << " straydn " << *straydn
<< dendl
;
8447 CInode
*in
= straydn
->get_linkage()->get_inode();
8449 dn
->push_projected_linkage(in
);
8450 straydn
->push_projected_linkage();
8452 if (rollback
.snapbl
.length() && in
->snaprealm
) {
8454 auto p
= rollback
.snapbl
.cbegin();
8455 decode(hadrealm
, p
);
8457 decode(in
->snaprealm
->srnode
, p
);
8459 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
8463 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
8464 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
8466 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
8471 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_rollback", rollback
.reqid
, leader
,
8472 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RMDIR
);
8473 mdlog
->start_entry(le
);
8475 le
->commit
.add_dir_context(dn
->get_dir());
8476 le
->commit
.add_primary_dentry(dn
, in
, true);
8477 // peer: no need to journal straydn
8479 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8480 le
->commit
.renamed_dirino
= in
->ino();
8482 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
8484 submit_mdlog_entry(le
,
8485 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
8491 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
8493 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
8495 straydn
->get_dir()->unlink_inode(straydn
);
8496 dn
->pop_projected_linkage();
8497 straydn
->pop_projected_linkage();
8499 CInode
*in
= dn
->get_linkage()->get_inode();
8500 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
8501 !mdr
|| mdr
->more()->peer_update_journaled
);
8503 if (mds
->is_resolve()) {
8504 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
8505 mdcache
->try_trim_non_auth_subtree(root
);
8509 mdcache
->request_finish(mdr
);
8511 mdcache
->finish_rollback(reqid
, mdr
);
8515 /** _dir_is_nonempty[_unlocked]
8517 * check if a directory is non-empty (i.e. we can rmdir it).
8519 * the unlocked varient this is a fastpath check. we can't really be
8520 * sure until we rdlock the filelock.
8522 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
8524 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
8525 ceph_assert(in
->is_auth());
8527 if (in
->filelock
.is_cached())
8528 return false; // there can be pending async create/unlink. don't know.
8529 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
8530 return true; // in a snapshot!
8532 auto&& ls
= in
->get_dirfrags();
8533 for (const auto& dir
: ls
) {
8534 // is the frag obviously non-empty?
8535 if (dir
->is_auth()) {
8536 if (dir
->get_projected_fnode()->fragstat
.size()) {
8537 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8538 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
8547 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
8549 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
8550 ceph_assert(in
->is_auth());
8551 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
8553 frag_info_t dirstat
;
8554 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
8556 auto&& ls
= in
->get_dirfrags();
8557 for (const auto& dir
: ls
) {
8558 const auto& pf
= dir
->get_projected_fnode();
8559 if (pf
->fragstat
.size()) {
8560 dout(10) << "dir_is_nonempty dirstat has "
8561 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
8565 if (pf
->accounted_fragstat
.version
== dirstat_version
)
8566 dirstat
.add(pf
->accounted_fragstat
);
8568 dirstat
.add(pf
->fragstat
);
8571 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
8575 // ======================================================
8578 class C_MDS_rename_finish
: public ServerLogContext
{
8583 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
8584 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
8585 ServerLogContext(s
, r
),
8586 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
8587 void finish(int r
) override
{
8588 ceph_assert(r
== 0);
8589 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
8594 /** handle_client_rename
8596 * rename leader is the destdn auth. this is because cached inodes
8597 * must remain connected. thus, any replica of srci, must also
8598 * replicate destdn, and possibly straydn, so that srci (and
8599 * destdn->inode) remain connected during the rename.
8601 * to do this, we freeze srci, then leader (destdn auth) verifies that
8602 * all other nodes have also replciated destdn and straydn. note that
8603 * destdn replicas need not also replicate srci. this only works when
8606 * This function takes responsibility for the passed mdr.
8608 void Server::handle_client_rename(MDRequestRef
& mdr
)
8610 const auto& req
= mdr
->client_request
;
8611 dout(7) << "handle_client_rename " << *req
<< dendl
;
8613 filepath destpath
= req
->get_filepath();
8614 filepath srcpath
= req
->get_filepath2();
8615 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
8616 respond_to_request(mdr
, -CEPHFS_EBUSY
);
8620 if (req
->get_alternate_name().size() > alternate_name_max
) {
8621 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
8622 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
8626 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
8630 if (is_unlink_pending(destdn
)) {
8631 wait_for_pending_unlink(destdn
, mdr
);
8635 if (is_unlink_pending(srcdn
)) {
8636 wait_for_pending_unlink(srcdn
, mdr
);
8640 dout(10) << " destdn " << *destdn
<< dendl
;
8641 CDir
*destdir
= destdn
->get_dir();
8642 ceph_assert(destdir
->is_auth());
8643 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8645 dout(10) << " srcdn " << *srcdn
<< dendl
;
8646 CDir
*srcdir
= srcdn
->get_dir();
8647 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8648 CInode
*srci
= srcdnl
->get_inode();
8649 dout(10) << " srci " << *srci
<< dendl
;
8651 // -- some sanity checks --
8652 if (destdn
== srcdn
) {
8653 dout(7) << "rename src=dest, noop" << dendl
;
8654 respond_to_request(mdr
, 0);
8658 // dest a child of src?
8659 // e.g. mv /usr /usr/foo
8660 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
8661 dout(7) << "cannot rename item to be a child of itself" << dendl
;
8662 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8666 // is this a stray migration, reintegration or merge? (sanity checks!)
8667 if (mdr
->reqid
.name
.is_mds() &&
8668 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
8669 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
8670 !(destdnl
->is_remote() &&
8671 destdnl
->get_remote_ino() == srci
->ino())) {
8672 respond_to_request(mdr
, -CEPHFS_EINVAL
); // actually, this won't reply, but whatev.
8677 if (!destdnl
->is_null()) {
8678 //dout(10) << "dest dn exists " << *destdn << dendl;
8679 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
8681 dout(10) << " oldin " << *oldin
<< dendl
;
8683 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8684 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
8685 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8689 // mv /some/thing /to/some/existing_other_thing
8690 if (oldin
->is_dir() && !srci
->is_dir()) {
8691 respond_to_request(mdr
, -CEPHFS_EISDIR
);
8694 if (!oldin
->is_dir() && srci
->is_dir()) {
8695 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
8698 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
8699 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
8702 if (destdn
->get_alternate_name() != req
->get_alternate_name()) {
8703 /* the dentry exists but the alternate_names do not match, fail... */
8704 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8709 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
8710 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
8712 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8713 if (destpath
.get_ino() != srcpath
.get_ino() &&
8714 !(req
->get_source().is_mds() &&
8715 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8716 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
8717 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
8718 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8719 while (srcbase
!= destbase
&&
8720 !srcbase
->is_projected_ancestor_of(destbase
)) {
8721 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
8722 srctrace
.insert(srctrace
.begin(), pdn
);
8723 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
8724 srcbase
= pdn
->get_dir()->get_inode();
8727 // then, extend destpath until it shares the same parent inode as srcpath.
8728 while (destbase
!= srcbase
) {
8729 CDentry
*pdn
= destbase
->get_projected_parent_dn();
8730 desttrace
.insert(desttrace
.begin(), pdn
);
8731 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
8732 destbase
= pdn
->get_dir()->get_inode();
8734 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
8738 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8740 dout(10) << " this is a link merge" << dendl
;
8742 // -- create stray dentry? --
8743 CDentry
*straydn
= NULL
;
8744 if (destdnl
->is_primary() && !linkmerge
) {
8745 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
8748 dout(10) << " straydn is " << *straydn
<< dendl
;
8749 } else if (mdr
->straydn
) {
8750 mdr
->unpin(mdr
->straydn
);
8751 mdr
->straydn
= NULL
;
8756 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
8757 MutationImpl::LockOpVec lov
;
8759 // we need to update srci's ctime. xlock its least contended lock to do that...
8760 lov
.add_xlock(&srci
->linklock
);
8761 lov
.add_xlock(&srci
->snaplock
);
8764 // xlock oldin (for nlink--)
8765 lov
.add_xlock(&oldin
->linklock
);
8766 lov
.add_xlock(&oldin
->snaplock
);
8767 if (oldin
->is_dir()) {
8768 ceph_assert(srci
->is_dir());
8769 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
8771 // adjust locking order?
8772 int cmp
= mdr
->compare_paths();
8773 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
8774 std::reverse(lov
.begin(), lov
.end());
8776 ceph_assert(!srci
->is_dir());
8777 // adjust locking order;
8778 if (srci
->ino() > oldin
->ino())
8779 std::reverse(lov
.begin(), lov
.end());
8785 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
8786 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
8787 lov
.add_xlock(&straydn
->lock
);
8790 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
8791 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
8794 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
8798 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
8800 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
8801 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
8804 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
8807 if (!linkmerge
&& !check_fragment_space(mdr
, destdn
->get_dir()))
8810 if (!linkmerge
&& !check_dir_max_entries(mdr
, destdn
->get_dir()))
8813 if (!check_access(mdr
, srci
, MAY_WRITE
))
8817 // with read lock, really verify oldin is empty
8820 _dir_is_nonempty(mdr
, oldin
)) {
8821 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8825 /* project_snaprealm_past_parent() will do this job
8827 // moving between snaprealms?
8828 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8829 SnapRealm *srcrealm = srci->find_snaprealm();
8830 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8831 if (srcrealm != destrealm &&
8832 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8833 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8834 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8835 mdcache->snaprealm_create(mdr, srci);
8841 SnapRealm
*dest_realm
= nullptr;
8842 SnapRealm
*src_realm
= nullptr;
8844 dest_realm
= destdir
->inode
->find_snaprealm();
8845 if (srcdir
->inode
== destdir
->inode
)
8846 src_realm
= dest_realm
;
8848 src_realm
= srcdir
->inode
->find_snaprealm();
8849 if (src_realm
!= dest_realm
&&
8850 src_realm
->get_subvolume_ino() != dest_realm
->get_subvolume_ino()) {
8851 respond_to_request(mdr
, -CEPHFS_EXDEV
);
8856 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
8858 // -- open all srcdn inode frags, if any --
8859 // we need these open so that auth can properly delegate from inode to dirfrags
8860 // after the inode is _ours_.
8861 if (srcdnl
->is_primary() &&
8862 !srcdn
->is_auth() &&
8864 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
8865 mdr
->set_stickydirs(srci
);
8868 srci
->dirfragtree
.get_leaves(leaves
);
8869 for (const auto& leaf
: leaves
) {
8870 CDir
*dir
= srci
->get_dirfrag(leaf
);
8872 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
8873 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
8879 // -- prepare snaprealm ---
8882 if (!mdr
->more()->srci_srnode
&&
8883 srci
->get_projected_inode()->nlink
== 1 &&
8884 srci
->is_projected_snaprealm_global()) {
8885 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8886 srci
->record_snaprealm_parent_dentry(new_srnode
, nullptr, destdn
, false);
8888 srci
->clear_snaprealm_global(new_srnode
);
8889 mdr
->more()->srci_srnode
= new_srnode
;
8892 if (oldin
&& !mdr
->more()->desti_srnode
) {
8893 if (oldin
->is_projected_snaprealm_global()) {
8894 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
8895 oldin
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, destdn
, destdnl
->is_primary());
8896 // dropping the last linkage or dropping the last remote linkage,
8897 // detch the inode from global snaprealm
8898 auto nlink
= oldin
->get_projected_inode()->nlink
;
8900 (nlink
== 2 && !destdnl
->is_primary() &&
8901 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
8902 oldin
->clear_snaprealm_global(new_srnode
);
8903 mdr
->more()->desti_srnode
= new_srnode
;
8904 } else if (destdnl
->is_primary()) {
8905 snapid_t follows
= dest_realm
->get_newest_seq();
8906 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
8907 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
8908 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
8909 mdr
->more()->desti_srnode
= new_srnode
;
8913 if (!mdr
->more()->srci_srnode
) {
8914 if (srci
->is_projected_snaprealm_global()) {
8915 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8916 srci
->record_snaprealm_parent_dentry(new_srnode
, src_realm
, srcdn
, srcdnl
->is_primary());
8917 mdr
->more()->srci_srnode
= new_srnode
;
8918 } else if (srcdnl
->is_primary()) {
8919 snapid_t follows
= src_realm
->get_newest_seq();
8920 if (src_realm
!= dest_realm
&&
8921 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
8922 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
8923 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
8924 mdr
->more()->srci_srnode
= new_srnode
;
8930 // -- prepare witnesses --
8933 * NOTE: we use _all_ replicas as witnesses.
8934 * this probably isn't totally necessary (esp for file renames),
8935 * but if/when we change that, we have to make sure rejoin is
8936 * sufficiently robust to handle strong rejoins from survivors
8937 * with totally wrong dentry->inode linkage.
8938 * (currently, it can ignore rename effects, because the resolve
8939 * stage will sort them out.)
8941 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
8942 if (srcdn
->is_auth())
8943 srcdn
->list_replicas(witnesses
);
8945 witnesses
.insert(srcdn
->authority().first
);
8946 if (srcdnl
->is_remote() && !srci
->is_auth())
8947 witnesses
.insert(srci
->authority().first
);
8948 destdn
->list_replicas(witnesses
);
8949 if (destdnl
->is_remote() && !oldin
->is_auth())
8950 witnesses
.insert(oldin
->authority().first
);
8951 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
8953 if (!witnesses
.empty()) {
8954 // Replicas can't see projected dentry linkages and will get confused.
8955 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8956 // can't project these inodes' linkages.
8957 bool need_flush
= false;
8958 for (auto& dn
: srctrace
) {
8959 if (dn
->is_projected()) {
8965 CDentry
*dn
= destdn
;
8967 if (dn
->is_projected()) {
8971 CInode
*diri
= dn
->get_dir()->get_inode();
8972 dn
= diri
->get_projected_parent_dn();
8976 mdlog
->wait_for_safe(
8977 new MDSInternalContextWrapper(mds
,
8978 new C_MDS_RetryRequest(mdcache
, mdr
)));
8984 // do srcdn auth last
8985 mds_rank_t last
= MDS_RANK_NONE
;
8986 if (!srcdn
->is_auth()) {
8987 last
= srcdn
->authority().first
;
8988 mdr
->more()->srcdn_auth_mds
= last
;
8989 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8990 // are involved in the rename operation.
8991 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8992 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8993 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8994 ceph_assert(mdr
->more()->rename_inode
== srci
);
8995 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
9000 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
9001 p
!= witnesses
.end();
9003 if (*p
== last
) continue; // do it last!
9004 if (mdr
->more()->witnessed
.count(*p
)) {
9005 dout(10) << " already witnessed by mds." << *p
<< dendl
;
9006 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
9007 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
9009 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
9013 if (!mdr
->more()->waiting_on_peer
.empty())
9014 return; // we're waiting for a witness.
9016 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
9017 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
9018 ceph_assert(mdr
->more()->waiting_on_peer
.count(last
) == 0);
9019 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
9023 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
9024 if (!mdr
->more()->peers
.empty() && !srci
->is_dir())
9025 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
9026 if (!mdr
->more()->peers
.empty() && srci
->is_dir())
9027 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
9029 // -- declare now --
9030 mdr
->set_mds_stamp(ceph_clock_now());
9032 // -- prepare journal entry --
9033 mdr
->ls
= mdlog
->get_current_segment();
9034 EUpdate
*le
= new EUpdate(mdlog
, "rename");
9035 mdlog
->start_entry(le
);
9036 le
->metablob
.add_client_req(mdr
->reqid
, req
->get_oldest_client_tid());
9037 if (!mdr
->more()->witnessed
.empty()) {
9038 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
9040 le
->reqid
= mdr
->reqid
;
9041 le
->had_peers
= true;
9043 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
9044 // no need to send frozen auth pin to recovring auth MDS of srci
9045 mdr
->more()->is_remote_frozen_authpin
= false;
9048 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, req
->get_alternate_name(), straydn
);
9049 if (le
->client_map
.length())
9050 le
->cmapv
= mds
->sessionmap
.get_projected();
9052 // -- commit locally --
9053 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
9055 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
9057 // trigger to flush mdlog in case reintegrating or migrating the stray dn,
9058 // because the link requests maybe waiting.
9059 if (srcdn
->get_dir()->inode
->is_stray()) {
9062 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
9066 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9068 dout(10) << "_rename_finish " << *mdr
<< dendl
;
9070 if (!mdr
->more()->witnessed
.empty())
9071 mdcache
->logged_leader_update(mdr
->reqid
);
9074 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9076 mdcache
->send_dentry_link(destdn
, mdr
);
9078 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9079 CInode
*in
= destdnl
->get_inode();
9080 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
9082 // test hack: test peer commit
9083 if (!mdr
->more()->peers
.empty() && !in
->is_dir())
9084 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
9085 if (!mdr
->more()->peers
.empty() && in
->is_dir())
9086 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
9089 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9090 if (destdnl
->is_remote() && in
->is_auth())
9091 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
9093 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
9095 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
9098 respond_to_request(mdr
, 0);
9101 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
9104 // respond_to_request() drops locks. So stray reintegration can race with us.
9105 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
9106 mdcache
->notify_stray(straydn
);
9114 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
9115 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
9117 const auto& client_req
= mdr
->client_request
;
9118 ceph_assert(client_req
);
9120 if (mds
->is_cluster_degraded() &&
9121 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
9122 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
9123 if (mdr
->more()->waiting_on_peer
.empty())
9124 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
9128 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
9129 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREP
);
9131 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
9132 for (auto dn
: srctrace
)
9133 req
->srcdnpath
.push_dentry(dn
->get_name());
9134 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
9135 for (auto dn
: dsttrace
)
9136 req
->destdnpath
.push_dentry(dn
->get_name());
9137 req
->alternate_name
= client_req
->alternate_name
;
9139 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
9141 if (mdr
->more()->srci_srnode
)
9142 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
9143 if (mdr
->more()->desti_srnode
)
9144 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
9146 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
9148 // srcdn auth will verify our current witness list is sufficient
9149 req
->witnesses
= witnesse
;
9151 req
->op_stamp
= mdr
->get_op_stamp();
9152 mds
->send_message_mds(req
, who
);
9154 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
9155 mdr
->more()->waiting_on_peer
.insert(who
);
9159 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
9161 version_t oldpv
= mdr
->more()->inode_import_v
;
9163 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9166 auto blp
= mdr
->more()->inode_import
.cbegin();
9169 map
<client_t
,entity_inst_t
> client_map
;
9170 map
<client_t
, client_metadata_t
> client_metadata_map
;
9171 decode(client_map
, blp
);
9172 decode(client_metadata_map
, blp
);
9173 prepare_force_open_sessions(client_map
, client_metadata_map
,
9174 mdr
->more()->imported_session_map
);
9175 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
9176 encode(client_metadata_map
, *client_map_bl
);
9178 list
<ScatterLock
*> updated_scatterlocks
;
9179 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
9180 mdr
->more()->cap_imports
, updated_scatterlocks
);
9182 // hack: force back to !auth and clean, temporarily
9183 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
9184 srcdnl
->get_inode()->mark_clean();
9189 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
9191 auto&& dirs
= diri
->get_dirfrags();
9193 bool force_journal
= false;
9195 for (const auto& dir
: dirs
) {
9196 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
9197 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
9198 force_journal
= true;
9201 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
9204 // see if any children of our frags are auth subtrees.
9205 std::vector
<CDir
*> subtrees
;
9206 mdcache
->get_subtrees(subtrees
);
9207 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
9208 for (const auto& dir
: dirs
) {
9209 for (const auto& subtree
: subtrees
) {
9210 if (dir
->contains(subtree
)) {
9211 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
9212 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
9213 << *subtree
<< dendl
;
9214 force_journal
= true;
9217 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
9219 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
9225 return force_journal
;
9228 void Server::_rename_prepare(MDRequestRef
& mdr
,
9229 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
9230 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
9233 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9235 dout(10) << " straydn " << *straydn
<< dendl
;
9237 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9238 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9239 CInode
*srci
= srcdnl
->get_inode();
9240 CInode
*oldin
= destdnl
->get_inode();
9242 // primary+remote link merge?
9243 bool linkmerge
= (srci
== oldin
);
9245 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9246 bool silent
= srcdn
->get_dir()->inode
->is_stray();
9248 bool force_journal_dest
= false;
9249 if (srci
->is_dir() && !destdn
->is_auth()) {
9250 if (srci
->is_auth()) {
9251 // if we are auth for srci and exporting it, force journal because journal replay needs
9252 // the source inode to create auth subtrees.
9253 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
9254 force_journal_dest
= true;
9256 force_journal_dest
= _need_force_journal(srci
, false);
9259 bool force_journal_stray
= false;
9260 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
9261 force_journal_stray
= _need_force_journal(oldin
, true);
9264 dout(10) << " merging remote and primary links to the same inode" << dendl
;
9266 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
9267 if (force_journal_dest
)
9268 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
9269 if (force_journal_stray
)
9270 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
9272 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
9273 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
9274 metablob
->renamed_dirino
= srci
->ino();
9275 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
9276 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
9277 metablob
->renamed_dirino
= oldin
->ino();
9281 CInode::mempool_inode
*spi
= 0; // renamed inode
9282 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
9286 if (destdnl
->is_primary()) {
9287 ceph_assert(straydn
); // moving to straydn.
9288 // link--, and move.
9289 if (destdn
->is_auth()) {
9290 auto pi
= oldin
->project_inode(mdr
); //project_snaprealm
9291 pi
.inode
->version
= straydn
->pre_dirty(pi
.inode
->version
);
9292 pi
.inode
->update_backtrace();
9293 tpi
= pi
.inode
.get();
9295 straydn
->push_projected_linkage(oldin
);
9296 } else if (destdnl
->is_remote()) {
9298 if (oldin
->is_auth()) {
9299 auto pi
= oldin
->project_inode(mdr
);
9300 pi
.inode
->version
= oldin
->pre_dirty();
9301 tpi
= pi
.inode
.get();
9307 if (destdnl
->is_null()) {
9308 /* handle_client_rename checks that alternate_name matches for existing destdn */
9309 destdn
->set_alternate_name(alternate_name
);
9311 if (srcdnl
->is_remote()) {
9314 if (destdn
->is_auth())
9315 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
9316 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9318 if (srci
->is_auth()) {
9319 auto pi
= srci
->project_inode(mdr
);
9320 pi
.inode
->version
= srci
->pre_dirty();
9321 spi
= pi
.inode
.get();
9324 dout(10) << " will merge remote onto primary link" << dendl
;
9325 if (destdn
->is_auth()) {
9326 auto pi
= oldin
->project_inode(mdr
);
9327 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->get_version());
9328 spi
= pi
.inode
.get();
9332 if (destdn
->is_auth()) {
9334 if (srcdn
->is_auth())
9335 oldpv
= srci
->get_projected_version();
9337 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
9339 // note which dirfrags have child subtrees in the journal
9340 // event, so that we can open those (as bounds) during replay.
9341 if (srci
->is_dir()) {
9342 auto&& ls
= srci
->get_dirfrags();
9343 for (const auto& dir
: ls
) {
9344 if (!dir
->is_auth())
9345 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
9347 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
9350 auto pi
= srci
->project_inode(mdr
); // project snaprealm if srcdnl->is_primary
9351 // & srcdnl->snaprealm
9352 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
9353 pi
.inode
->update_backtrace();
9354 spi
= pi
.inode
.get();
9356 destdn
->push_projected_linkage(srci
);
9360 if (srcdn
->is_auth())
9361 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
9362 srcdn
->push_projected_linkage(); // push null linkage
9366 spi
->ctime
= mdr
->get_op_stamp();
9367 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
9368 spi
->rstat
.rctime
= mdr
->get_op_stamp();
9374 tpi
->ctime
= mdr
->get_op_stamp();
9375 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
9376 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
9380 destdn
->make_path_string(t
, true);
9381 tpi
->stray_prior_path
= std::move(t
);
9384 if (tpi
->nlink
== 0)
9385 oldin
->state_set(CInode::STATE_ORPHAN
);
9389 // prepare nesting, mtime updates
9390 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
9392 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9393 // then link the source inode to destdn
9394 if (destdnl
->is_primary()) {
9395 ceph_assert(straydn
);
9396 if (straydn
->is_auth()) {
9397 metablob
->add_dir_context(straydn
->get_dir());
9398 metablob
->add_dir(straydn
->get_dir(), true);
9402 if (!linkmerge
&& destdnl
->is_remote() && oldin
->is_auth()) {
9403 CDir
*oldin_dir
= oldin
->get_projected_parent_dir();
9404 if (oldin_dir
!= srcdn
->get_dir() && oldin_dir
!= destdn
->get_dir())
9405 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, oldin_dir
, PREDIRTY_PRIMARY
);
9409 if (destdn
->is_auth() && !destdnl
->is_null()) {
9410 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
9411 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
9412 if (destdnl
->is_primary()) {
9413 ceph_assert(straydn
);
9414 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
9415 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
9419 if (srcdnl
->is_remote() && srci
->is_auth()) {
9420 CDir
*srci_dir
= srci
->get_projected_parent_dir();
9421 if (srci_dir
!= srcdn
->get_dir() && srci_dir
!= destdn
->get_dir())
9422 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srci_dir
, PREDIRTY_PRIMARY
);
9426 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
9427 int flags
= predirty_dir
| predirty_primary
;
9428 if (srcdn
->is_auth())
9429 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
9430 if (destdn
->is_auth())
9431 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
9433 // add it all to the metablob
9436 if (destdnl
->is_primary()) {
9437 ceph_assert(straydn
);
9438 if (destdn
->is_auth()) {
9439 // project snaprealm, too
9440 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9441 oldin
->project_snaprealm(desti_srnode
);
9442 if (tpi
->nlink
== 0)
9443 ceph_assert(!desti_srnode
->is_parent_global());
9444 desti_srnode
= NULL
;
9446 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9447 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
9448 } else if (force_journal_stray
) {
9449 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
9450 metablob
->add_dir_context(straydn
->get_dir());
9451 metablob
->add_primary_dentry(straydn
, oldin
, true);
9453 } else if (destdnl
->is_remote()) {
9454 if (oldin
->is_auth()) {
9455 sr_t
*new_srnode
= NULL
;
9456 if (mdr
->peer_request
) {
9457 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9458 new_srnode
= new sr_t();
9459 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
9460 decode(*new_srnode
, p
);
9462 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9463 new_srnode
= desti_srnode
;
9464 desti_srnode
= NULL
;
9467 oldin
->project_snaprealm(new_srnode
);
9468 if (tpi
->nlink
== 0)
9469 ceph_assert(!new_srnode
->is_parent_global());
9472 CDentry
*oldin_pdn
= oldin
->get_projected_parent_dn();
9473 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin_pdn
);
9474 metablob
->add_primary_dentry(oldin_pdn
, oldin
, true);
9480 if (srcdnl
->is_remote()) {
9481 ceph_assert(!linkmerge
);
9482 if (destdn
->is_auth() && !destdnl
->is_null())
9483 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9485 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9487 if (destdn
->is_auth())
9488 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9490 if (srci
->is_auth() ) { // it's remote
9491 if (mdr
->peer_request
) {
9492 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9493 sr_t
*new_srnode
= new sr_t();
9494 auto p
= mdr
->peer_request
->srci_snapbl
.cbegin();
9495 decode(*new_srnode
, p
);
9496 srci
->project_snaprealm(new_srnode
);
9498 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9499 srci
->project_snaprealm(srci_srnode
);
9503 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
9504 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
);
9505 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
9507 } else if (srcdnl
->is_primary()) {
9508 // project snap parent update?
9509 if (destdn
->is_auth()) {
9510 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9511 srci
->project_snaprealm(srci_srnode
);
9516 if (destdn
->is_auth() && !destdnl
->is_null())
9517 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9519 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9521 auto do_corruption
= inject_rename_corrupt_dentry_first
;
9522 if (unlikely(do_corruption
> 0.0)) {
9523 auto r
= ceph::util::generate_random_number(0.0, 1.0);
9524 if (r
< do_corruption
) {
9525 dout(0) << "corrupting dn: " << *destdn
<< dendl
;
9526 destdn
->first
= -10;
9531 if (destdn
->is_auth())
9532 metablob
->add_primary_dentry(destdn
, srci
, true, true);
9533 else if (force_journal_dest
) {
9534 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
9535 metablob
->add_dir_context(destdn
->get_dir());
9536 metablob
->add_primary_dentry(destdn
, srci
, true);
9537 if (srcdn
->is_auth() && srci
->is_dir()) {
9538 // journal new subtrees root dirfrags
9539 auto&& ls
= srci
->get_dirfrags();
9540 for (const auto& dir
: ls
) {
9542 metablob
->add_dir(dir
, true);
9549 if (srcdn
->is_auth()) {
9550 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
9551 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
9552 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9553 // both primary and NULL dentries. Because during journal replay, null dentry is
9554 // processed after primary dentry.
9555 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
9556 metablob
->add_primary_dentry(srcdn
, srci
, true);
9557 metablob
->add_null_dentry(srcdn
, true);
9559 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
9561 // make renamed inode first track the dn
9562 if (srcdnl
->is_primary() && destdn
->is_auth()) {
9563 ceph_assert(srci
->first
<= destdn
->first
);
9564 srci
->first
= destdn
->first
;
9566 // make stray inode first track the straydn
9567 if (straydn
&& straydn
->is_auth()) {
9568 ceph_assert(oldin
->first
<= straydn
->first
);
9569 oldin
->first
= straydn
->first
;
9572 if (oldin
&& oldin
->is_dir()) {
9573 ceph_assert(straydn
);
9574 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
9577 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
9582 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9584 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9585 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
9587 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9588 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9590 CInode
*oldin
= destdnl
->get_inode();
9592 // primary+remote link merge?
9593 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
9595 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
9597 bool new_in_snaprealm
= false;
9598 bool new_oldin_snaprealm
= false;
9602 if (destdnl
->is_primary()) {
9603 ceph_assert(straydn
);
9604 dout(10) << "straydn is " << *straydn
<< dendl
;
9606 // if there is newly created snaprealm, need to split old snaprealm's
9607 // inodes_with_caps. So pop snaprealm before linkage changes.
9608 if (destdn
->is_auth()) {
9609 bool hadrealm
= (oldin
->snaprealm
? true : false);
9610 oldin
->early_pop_projected_snaprealm();
9611 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
9613 ceph_assert(mdr
->peer_request
);
9614 if (mdr
->peer_request
->desti_snapbl
.length()) {
9615 new_oldin_snaprealm
= !oldin
->snaprealm
;
9616 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9617 ceph_assert(oldin
->snaprealm
);
9621 destdn
->get_dir()->unlink_inode(destdn
, false);
9623 straydn
->pop_projected_linkage();
9624 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9625 ceph_assert(!straydn
->is_projected()); // no other projected
9628 if (destdn
->is_auth())
9629 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9631 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
9632 } else if (destdnl
->is_remote()) {
9633 destdn
->get_dir()->unlink_inode(destdn
, false);
9634 if (oldin
->is_auth()) {
9635 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9636 } else if (mdr
->peer_request
) {
9637 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9638 ceph_assert(oldin
->snaprealm
);
9639 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9641 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9642 delete desti_srnode
;
9643 desti_srnode
= NULL
;
9648 // unlink src before we relink it at dest
9649 CInode
*in
= srcdnl
->get_inode();
9652 bool srcdn_was_remote
= srcdnl
->is_remote();
9653 if (!srcdn_was_remote
) {
9654 // if there is newly created snaprealm, need to split old snaprealm's
9655 // inodes_with_caps. So pop snaprealm before linkage changes.
9656 if (destdn
->is_auth()) {
9657 bool hadrealm
= (in
->snaprealm
? true : false);
9658 in
->early_pop_projected_snaprealm();
9659 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
9661 ceph_assert(mdr
->peer_request
);
9662 if (mdr
->peer_request
->srci_snapbl
.length()) {
9663 new_in_snaprealm
= !in
->snaprealm
;
9664 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9665 ceph_assert(in
->snaprealm
);
9670 srcdn
->get_dir()->unlink_inode(srcdn
);
9672 // After the stray dn being unlinked from the corresponding inode in case of
9673 // reintegrate_stray/migrate_stray, just wake up the waitiers.
9674 MDSContext::vec finished
;
9675 in
->take_waiting(CInode::WAIT_UNLINK
, finished
);
9676 if (!finished
.empty()) {
9677 mds
->queue_waiters(finished
);
9681 if (srcdn_was_remote
) {
9684 destdnl
= destdn
->pop_projected_linkage();
9685 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9686 ceph_assert(!destdn
->is_projected()); // no other projected
9688 destdn
->link_remote(destdnl
, in
);
9689 if (destdn
->is_auth())
9690 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
9692 if (in
->is_auth()) {
9693 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9694 } else if (mdr
->peer_request
) {
9695 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9696 ceph_assert(in
->snaprealm
);
9697 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9699 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9704 dout(10) << "merging remote onto primary link" << dendl
;
9705 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9709 dout(10) << "merging primary onto remote link" << dendl
;
9710 destdn
->get_dir()->unlink_inode(destdn
, false);
9712 destdnl
= destdn
->pop_projected_linkage();
9713 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9714 ceph_assert(!destdn
->is_projected()); // no other projected
9716 // srcdn inode import?
9717 if (!srcdn
->is_auth() && destdn
->is_auth()) {
9718 ceph_assert(mdr
->more()->inode_import
.length() > 0);
9720 map
<client_t
,Capability::Import
> imported_caps
;
9722 // finish cap imports
9723 finish_force_open_sessions(mdr
->more()->imported_session_map
);
9724 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
9725 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
9726 mdr
->more()->srcdn_auth_mds
, true,
9727 mdr
->more()->imported_session_map
,
9728 mdr
->more()->cap_imports
[destdnl
->get_inode()],
9732 mdr
->more()->inode_import
.clear();
9733 encode(imported_caps
, mdr
->more()->inode_import
);
9735 /* hack: add an auth pin for each xlock we hold. These were
9736 * remote xlocks previously but now they're local and
9737 * we're going to try and unpin when we xlock_finish. */
9739 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
9740 i
!= mdr
->locks
.end();
9742 SimpleLock
*lock
= i
->lock
;
9743 if (lock
->get_parent() != destdnl
->get_inode())
9745 if (i
->is_xlock() && !lock
->is_locallock())
9746 mds
->locker
->xlock_import(lock
);
9749 // hack: fix auth bit
9750 in
->state_set(CInode::STATE_AUTH
);
9752 mdr
->clear_ambiguous_auth();
9755 if (destdn
->is_auth())
9756 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9760 if (srcdn
->is_auth())
9761 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
9762 srcdn
->pop_projected_linkage();
9763 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9764 ceph_assert(!srcdn
->is_projected()); // no other projected
9766 // apply remaining projected inodes (nested)
9769 // update subtree map?
9770 if (destdnl
->is_primary() && in
->is_dir())
9771 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
9773 if (straydn
&& oldin
->is_dir())
9774 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
9776 if (new_oldin_snaprealm
)
9777 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
9778 if (new_in_snaprealm
)
9779 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
9781 // removing a new dn?
9782 if (srcdn
->is_auth())
9783 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
9791 class C_MDS_PeerRenamePrep
: public ServerLogContext
{
9792 CDentry
*srcdn
, *destdn
, *straydn
;
9794 C_MDS_PeerRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9795 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9796 void finish(int r
) override
{
9797 server
->_logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9801 class C_MDS_PeerRenameCommit
: public ServerContext
{
9803 CDentry
*srcdn
, *destdn
, *straydn
;
9805 C_MDS_PeerRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9806 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9807 void finish(int r
) override
{
9808 server
->_commit_peer_rename(mdr
, r
, srcdn
, destdn
, straydn
);
9812 class C_MDS_PeerRenameSessionsFlushed
: public ServerContext
{
9815 C_MDS_PeerRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
9816 ServerContext(s
), mdr(r
) {}
9817 void finish(int r
) override
{
9818 server
->_peer_rename_sessions_flushed(mdr
);
9822 void Server::handle_peer_rename_prep(MDRequestRef
& mdr
)
9824 dout(10) << "handle_peer_rename_prep " << *mdr
9825 << " " << mdr
->peer_request
->srcdnpath
9826 << " to " << mdr
->peer_request
->destdnpath
9829 if (mdr
->peer_request
->is_interrupted()) {
9830 dout(10) << " peer request interrupted, sending noop reply" << dendl
;
9831 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9832 reply
->mark_interrupted();
9833 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9834 mdr
->reset_peer_request();
9839 filepath
destpath(mdr
->peer_request
->destdnpath
);
9840 dout(10) << " dest " << destpath
<< dendl
;
9841 vector
<CDentry
*> trace
;
9842 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
9843 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
9844 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
9847 if (r
== -CEPHFS_ESTALE
) {
9848 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
9849 mdr
->peer_to_mds
, true);
9852 ceph_assert(r
== 0); // we shouldn't get an error here!
9854 CDentry
*destdn
= trace
.back();
9855 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9856 dout(10) << " destdn " << *destdn
<< dendl
;
9860 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
9861 dout(10) << " src " << srcpath
<< dendl
;
9862 CInode
*srci
= nullptr;
9863 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
9864 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
9867 ceph_assert(r
== 0);
9869 CDentry
*srcdn
= trace
.back();
9870 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9871 dout(10) << " srcdn " << *srcdn
<< dendl
;
9876 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
9878 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9879 CDentry
*straydn
= mdr
->straydn
;
9880 if (destdnl
->is_primary() && !linkmerge
)
9881 ceph_assert(straydn
);
9883 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
9884 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
9886 // set up commit waiter (early, to clean up any freezing etc we do)
9887 if (!mdr
->more()->peer_commit
)
9888 mdr
->more()->peer_commit
= new C_MDS_PeerRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
9891 if (srcdn
->is_auth()) {
9892 set
<mds_rank_t
> srcdnrep
;
9893 srcdn
->list_replicas(srcdnrep
);
9895 bool reply_witness
= false;
9896 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
9899 // - avoid conflicting lock state changes
9900 // - avoid concurrent updates to the inode
9901 // (this could also be accomplished with the versionlock)
9902 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9903 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
9904 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
9906 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9907 if (srcdnl
->get_inode()->is_frozen_auth_pin())
9908 mdr
->unfreeze_auth_pin();
9910 if (!frozen_inode
) {
9911 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
9916 * set ambiguous auth for srci
9917 * NOTE: we don't worry about ambiguous cache expire as we do
9918 * with subtree migrations because all peers will pin
9919 * srcdn->get_inode() for duration of this rename.
9921 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9923 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9924 // the leader will send another OP_RENAMEPREP peer request later.
9925 if (mdr
->peer_request
->witnesses
.size() > 1) {
9926 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
9927 reply_witness
= true;
9930 // make sure bystanders have received all lock related messages
9931 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9932 if (*p
== mdr
->peer_to_mds
||
9933 (mds
->is_cluster_degraded() &&
9934 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
9936 auto notify
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMENOTIFY
);
9937 mds
->send_message_mds(notify
, *p
);
9938 mdr
->more()->waiting_on_peer
.insert(*p
);
9941 // make sure clients have received all cap related messages
9942 set
<client_t
> export_client_set
;
9943 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
9945 MDSGatherBuilder
gather(g_ceph_context
);
9946 flush_client_sessions(export_client_set
, gather
);
9947 if (gather
.has_subs()) {
9948 mdr
->more()->waiting_on_peer
.insert(MDS_RANK_NONE
);
9949 gather
.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr
));
9954 // is witness list sufficient?
9955 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9956 if (*p
== mdr
->peer_to_mds
||
9957 mdr
->peer_request
->witnesses
.count(*p
)) continue;
9958 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
9959 reply_witness
= true;
9963 if (reply_witness
) {
9964 ceph_assert(!srcdnrep
.empty());
9965 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9966 reply
->witnesses
.swap(srcdnrep
);
9967 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9968 mdr
->reset_peer_request();
9971 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
9972 if (!mdr
->more()->waiting_on_peer
.empty()) {
9973 dout(10) << " still waiting for rename notify acks from "
9974 << mdr
->more()->waiting_on_peer
<< dendl
;
9977 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
9978 // set ambiguous auth for srci on witnesses
9979 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9982 // encode everything we'd need to roll this back... basically, just the original state.
9983 rename_rollback rollback
;
9985 rollback
.reqid
= mdr
->reqid
;
9987 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
9988 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9989 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9990 rollback
.orig_src
.dname
= srcdn
->get_name();
9991 if (srcdnl
->is_primary())
9992 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
9994 ceph_assert(srcdnl
->is_remote());
9995 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
9996 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
9999 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
10000 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
10001 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
10002 rollback
.orig_dest
.dname
= destdn
->get_name();
10003 if (destdnl
->is_primary())
10004 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
10005 else if (destdnl
->is_remote()) {
10006 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
10007 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
10011 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
10012 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
10013 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
10014 rollback
.stray
.dname
= straydn
->get_name();
10016 if (mdr
->peer_request
->desti_snapbl
.length()) {
10017 CInode
*oldin
= destdnl
->get_inode();
10018 if (oldin
->snaprealm
) {
10019 encode(true, rollback
.desti_snapbl
);
10020 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
10022 encode(false, rollback
.desti_snapbl
);
10025 if (mdr
->peer_request
->srci_snapbl
.length()) {
10026 if (srci
->snaprealm
) {
10027 encode(true, rollback
.srci_snapbl
);
10028 srci
->encode_snap_blob(rollback
.srci_snapbl
);
10030 encode(false, rollback
.srci_snapbl
);
10033 encode(rollback
, mdr
->more()->rollback_bl
);
10034 // FIXME: rollback snaprealm
10035 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
10038 mdr
->ls
= mdlog
->get_current_segment();
10039 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_prep", mdr
->reqid
, mdr
->peer_to_mds
,
10040 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RENAME
);
10041 mdlog
->start_entry(le
);
10042 le
->rollback
= mdr
->more()->rollback_bl
;
10044 bufferlist blah
; // inode import data... obviously not used if we're the peer
10045 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, mdr
->peer_request
->alternate_name
, straydn
);
10047 if (le
->commit
.empty()) {
10048 dout(10) << " empty metablob, skipping journal" << dendl
;
10049 mdlog
->cancel_entry(le
);
10051 _logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
10053 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
10054 mdr
->more()->peer_update_journaled
= true;
10055 submit_mdlog_entry(le
, new C_MDS_PeerRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
10061 void Server::_logged_peer_rename(MDRequestRef
& mdr
,
10062 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
10064 dout(10) << "_logged_peer_rename " << *mdr
<< dendl
;
10067 ref_t
<MMDSPeerRequest
> reply
;
10068 if (!mdr
->aborted
) {
10069 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
10070 if (!mdr
->more()->peer_update_journaled
)
10071 reply
->mark_not_journaled();
10074 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
10075 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
10078 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
10079 // set export bounds for CInode::encode_export()
10081 std::vector
<CDir
*> bounds
;
10082 if (srcdnl
->get_inode()->is_dir()) {
10083 srcdnl
->get_inode()->get_dirfrags(bounds
);
10084 for (const auto& bound
: bounds
) {
10085 bound
->state_set(CDir::STATE_EXPORTBOUND
);
10089 map
<client_t
,entity_inst_t
> exported_client_map
;
10090 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
10091 bufferlist inodebl
;
10092 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
10093 exported_client_map
,
10094 exported_client_metadata_map
);
10096 for (const auto& bound
: bounds
) {
10097 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
10100 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
10101 encode(exported_client_metadata_map
, reply
->inode_export
);
10102 reply
->inode_export
.claim_append(inodebl
);
10103 reply
->inode_export_v
= srcdnl
->get_inode()->get_version();
10106 // remove mdr auth pin
10107 mdr
->auth_unpin(srcdnl
->get_inode());
10108 mdr
->more()->is_inode_exporter
= true;
10110 if (srcdnl
->get_inode()->is_dirty())
10111 srcdnl
->get_inode()->mark_clean();
10113 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
10117 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
10119 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
10122 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
10123 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
10124 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
10127 mdr
->reset_peer_request();
10131 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
10133 ceph_assert(mdr
->aborted
);
10134 dout(10) << " abort flag set, finishing" << dendl
;
10135 mdcache
->request_finish(mdr
);
10139 void Server::_commit_peer_rename(MDRequestRef
& mdr
, int r
,
10140 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
10142 dout(10) << "_commit_peer_rename " << *mdr
<< " r=" << r
<< dendl
;
10144 CInode
*in
= destdn
->get_linkage()->get_inode();
10146 inodeno_t migrated_stray
;
10147 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
10148 migrated_stray
= in
->ino();
10150 MDSContext::vec finished
;
10152 // unfreeze+singleauth inode
10153 // hmm, do i really need to delay this?
10154 if (mdr
->more()->is_inode_exporter
) {
10156 // we exported, clear out any xlocks that we moved to another MDS
10158 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
10159 i
!= mdr
->locks
.end(); ) {
10160 SimpleLock
*lock
= i
->lock
;
10161 if (lock
->get_parent() != in
)
10163 // we only care about xlocks on the exported inode
10164 if (i
->is_xlock() && !lock
->is_locallock())
10165 mds
->locker
->xlock_export(i
++, mdr
.get());
10170 map
<client_t
,Capability::Import
> peer_imported
;
10171 auto bp
= mdr
->more()->inode_import
.cbegin();
10172 decode(peer_imported
, bp
);
10174 dout(10) << " finishing inode export on " << *in
<< dendl
;
10175 mdcache
->migrator
->finish_export_inode(in
, mdr
->peer_to_mds
, peer_imported
, finished
);
10176 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
10179 ceph_assert(in
->is_frozen_inode());
10180 in
->unfreeze_inode(finished
);
10184 if (mdr
->more()->is_ambiguous_auth
) {
10185 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10186 mdr
->more()->is_ambiguous_auth
= false;
10189 if (straydn
&& mdr
->more()->peer_update_journaled
) {
10190 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
10191 if (strayin
&& !strayin
->snaprealm
)
10192 mdcache
->clear_dirty_bits_for_stray(strayin
);
10195 mds
->queue_waiters(finished
);
10198 if (mdr
->more()->peer_update_journaled
) {
10199 // write a commit to the journal
10200 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_commit", mdr
->reqid
,
10201 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
10202 EPeerUpdate::RENAME
);
10203 mdlog
->start_entry(le
);
10204 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
10207 _committed_peer(mdr
);
10212 // rollback_bl may be empty if we froze the inode but had to provide an expanded
10213 // witness list from the leader, and they failed before we tried prep again.
10214 if (mdr
->more()->rollback_bl
.length()) {
10215 if (mdr
->more()->is_inode_exporter
) {
10216 dout(10) << " reversing inode export of " << *in
<< dendl
;
10217 in
->abort_export();
10219 if (mdcache
->is_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
)) {
10220 mdcache
->remove_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
);
10221 // rollback but preserve the peer request
10222 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, false);
10223 mdr
->more()->rollback_bl
.clear();
10225 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, true);
10227 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl
;
10229 if (mdr
->more()->is_ambiguous_auth
) {
10230 if (srcdn
->is_auth())
10231 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10233 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10234 mdr
->more()->is_ambiguous_auth
= false;
10236 mds
->queue_waiters(finished
);
10237 mdcache
->request_finish(mdr
);
10241 if (migrated_stray
&& mds
->is_stopping())
10242 mdcache
->shutdown_export_stray_finish(migrated_stray
);
10245 static void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
,
10246 rename_rollback::drec
&r
, utime_t ctime
,
10247 bool isdir
, const nest_info_t
&rstat
)
10249 auto pf
= dir
->project_fnode(mut
);
10250 pf
->version
= dir
->pre_dirty();
10253 pf
->fragstat
.nsubdirs
+= 1;
10255 pf
->fragstat
.nfiles
+= 1;
10258 pf
->rstat
.rbytes
+= rstat
.rbytes
;
10259 pf
->rstat
.rfiles
+= rstat
.rfiles
;
10260 pf
->rstat
.rsubdirs
+= rstat
.rsubdirs
;
10261 pf
->rstat
.rsnaps
+= rstat
.rsnaps
;
10263 if (pf
->fragstat
.mtime
== ctime
) {
10264 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
10265 if (pf
->rstat
.rctime
== ctime
)
10266 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
10268 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
10269 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
10272 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
10278 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
10280 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
10281 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
10282 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
10283 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
10284 straydn(st
), finish_mdr(f
) {
10285 splits
[0].swap(_splits
[0]);
10286 splits
[1].swap(_splits
[1]);
10288 void finish(int r
) override
{
10289 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
10290 destdn
, straydn
, splits
, finish_mdr
);
10294 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
,
10297 rename_rollback rollback
;
10298 auto p
= rbl
.cbegin();
10299 decode(rollback
, p
);
10301 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
10302 // need to finish this update before sending resolve to claim the subtree
10303 mdcache
->add_rollback(rollback
.reqid
, leader
);
10305 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
10306 mut
->ls
= mds
->mdlog
->get_current_segment();
10308 CDentry
*srcdn
= NULL
;
10309 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
10311 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
10313 dout(10) << " srcdir " << *srcdir
<< dendl
;
10314 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
10316 dout(10) << " srcdn " << *srcdn
<< dendl
;
10317 ceph_assert(srcdn
->get_linkage()->is_null());
10319 dout(10) << " srcdn not found" << dendl
;
10321 dout(10) << " srcdir not found" << dendl
;
10323 CDentry
*destdn
= NULL
;
10324 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
10326 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
10328 dout(10) << " destdir " << *destdir
<< dendl
;
10329 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
10331 dout(10) << " destdn " << *destdn
<< dendl
;
10333 dout(10) << " destdn not found" << dendl
;
10335 dout(10) << " destdir not found" << dendl
;
10338 if (rollback
.orig_src
.ino
) {
10339 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
10340 if (in
&& in
->is_dir())
10341 ceph_assert(srcdn
&& destdn
);
10343 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
10345 CDir
*straydir
= NULL
;
10346 CDentry
*straydn
= NULL
;
10347 if (rollback
.stray
.dirfrag
.ino
) {
10348 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
10350 dout(10) << "straydir " << *straydir
<< dendl
;
10351 straydn
= straydir
->lookup(rollback
.stray
.dname
);
10353 dout(10) << " straydn " << *straydn
<< dendl
;
10354 ceph_assert(straydn
->get_linkage()->is_primary());
10356 dout(10) << " straydn not found" << dendl
;
10358 dout(10) << "straydir not found" << dendl
;
10361 CInode
*target
= NULL
;
10362 if (rollback
.orig_dest
.ino
) {
10363 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
10365 ceph_assert(destdn
&& straydn
);
10366 } else if (rollback
.orig_dest
.remote_ino
)
10367 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
10369 // can't use is_auth() in the resolve stage
10370 mds_rank_t whoami
= mds
->get_nodeid();
10372 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
10373 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
10375 bool force_journal_src
= false;
10376 bool force_journal_dest
= false;
10377 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
10378 force_journal_src
= _need_force_journal(in
, false);
10379 if (in
&& target
&& target
->is_dir())
10380 force_journal_dest
= _need_force_journal(in
, true);
10382 version_t srcdnpv
= 0;
10385 if (srcdn
->authority().first
== whoami
)
10386 srcdnpv
= srcdn
->pre_dirty();
10387 if (rollback
.orig_src
.ino
) {
10389 srcdn
->push_projected_linkage(in
);
10391 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
10392 rollback
.orig_src
.remote_d_type
);
10395 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
10397 const CInode::mempool_inode
*pip
= nullptr;
10400 CDir
*pdir
= in
->get_projected_parent_dir();
10401 if (pdir
->authority().first
== whoami
) {
10402 auto pi
= in
->project_inode(mut
);
10403 pi
.inode
->version
= in
->pre_dirty();
10404 if (pdir
!= srcdir
) {
10405 auto pf
= pdir
->project_fnode(mut
);
10406 pf
->version
= pdir
->pre_dirty();
10408 if (pi
.inode
->ctime
== rollback
.ctime
)
10409 pi
.inode
->ctime
= rollback
.orig_src
.old_ctime
;
10412 if (in
->get_inode()->ctime
== rollback
.ctime
) {
10413 auto _inode
= CInode::allocate_inode(*in
->get_inode());
10414 _inode
->ctime
= rollback
.orig_src
.old_ctime
;
10415 in
->reset_inode(_inode
);
10419 pip
= in
->get_projected_inode().get();
10421 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
10423 auto p
= rollback
.srci_snapbl
.cbegin();
10424 decode(hadrealm
, p
);
10426 if (projected
&& !mds
->is_resolve()) {
10427 sr_t
*new_srnode
= new sr_t();
10428 decode(*new_srnode
, p
);
10429 in
->project_snaprealm(new_srnode
);
10431 decode(in
->snaprealm
->srnode
, p
);
10434 if (rollback
.orig_src
.ino
) {
10435 ceph_assert(srcdir
);
10436 realm
= srcdir
->get_inode()->find_snaprealm();
10438 realm
= in
->snaprealm
->parent
;
10440 if (!mds
->is_resolve())
10441 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
10443 in
->project_snaprealm(NULL
);
10445 in
->snaprealm
->merge_to(realm
);
10452 if (rollback
.orig_dest
.ino
&& target
) {
10453 destdn
->push_projected_linkage(target
);
10454 } else if (rollback
.orig_dest
.remote_ino
) {
10455 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
10456 rollback
.orig_dest
.remote_d_type
);
10458 // the dentry will be trimmed soon, it's ok to have wrong linkage
10459 if (rollback
.orig_dest
.ino
)
10460 ceph_assert(mds
->is_resolve());
10461 destdn
->push_projected_linkage();
10466 straydn
->push_projected_linkage();
10470 CInode::inode_ptr ti
;
10471 CDir
*pdir
= target
->get_projected_parent_dir();
10472 if (pdir
->authority().first
== whoami
) {
10473 auto pi
= target
->project_inode(mut
);
10474 pi
.inode
->version
= target
->pre_dirty();
10475 if (pdir
!= srcdir
) {
10476 auto pf
= pdir
->project_fnode(mut
);
10477 pf
->version
= pdir
->pre_dirty();
10482 ti
= CInode::allocate_inode(*target
->get_inode());
10486 if (ti
->ctime
== rollback
.ctime
)
10487 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
10488 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
10489 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
10490 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
10492 ceph_assert(rollback
.orig_dest
.remote_ino
&&
10493 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
10498 target
->reset_inode(ti
);
10500 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
10502 auto p
= rollback
.desti_snapbl
.cbegin();
10503 decode(hadrealm
, p
);
10505 if (projected
&& !mds
->is_resolve()) {
10506 sr_t
*new_srnode
= new sr_t();
10507 decode(*new_srnode
, p
);
10508 target
->project_snaprealm(new_srnode
);
10510 decode(target
->snaprealm
->srnode
, p
);
10513 if (rollback
.orig_dest
.ino
) {
10514 ceph_assert(destdir
);
10515 realm
= destdir
->get_inode()->find_snaprealm();
10517 realm
= target
->snaprealm
->parent
;
10519 if (!mds
->is_resolve())
10520 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
10522 target
->project_snaprealm(NULL
);
10524 target
->snaprealm
->merge_to(realm
);
10529 if (srcdn
&& srcdn
->authority().first
== whoami
) {
10531 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
10532 in
&& in
->is_dir(), pip
? pip
->accounted_rstat
: blah
);
10536 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
10538 dout(0) << " srci back to " << *in
<< dendl
;
10540 dout(0) << " destdn back to " << *destdn
<< dendl
;
10542 dout(0) << " desti back to " << *target
<< dendl
;
10545 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_rollback", rollback
.reqid
, leader
,
10546 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RENAME
);
10547 mdlog
->start_entry(le
);
10549 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10550 le
->commit
.add_dir_context(srcdir
);
10551 if (rollback
.orig_src
.ino
)
10552 le
->commit
.add_primary_dentry(srcdn
, 0, true);
10554 le
->commit
.add_remote_dentry(srcdn
, true);
10557 if (!rollback
.orig_src
.ino
&& // remote linkage
10558 in
&& in
->authority().first
== whoami
) {
10559 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
10560 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
10563 if (force_journal_dest
) {
10564 ceph_assert(rollback
.orig_dest
.ino
);
10565 le
->commit
.add_dir_context(destdir
);
10566 le
->commit
.add_primary_dentry(destdn
, 0, true);
10569 // peer: no need to journal straydn
10571 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
10572 ceph_assert(rollback
.orig_dest
.remote_ino
);
10573 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
10574 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
10577 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10578 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
10579 le
->commit
.renamed_dirino
= in
->ino();
10580 if (srcdn
->authority().first
== whoami
) {
10581 auto&& ls
= in
->get_dirfrags();
10582 for (const auto& dir
: ls
) {
10583 if (!dir
->is_auth())
10584 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
10586 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
10588 } else if (force_journal_dest
) {
10589 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
10590 le
->commit
.renamed_dirino
= target
->ino();
10593 if (target
&& target
->is_dir()) {
10594 ceph_assert(destdn
);
10595 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
10598 if (in
&& in
->is_dir()) {
10599 ceph_assert(srcdn
);
10600 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
10603 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
10604 ceph_assert(le
->commit
.empty());
10605 mdlog
->cancel_entry(le
);
10607 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
10609 ceph_assert(!le
->commit
.empty());
10611 mdr
->more()->peer_update_journaled
= false;
10612 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
10613 srcdn
, srcdnpv
, destdn
, straydn
,
10614 splits
, finish_mdr
);
10615 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
10620 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
10621 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
10622 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
10624 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
10627 straydn
->get_dir()->unlink_inode(straydn
);
10628 straydn
->pop_projected_linkage();
10631 destdn
->get_dir()->unlink_inode(destdn
);
10632 destdn
->pop_projected_linkage();
10635 srcdn
->pop_projected_linkage();
10636 if (srcdn
->authority().first
== mds
->get_nodeid()) {
10637 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
10638 if (srcdn
->get_linkage()->is_primary())
10639 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
10645 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
10646 CInode
*in
= srcdn
->get_linkage()->get_inode();
10647 if (in
&& in
->is_dir()) {
10648 ceph_assert(destdn
);
10649 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
10654 CInode
*oldin
= destdn
->get_linkage()->get_inode();
10655 // update subtree map?
10656 if (oldin
&& oldin
->is_dir()) {
10657 ceph_assert(straydn
);
10658 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
10662 if (mds
->is_resolve()) {
10665 root
= mdcache
->get_subtree_root(straydn
->get_dir());
10667 root
= mdcache
->get_subtree_root(destdn
->get_dir());
10669 mdcache
->try_trim_non_auth_subtree(root
);
10671 mdcache
->send_snaps(splits
[1]);
10672 mdcache
->send_snaps(splits
[0]);
10676 MDSContext::vec finished
;
10677 if (mdr
->more()->is_ambiguous_auth
) {
10678 if (srcdn
->is_auth())
10679 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10681 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10682 mdr
->more()->is_ambiguous_auth
= false;
10684 mds
->queue_waiters(finished
);
10685 if (finish_mdr
|| mdr
->aborted
)
10686 mdcache
->request_finish(mdr
);
10688 mdr
->more()->peer_rolling_back
= false;
10691 mdcache
->finish_rollback(mut
->reqid
, mdr
);
10696 void Server::handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10698 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10699 << " witnessed by " << ack
->get_source()
10700 << " " << *ack
<< dendl
;
10701 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10704 mdr
->more()->peers
.insert(from
);
10705 if (mdr
->more()->srcdn_auth_mds
== from
&&
10706 mdr
->more()->is_remote_frozen_authpin
&&
10707 !mdr
->more()->is_ambiguous_auth
) {
10708 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
10711 // witnessed? or add extra witnesses?
10712 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
10713 if (ack
->is_interrupted()) {
10714 dout(10) << " peer request interrupted, noop" << dendl
;
10715 } else if (ack
->witnesses
.empty()) {
10716 mdr
->more()->witnessed
.insert(from
);
10717 if (!ack
->is_not_journaled())
10718 mdr
->more()->has_journaled_peers
= true;
10720 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
10721 mdr
->more()->extra_witnesses
= ack
->witnesses
;
10722 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
10726 if (ack
->inode_export
.length()) {
10727 dout(10) << " got srci import" << dendl
;
10728 mdr
->more()->inode_import
.share(ack
->inode_export
);
10729 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
10732 // remove from waiting list
10733 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
10734 mdr
->more()->waiting_on_peer
.erase(from
);
10736 if (mdr
->more()->waiting_on_peer
.empty())
10737 dispatch_client_request(mdr
); // go again!
10739 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
10742 void Server::handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10744 dout(10) << "handle_peer_rename_notify_ack " << *mdr
<< " from mds."
10745 << ack
->get_source() << dendl
;
10746 ceph_assert(mdr
->is_peer());
10747 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10749 if (mdr
->more()->waiting_on_peer
.count(from
)) {
10750 mdr
->more()->waiting_on_peer
.erase(from
);
10752 if (mdr
->more()->waiting_on_peer
.empty()) {
10753 if (mdr
->peer_request
)
10754 dispatch_peer_request(mdr
);
10756 dout(10) << " still waiting for rename notify acks from "
10757 << mdr
->more()->waiting_on_peer
<< dendl
;
10761 void Server::_peer_rename_sessions_flushed(MDRequestRef
& mdr
)
10763 dout(10) << "_peer_rename_sessions_flushed " << *mdr
<< dendl
;
10765 if (mdr
->more()->waiting_on_peer
.count(MDS_RANK_NONE
)) {
10766 mdr
->more()->waiting_on_peer
.erase(MDS_RANK_NONE
);
10768 if (mdr
->more()->waiting_on_peer
.empty()) {
10769 if (mdr
->peer_request
)
10770 dispatch_peer_request(mdr
);
10772 dout(10) << " still waiting for rename notify acks from "
10773 << mdr
->more()->waiting_on_peer
<< dendl
;
10778 /* This function takes responsibility for the passed mdr*/
10779 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
10781 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10783 // traverse to path
10784 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10788 if (!diri
->is_dir()) {
10789 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10792 dout(10) << "lssnap on " << *diri
<< dendl
;
10795 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
10798 if (!check_access(mdr
, diri
, MAY_READ
))
10801 SnapRealm
*realm
= diri
->find_snaprealm();
10802 map
<snapid_t
,const SnapInfo
*> infomap
;
10803 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
10805 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
10807 max_entries
= infomap
.size();
10808 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
10810 // make sure at least one item can be encoded
10811 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
10813 __u64 last_snapid
= 0;
10814 string offset_str
= req
->get_path2();
10815 if (!offset_str
.empty())
10816 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
10820 static DirStat empty
;
10821 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
10823 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
10827 auto p
= infomap
.upper_bound(last_snapid
);
10828 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
10829 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
10833 if (p
->second
->ino
== diri
->ino())
10834 snap_name
= p
->second
->name
;
10836 snap_name
= p
->second
->get_long_name();
10838 unsigned start_len
= dnbl
.length();
10839 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
10842 encode(snap_name
, dnbl
);
10844 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
10845 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
10846 dout(20) << "encode_infinite_lease" << dendl
;
10848 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
10851 keep
.substr_of(dnbl
, 0, start_len
);
10858 encode(num
, dirbl
);
10860 if (p
== infomap
.end()) {
10861 flags
= CEPH_READDIR_FRAG_END
;
10862 if (last_snapid
== 0)
10863 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
10865 encode(flags
, dirbl
);
10866 dirbl
.claim_append(dnbl
);
10868 mdr
->reply_extra_bl
= dirbl
;
10869 mdr
->tracei
= diri
;
10870 respond_to_request(mdr
, 0);
10876 struct C_MDS_mksnap_finish
: public ServerLogContext
{
10879 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
10880 ServerLogContext(s
, r
), diri(di
), info(i
) {}
10881 void finish(int r
) override
{
10882 server
->_mksnap_finish(mdr
, diri
, info
);
10886 /* This function takes responsibility for the passed mdr*/
10887 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
10889 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10890 // make sure we have as new a map as the client
10891 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
10892 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
10895 if (!mds
->mdsmap
->allows_snaps()) {
10896 // you can't make snapshots until you set an option right now
10897 dout(5) << "new snapshots are disabled for this fs" << dendl
;
10898 respond_to_request(mdr
, -CEPHFS_EPERM
);
10902 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10907 if (!diri
->is_dir()) {
10908 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10911 if (diri
->is_system() && !diri
->is_root()) {
10912 // no snaps in system dirs (root is ok)
10913 dout(5) << "is an internal system dir" << dendl
;
10914 respond_to_request(mdr
, -CEPHFS_EPERM
);
10918 std::string_view snapname
= req
->get_filepath().last_dentry();
10920 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10921 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10922 respond_to_request(mdr
, -CEPHFS_EPERM
);
10926 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
10929 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10930 MutationImpl::LockOpVec lov
;
10931 lov
.add_xlock(&diri
->snaplock
);
10932 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10935 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10936 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10939 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10942 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10945 if (inodeno_t subvol_ino
= diri
->find_snaprealm()->get_subvolume_ino();
10946 (subvol_ino
&& subvol_ino
!= diri
->ino())) {
10947 dout(5) << "is a descendent of a subvolume dir" << dendl
;
10948 respond_to_request(mdr
, -CEPHFS_EPERM
);
10952 // check if we can create any more snapshots
10953 // we don't allow any more if we are already at or beyond the limit
10954 if (diri
->snaprealm
&&
10955 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
10956 respond_to_request(mdr
, -CEPHFS_EMLINK
);
10960 // make sure name is unique
10961 if (diri
->snaprealm
&&
10962 diri
->snaprealm
->exists(snapname
)) {
10963 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10966 if (snapname
.length() == 0 ||
10967 snapname
.length() > snapshot_name_max
||
10968 snapname
[0] == '_') {
10969 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10973 // allocate a snapid
10974 if (!mdr
->more()->stid
) {
10976 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
10977 mdr
->get_mds_stamp(),
10978 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10979 new C_MDS_RetryRequest(mdcache
, mdr
));
10983 version_t stid
= mdr
->more()->stid
;
10985 auto p
= mdr
->more()->snapidbl
.cbegin();
10987 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
10989 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10991 SnapPayload payload
;
10992 if (req
->get_data().length()) {
10994 auto iter
= req
->get_data().cbegin();
10995 decode(payload
, iter
);
10996 } catch (const ceph::buffer::error
&e
) {
10997 // backward compat -- client sends xattr bufferlist. however,
10998 // that is not used anywhere -- so (log and) ignore.
10999 dout(20) << ": no metadata in payload (old client?)" << dendl
;
11005 info
.ino
= diri
->ino();
11006 info
.snapid
= snapid
;
11007 info
.name
= snapname
;
11008 info
.stamp
= mdr
->get_op_stamp();
11009 info
.metadata
= payload
.metadata
;
11011 auto pi
= diri
->project_inode(mdr
, false, true);
11012 pi
.inode
->ctime
= info
.stamp
;
11013 if (info
.stamp
> pi
.inode
->rstat
.rctime
)
11014 pi
.inode
->rstat
.rctime
= info
.stamp
;
11015 pi
.inode
->rstat
.rsnaps
++;
11016 pi
.inode
->version
= diri
->pre_dirty();
11018 // project the snaprealm
11019 auto &newsnap
= *pi
.snapnode
;
11020 newsnap
.created
= snapid
;
11021 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
11023 em
.first
->second
= info
;
11024 newsnap
.seq
= snapid
;
11025 newsnap
.last_created
= snapid
;
11026 newsnap
.last_modified
= info
.stamp
;
11027 newsnap
.change_attr
++;
11029 // journal the inode changes
11030 mdr
->ls
= mdlog
->get_current_segment();
11031 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
11032 mdlog
->start_entry(le
);
11034 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
11035 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
11036 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
11037 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11039 // journal the snaprealm changes
11040 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
11045 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
11047 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
11049 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
11053 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
11056 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11058 // notify other mds
11059 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
11061 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
11065 mdr
->snapid
= info
.snapid
;
11066 mdr
->tracei
= diri
;
11067 respond_to_request(mdr
, 0);
11073 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
11076 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
11077 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
11078 void finish(int r
) override
{
11079 server
->_rmsnap_finish(mdr
, diri
, snapid
);
11083 /* This function takes responsibility for the passed mdr*/
11084 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
11086 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
11088 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
11092 if (!diri
->is_dir()) {
11093 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
11097 std::string_view snapname
= req
->get_filepath().last_dentry();
11099 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
11100 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
11101 respond_to_request(mdr
, -CEPHFS_EPERM
);
11105 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
11107 // does snap exist?
11108 if (snapname
.length() == 0 || snapname
[0] == '_') {
11109 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't prune a parent snap, currently.
11112 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
11113 respond_to_request(mdr
, -CEPHFS_ENOENT
);
11116 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
11117 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
11118 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
11119 MutationImpl::LockOpVec lov
;
11120 lov
.add_xlock(&diri
->snaplock
);
11121 if (!mds
->locker
->acquire_locks(mdr
, lov
))
11123 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
11124 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
11127 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
11130 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
11134 if (!mdr
->more()->stid
) {
11135 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
11136 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
11137 new C_MDS_RetryRequest(mdcache
, mdr
));
11140 version_t stid
= mdr
->more()->stid
;
11141 auto p
= mdr
->more()->snapidbl
.cbegin();
11144 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
11146 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
11149 auto pi
= diri
->project_inode(mdr
, false, true);
11150 pi
.inode
->version
= diri
->pre_dirty();
11151 pi
.inode
->ctime
= mdr
->get_op_stamp();
11152 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
11153 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
11154 pi
.inode
->rstat
.rsnaps
--;
11156 mdr
->ls
= mdlog
->get_current_segment();
11157 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
11158 mdlog
->start_entry(le
);
11160 // project the snaprealm
11161 auto &newnode
= *pi
.snapnode
;
11162 newnode
.snaps
.erase(snapid
);
11164 newnode
.last_destroyed
= seq
;
11165 newnode
.last_modified
= mdr
->get_op_stamp();
11166 newnode
.change_attr
++;
11168 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
11169 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
11170 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
11171 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11173 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
11178 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
11180 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
11181 snapid_t stid
= mdr
->more()->stid
;
11185 mds
->snapclient
->commit(stid
, mdr
->ls
);
11187 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11189 // notify other mds
11190 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
11192 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
11196 mdr
->tracei
= diri
;
11197 mdr
->snapid
= snapid
;
11198 respond_to_request(mdr
, 0);
11200 // purge snapshot data
11201 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
11204 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
11207 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
11208 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
11209 void finish(int r
) override
{
11210 server
->_renamesnap_finish(mdr
, diri
, snapid
);
11214 /* This function takes responsibility for the passed mdr*/
11215 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
11217 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
11218 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
11219 respond_to_request(mdr
, -CEPHFS_EINVAL
);
11223 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
11227 if (!diri
->is_dir()) { // dir only
11228 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
11232 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
11233 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
11234 respond_to_request(mdr
, -CEPHFS_EPERM
);
11238 std::string_view dstname
= req
->get_filepath().last_dentry();
11239 std::string_view srcname
= req
->get_filepath2().last_dentry();
11240 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
11242 if (srcname
.length() == 0 || srcname
[0] == '_') {
11243 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't rename a parent snap.
11246 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
11247 respond_to_request(mdr
, -CEPHFS_ENOENT
);
11250 if (dstname
.length() == 0 || dstname
[0] == '_') {
11251 respond_to_request(mdr
, -CEPHFS_EINVAL
);
11254 if (diri
->snaprealm
->exists(dstname
)) {
11255 respond_to_request(mdr
, -CEPHFS_EEXIST
);
11259 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
11260 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
11263 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
11264 MutationImpl::LockOpVec lov
;
11265 lov
.add_xlock(&diri
->snaplock
);
11266 if (!mds
->locker
->acquire_locks(mdr
, lov
))
11268 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
11269 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
11272 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
11275 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
11279 if (!mdr
->more()->stid
) {
11280 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
11281 &mdr
->more()->stid
,
11282 new C_MDS_RetryRequest(mdcache
, mdr
));
11286 version_t stid
= mdr
->more()->stid
;
11287 dout(10) << " stid is " << stid
<< dendl
;
11289 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
11292 auto pi
= diri
->project_inode(mdr
, false, true);
11293 pi
.inode
->ctime
= mdr
->get_op_stamp();
11294 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
11295 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
11296 pi
.inode
->version
= diri
->pre_dirty();
11298 // project the snaprealm
11299 auto &newsnap
= *pi
.snapnode
;
11300 auto it
= newsnap
.snaps
.find(snapid
);
11301 ceph_assert(it
!= newsnap
.snaps
.end());
11302 it
->second
.name
= dstname
;
11303 newsnap
.last_modified
= mdr
->get_op_stamp();
11304 newsnap
.change_attr
++;
11306 // journal the inode changes
11307 mdr
->ls
= mdlog
->get_current_segment();
11308 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
11309 mdlog
->start_entry(le
);
11311 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
11312 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
11313 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
11314 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11316 // journal the snaprealm changes
11317 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
11322 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
11324 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
11328 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
11330 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11332 // notify other mds
11333 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
11335 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
11339 mdr
->tracei
= diri
;
11340 mdr
->snapid
= snapid
;
11341 respond_to_request(mdr
, 0);
11345 * Return true if server is in state RECONNECT and this
11346 * client has not yet reconnected.
11348 bool Server::waiting_for_reconnect(client_t c
) const
11350 return client_reconnect_gather
.count(c
) > 0;
11353 void Server::dump_reconnect_status(Formatter
*f
) const
11355 f
->open_object_section("reconnect_status");
11356 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
11357 f
->close_section();
11360 const bufferlist
& Server::get_snap_trace(Session
*session
, SnapRealm
*realm
) const {
11361 ceph_assert(session
);
11362 ceph_assert(realm
);
11363 if (session
->info
.has_feature(CEPHFS_FEATURE_NEW_SNAPREALM_INFO
)) {
11364 return realm
->get_snap_trace_new();
11366 return realm
->get_snap_trace();
11370 const bufferlist
& Server::get_snap_trace(client_t client
, SnapRealm
*realm
) const {
11371 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
11372 return get_snap_trace(session
, realm
);