1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
35 #include "msg/Messenger.h"
37 #include "osdc/Objecter.h"
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
58 #include <string_view>
61 #include "common/config.h"
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
70 class ServerContext
: public MDSContext
{
73 MDSRank
*get_mds() override
79 explicit ServerContext(Server
*s
) : server(s
) {
80 ceph_assert(server
!= NULL
);
84 class Batch_Getattr_Lookup
: public BatchOp
{
87 ceph::ref_t
<MDRequestImpl
> mdr
;
88 std::vector
<ceph::ref_t
<MDRequestImpl
>> batch_reqs
;
91 Batch_Getattr_Lookup(Server
* s
, const ceph::ref_t
<MDRequestImpl
>& r
)
93 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
)
94 mdr
->batch_op_map
= &mdr
->dn
[0].back()->batch_ops
;
96 mdr
->batch_op_map
= &mdr
->in
[0]->batch_ops
;
98 void add_request(const ceph::ref_t
<MDRequestImpl
>& r
) override
{
99 batch_reqs
.push_back(r
);
101 ceph::ref_t
<MDRequestImpl
> find_new_head() override
{
102 while (!batch_reqs
.empty()) {
103 auto r
= std::move(batch_reqs
.back());
104 batch_reqs
.pop_back();
108 r
->batch_op_map
= mdr
->batch_op_map
;
109 mdr
->batch_op_map
= nullptr;
115 void _forward(mds_rank_t t
) override
{
116 MDCache
* mdcache
= server
->mdcache
;
117 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
118 mdr
->set_mds_stamp(ceph_clock_now());
119 for (auto& m
: batch_reqs
) {
121 mdcache
->request_forward(m
, t
);
125 void _respond(int r
) override
{
126 mdr
->set_mds_stamp(ceph_clock_now());
127 for (auto& m
: batch_reqs
) {
129 m
->tracei
= mdr
->tracei
;
130 m
->tracedn
= mdr
->tracedn
;
131 server
->respond_to_request(m
, r
);
135 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
137 void print(std::ostream
& o
) {
138 o
<< "[batch front=" << *mdr
<< "]";
142 class ServerLogContext
: public MDSLogContextBase
{
145 MDSRank
*get_mds() override
151 void pre_finish(int r
) override
{
153 mdr
->mark_event("journal_committed: ");
156 explicit ServerLogContext(Server
*s
) : server(s
) {
157 ceph_assert(server
!= NULL
);
159 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
160 ceph_assert(server
!= NULL
);
164 void Server::create_logger()
166 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
168 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
170 plb
.add_u64_counter(l_mdss_handle_peer_request
, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
172 plb
.add_u64_counter(l_mdss_handle_client_session
,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING
);
175 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
177 plb
.add_u64_counter(l_mdss_cap_acquisition_throttle
,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING
);
181 // fop latencies are useful
182 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
183 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
192 "Request type lookup latency");
193 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
203 plb
.add_time_avg(l_mdss_req_getvxattr_latency
, "req_getvxattr_latency",
204 "Request type get virtual extended attribute latency");
205 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
206 "Request type set extended attribute latency");
207 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
208 "Request type remove extended attribute latency");
209 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
210 "Request type read directory latency");
211 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
212 "Request type set file lock latency");
213 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
214 "Request type get file lock latency");
215 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
216 "Request type create latency");
217 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
218 "Request type open latency");
219 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
220 "Request type make node latency");
221 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
222 "Request type link latency");
223 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
224 "Request type unlink latency");
225 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
226 "Request type remove directory latency");
227 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
228 "Request type rename latency");
229 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
230 "Request type make directory latency");
231 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
232 "Request type symbolic link latency");
233 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
234 "Request type list snapshot latency");
235 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
236 "Request type make snapshot latency");
237 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
238 "Request type remove snapshot latency");
239 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
240 "Request type rename snapshot latency");
242 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
243 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
244 "Client requests dispatched");
245 plb
.add_u64_counter(l_mdss_dispatch_peer_request
, "dispatch_server_request",
246 "Server requests dispatched");
248 logger
= plb
.create_perf_counters();
249 g_ceph_context
->get_perfcounters_collection()->add(logger
);
252 Server::Server(MDSRank
*m
, MetricsHandler
*metrics_handler
) :
254 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
255 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate")),
256 metrics_handler(metrics_handler
)
258 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
259 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
260 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
261 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
262 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
263 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
264 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
265 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
266 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
267 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
268 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
269 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
270 supported_metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
273 void Server::dispatch(const cref_t
<Message
> &m
)
275 switch (m
->get_type()) {
276 case CEPH_MSG_CLIENT_RECONNECT
:
277 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
282 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
284 1. In reconnect phase, client sent unsafe requests to mds.
285 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
286 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
287 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
290 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
292 // handle_peer_request()/handle_client_session() will wait if necessary
293 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
294 const auto &req
= ref_cast
<MClientRequest
>(m
);
295 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
296 Session
*session
= mds
->get_session(req
);
297 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
298 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
301 bool queue_replay
= false;
302 if (req
->is_replay() || req
->is_async()) {
303 dout(3) << "queuing replayed op" << dendl
;
306 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
307 inodeno_t
ino(req
->head
.ino
);
308 mdcache
->add_replay_ino_alloc(ino
);
309 if (replay_unsafe_with_closed_session
&&
310 session
->free_prealloc_inos
.contains(ino
)) {
311 // don't purge inodes that will be created by later replay
312 session
->free_prealloc_inos
.erase(ino
);
313 session
->delegated_inos
.insert(ino
);
316 } else if (req
->get_retry_attempt()) {
317 // process completed request in clientreplay stage. The completed request
318 // might have created new file/directorie. This guarantees MDS sends a reply
319 // to client before other request modifies the new file/directorie.
320 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
321 dout(3) << "queuing completed op" << dendl
;
324 // this request was created before the cap reconnect message, drop any embedded
326 req
->releases
.clear();
329 req
->mark_queued_for_replay();
330 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
335 bool wait_for_active
= true;
336 if (mds
->is_stopping()) {
337 wait_for_active
= false;
338 } else if (mds
->is_clientreplay()) {
339 if (req
->is_queued_for_replay()) {
340 wait_for_active
= false;
343 if (wait_for_active
) {
344 dout(3) << "not active yet, waiting" << dendl
;
345 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
350 switch (m
->get_type()) {
351 case CEPH_MSG_CLIENT_SESSION
:
352 handle_client_session(ref_cast
<MClientSession
>(m
));
354 case CEPH_MSG_CLIENT_REQUEST
:
355 handle_client_request(ref_cast
<MClientRequest
>(m
));
357 case CEPH_MSG_CLIENT_RECLAIM
:
358 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
360 case MSG_MDS_PEER_REQUEST
:
361 handle_peer_request(ref_cast
<MMDSPeerRequest
>(m
));
364 derr
<< "server unknown message " << m
->get_type() << dendl
;
365 ceph_abort_msg("server unknown message");
371 // ----------------------------------------------------------
372 // SESSION management
374 class C_MDS_session_finish
: public ServerLogContext
{
379 interval_set
<inodeno_t
> inos_to_free
;
381 interval_set
<inodeno_t
> inos_to_purge
;
382 LogSegment
*ls
= nullptr;
385 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= nullptr) :
386 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
387 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
,
388 const interval_set
<inodeno_t
>& to_free
, version_t iv
,
389 const interval_set
<inodeno_t
>& to_purge
, LogSegment
*_ls
, Context
*fin_
= nullptr) :
390 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
),
391 inos_to_free(to_free
), inotablev(iv
), inos_to_purge(to_purge
), ls(_ls
), fin(fin_
) {}
392 void finish(int r
) override
{
394 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos_to_free
, inotablev
, inos_to_purge
, ls
);
401 Session
* Server::find_session_by_uuid(std::string_view uuid
)
403 Session
* session
= nullptr;
404 for (auto& it
: mds
->sessionmap
.get_sessions()) {
405 auto& metadata
= it
.second
->info
.client_metadata
;
407 auto p
= metadata
.find("uuid");
408 if (p
== metadata
.end() || p
->second
!= uuid
)
413 } else if (!session
->reclaiming_from
) {
414 ceph_assert(it
.second
->reclaiming_from
== session
);
417 ceph_assert(session
->reclaiming_from
== it
.second
);
423 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
425 if (!session
->is_open() && !session
->is_stale()) {
426 dout(10) << "session not open, dropping this req" << dendl
;
430 auto reply
= make_message
<MClientReclaimReply
>(0);
431 if (m
->get_uuid().empty()) {
432 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
433 reply
->set_result(-CEPHFS_EINVAL
);
434 mds
->send_message_client(reply
, session
);
438 unsigned flags
= m
->get_flags();
439 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
440 dout(10) << __func__
<< " unsupported flags" << dendl
;
441 reply
->set_result(-CEPHFS_EOPNOTSUPP
);
442 mds
->send_message_client(reply
, session
);
446 Session
* target
= find_session_by_uuid(m
->get_uuid());
448 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
449 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
450 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
451 reply
->set_result(-CEPHFS_EPERM
);
452 mds
->send_message_client(reply
, session
);
455 ceph_assert(!target
->reclaiming_from
);
456 ceph_assert(!session
->reclaiming_from
);
457 session
->reclaiming_from
= target
;
458 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
461 if (flags
& CEPH_RECLAIM_RESET
) {
462 finish_reclaim_session(session
, reply
);
469 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
471 Session
*target
= session
->reclaiming_from
;
473 session
->reclaiming_from
= nullptr;
477 int64_t session_id
= session
->get_client().v
;
478 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
479 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
480 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
484 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
485 reply
->set_epoch(epoch
);
486 mds
->send_message_client(reply
, session
);
489 send_reply
= nullptr;
492 bool blocklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
493 return map
.is_blocklisted(target
->info
.inst
.addr
);
496 if (blocklisted
|| !g_conf()->mds_session_blocklist_on_evict
) {
497 kill_session(target
, send_reply
);
499 CachedStackStringStream css
;
500 mds
->evict_client(target
->get_client().v
, false, true, *css
, send_reply
);
503 mds
->send_message_client(reply
, session
);
507 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
509 Session
*session
= mds
->get_session(m
);
510 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
511 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
514 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
518 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
519 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
520 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
524 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
525 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
529 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
530 finish_reclaim_session(session
);
532 reclaim_session(session
, m
);
536 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
539 Session
*session
= mds
->get_session(m
);
541 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
542 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
545 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
546 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
547 reply
->metadata
["error_string"] = "sessionless";
548 mds
->send_message(reply
, m
->get_connection());
552 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
553 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
554 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
555 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
556 reply
->metadata
["error_string"] = "client doesn't have caps for FS \"" +
557 std::string(fs_name
) + "\"";
558 mds
->send_message(std::move(reply
), m
->get_connection());
562 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
563 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
564 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
565 // close requests need to be handled when mds is active
566 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
567 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
571 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
572 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
578 logger
->inc(l_mdss_handle_client_session
);
581 switch (m
->get_op()) {
582 case CEPH_SESSION_REQUEST_OPEN
:
583 if (session
->is_opening() ||
584 session
->is_open() ||
585 session
->is_stale() ||
586 session
->is_killing() ||
587 terminating_sessions
) {
588 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
591 ceph_assert(session
->is_closed() || session
->is_closing());
593 if (mds
->is_stopping()) {
594 dout(10) << "mds is stopping, dropping open req" << dendl
;
599 auto& addr
= session
->info
.inst
.addr
;
600 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
601 auto& client_metadata
= session
->info
.client_metadata
;
603 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
604 auto now
= ceph_clock_now();
605 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
606 auto elapsed
= now
- m
->get_recv_stamp();
607 CachedStackStringStream css
;
608 *css
<< "New client session:"
609 << " addr=\"" << session
->info
.inst
.addr
<< "\""
610 << ",elapsed=" << elapsed
611 << ",throttled=" << throttle_elapsed
612 << ",status=\"" << status
<< "\"";
614 *css
<< ",error=\"" << err
<< "\"";
616 const auto& metadata
= session
->info
.client_metadata
;
617 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
618 *css
<< ",root=\"" << it
->second
<< "\"";
620 dout(2) << css
->strv() << dendl
;
623 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
, unsigned flags
=0) {
624 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
, 0, flags
);
625 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
626 m
->metadata
["error_string"] = err_str
;
627 mds
->send_message_client(m
, session
);
628 log_session_status("REJECTED", err_str
);
631 bool blocklisted
= mds
->objecter
->with_osdmap(
632 [&addr
](const OSDMap
&osd_map
) -> bool {
633 return osd_map
.is_blocklisted(addr
);
637 dout(10) << "rejecting blocklisted client " << addr
<< dendl
;
638 // This goes on the wire and the "blacklisted" substring is
639 // depended upon by the kernel client for detecting whether it
640 // has been blocklisted. If mounted with recover_session=clean
641 // (since 5.4), it tries to automatically recover itself from
644 flags
|= MClientSession::SESSION_BLOCKLISTED
;
645 send_reject_message("blocklisted (blacklisted)", flags
);
650 if (client_metadata
.features
.empty())
651 infer_supported_features(session
, client_metadata
);
653 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
654 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
655 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
656 for (const auto& p
: client_metadata
) {
657 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
660 feature_bitset_t missing_features
= required_client_features
;
661 missing_features
-= client_metadata
.features
;
662 if (!missing_features
.empty()) {
663 CachedStackStringStream css
;
664 *css
<< "missing required features '" << missing_features
<< "'";
665 send_reject_message(css
->strv());
666 mds
->clog
->warn() << "client session (" << session
->info
.inst
667 << ") lacks required features " << missing_features
668 << "; client supports " << client_metadata
.features
;
673 // Special case for the 'root' metadata path; validate that the claimed
674 // root is actually within the caps of the session
675 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
676 auto claimed_root
= it
->second
;
677 CachedStackStringStream css
;
679 // claimed_root has a leading "/" which we strip before passing
681 if (claimed_root
.empty() || claimed_root
[0] != '/') {
683 *css
<< "invalue root '" << claimed_root
<< "'";
684 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
686 *css
<< "non-allowable root '" << claimed_root
<< "'";
690 // Tell the client we're rejecting their open
691 send_reject_message(css
->strv());
692 mds
->clog
->warn() << "client session with " << css
->strv()
693 << " denied (" << session
->info
.inst
<< ")";
699 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
700 if (find_session_by_uuid(it
->second
)) {
701 send_reject_message("duplicated session uuid");
702 mds
->clog
->warn() << "client session with duplicated session uuid '"
703 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
709 if (session
->is_closed()) {
710 mds
->sessionmap
.add_session(session
);
713 pv
= mds
->sessionmap
.mark_projected(session
);
714 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
715 mds
->sessionmap
.touch_session(session
);
716 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
718 log_session_status("ACCEPTED", "");
720 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
721 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
726 case CEPH_SESSION_REQUEST_RENEWCAPS
:
727 if (session
->is_open() || session
->is_stale()) {
728 mds
->sessionmap
.touch_session(session
);
729 if (session
->is_stale()) {
730 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
731 mds
->locker
->resume_stale_caps(session
);
732 mds
->sessionmap
.touch_session(session
);
734 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
735 mds
->send_message_client(reply
, session
);
737 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
741 case CEPH_SESSION_REQUEST_CLOSE
:
743 if (session
->is_closed() ||
744 session
->is_closing() ||
745 session
->is_killing()) {
746 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
749 if (session
->is_importing()) {
750 dout(10) << "ignoring close req on importing session" << dendl
;
753 ceph_assert(session
->is_open() ||
754 session
->is_stale() ||
755 session
->is_opening());
756 if (m
->get_seq() < session
->get_push_seq()) {
757 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
758 << ", dropping" << dendl
;
761 // We are getting a seq that is higher than expected.
762 // Handle the same as any other seqn error.
764 if (m
->get_seq() != session
->get_push_seq()) {
765 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
766 << ", BUGGY!" << dendl
;
767 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
768 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
771 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
775 case CEPH_SESSION_FLUSHMSG_ACK
:
776 finish_flush_session(session
, m
->get_seq());
779 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
780 if (mds
->is_active())
789 void Server::flush_session(Session
*session
, MDSGatherBuilder
& gather
) {
790 if (!session
->is_open() ||
791 !session
->get_connection() ||
792 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
796 version_t seq
= session
->wait_for_flush(gather
.new_sub());
797 mds
->send_message_client(
798 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
801 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
803 for (const auto& client
: client_set
) {
804 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
805 ceph_assert(session
);
806 flush_session(session
, gather
);
810 void Server::finish_flush_session(Session
*session
, version_t seq
)
812 MDSContext::vec finished
;
813 session
->finish_flush(seq
, finished
);
814 mds
->queue_waiters(finished
);
817 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
818 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
819 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
)
821 dout(10) << "_session_logged " << session
->info
.inst
822 << " state_seq " << state_seq
823 << " " << (open
? "open":"close") << " " << pv
824 << " inos_to_free " << inos_to_free
<< " inotablev " << piv
825 << " inos_to_purge " << inos_to_purge
<< dendl
;
828 if (inos_to_purge
.size()){
830 session
->info
.prealloc_inos
.subtract(inos_to_purge
);
831 ls
->purging_inodes
.insert(inos_to_purge
);
832 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())
833 mdcache
->purge_inodes(inos_to_purge
, ls
);
836 if (inos_to_free
.size()) {
838 ceph_assert(session
->is_closing() || session
->is_killing() ||
839 session
->is_opening()); // re-open closing session
840 session
->info
.prealloc_inos
.subtract(inos_to_free
);
841 mds
->inotable
->apply_release_ids(inos_to_free
);
842 ceph_assert(mds
->inotable
->get_version() == piv
);
844 session
->free_prealloc_inos
= session
->info
.prealloc_inos
;
845 session
->delegated_inos
.clear();
848 mds
->sessionmap
.mark_dirty(session
);
851 if (session
->get_state_seq() != state_seq
) {
852 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
853 << ", noop" << dendl
;
854 // close must have been canceled (by an import?), or any number of other things..
856 ceph_assert(session
->is_opening());
857 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
858 mds
->sessionmap
.touch_session(session
);
859 metrics_handler
->add_session(session
);
860 ceph_assert(session
->get_connection());
861 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
862 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
863 reply
->supported_features
= supported_features
;
864 reply
->metric_spec
= supported_metric_spec
;
866 mds
->send_message_client(reply
, session
);
867 if (mdcache
->is_readonly()) {
868 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
869 mds
->send_message_client(m
, session
);
871 } else if (session
->is_closing() ||
872 session
->is_killing()) {
873 // kill any lingering capabilities, leases, requests
874 bool killing
= session
->is_killing();
875 while (!session
->caps
.empty()) {
876 Capability
*cap
= session
->caps
.front();
877 CInode
*in
= cap
->get_inode();
878 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
879 mds
->locker
->remove_client_cap(in
, cap
, killing
);
881 while (!session
->leases
.empty()) {
882 ClientLease
*r
= session
->leases
.front();
883 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
884 dout(20) << " killing client lease of " << *dn
<< dendl
;
885 dn
->remove_client_lease(r
, mds
->locker
);
887 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
888 dout(20) << " removing client from reconnect set" << dendl
;
889 if (client_reconnect_gather
.empty()) {
890 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
891 reconnect_gather_finish();
894 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
895 dout(20) << " removing client from reclaim set" << dendl
;
896 if (client_reclaim_gather
.empty()) {
897 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
898 mds
->maybe_clientreplay_done();
902 if (session
->is_closing()) {
903 // mark con disposable. if there is a fault, we will get a
904 // reset and clean it up. if the client hasn't received the
905 // CLOSE message yet, they will reconnect and get an
906 // ms_handle_remote_reset() and realize they had in fact closed.
907 // do this *before* sending the message to avoid a possible
909 if (session
->get_connection()) {
910 // Conditional because terminate_sessions will indiscrimately
911 // put sessions in CLOSING whether they ever had a conn or not.
912 session
->get_connection()->mark_disposable();
916 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
917 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
919 metrics_handler
->remove_session(session
);
920 mds
->sessionmap
.remove_session(session
);
921 } else if (session
->is_killing()) {
922 // destroy session, close connection
923 if (session
->get_connection()) {
924 session
->get_connection()->mark_down();
925 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
926 session
->set_connection(nullptr);
928 metrics_handler
->remove_session(session
);
929 mds
->sessionmap
.remove_session(session
);
939 * Inject sessions from some source other than actual connections.
942 * - sessions inferred from journal replay
943 * - sessions learned from other MDSs during rejoin
944 * - sessions learned from other MDSs during dir/caps migration
945 * - sessions learned from other MDSs during a cross-MDS rename
947 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
948 map
<client_t
,client_metadata_t
>& cmm
,
949 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
951 version_t pv
= mds
->sessionmap
.get_projected();
953 dout(10) << "prepare_force_open_sessions " << pv
954 << " on " << cm
.size() << " clients"
957 mds
->objecter
->with_osdmap(
958 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
959 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
960 if (osd_map
.is_blocklisted(p
->second
.addr
)) {
961 dout(10) << " ignoring blocklisted client." << p
->first
962 << " (" << p
->second
.addr
<< ")" << dendl
;
971 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
972 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
973 pv
= mds
->sessionmap
.mark_projected(session
);
975 if (session
->is_closed() ||
976 session
->is_closing() ||
977 session
->is_killing()) {
978 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
979 auto q
= cmm
.find(p
->first
);
981 session
->info
.client_metadata
.merge(q
->second
);
983 ceph_assert(session
->is_open() ||
984 session
->is_opening() ||
985 session
->is_stale());
988 smap
[p
->first
] = make_pair(session
, sseq
);
989 session
->inc_importing();
994 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
998 * FIXME: need to carefully consider the race conditions between a
999 * client trying to close a session and an MDS doing an import
1000 * trying to force open a session...
1002 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
1003 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
1005 for (auto &it
: smap
) {
1006 Session
*session
= it
.second
.first
;
1007 uint64_t sseq
= it
.second
.second
;
1009 if (session
->get_state_seq() != sseq
) {
1010 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
1012 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
1013 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1014 mds
->sessionmap
.touch_session(session
);
1015 metrics_handler
->add_session(session
);
1017 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1018 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1019 reply
->supported_features
= supported_features
;
1020 reply
->metric_spec
= supported_metric_spec
;
1022 mds
->send_message_client(reply
, session
);
1024 if (mdcache
->is_readonly())
1025 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1028 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
1029 ceph_assert(session
->is_open() || session
->is_stale());
1033 session
->dec_importing();
1036 mds
->sessionmap
.mark_dirty(session
);
1039 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
1042 class C_MDS_TerminatedSessions
: public ServerContext
{
1043 void finish(int r
) override
{
1044 server
->terminating_sessions
= false;
1047 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
1050 void Server::terminate_sessions()
1052 dout(5) << "terminating all sessions..." << dendl
;
1054 terminating_sessions
= true;
1056 // kill them off. clients will retry etc.
1057 set
<Session
*> sessions
;
1058 mds
->sessionmap
.get_client_session_set(sessions
);
1059 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1060 p
!= sessions
.end();
1062 Session
*session
= *p
;
1063 if (session
->is_closing() ||
1064 session
->is_killing() ||
1065 session
->is_closed())
1067 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
1070 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
1074 void Server::find_idle_sessions()
1076 auto now
= clock::now();
1077 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1079 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1082 // (caps go stale, lease die)
1083 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1084 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1086 // don't kick clients if we've been laggy
1087 if (last_cleared_laggy
< cutoff
) {
1088 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1089 << "), not marking any client stale" << dendl
;
1093 std::vector
<Session
*> to_evict
;
1095 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1096 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1097 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1098 std::vector
<Session
*> new_stale
;
1100 for (auto session
: *(sessions_p1
->second
)) {
1101 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1102 if (last_cap_renew_span
< cutoff
) {
1103 dout(20) << "laggiest active session is " << session
->info
.inst
1104 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1108 if (session
->last_seen
> session
->last_cap_renew
) {
1109 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1110 if (last_cap_renew_span
< cutoff
) {
1111 dout(20) << "laggiest active session is " << session
->info
.inst
1112 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1117 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1118 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1119 "has arrived" << dendl
;
1120 // evict session without marking it stale
1121 to_evict
.push_back(session
);
1125 if (defer_session_stale
&&
1126 !session
->is_any_flush_waiter() &&
1127 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1128 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1129 "since it holds no caps" << dendl
;
1133 auto it
= session
->info
.client_metadata
.find("timeout");
1134 if (it
!= session
->info
.client_metadata
.end()) {
1135 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1137 dout(10) << "skipping session " << session
->info
.inst
1138 << ", infinite timeout specified" << dendl
;
1141 double cutoff
= queue_max_age
+ timeout
;
1142 if (last_cap_renew_span
< cutoff
) {
1143 dout(10) << "skipping session " << session
->info
.inst
1144 << ", timeout (" << timeout
<< ") specified"
1145 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1149 // do not go through stale, evict it directly.
1150 to_evict
.push_back(session
);
1152 dout(10) << "new stale session " << session
->info
.inst
1153 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1154 new_stale
.push_back(session
);
1158 for (auto session
: new_stale
) {
1159 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1160 if (mds
->locker
->revoke_stale_caps(session
)) {
1161 mds
->locker
->remove_stale_leases(session
);
1162 finish_flush_session(session
, session
->get_push_seq());
1163 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
, session
->get_push_seq());
1164 mds
->send_message_client(m
, session
);
1166 to_evict
.push_back(session
);
1172 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1174 // Collect a list of sessions exceeding the autoclose threshold
1175 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1176 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1177 for (auto session
: *(sessions_p2
->second
)) {
1178 ceph_assert(session
->is_stale());
1179 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1180 if (last_cap_renew_span
< cutoff
) {
1181 dout(20) << "oldest stale session is " << session
->info
.inst
1182 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1185 to_evict
.push_back(session
);
1189 for (auto session
: to_evict
) {
1190 if (session
->is_importing()) {
1191 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1195 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1196 mds
->clog
->warn() << "evicting unresponsive client " << *session
1197 << ", after " << last_cap_renew_span
<< " seconds";
1198 dout(10) << "autoclosing stale session " << session
->info
.inst
1199 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1201 if (g_conf()->mds_session_blocklist_on_timeout
) {
1202 CachedStackStringStream css
;
1203 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
1205 kill_session(session
, NULL
);
1210 void Server::evict_cap_revoke_non_responders() {
1211 if (!cap_revoke_eviction_timeout
) {
1215 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1217 for (auto const &client
: to_evict
) {
1218 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1219 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1220 << " seconds, evicting";
1221 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1224 CachedStackStringStream css
;
1225 bool evicted
= mds
->evict_client(client
.v
, false,
1226 g_conf()->mds_session_blocklist_on_evict
,
1228 if (evicted
&& logger
) {
1229 logger
->inc(l_mdss_cap_revoke_eviction
);
1234 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1235 if (changed
.count("mds_forward_all_requests_to_auth")){
1236 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
1238 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1239 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1240 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1241 << cap_revoke_eviction_timeout
<< dendl
;
1243 if (changed
.count("mds_recall_max_decay_rate")) {
1244 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1246 if (changed
.count("mds_max_snaps_per_dir")) {
1247 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1248 dout(20) << __func__
<< " max snapshots per directory changed to "
1249 << max_snaps_per_dir
<< dendl
;
1251 if (changed
.count("mds_client_delegate_inos_pct")) {
1252 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1254 if (changed
.count("mds_max_caps_per_client")) {
1255 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1257 if (changed
.count("mds_session_cap_acquisition_throttle")) {
1258 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
1260 if (changed
.count("mds_session_max_caps_throttle_ratio")) {
1261 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
1263 if (changed
.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1264 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1266 if (changed
.count("mds_alternate_name_max")) {
1267 alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
1269 if (changed
.count("mds_dir_max_entries")) {
1270 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
1271 dout(20) << __func__
<< " max entries per directory changed to "
1272 << dir_max_entries
<< dendl
;
1274 if (changed
.count("mds_bal_fragment_size_max")) {
1275 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
1276 dout(20) << __func__
<< " max fragment size changed to "
1277 << bal_fragment_size_max
<< dendl
;
1282 * XXX bump in the interface here, not using an MDSContext here
1283 * because all the callers right now happen to use a SaferCond
1285 void Server::kill_session(Session
*session
, Context
*on_safe
)
1287 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1289 if ((session
->is_opening() ||
1290 session
->is_open() ||
1291 session
->is_stale()) &&
1292 !session
->is_importing()) {
1293 dout(10) << "kill_session " << session
<< dendl
;
1294 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1296 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1297 if (session
->is_closing() ||
1298 session
->is_killing()) {
1300 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1302 ceph_assert(session
->is_closed() ||
1303 session
->is_importing());
1305 on_safe
->complete(0);
1310 size_t Server::apply_blocklist()
1312 std::vector
<Session
*> victims
;
1313 const auto& sessions
= mds
->sessionmap
.get_sessions();
1314 mds
->objecter
->with_osdmap(
1315 [&](const OSDMap
& o
) {
1316 for (const auto& p
: sessions
) {
1317 if (!p
.first
.is_client()) {
1318 // Do not apply OSDMap blocklist to MDS daemons, we find out
1319 // about their death via MDSMap.
1322 if (o
.is_blocklisted(p
.second
->info
.inst
.addr
)) {
1323 victims
.push_back(p
.second
);
1328 for (const auto& s
: victims
) {
1329 kill_session(s
, nullptr);
1332 dout(10) << "apply_blocklist: killed " << victims
.size() << dendl
;
1334 return victims
.size();
1337 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1339 dout(10) << __func__
<< " : "
1340 << session
->info
.inst
1341 << " pending_prealloc_inos " << session
->pending_prealloc_inos
1342 << " free_prealloc_inos " << session
->free_prealloc_inos
1343 << " delegated_inos " << session
->delegated_inos
<< dendl
;
1345 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1346 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1349 // release alloc and pending-alloc inos for this session
1350 // and wipe out session state, in case the session close aborts for some reason
1351 interval_set
<inodeno_t
> inos_to_free
;
1352 inos_to_free
.insert(session
->pending_prealloc_inos
);
1353 inos_to_free
.insert(session
->free_prealloc_inos
);
1354 if (inos_to_free
.size()) {
1355 mds
->inotable
->project_release_ids(inos_to_free
);
1356 piv
= mds
->inotable
->get_projected_version();
1360 auto le
= new ESession(session
->info
.inst
, false, pv
, inos_to_free
, piv
, session
->delegated_inos
);
1361 auto fin
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, inos_to_free
, piv
,
1362 session
->delegated_inos
, mdlog
->get_current_segment(), on_safe
);
1363 mdlog
->start_submit_entry(le
, fin
);
1366 // clean up requests, too
1367 while(!session
->requests
.empty()) {
1368 auto mdr
= MDRequestRef(*session
->requests
.begin());
1369 mdcache
->request_kill(mdr
);
1372 finish_flush_session(session
, session
->get_push_seq());
1375 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1377 reconnect_done
= reconnect_done_
;
1379 auto now
= clock::now();
1380 set
<Session
*> sessions
;
1381 mds
->sessionmap
.get_client_session_set(sessions
);
1382 for (auto session
: sessions
) {
1383 if (session
->is_open()) {
1384 client_reconnect_gather
.insert(session
->get_client());
1385 session
->set_reconnecting(true);
1386 session
->last_cap_renew
= now
;
1390 if (client_reconnect_gather
.empty()) {
1391 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1392 reconnect_gather_finish();
1396 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1398 reconnect_start
= now
;
1399 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1400 mds
->sessionmap
.dump();
1403 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1405 dout(7) << "handle_client_reconnect " << m
->get_source()
1406 << (m
->has_more() ? " (more)" : "") << dendl
;
1407 client_t from
= m
->get_source().num();
1408 Session
*session
= mds
->get_session(m
);
1410 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1411 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1412 reply
->metadata
["error_string"] = "sessionless";
1413 mds
->send_message(reply
, m
->get_connection());
1417 if (!session
->is_open()) {
1418 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1419 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1420 mds
->send_message(reply
, m
->get_connection());
1424 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1426 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1427 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1428 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1432 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1433 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1436 if (reconnect_all_deny
|| !mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1437 // XXX maybe in the future we can do better than this?
1438 if (reconnect_all_deny
) {
1439 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl
;
1441 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1443 mds
->clog
->info() << "denied reconnect attempt (mds is "
1444 << ceph_mds_state_name(mds
->get_state())
1445 << ") from " << m
->get_source_inst()
1446 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1449 std::string error_str
;
1450 if (!session
->is_open()) {
1451 error_str
= "session is closed";
1452 } else if (mdcache
->is_readonly()) {
1453 error_str
= "mds is readonly";
1455 if (session
->info
.client_metadata
.features
.empty())
1456 infer_supported_features(session
, session
->info
.client_metadata
);
1458 feature_bitset_t missing_features
= required_client_features
;
1459 missing_features
-= session
->info
.client_metadata
.features
;
1460 if (!missing_features
.empty()) {
1461 CachedStackStringStream css
;
1462 *css
<< "missing required features '" << missing_features
<< "'";
1463 error_str
= css
->strv();
1467 if (!error_str
.empty()) {
1469 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1470 mds
->clog
->info() << "denied reconnect attempt from "
1471 << m
->get_source_inst() << " (" << error_str
<< ")";
1476 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1477 mds
->send_message_client(r
, session
);
1478 if (session
->is_open()) {
1479 client_reconnect_denied
.insert(session
->get_client());
1484 if (!m
->has_more()) {
1485 metrics_handler
->add_session(session
);
1486 // notify client of success with an OPEN
1487 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1488 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1489 reply
->supported_features
= supported_features
;
1490 reply
->metric_spec
= supported_metric_spec
;
1492 mds
->send_message_client(reply
, session
);
1493 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1496 session
->last_cap_renew
= clock::now();
1499 for (const auto &r
: m
->realms
) {
1500 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1501 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1504 if (in
->snaprealm
) {
1505 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1507 // this can happen if we are non-auth or we rollback snaprealm
1508 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1510 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1512 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1513 << " seq " << r
.realm
.seq
<< dendl
;
1514 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1519 for (const auto &p
: m
->caps
) {
1520 // make sure our last_cap_id is MAX over all issued caps
1521 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1522 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1524 CInode
*in
= mdcache
->get_inode(p
.first
);
1525 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1527 if (in
&& in
->is_auth()) {
1528 // we recovered it, and it's ours. take note.
1529 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1530 << " on " << *in
<< dendl
;
1531 in
->reconnect_cap(from
, p
.second
, session
);
1532 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1533 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1537 if (in
&& !in
->is_auth()) {
1539 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1540 // add to cap export list.
1541 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1542 in
->authority().first
, true);
1544 // don't know if the inode is mine
1545 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1546 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1550 reconnect_last_seen
= clock::now();
1552 if (!m
->has_more()) {
1553 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1555 // remove from gather set
1556 client_reconnect_gather
.erase(from
);
1557 session
->set_reconnecting(false);
1558 if (client_reconnect_gather
.empty())
1559 reconnect_gather_finish();
1563 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1566 auto it
= client_metadata
.find("ceph_version");
1567 if (it
!= client_metadata
.end()) {
1568 // user space client
1569 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1570 supported
= CEPHFS_FEATURE_LUMINOUS
;
1571 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1572 supported
= CEPHFS_FEATURE_KRAKEN
;
1574 it
= client_metadata
.find("kernel_version");
1575 if (it
!= client_metadata
.end()) {
1577 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1578 supported
= CEPHFS_FEATURE_LUMINOUS
;
1581 if (supported
== -1 &&
1582 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1583 supported
= CEPHFS_FEATURE_JEWEL
;
1585 if (supported
>= 0) {
1586 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1587 client_metadata
.features
= feature_bitset_t(value
);
1588 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1592 void Server::update_required_client_features()
1594 required_client_features
= mds
->mdsmap
->get_required_client_features();
1595 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1597 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1598 set
<Session
*> sessions
;
1599 mds
->sessionmap
.get_client_session_set(sessions
);
1600 for (auto session
: sessions
) {
1601 feature_bitset_t missing_features
= required_client_features
;
1602 missing_features
-= session
->info
.client_metadata
.features
;
1603 if (!missing_features
.empty()) {
1604 bool blocklisted
= mds
->objecter
->with_osdmap(
1605 [session
](const OSDMap
&osd_map
) -> bool {
1606 return osd_map
.is_blocklisted(session
->info
.inst
.addr
);
1611 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1612 << missing_features
<< "'";
1613 CachedStackStringStream css
;
1614 mds
->evict_client(session
->get_client().v
, false,
1615 g_conf()->mds_session_blocklist_on_evict
, *css
);
1621 void Server::reconnect_gather_finish()
1623 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1624 ceph_assert(reconnect_done
);
1626 if (!mds
->snapclient
->is_synced()) {
1627 // make sure snaptable cache is populated. snaprealms will be
1628 // extensively used in rejoin stage.
1629 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1630 mds
->snapclient
->wait_for_sync(reconnect_done
);
1632 reconnect_done
->complete(0);
1634 reconnect_done
= NULL
;
1637 void Server::reconnect_tick()
1639 bool reject_all_reconnect
= false;
1640 if (reconnect_evicting
) {
1641 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1646 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1647 * then load less meta information in rejoin phase. This will shorten reboot time.
1648 * Moreover, loading less meta increases the chance standby with less memory can failover.
1650 * Why not shorten reconnect period?
1651 * Clients may send unsafe or retry requests, which haven't been
1652 * completed before old mds stop, to new mds. These requests may
1653 * need to be processed during new mds's clientreplay phase,
1654 * see: #https://github.com/ceph/ceph/pull/29059.
1656 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1657 if (client_reconnect_gather
.empty())
1660 if (reconnect_all_deny
&& (client_reconnect_gather
== client_reconnect_denied
))
1661 reject_all_reconnect
= true;
1663 auto now
= clock::now();
1664 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1665 if (elapse1
< g_conf()->mds_reconnect_timeout
&& !reject_all_reconnect
)
1668 vector
<Session
*> remaining_sessions
;
1669 remaining_sessions
.reserve(client_reconnect_gather
.size());
1670 for (auto c
: client_reconnect_gather
) {
1671 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1672 ceph_assert(session
);
1673 remaining_sessions
.push_back(session
);
1674 // client re-sends cap flush messages before the reconnect message
1675 if (session
->last_seen
> reconnect_last_seen
)
1676 reconnect_last_seen
= session
->last_seen
;
1679 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1680 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2 && !reject_all_reconnect
) {
1681 dout(7) << "reconnect_tick: last seen " << elapse2
1682 << " seconds ago, extending reconnect interval" << dendl
;
1686 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1687 << " clients have not reconnected in time" << dendl
;
1689 // If we're doing blocklist evictions, use this to wait for them before
1690 // proceeding to reconnect_gather_finish
1691 MDSGatherBuilder
gather(g_ceph_context
);
1693 for (auto session
: remaining_sessions
) {
1694 // Keep sessions that have specified timeout. These sessions will prevent
1695 // mds from going to active. MDS goes to active after they all have been
1696 // killed or reclaimed.
1697 if (session
->info
.client_metadata
.find("timeout") !=
1698 session
->info
.client_metadata
.end()) {
1699 dout(1) << "reconnect keeps " << session
->info
.inst
1700 << ", need to be reclaimed" << dendl
;
1701 client_reclaim_gather
.insert(session
->get_client());
1705 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1707 mds
->clog
->warn() << "evicting unresponsive client " << *session
1708 << ", after waiting " << elapse1
1709 << " seconds during MDS startup";
1711 // make _session_logged() purge orphan objects of lost async/unsafe requests
1712 session
->delegated_inos
.swap(session
->free_prealloc_inos
);
1714 if (g_conf()->mds_session_blocklist_on_timeout
) {
1715 CachedStackStringStream css
;
1716 mds
->evict_client(session
->get_client().v
, false, true, *css
,
1719 kill_session(session
, NULL
);
1722 failed_reconnects
++;
1724 client_reconnect_gather
.clear();
1725 client_reconnect_denied
.clear();
1727 if (gather
.has_subs()) {
1728 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1729 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1730 [this](int r
){reconnect_gather_finish();})));
1732 reconnect_evicting
= true;
1734 reconnect_gather_finish();
1738 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1740 if (!locks
.length()) return;
1743 auto p
= locks
.cbegin();
1744 decode(numlocks
, p
);
1745 for (int i
= 0; i
< numlocks
; ++i
) {
1747 lock
.client
= client
;
1748 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1749 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1751 decode(numlocks
, p
);
1752 for (int i
= 0; i
< numlocks
; ++i
) {
1754 lock
.client
= client
;
1755 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1756 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1761 * Call this when the MDCache is oversized, to send requests to the clients
1762 * to trim some caps, and consequently unpin some inodes in the MDCache so
1763 * that it can trim too.
1765 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1767 const auto now
= clock::now();
1768 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1769 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1770 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1771 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1773 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1774 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1775 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1776 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1777 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1778 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1780 dout(7) << __func__
<< ":"
1781 << " min=" << min_caps_per_client
1782 << " max=" << max_caps_per_client
1783 << " total=" << Capability::count()
1784 << " flags=" << flags
1787 /* trim caps of sessions with the most caps first */
1788 std::multimap
<uint64_t, Session
*> caps_session
;
1789 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1790 auto num_caps
= s
->caps
.size();
1791 auto cache_liveness
= s
->get_session_cache_liveness();
1792 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1793 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1796 mds
->sessionmap
.get_client_sessions(std::move(f
));
1798 std::pair
<bool, uint64_t> result
= {false, 0};
1799 auto& [throttled
, caps_recalled
] = result
;
1800 last_recall_state
= now
;
1801 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1802 if (!session
->is_open() ||
1803 !session
->get_connection() ||
1804 !session
->info
.inst
.name
.is_client())
1807 dout(10) << __func__
<< ":"
1808 << " session " << session
->info
.inst
1809 << " caps " << num_caps
1810 << ", leases " << session
->leases
.size()
1814 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1815 newlim
= min_caps_per_client
;
1817 newlim
= num_caps
-recall_max_caps
;
1819 if (num_caps
> newlim
) {
1820 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1821 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1822 newlim
= num_caps
-recall
;
1823 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1824 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1825 const uint64_t global_recall_throttle
= recall_throttle
.get();
1826 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1827 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1830 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1831 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1834 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1835 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1840 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1842 const auto session_recall
= session
->get_recall_caps();
1843 const auto session_release
= session
->get_release_caps();
1844 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1845 /* The session has been unable to keep up with the number of caps
1846 * recalled (by half); additionally, to prevent marking sessions
1847 * we've just begun to recall from, the session_recall counter
1848 * (decayed count of caps recently recalled) is **greater** than the
1849 * session threshold for the session's cap recall throttle.
1851 dout(15) << " 2*session_release < session_recall"
1852 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1853 " 2*session_recall < recall_max_decay_threshold"
1854 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1855 " Skipping because we are unlikely to get more released." << dendl
;
1857 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1858 /* The number of caps recalled is less than the number we *could*
1859 * recall (so there isn't much left to recall?) and the number of
1860 * caps is less than the current recall_caps counter (decayed count
1861 * of caps recently recalled).
1863 dout(15) << " 2*recall < session_recall "
1864 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1865 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1866 " Skipping because we are unlikely to get more released." << dendl
;
1871 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1873 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1874 m
->head
.max_caps
= newlim
;
1875 mds
->send_message_client(m
, session
);
1877 flush_session(session
, *gather
);
1879 caps_recalled
+= session
->notify_recall_sent(newlim
);
1880 recall_throttle
.hit(recall
);
1884 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1889 void Server::force_clients_readonly()
1891 dout(10) << "force_clients_readonly" << dendl
;
1892 set
<Session
*> sessions
;
1893 mds
->sessionmap
.get_client_session_set(sessions
);
1894 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1895 p
!= sessions
.end();
1897 Session
*session
= *p
;
1898 if (!session
->info
.inst
.name
.is_client() ||
1899 !(session
->is_open() || session
->is_stale()))
1901 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1906 * some generic stuff for finishing off requests
1908 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1910 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1911 ceph_assert(!mdr
->has_completed
);
1913 // note trace items for eventual reply.
1922 early_reply(mdr
, in
, dn
);
1924 mdr
->committing
= true;
1925 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1927 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1928 if (mds
->queue_one_replay()) {
1929 dout(10) << " queued next replay op" << dendl
;
1931 dout(10) << " journaled last replay op" << dendl
;
1933 } else if (mdr
->did_early_reply
)
1934 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1939 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1940 std::string_view event
)
1943 string
event_str("submit entry: ");
1945 mdr
->mark_event(event_str
);
1947 mdlog
->submit_entry(le
, fin
);
1951 * send response built from mdr contents and error code; clean up mdr
1953 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1955 if (mdr
->client_request
) {
1956 if (mdr
->is_batch_head()) {
1957 dout(20) << __func__
<< " batch head " << *mdr
<< dendl
;
1958 mdr
->release_batch_op()->respond(r
);
1960 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
1962 } else if (mdr
->internal_op
> -1) {
1963 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1964 if (!mdr
->internal_op_finish
)
1965 ceph_abort_msg("trying to respond to internal op without finisher");
1966 mdr
->internal_op_finish
->complete(r
);
1967 mdcache
->request_finish(mdr
);
1971 // statistics mds req op number and latency
1972 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
1974 int code
= l_mdss_first
;
1975 switch(req
->get_op()) {
1976 case CEPH_MDS_OP_LOOKUPHASH
:
1977 code
= l_mdss_req_lookuphash_latency
;
1979 case CEPH_MDS_OP_LOOKUPINO
:
1980 code
= l_mdss_req_lookupino_latency
;
1982 case CEPH_MDS_OP_LOOKUPPARENT
:
1983 code
= l_mdss_req_lookupparent_latency
;
1985 case CEPH_MDS_OP_LOOKUPNAME
:
1986 code
= l_mdss_req_lookupname_latency
;
1988 case CEPH_MDS_OP_LOOKUP
:
1989 code
= l_mdss_req_lookup_latency
;
1991 case CEPH_MDS_OP_LOOKUPSNAP
:
1992 code
= l_mdss_req_lookupsnap_latency
;
1994 case CEPH_MDS_OP_GETATTR
:
1995 code
= l_mdss_req_getattr_latency
;
1997 case CEPH_MDS_OP_SETATTR
:
1998 code
= l_mdss_req_setattr_latency
;
2000 case CEPH_MDS_OP_SETLAYOUT
:
2001 code
= l_mdss_req_setlayout_latency
;
2003 case CEPH_MDS_OP_SETDIRLAYOUT
:
2004 code
= l_mdss_req_setdirlayout_latency
;
2006 case CEPH_MDS_OP_GETVXATTR
:
2007 code
= l_mdss_req_getvxattr_latency
;
2009 case CEPH_MDS_OP_SETXATTR
:
2010 code
= l_mdss_req_setxattr_latency
;
2012 case CEPH_MDS_OP_RMXATTR
:
2013 code
= l_mdss_req_rmxattr_latency
;
2015 case CEPH_MDS_OP_READDIR
:
2016 code
= l_mdss_req_readdir_latency
;
2018 case CEPH_MDS_OP_SETFILELOCK
:
2019 code
= l_mdss_req_setfilelock_latency
;
2021 case CEPH_MDS_OP_GETFILELOCK
:
2022 code
= l_mdss_req_getfilelock_latency
;
2024 case CEPH_MDS_OP_CREATE
:
2025 code
= l_mdss_req_create_latency
;
2027 case CEPH_MDS_OP_OPEN
:
2028 code
= l_mdss_req_open_latency
;
2030 case CEPH_MDS_OP_MKNOD
:
2031 code
= l_mdss_req_mknod_latency
;
2033 case CEPH_MDS_OP_LINK
:
2034 code
= l_mdss_req_link_latency
;
2036 case CEPH_MDS_OP_UNLINK
:
2037 code
= l_mdss_req_unlink_latency
;
2039 case CEPH_MDS_OP_RMDIR
:
2040 code
= l_mdss_req_rmdir_latency
;
2042 case CEPH_MDS_OP_RENAME
:
2043 code
= l_mdss_req_rename_latency
;
2045 case CEPH_MDS_OP_MKDIR
:
2046 code
= l_mdss_req_mkdir_latency
;
2048 case CEPH_MDS_OP_SYMLINK
:
2049 code
= l_mdss_req_symlink_latency
;
2051 case CEPH_MDS_OP_LSSNAP
:
2052 code
= l_mdss_req_lssnap_latency
;
2054 case CEPH_MDS_OP_MKSNAP
:
2055 code
= l_mdss_req_mksnap_latency
;
2057 case CEPH_MDS_OP_RMSNAP
:
2058 code
= l_mdss_req_rmsnap_latency
;
2060 case CEPH_MDS_OP_RENAMESNAP
:
2061 code
= l_mdss_req_renamesnap_latency
;
2064 dout(1) << ": unknown client op" << dendl
;
2067 logger
->tinc(code
, lat
);
2070 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
2072 if (!g_conf()->mds_early_reply
)
2075 if (mdr
->no_early_reply
) {
2076 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2080 if (mdr
->has_more() && mdr
->more()->has_journaled_peers
) {
2081 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl
;
2085 if (mdr
->alloc_ino
) {
2086 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2090 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2091 entity_inst_t client_inst
= req
->get_source_inst();
2092 if (client_inst
.name
.is_mds())
2095 if (req
->is_replay()) {
2096 dout(10) << " no early reply on replay op" << dendl
;
2101 auto reply
= make_message
<MClientReply
>(*req
, 0);
2102 reply
->set_unsafe();
2104 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2106 //_rename_finish() does not send dentry link/unlink message to replicas.
2107 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2108 // that have projected linkages from getting new replica.
2109 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2111 dout(10) << "early_reply " << reply
->get_result()
2112 << " (" << cpp_strerror(reply
->get_result())
2113 << ") " << *req
<< dendl
;
2115 if (tracei
|| tracedn
) {
2117 mdr
->cap_releases
.erase(tracei
->vino());
2119 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2121 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2124 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2125 mds
->send_message_client(reply
, mdr
->session
);
2127 mdr
->did_early_reply
= true;
2129 mds
->logger
->inc(l_mds_reply
);
2130 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2131 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2132 if (lat
>= g_conf()->mds_op_complaint_time
) {
2133 mds
->logger
->inc(l_mds_slow_reply
);
2135 if (client_inst
.name
.is_client()) {
2136 mds
->sessionmap
.hit_session(mdr
->session
);
2138 perf_gather_op_latency(req
, lat
);
2139 dout(20) << "lat " << lat
<< dendl
;
2141 mdr
->mark_event("early_replied");
2146 * include a trace to tracei
2149 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2151 ceph_assert(mdr
.get());
2152 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2154 dout(7) << "reply_client_request " << reply
->get_result()
2155 << " (" << cpp_strerror(reply
->get_result())
2156 << ") " << *req
<< dendl
;
2158 mdr
->mark_event("replying");
2160 Session
*session
= mdr
->session
;
2162 // note successful request in session map?
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2169 reply
->get_result() == 0 && session
) {
2170 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2171 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2173 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr
, session
);
2180 // get tracei/tracedn from mdr?
2181 CInode
*tracei
= mdr
->tracei
;
2182 CDentry
*tracedn
= mdr
->tracedn
;
2184 bool is_replay
= mdr
->client_request
->is_replay();
2185 bool did_early_reply
= mdr
->did_early_reply
;
2186 entity_inst_t client_inst
= req
->get_source_inst();
2188 if (!did_early_reply
&& !is_replay
) {
2190 mds
->logger
->inc(l_mds_reply
);
2191 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2192 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2193 if (lat
>= g_conf()->mds_op_complaint_time
) {
2194 mds
->logger
->inc(l_mds_slow_reply
);
2196 if (session
&& client_inst
.name
.is_client()) {
2197 mds
->sessionmap
.hit_session(session
);
2199 perf_gather_op_latency(req
, lat
);
2200 dout(20) << "lat " << lat
<< dendl
;
2203 mdr
->cap_releases
.erase(tracei
->vino());
2205 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2208 // drop non-rdlocks before replying, so that we can issue leases
2209 mdcache
->request_drop_non_rdlocks(mdr
);
2212 if (session
&& !client_inst
.name
.is_mds()) {
2214 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2215 (tracei
|| tracedn
)) {
2218 mdcache
->try_reconnect_cap(tracei
, session
);
2220 // include metadata in reply
2221 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2225 // We can set the extra bl unconditionally: if it's already been sent in the
2226 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2227 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2229 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2230 mds
->send_message_client(reply
, session
);
2233 if (req
->is_queued_for_replay() &&
2234 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2235 if (reply
->get_result() < 0) {
2236 int r
= reply
->get_result();
2237 derr
<< "reply_client_request: failed to replay " << *req
2238 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2239 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2241 mds
->queue_one_replay();
2245 mdcache
->request_finish(mdr
);
2247 // take a closer look at tracei, if it happens to be a remote link
2250 tracedn
->get_projected_linkage()->is_remote()) {
2251 mdcache
->eval_remote(tracedn
);
2256 * pass inode OR dentry (not both, or we may get confused)
2258 * trace is in reverse order (i.e. root inode comes last)
2260 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2261 CInode
*in
, CDentry
*dn
,
2264 // skip doing this for debugging purposes?
2265 if (g_conf()->mds_inject_traceless_reply_probability
&&
2266 mdr
->ls
&& !mdr
->o_trunc
&&
2267 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2268 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2272 // inode, dentry, dir, ..., inode
2274 mds_rank_t whoami
= mds
->get_nodeid();
2275 Session
*session
= mdr
->session
;
2276 snapid_t snapid
= mdr
->snapid
;
2277 utime_t now
= ceph_clock_now();
2279 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2282 if (snapid
== CEPH_NOSNAP
) {
2285 realm
= in
->find_snaprealm();
2287 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2288 reply
->snapbl
= realm
->get_snap_trace();
2289 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2294 reply
->head
.is_dentry
= 1;
2295 CDir
*dir
= dn
->get_dir();
2296 CInode
*diri
= dir
->get_inode();
2298 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2299 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2301 #ifdef MDS_VERIFY_FRAGSTAT
2302 if (dir
->is_complete())
2303 dir
->verify_fragstat();
2306 ds
.frag
= dir
->get_frag();
2307 ds
.auth
= dir
->get_dir_auth().first
;
2308 if (dir
->is_auth() && !forward_all_requests_to_auth
)
2309 dir
->get_dist_spec(ds
.dist
, whoami
);
2311 dir
->encode_dirstat(bl
, session
->info
, ds
);
2312 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2314 encode(dn
->get_name(), bl
);
2315 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, bl
);
2317 reply
->head
.is_dentry
= 0;
2321 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2322 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2323 reply
->head
.is_target
= 1;
2325 reply
->head
.is_target
= 0;
2327 reply
->set_trace(bl
);
2330 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2332 dout(4) << "handle_client_request " << *req
<< dendl
;
2335 mds
->logger
->inc(l_mds_request
);
2337 logger
->inc(l_mdss_handle_client_request
);
2339 if (!mdcache
->is_open()) {
2340 dout(5) << "waiting for root" << dendl
;
2341 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2345 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2347 Session
*session
= 0;
2348 if (req
->get_source().is_client()) {
2349 session
= mds
->get_session(req
);
2351 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2352 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2353 session
->is_closing() ||
2354 session
->is_killing()) {
2355 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2359 if (req
->is_queued_for_replay())
2360 mds
->queue_one_replay();
2366 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2367 // send it? hrm, this isn't ideal; they may get a lot of copies if
2368 // they have a high request rate.
2371 // completed request?
2372 bool has_completed
= false;
2373 if (req
->is_replay() || req
->get_retry_attempt()) {
2374 ceph_assert(session
);
2376 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2377 has_completed
= true;
2378 if (!session
->is_open())
2380 // Don't send traceless reply if the completed request has created
2381 // new inode. Treat the request as lookup request instead.
2382 if (req
->is_replay() ||
2383 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2384 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2385 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2386 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2387 auto reply
= make_message
<MClientReply
>(*req
, 0);
2388 if (created
!= inodeno_t()) {
2390 encode(created
, extra
);
2391 reply
->set_extra_bl(extra
);
2393 mds
->send_message_client(reply
, session
);
2395 if (req
->is_queued_for_replay())
2396 mds
->queue_one_replay();
2400 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2401 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2402 dout(10) << " completed request which created new inode " << created
2403 << ", convert it to lookup request" << dendl
;
2404 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2405 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2410 // trim completed_request list
2411 if (req
->get_oldest_client_tid() > 0) {
2412 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2413 ceph_assert(session
);
2414 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2415 // Sessions 'completed_requests' was dirtied, mark it to be
2416 // potentially flushed at segment expiry.
2417 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2419 if (session
->get_num_trim_requests_warnings() > 0 &&
2420 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2421 session
->reset_num_trim_requests_warnings();
2423 if (session
->get_num_completed_requests() >=
2424 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2425 session
->inc_num_trim_requests_warnings();
2426 CachedStackStringStream css
;
2427 *css
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2428 << req
->get_oldest_client_tid() << "), "
2429 << session
->get_num_completed_requests()
2430 << " completed requests recorded in session\n";
2431 mds
->clog
->warn() << css
->strv();
2432 dout(20) << __func__
<< " " << css
->strv() << dendl
;
2437 // register + dispatch
2438 MDRequestRef mdr
= mdcache
->request_start(req
);
2443 mdr
->session
= session
;
2444 session
->requests
.push_back(&mdr
->item_session_request
);
2448 mdr
->has_completed
= true;
2450 // process embedded cap releases?
2451 // (only if NOT replay!)
2452 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2453 client_t client
= req
->get_source().num();
2454 for (const auto &r
: req
->releases
) {
2455 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2457 req
->releases
.clear();
2460 dispatch_client_request(mdr
);
2464 void Server::handle_osd_map()
2466 /* Note that we check the OSDMAP_FULL flag directly rather than
2467 * using osdmap_full_flag(), because we want to know "is the flag set"
2468 * rather than "does the flag apply to us?" */
2469 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2470 auto pi
= o
.get_pg_pool(mds
->get_metadata_pool());
2471 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2472 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2473 << o
.get_epoch() << dendl
;
2477 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2479 // we shouldn't be waiting on anyone.
2480 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_peer
.empty());
2483 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2484 //if the mdr is a "batch_op" and it has followers, pick a follower as
2485 //the new "head of the batch ops" and go on processing the new one.
2486 if (mdr
->is_batch_head()) {
2487 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2488 auto it
= mdr
->batch_op_map
->find(mask
);
2489 auto new_batch_head
= it
->second
->find_new_head();
2490 if (!new_batch_head
) {
2491 mdr
->batch_op_map
->erase(it
);
2494 mdr
= std::move(new_batch_head
);
2498 } else if (mdr
->aborted
) {
2499 mdr
->aborted
= false;
2500 mdcache
->request_kill(mdr
);
2504 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2506 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2508 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2510 if (req
->may_write() && mdcache
->is_readonly()) {
2511 dout(10) << " read-only FS" << dendl
;
2512 respond_to_request(mdr
, -CEPHFS_EROFS
);
2515 if (mdr
->has_more() && mdr
->more()->peer_error
) {
2516 dout(10) << " got error from peers" << dendl
;
2517 respond_to_request(mdr
, mdr
->more()->peer_error
);
2522 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
2524 // the request is already responded to
2527 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2528 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2529 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2530 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2531 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2532 req
->get_op() == CEPH_MDS_OP_CREATE
||
2533 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2534 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2535 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2536 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2537 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started peer request
2540 if (check_access(mdr
, cur
, MAY_FULL
)) {
2541 dout(20) << __func__
<< ": full, has FULL caps, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2543 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2544 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
2548 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2552 switch (req
->get_op()) {
2553 case CEPH_MDS_OP_LOOKUPHASH
:
2554 case CEPH_MDS_OP_LOOKUPINO
:
2555 handle_client_lookup_ino(mdr
, false, false);
2557 case CEPH_MDS_OP_LOOKUPPARENT
:
2558 handle_client_lookup_ino(mdr
, true, false);
2560 case CEPH_MDS_OP_LOOKUPNAME
:
2561 handle_client_lookup_ino(mdr
, false, true);
2565 case CEPH_MDS_OP_LOOKUP
:
2566 handle_client_getattr(mdr
, true);
2569 case CEPH_MDS_OP_LOOKUPSNAP
:
2570 // lookupsnap does not reference a CDentry; treat it as a getattr
2571 case CEPH_MDS_OP_GETATTR
:
2572 handle_client_getattr(mdr
, false);
2574 case CEPH_MDS_OP_GETVXATTR
:
2575 handle_client_getvxattr(mdr
);
2578 case CEPH_MDS_OP_SETATTR
:
2579 handle_client_setattr(mdr
);
2581 case CEPH_MDS_OP_SETLAYOUT
:
2582 handle_client_setlayout(mdr
);
2584 case CEPH_MDS_OP_SETDIRLAYOUT
:
2585 handle_client_setdirlayout(mdr
);
2587 case CEPH_MDS_OP_SETXATTR
:
2588 handle_client_setxattr(mdr
);
2590 case CEPH_MDS_OP_RMXATTR
:
2591 handle_client_removexattr(mdr
);
2594 case CEPH_MDS_OP_READDIR
:
2595 handle_client_readdir(mdr
);
2598 case CEPH_MDS_OP_SETFILELOCK
:
2599 handle_client_file_setlock(mdr
);
2602 case CEPH_MDS_OP_GETFILELOCK
:
2603 handle_client_file_readlock(mdr
);
2607 case CEPH_MDS_OP_CREATE
:
2608 if (mdr
->has_completed
)
2609 handle_client_open(mdr
); // already created.. just open
2611 handle_client_openc(mdr
);
2614 case CEPH_MDS_OP_OPEN
:
2615 handle_client_open(mdr
);
2620 case CEPH_MDS_OP_MKNOD
:
2621 handle_client_mknod(mdr
);
2623 case CEPH_MDS_OP_LINK
:
2624 handle_client_link(mdr
);
2626 case CEPH_MDS_OP_UNLINK
:
2627 case CEPH_MDS_OP_RMDIR
:
2628 handle_client_unlink(mdr
);
2630 case CEPH_MDS_OP_RENAME
:
2631 handle_client_rename(mdr
);
2633 case CEPH_MDS_OP_MKDIR
:
2634 handle_client_mkdir(mdr
);
2636 case CEPH_MDS_OP_SYMLINK
:
2637 handle_client_symlink(mdr
);
2642 case CEPH_MDS_OP_LSSNAP
:
2643 handle_client_lssnap(mdr
);
2645 case CEPH_MDS_OP_MKSNAP
:
2646 handle_client_mksnap(mdr
);
2648 case CEPH_MDS_OP_RMSNAP
:
2649 handle_client_rmsnap(mdr
);
2651 case CEPH_MDS_OP_RENAMESNAP
:
2652 handle_client_renamesnap(mdr
);
2656 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2657 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
2662 // ---------------------------------------
2665 void Server::handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
)
2667 dout(4) << "handle_peer_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2668 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2670 if (logger
) logger
->inc(l_mdss_handle_peer_request
);
2674 return handle_peer_request_reply(m
);
2676 // the purpose of rename notify is enforcing causal message ordering. making sure
2677 // bystanders have received all messages from rename srcdn's auth MDS.
2678 if (m
->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY
) {
2679 auto reply
= make_message
<MMDSPeerRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK
);
2680 mds
->send_message(reply
, m
->get_connection());
2684 CDentry
*straydn
= NULL
;
2685 if (m
->straybl
.length() > 0) {
2686 mdcache
->decode_replica_stray(straydn
, nullptr, m
->straybl
, from
);
2687 ceph_assert(straydn
);
2691 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2692 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2693 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2699 if (mdcache
->have_request(m
->get_reqid())) {
2701 mdr
= mdcache
->request_get(m
->get_reqid());
2703 // is my request newer?
2704 if (mdr
->attempt
> m
->get_attempt()) {
2705 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2706 << ", dropping " << *m
<< dendl
;
2710 if (mdr
->attempt
< m
->get_attempt()) {
2711 // mine is old, close it out
2712 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2713 << ", closing out" << dendl
;
2714 mdcache
->request_finish(mdr
);
2716 } else if (mdr
->peer_to_mds
!= from
) {
2717 dout(10) << "local request " << *mdr
<< " not peer to mds." << from
<< dendl
;
2721 // may get these while mdr->peer_request is non-null
2722 if (m
->get_op() == MMDSPeerRequest::OP_DROPLOCKS
) {
2723 mds
->locker
->drop_locks(mdr
.get());
2726 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2727 if (m
->is_abort()) {
2728 mdr
->aborted
= true;
2729 if (mdr
->peer_request
) {
2730 // only abort on-going xlock, wrlock and auth pin
2731 ceph_assert(!mdr
->peer_did_prepare());
2733 mdcache
->request_finish(mdr
);
2736 if (m
->inode_export
.length() > 0)
2737 mdr
->more()->inode_import
= m
->inode_export
;
2738 // finish off request.
2739 mdcache
->request_finish(mdr
);
2746 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2747 dout(10) << "missing peer request for " << m
->get_reqid()
2748 << " OP_FINISH, must have lost race with a forward" << dendl
;
2751 mdr
= mdcache
->request_start_peer(m
->get_reqid(), m
->get_attempt(), m
);
2752 mdr
->set_op_stamp(m
->op_stamp
);
2754 ceph_assert(mdr
->peer_request
== 0); // only one at a time, please!
2758 mdr
->straydn
= straydn
;
2761 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2762 mdr
->locks
.empty()) {
2763 dout(3) << "not active yet, waiting" << dendl
;
2764 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2768 mdr
->reset_peer_request(m
);
2770 dispatch_peer_request(mdr
);
2773 void Server::handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
)
2775 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2777 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2778 metareqid_t r
= m
->get_reqid();
2779 if (!mdcache
->have_uncommitted_leader(r
, from
)) {
2780 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2781 << from
<< " reqid " << r
<< dendl
;
2784 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2785 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2789 if (m
->get_op() == MMDSPeerRequest::OP_COMMITTED
) {
2790 metareqid_t r
= m
->get_reqid();
2791 mdcache
->committed_leader_peer(r
, from
);
2795 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2796 if (m
->get_attempt() != mdr
->attempt
) {
2797 dout(10) << "handle_peer_request_reply " << *mdr
<< " ignoring reply from other attempt "
2798 << m
->get_attempt() << dendl
;
2802 switch (m
->get_op()) {
2803 case MMDSPeerRequest::OP_XLOCKACK
:
2805 // identify lock, leader request
2806 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2807 m
->get_object_info());
2808 mdr
->more()->peers
.insert(from
);
2809 lock
->decode_locked_state(m
->get_lock_data());
2810 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2811 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2812 mdr
->finish_locking(lock
);
2813 lock
->get_xlock(mdr
, mdr
->get_client());
2815 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2816 mdr
->more()->waiting_on_peer
.erase(from
);
2817 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2818 mdcache
->dispatch_request(mdr
);
2822 case MMDSPeerRequest::OP_WRLOCKACK
:
2824 // identify lock, leader request
2825 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2826 m
->get_object_info());
2827 mdr
->more()->peers
.insert(from
);
2828 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2829 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2830 ceph_assert(it
->is_remote_wrlock());
2831 ceph_assert(it
->wrlock_target
== from
);
2833 mdr
->finish_locking(lock
);
2835 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2836 mdr
->more()->waiting_on_peer
.erase(from
);
2837 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2838 mdcache
->dispatch_request(mdr
);
2842 case MMDSPeerRequest::OP_AUTHPINACK
:
2843 handle_peer_auth_pin_ack(mdr
, m
);
2846 case MMDSPeerRequest::OP_LINKPREPACK
:
2847 handle_peer_link_prep_ack(mdr
, m
);
2850 case MMDSPeerRequest::OP_RMDIRPREPACK
:
2851 handle_peer_rmdir_prep_ack(mdr
, m
);
2854 case MMDSPeerRequest::OP_RENAMEPREPACK
:
2855 handle_peer_rename_prep_ack(mdr
, m
);
2858 case MMDSPeerRequest::OP_RENAMENOTIFYACK
:
2859 handle_peer_rename_notify_ack(mdr
, m
);
2867 void Server::dispatch_peer_request(MDRequestRef
& mdr
)
2869 dout(7) << "dispatch_peer_request " << *mdr
<< " " << *mdr
->peer_request
<< dendl
;
2872 dout(7) << " abort flag set, finishing" << dendl
;
2873 mdcache
->request_finish(mdr
);
2877 if (logger
) logger
->inc(l_mdss_dispatch_peer_request
);
2879 int op
= mdr
->peer_request
->get_op();
2881 case MMDSPeerRequest::OP_XLOCK
:
2882 case MMDSPeerRequest::OP_WRLOCK
:
2885 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2886 mdr
->peer_request
->get_object_info());
2889 dout(10) << "don't have object, dropping" << dendl
;
2890 ceph_abort(); // can this happen, if we auth pinned properly.
2892 if (op
== MMDSPeerRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2893 dout(10) << "not auth for remote xlock attempt, dropping on "
2894 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2896 // use acquire_locks so that we get auth_pinning.
2897 MutationImpl::LockOpVec lov
;
2898 for (const auto& p
: mdr
->locks
) {
2900 lov
.add_xlock(p
.lock
);
2901 else if (p
.is_wrlock())
2902 lov
.add_wrlock(p
.lock
);
2907 case MMDSPeerRequest::OP_XLOCK
:
2908 lov
.add_xlock(lock
);
2909 replycode
= MMDSPeerRequest::OP_XLOCKACK
;
2911 case MMDSPeerRequest::OP_WRLOCK
:
2912 lov
.add_wrlock(lock
);
2913 replycode
= MMDSPeerRequest::OP_WRLOCKACK
;
2917 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2921 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2922 r
->set_lock_type(lock
->get_type());
2923 lock
->get_parent()->set_object_info(r
->get_object_info());
2924 if (replycode
== MMDSPeerRequest::OP_XLOCKACK
)
2925 lock
->encode_locked_state(r
->get_lock_data());
2926 mds
->send_message(r
, mdr
->peer_request
->get_connection());
2930 mdr
->reset_peer_request();
2934 case MMDSPeerRequest::OP_UNXLOCK
:
2935 case MMDSPeerRequest::OP_UNWRLOCK
:
2937 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2938 mdr
->peer_request
->get_object_info());
2940 auto it
= mdr
->locks
.find(lock
);
2941 ceph_assert(it
!= mdr
->locks
.end());
2942 bool need_issue
= false;
2944 case MMDSPeerRequest::OP_UNXLOCK
:
2945 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2947 case MMDSPeerRequest::OP_UNWRLOCK
:
2948 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2952 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2954 // done. no ack necessary.
2955 mdr
->reset_peer_request();
2959 case MMDSPeerRequest::OP_AUTHPIN
:
2960 handle_peer_auth_pin(mdr
);
2963 case MMDSPeerRequest::OP_LINKPREP
:
2964 case MMDSPeerRequest::OP_UNLINKPREP
:
2965 handle_peer_link_prep(mdr
);
2968 case MMDSPeerRequest::OP_RMDIRPREP
:
2969 handle_peer_rmdir_prep(mdr
);
2972 case MMDSPeerRequest::OP_RENAMEPREP
:
2973 handle_peer_rename_prep(mdr
);
2981 void Server::handle_peer_auth_pin(MDRequestRef
& mdr
)
2983 dout(10) << "handle_peer_auth_pin " << *mdr
<< dendl
;
2985 // build list of objects
2986 list
<MDSCacheObject
*> objects
;
2987 CInode
*auth_pin_freeze
= NULL
;
2988 bool nonblocking
= mdr
->peer_request
->is_nonblocking();
2989 bool fail
= false, wouldblock
= false, readonly
= false;
2990 ref_t
<MMDSPeerRequest
> reply
;
2992 if (mdcache
->is_readonly()) {
2993 dout(10) << " read-only FS" << dendl
;
2999 for (const auto &oi
: mdr
->peer_request
->get_authpins()) {
3000 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3002 dout(10) << " don't have " << oi
<< dendl
;
3007 objects
.push_back(object
);
3008 if (oi
== mdr
->peer_request
->get_authpin_freeze())
3009 auth_pin_freeze
= static_cast<CInode
*>(object
);
3013 // can we auth pin them?
3015 for (const auto& obj
: objects
) {
3016 if (!obj
->is_auth()) {
3017 dout(10) << " not auth for " << *obj
<< dendl
;
3021 if (mdr
->is_auth_pinned(obj
))
3023 if (!mdr
->can_auth_pin(obj
)) {
3025 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
3031 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
3032 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3033 mdr
->drop_local_auth_pins();
3035 mds
->locker
->notify_freeze_waiter(obj
);
3042 /* freeze authpin wrong inode */
3043 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
3044 mdr
->more()->rename_inode
!= auth_pin_freeze
)
3045 mdr
->unfreeze_auth_pin(true);
3047 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3048 * on the source inode to complete. This happens after all locks for the rename
3049 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3050 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3051 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3052 * The solution is freeze the inode and prevent other MDRequests from getting new
3055 if (auth_pin_freeze
) {
3056 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3057 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3058 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3059 mds
->mdlog
->flush();
3065 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3068 mdr
->drop_local_auth_pins(); // just in case
3070 reply
->mark_error_rofs();
3072 reply
->mark_error_wouldblock();
3075 for (const auto& obj
: objects
) {
3076 dout(10) << "auth_pinning " << *obj
<< dendl
;
3079 // return list of my auth_pins (if any)
3080 for (const auto &p
: mdr
->object_states
) {
3081 if (!p
.second
.auth_pinned
)
3083 MDSCacheObjectInfo info
;
3084 p
.first
->set_object_info(info
);
3085 reply
->get_authpins().push_back(info
);
3086 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3087 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3091 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3093 // clean up this request
3094 mdr
->reset_peer_request();
3098 if (mdr
->peer_request
->should_notify_blocking()) {
3099 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3100 reply
->mark_req_blocked();
3101 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3102 mdr
->peer_request
->clear_notify_blocking();
3107 void Server::handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
3109 dout(10) << "handle_peer_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3110 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3112 if (ack
->is_req_blocked()) {
3113 mdr
->disable_lock_cache();
3114 // peer auth pin is blocked, drop locks to avoid deadlock
3115 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3120 set
<MDSCacheObject
*> pinned
;
3121 for (const auto &oi
: ack
->get_authpins()) {
3122 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3123 ceph_assert(object
); // we pinned it
3124 dout(10) << " remote has pinned " << *object
<< dendl
;
3125 mdr
->set_remote_auth_pinned(object
, from
);
3126 if (oi
== ack
->get_authpin_freeze())
3127 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3128 pinned
.insert(object
);
3131 // removed frozen auth pin ?
3132 if (mdr
->more()->is_remote_frozen_authpin
&&
3133 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3134 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3135 ceph_assert(stat_p
);
3136 if (stat_p
->remote_auth_pinned
== from
) {
3137 mdr
->more()->is_remote_frozen_authpin
= false;
3141 // removed auth pins?
3142 for (auto& p
: mdr
->object_states
) {
3143 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3145 MDSCacheObject
* object
= p
.first
;
3146 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3147 dout(10) << " remote has unpinned " << *object
<< dendl
;
3148 mdr
->_clear_remote_auth_pinned(p
.second
);
3153 mdr
->more()->peers
.insert(from
);
3155 // clear from waiting list
3156 auto ret
= mdr
->more()->waiting_on_peer
.erase(from
);
3159 if (ack
->is_error_rofs()) {
3160 mdr
->more()->peer_error
= -CEPHFS_EROFS
;
3161 } else if (ack
->is_error_wouldblock()) {
3162 mdr
->more()->peer_error
= -CEPHFS_EWOULDBLOCK
;
3166 if (mdr
->more()->waiting_on_peer
.empty())
3167 mdcache
->dispatch_request(mdr
);
3169 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
3173 // ---------------------------------------
3178 * check whether we are permitted to complete a request
3180 * Check whether we have permission to perform the operation specified
3181 * by mask on the given inode, based on the capability in the mdr's
3184 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3187 int r
= mdr
->session
->check_access(
3189 mdr
->client_request
->get_caller_uid(),
3190 mdr
->client_request
->get_caller_gid(),
3191 &mdr
->client_request
->get_caller_gid_list(),
3192 mdr
->client_request
->head
.args
.setattr
.uid
,
3193 mdr
->client_request
->head
.args
.setattr
.gid
);
3195 respond_to_request(mdr
, r
);
3203 * check whether fragment has reached maximum size
3206 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*dir
)
3208 const auto size
= dir
->get_frag_size();
3209 const auto max
= bal_fragment_size_max
;
3211 dout(10) << "fragment " << *dir
<< " size exceeds " << max
<< " (CEPHFS_ENOSPC)" << dendl
;
3212 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
3215 dout(20) << "fragment " << *dir
<< " size " << size
<< " < " << max
<< dendl
;
3222 * check whether entries in a dir reached maximum size
3225 bool Server::check_dir_max_entries(MDRequestRef
&mdr
, CDir
*in
)
3227 const uint64_t size
= in
->inode
->get_projected_inode()->dirstat
.nfiles
+
3228 in
->inode
->get_projected_inode()->dirstat
.nsubdirs
;
3229 if (dir_max_entries
&& size
>= dir_max_entries
) {
3230 dout(10) << "entries per dir " << *in
<< " size exceeds " << dir_max_entries
<< " (ENOSPC)" << dendl
;
3231 respond_to_request(mdr
, -ENOSPC
);
3238 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3241 in
->name_stray_dentry(straydname
);
3243 CDentry
*straydn
= mdr
->straydn
;
3245 ceph_assert(straydn
->get_name() == straydname
);
3248 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3250 if (!mdr
->client_request
->is_replay() &&
3251 !check_fragment_space(mdr
, straydir
))
3254 straydn
= straydir
->lookup(straydname
);
3256 if (straydir
->is_frozen_dir()) {
3257 dout(10) << __func__
<< ": " << *straydir
<< " is frozen, waiting" << dendl
;
3258 mds
->locker
->drop_locks(mdr
.get());
3259 mdr
->drop_local_auth_pins();
3260 straydir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3263 straydn
= straydir
->add_null_dentry(straydname
);
3264 straydn
->mark_new();
3266 ceph_assert(straydn
->get_projected_linkage()->is_null());
3269 straydn
->state_set(CDentry::STATE_STRAY
);
3270 mdr
->straydn
= straydn
;
3276 /** prepare_new_inode
3278 * create a new inode. set c/m/atime. hit dir pop.
3280 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3281 const file_layout_t
*layout
)
3283 CInode
*in
= new CInode(mdcache
);
3284 auto _inode
= in
->_get_inode();
3286 // Server::prepare_force_open_sessions() can re-open session in closing
3287 // state. In that corner case, session's prealloc_inos are being freed.
3288 // To simplify the code, we disallow using/refilling session's prealloc_ino
3289 // while session is opening.
3290 bool allow_prealloc_inos
= mdr
->session
->is_open();
3293 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= _inode
->ino
= mdr
->session
->take_ino(useino
))) {
3294 mds
->sessionmap
.mark_projected(mdr
->session
);
3295 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3296 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3300 _inode
->ino
= mds
->inotable
->project_alloc_id(useino
);
3301 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3304 if (useino
&& useino
!= _inode
->ino
) {
3305 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << _inode
->ino
<< dendl
;
3306 mds
->clog
->error() << mdr
->client_request
->get_source()
3307 << " specified ino " << useino
3308 << " but mds." << mds
->get_nodeid() << " allocated " << _inode
->ino
;
3309 //ceph_abort(); // just for now.
3312 if (allow_prealloc_inos
&&
3313 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3314 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3315 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3316 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3317 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3318 mds
->sessionmap
.mark_projected(mdr
->session
);
3319 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3322 _inode
->version
= 1;
3323 _inode
->xattr_version
= 1;
3324 _inode
->nlink
= 1; // FIXME
3326 _inode
->mode
= mode
;
3328 // FIPS zeroization audit 20191117: this memset is not security related.
3329 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
3330 if (_inode
->is_dir()) {
3331 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3332 } else if (layout
) {
3333 _inode
->layout
= *layout
;
3335 _inode
->layout
= mdcache
->default_file_layout
;
3338 _inode
->truncate_size
= -1ull; // not truncated, yet!
3339 _inode
->truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3341 CInode
*diri
= dir
->get_inode();
3342 auto pip
= diri
->get_projected_inode();
3344 dout(10) << oct
<< " dir mode 0" << pip
->mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3346 if (pip
->mode
& S_ISGID
) {
3347 dout(10) << " dir is sticky" << dendl
;
3348 _inode
->gid
= pip
->gid
;
3349 if (S_ISDIR(mode
)) {
3350 dout(10) << " new dir also sticky" << dendl
;
3351 _inode
->mode
|= S_ISGID
;
3354 _inode
->gid
= mdr
->client_request
->get_caller_gid();
3357 _inode
->uid
= mdr
->client_request
->get_caller_uid();
3359 _inode
->btime
= _inode
->ctime
= _inode
->mtime
= _inode
->atime
=
3360 mdr
->get_op_stamp();
3362 _inode
->change_attr
= 0;
3364 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3365 if (req
->get_data().length()) {
3366 auto p
= req
->get_data().cbegin();
3368 // xattrs on new inode?
3369 auto _xattrs
= CInode::allocate_xattr_map();
3370 decode_noshare(*_xattrs
, p
);
3371 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs
<< dendl
;
3372 if (_xattrs
->count("encryption.ctx")) {
3373 _inode
->fscrypt
= true;
3375 in
->reset_xattrs(std::move(_xattrs
));
3378 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3379 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3380 _inode
->inline_data
.version
= CEPH_INLINE_NONE
;
3382 mdcache
->add_inode(in
); // add
3383 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3387 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3389 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3390 << " inotablev " << mds
->inotable
->get_projected_version()
3392 blob
->set_ino_alloc(mdr
->alloc_ino
,
3393 mdr
->used_prealloc_ino
,
3395 mdr
->client_request
->get_source(),
3396 mds
->sessionmap
.get_projected(),
3397 mds
->inotable
->get_projected_version());
3400 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3402 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3403 << " / " << mdr
->prealloc_inos
3404 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3406 if (mdr
->alloc_ino
) {
3407 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3409 if (mdr
->prealloc_inos
.size()) {
3410 ceph_assert(session
);
3411 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3412 session
->free_prealloc_inos
.insert(mdr
->prealloc_inos
);
3413 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3414 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3415 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3417 if (mdr
->used_prealloc_ino
) {
3418 ceph_assert(session
);
3419 session
->info
.prealloc_inos
.erase(mdr
->used_prealloc_ino
);
3420 mds
->sessionmap
.mark_dirty(session
);
3424 struct C_MDS_TryOpenInode
: public ServerContext
{
3427 C_MDS_TryOpenInode(Server
*s
, MDRequestRef
& r
, inodeno_t i
) :
3428 ServerContext(s
), mdr(r
), ino(i
) {}
3429 void finish(int r
) override
{
3430 server
->_try_open_ino(mdr
, r
, ino
);
3434 void Server::_try_open_ino(MDRequestRef
& mdr
, int r
, inodeno_t ino
)
3436 dout(10) << "_try_open_ino " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3438 // `r` is a rank if >=0, else an error code
3440 mds_rank_t
dest_rank(r
);
3441 if (dest_rank
== mds
->get_nodeid())
3442 dispatch_client_request(mdr
);
3444 mdcache
->request_forward(mdr
, dest_rank
);
3449 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
3451 respond_to_request(mdr
, r
);
3454 class C_MDS_TryFindInode
: public ServerContext
{
3459 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
, MDCache
*m
, inodeno_t i
) :
3460 ServerContext(s
), mdr(r
), mdcache(m
), ino(i
) {}
3461 void finish(int r
) override
{
3462 if (r
== -CEPHFS_ESTALE
) { // :( find_ino_peers failed
3464 * There has one case that when the MDS crashes and the
3465 * openfiletable journal couldn't be flushed and then
3466 * the replacing MDS is possibly won't load some already
3467 * opened CInodes into the MDCache. And if the clients
3468 * will retry some requests after reconnected, the MDS
3469 * will return -ESTALE after failing to find the ino in
3472 * As a workaround users can run `ls -R ${mountpoint}`
3473 * to list all the sub-files or sub-direcotries from the
3476 * We need try to open the ino and try it again.
3478 CInode
*in
= mdcache
->get_inode(ino
);
3479 if (in
&& in
->state_test(CInode::STATE_PURGING
))
3480 server
->respond_to_request(mdr
, r
);
3482 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_TryOpenInode(server
, mdr
, ino
));
3484 server
->dispatch_client_request(mdr
);
3489 /* If this returns null, the request has been handled
3490 * as appropriate: forwarded on, or the client's been replied to */
3491 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3495 const filepath
& refpath
= mdr
->get_filepath();
3496 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3498 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3502 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3504 if (refpath
.is_last_snap()) {
3508 if (!no_want_auth
&& forward_all_requests_to_auth
)
3510 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3513 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3514 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3516 return nullptr; // delayed
3517 if (r
< 0) { // error
3518 if (r
== -CEPHFS_ENOENT
&& !mdr
->dn
[0].empty()) {
3519 if (mdr
->client_request
&&
3520 mdr
->client_request
->get_dentry_wanted())
3521 mdr
->tracedn
= mdr
->dn
[0].back();
3522 respond_to_request(mdr
, r
);
3523 } else if (r
== -CEPHFS_ESTALE
) {
3524 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3525 inodeno_t ino
= refpath
.get_ino();
3526 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3528 dout(10) << "FAIL on error " << r
<< dendl
;
3529 respond_to_request(mdr
, r
);
3533 CInode
*ref
= mdr
->in
[0];
3534 dout(10) << "ref is " << *ref
<< dendl
;
3538 // do NOT proceed if freezing, as cap release may defer in that case, and
3539 // we could deadlock when we try to lock @ref.
3540 // if we're already auth_pinned, continue; the release has already been processed.
3541 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3542 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3543 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3544 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3545 if (mdr
->is_any_remote_auth_pin())
3546 mds
->locker
->notify_freeze_waiter(ref
);
3558 /** rdlock_path_xlock_dentry
3559 * traverse path to the directory that could/would contain dentry.
3560 * make sure i am auth for that dentry, forward as necessary.
3561 * create null dentry in place (or use existing if okexist).
3562 * get rdlocks on traversed dentries, xlock on new dentry.
3564 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3565 bool create
, bool okexist
, bool want_layout
)
3567 const filepath
& refpath
= mdr
->get_filepath();
3568 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3570 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3571 return mdr
->dn
[0].back();
3573 // figure parent dir vs dname
3574 if (refpath
.depth() == 0) {
3575 dout(7) << "invalid path (zero length)" << dendl
;
3576 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3580 if (refpath
.is_last_snap()) {
3581 respond_to_request(mdr
, -CEPHFS_EROFS
);
3585 if (refpath
.is_last_dot_or_dotdot()) {
3586 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3588 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3590 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
3594 // traverse to parent dir
3595 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3596 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3597 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3598 MDS_TRAVERSE_WANT_AUTH
;
3599 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3600 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3602 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3604 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3605 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3607 return nullptr; // delayed
3609 if (r
== -CEPHFS_ESTALE
) {
3610 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3611 inodeno_t ino
= refpath
.get_ino();
3612 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3615 respond_to_request(mdr
, r
);
3619 CDentry
*dn
= mdr
->dn
[0].back();
3620 CDir
*dir
= dn
->get_dir();
3621 CInode
*diri
= dir
->get_inode();
3623 if (!mdr
->reqid
.name
.is_mds()) {
3624 if (diri
->is_system() && !diri
->is_root()) {
3625 respond_to_request(mdr
, -CEPHFS_EROFS
);
3630 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3631 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3635 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3636 if (dnl
->is_null()) {
3637 if (!create
&& okexist
) {
3638 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3642 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3643 dn
->first
= std::max(dn
->first
, next_snap
);
3646 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3649 mdr
->in
[0] = dnl
->get_inode();
3655 /** rdlock_two_paths_xlock_destdn
3656 * traverse two paths and lock the two paths in proper order.
3657 * The order of taking locks is:
3658 * 1. Lock directory inodes or dentries according to which trees they
3659 * are under. Lock objects under fs root before objects under mdsdir.
3660 * 2. Lock directory inodes or dentries according to their depth, in
3662 * 3. Lock directory inodes or dentries according to inode numbers or
3663 * dentries' parent inode numbers, in ascending order.
3664 * 4. Lock dentries in the same directory in order of their keys.
3665 * 5. Lock non-directory inodes according to inode numbers, in ascending
3668 std::pair
<CDentry
*, CDentry
*>
3669 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3672 const filepath
& refpath
= mdr
->get_filepath();
3673 const filepath
& refpath2
= mdr
->get_filepath2();
3675 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3677 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3678 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3680 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3681 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3682 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3685 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3686 respond_to_request(mdr
, -CEPHFS_EROFS
);
3687 return std::make_pair(nullptr, nullptr);
3690 // traverse to parent dir
3691 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3692 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3693 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3695 if (r
== -CEPHFS_ESTALE
) {
3696 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl
;
3697 inodeno_t ino
= refpath
.get_ino();
3698 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3700 respond_to_request(mdr
, r
);
3702 return std::make_pair(nullptr, nullptr);
3705 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3706 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3708 if (r
== -CEPHFS_ESTALE
) {
3709 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
3710 inodeno_t ino
= refpath2
.get_ino();
3711 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3713 respond_to_request(mdr
, r
);
3715 return std::make_pair(nullptr, nullptr);
3718 CDentry
*srcdn
= mdr
->dn
[1].back();
3719 CDir
*srcdir
= srcdn
->get_dir();
3720 CDentry
*destdn
= mdr
->dn
[0].back();
3721 CDir
*destdir
= destdn
->get_dir();
3723 if (!mdr
->reqid
.name
.is_mds()) {
3724 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3725 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3726 respond_to_request(mdr
, -CEPHFS_EROFS
);
3727 return std::make_pair(nullptr, nullptr);
3731 if (!destdir
->get_inode()->is_base() &&
3732 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3733 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3734 return std::make_pair(nullptr, nullptr);
3737 MutationImpl::LockOpVec lov
;
3738 if (srcdir
->get_inode() == destdir
->get_inode()) {
3739 lov
.add_wrlock(&destdir
->inode
->filelock
);
3740 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3741 if (xlock_srcdn
&& srcdir
!= destdir
) {
3742 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3743 if (srcdir_auth
!= mds
->get_nodeid()) {
3744 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3745 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3749 if (srcdn
->get_name() > destdn
->get_name())
3750 lov
.add_xlock(&destdn
->lock
);
3753 lov
.add_xlock(&srcdn
->lock
);
3755 lov
.add_rdlock(&srcdn
->lock
);
3757 if (srcdn
->get_name() < destdn
->get_name())
3758 lov
.add_xlock(&destdn
->lock
);
3760 int cmp
= mdr
->compare_paths();
3761 bool lock_destdir_first
=
3762 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3764 if (lock_destdir_first
) {
3765 lov
.add_wrlock(&destdir
->inode
->filelock
);
3766 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3767 lov
.add_xlock(&destdn
->lock
);
3771 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3772 if (srcdir_auth
== mds
->get_nodeid()) {
3773 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3774 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3776 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3777 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3779 lov
.add_xlock(&srcdn
->lock
);
3781 lov
.add_rdlock(&srcdn
->lock
);
3784 if (!lock_destdir_first
) {
3785 lov
.add_wrlock(&destdir
->inode
->filelock
);
3786 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3787 lov
.add_xlock(&destdn
->lock
);
3791 CInode
*auth_pin_freeze
= nullptr;
3792 // XXX any better way to do this?
3793 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3794 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3795 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3797 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3798 return std::make_pair(nullptr, nullptr);
3800 if (srcdn
->get_projected_linkage()->is_null()) {
3801 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3802 return std::make_pair(nullptr, nullptr);
3805 if (destdn
->get_projected_linkage()->is_null()) {
3806 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3807 destdn
->first
= std::max(destdn
->first
, next_snap
);
3810 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3812 return std::make_pair(destdn
, srcdn
);
3816 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3818 * @param diri base inode
3819 * @param fg the exact frag we want
3820 * @param mdr request
3821 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3823 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3825 CDir
*dir
= diri
->get_dirfrag(fg
);
3828 // am i auth for the dirfrag?
3829 if (!dir
->is_auth()) {
3830 mds_rank_t auth
= dir
->authority().first
;
3831 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3832 << ", fw to mds." << auth
<< dendl
;
3833 mdcache
->request_forward(mdr
, auth
);
3837 // not open and inode not mine?
3838 if (!diri
->is_auth()) {
3839 mds_rank_t inauth
= diri
->authority().first
;
3840 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3841 mdcache
->request_forward(mdr
, inauth
);
3845 // not open and inode frozen?
3846 if (diri
->is_frozen()) {
3847 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3848 ceph_assert(diri
->get_parent_dir());
3849 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3854 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3861 // ===============================================================================
3864 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3866 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3868 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3869 // refpath can't be empty for lookup but it can for
3870 // getattr (we do getattr with empty refpath for mount of '/')
3871 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3875 bool want_auth
= false;
3876 int mask
= req
->head
.args
.getattr
.mask
;
3877 if (mask
& CEPH_STAT_RSTAT
)
3878 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3880 if (!mdr
->is_batch_head() && mdr
->can_batch()) {
3881 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
3882 int r
= mdcache
->path_traverse(mdr
, cf
, mdr
->get_filepath(),
3883 (want_auth
? MDS_TRAVERSE_WANT_AUTH
: 0),
3884 &mdr
->dn
[0], &mdr
->in
[0]);
3889 // fall-thru. let rdlock_path_pin_ref() check again.
3890 } else if (is_lookup
) {
3891 CDentry
* dn
= mdr
->dn
[0].back();
3893 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3895 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3897 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3898 em
.first
->second
->add_request(mdr
);
3902 CInode
*in
= mdr
->in
[0];
3904 auto em
= in
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3906 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3908 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3909 em
.first
->second
->add_request(mdr
);
3915 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3919 mdr
->getattr_caps
= mask
;
3922 * if client currently holds the EXCL cap on a field, do not rdlock
3923 * it; client's stat() will result in valid info if _either_ EXCL
3924 * cap is held or MDS rdlocks and reads the value here.
3926 * handling this case here is easier than weakening rdlock
3927 * semantics... that would cause problems elsewhere.
3929 client_t client
= mdr
->get_client();
3931 Capability
*cap
= ref
->get_client_cap(client
);
3932 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3933 mdr
->snapid
<= cap
->client_follows
))
3934 issued
= cap
->issued();
3937 MutationImpl::LockOpVec lov
;
3938 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3939 lov
.add_rdlock(&ref
->linklock
);
3940 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3941 lov
.add_rdlock(&ref
->authlock
);
3942 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3943 lov
.add_rdlock(&ref
->xattrlock
);
3944 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3945 // Don't wait on unstable filelock if client is allowed to read file size.
3946 // This can reduce the response time of getattr in the case that multiple
3947 // clients do stat(2) and there are writers.
3948 // The downside of this optimization is that mds may not issue Fs caps along
3949 // with getattr reply. Client may need to send more getattr requests.
3950 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3951 lov
.add_rdlock(&ref
->filelock
);
3952 } else if (ref
->filelock
.is_stable() ||
3953 ref
->filelock
.get_num_wrlocks() > 0 ||
3954 !ref
->filelock
.can_read(mdr
->get_client())) {
3955 lov
.add_rdlock(&ref
->filelock
);
3956 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
3960 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3963 if (!check_access(mdr
, ref
, MAY_READ
))
3966 utime_t now
= ceph_clock_now();
3967 mdr
->set_mds_stamp(now
);
3969 // note which caps are requested, so we return at least a snapshot
3970 // value for them. (currently this matters for xattrs and inline data)
3971 mdr
->getattr_caps
= mask
;
3973 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3976 dout(10) << "reply to stat on " << *req
<< dendl
;
3979 mdr
->tracedn
= mdr
->dn
[0].back();
3980 respond_to_request(mdr
, 0);
3983 struct C_MDS_LookupIno2
: public ServerContext
{
3985 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3986 void finish(int r
) override
{
3987 server
->_lookup_ino_2(mdr
, r
);
3994 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3995 bool want_parent
, bool want_dentry
)
3997 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3999 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
4000 return _lookup_snap_ino(mdr
);
4002 inodeno_t ino
= req
->get_filepath().get_ino();
4003 auto _ino
= ino
.val
;
4005 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4006 * I do not have an explanation for how that happened organically but this
4007 * check will ensure that the client can no longer do that.
4009 * [1] https://tracker.ceph.com/issues/49922
4011 if (MDS_IS_PRIVATE_INO(_ino
)) {
4012 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4016 CInode
*in
= mdcache
->get_inode(ino
);
4017 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
4018 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4022 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
4026 // check for nothing (not read or write); this still applies the
4028 if (!check_access(mdr
, in
, 0))
4031 CDentry
*dn
= in
->get_projected_parent_dn();
4032 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
4034 MutationImpl::LockOpVec lov
;
4035 if (dn
&& (want_parent
|| want_dentry
)) {
4037 lov
.add_rdlock(&dn
->lock
);
4040 unsigned mask
= req
->head
.args
.lookupino
.mask
;
4042 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
4044 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4045 issued
= cap
->issued();
4047 // permission bits, ACL/security xattrs
4048 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4049 lov
.add_rdlock(&in
->authlock
);
4050 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4051 lov
.add_rdlock(&in
->xattrlock
);
4053 mdr
->getattr_caps
= mask
;
4057 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4061 // need read access to directory inode
4062 if (!check_access(mdr
, diri
, MAY_READ
))
4068 if (in
->is_base()) {
4069 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4072 if (!diri
|| diri
->is_stray()) {
4073 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4076 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
4078 respond_to_request(mdr
, 0);
4081 inodeno_t dirino
= req
->get_filepath2().get_ino();
4082 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
4083 respond_to_request(mdr
, -CEPHFS_ENOENT
);
4086 dout(10) << "reply to lookup_name " << *in
<< dendl
;
4088 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
4093 respond_to_request(mdr
, 0);
4097 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
4099 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4102 vino
.ino
= req
->get_filepath().get_ino();
4103 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
4104 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
4105 __u32 hash
= req
->head
.args
.lookupino
.hash
;
4107 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
4109 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
4111 in
= mdcache
->get_inode(vino
.ino
);
4113 if (in
->state_test(CInode::STATE_PURGING
) ||
4114 !in
->has_snap_data(vino
.snapid
)) {
4115 if (in
->is_dir() || !parent_ino
) {
4116 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4125 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
4126 mdr
->snapid
= vino
.snapid
;
4128 respond_to_request(mdr
, 0);
4132 CInode
*diri
= NULL
;
4134 diri
= mdcache
->get_inode(parent_ino
);
4136 mdcache
->open_ino(parent_ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
4140 if (!diri
->is_dir()) {
4141 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4145 MutationImpl::LockOpVec lov
;
4146 lov
.add_rdlock(&diri
->dirfragtreelock
);
4147 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4150 frag_t frag
= diri
->dirfragtree
[hash
];
4151 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4155 if (!dir
->is_complete()) {
4156 if (dir
->is_frozen()) {
4157 mds
->locker
->drop_locks(mdr
.get());
4158 mdr
->drop_local_auth_pins();
4159 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4162 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4166 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4168 mdcache
->open_ino(vino
.ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4172 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4174 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4175 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4177 // `r` is a rank if >=0, else an error code
4179 mds_rank_t
dest_rank(r
);
4180 if (dest_rank
== mds
->get_nodeid())
4181 dispatch_client_request(mdr
);
4183 mdcache
->request_forward(mdr
, dest_rank
);
4188 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
4190 respond_to_request(mdr
, r
);
4194 /* This function takes responsibility for the passed mdr*/
4195 void Server::handle_client_open(MDRequestRef
& mdr
)
4197 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4198 dout(7) << "open on " << req
->get_filepath() << dendl
;
4200 int flags
= req
->head
.args
.open
.flags
;
4201 int cmode
= ceph_flags_to_mode(flags
);
4203 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4207 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4208 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4210 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4211 dout(7) << "read-only FS" << dendl
;
4212 respond_to_request(mdr
, -CEPHFS_EROFS
);
4216 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4220 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4221 ceph_assert(!need_auth
);
4222 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4223 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4228 if (!cur
->is_file()) {
4229 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4230 cmode
= CEPH_FILE_MODE_PIN
;
4231 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4232 if (cur
->is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4233 flags
&= ~CEPH_O_TRUNC
;
4236 dout(10) << "open flags = " << flags
4237 << ", filemode = " << cmode
4238 << ", need_auth = " << need_auth
4242 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4243 dout(7) << "not a file or dir " << *cur << dendl;
4244 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4247 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->is_dir() && !cur
->is_symlink()) {
4248 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4249 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4253 if ((flags
& CEPH_O_TRUNC
) && !cur
->is_file()) {
4254 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4255 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4256 respond_to_request(mdr
, cur
->is_dir() ? -CEPHFS_EISDIR
: -CEPHFS_EINVAL
);
4260 if (cur
->get_inode()->inline_data
.version
!= CEPH_INLINE_NONE
&&
4261 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4262 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4263 respond_to_request(mdr
, -CEPHFS_EPERM
);
4267 // snapped data is read only
4268 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4269 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4270 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4271 respond_to_request(mdr
, -CEPHFS_EROFS
);
4275 MutationImpl::LockOpVec lov
;
4277 unsigned mask
= req
->head
.args
.open
.mask
;
4279 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4281 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4282 issued
= cap
->issued();
4283 // permission bits, ACL/security xattrs
4284 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4285 lov
.add_rdlock(&cur
->authlock
);
4286 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4287 lov
.add_rdlock(&cur
->xattrlock
);
4289 mdr
->getattr_caps
= mask
;
4293 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4294 ceph_assert(cur
->is_auth());
4296 lov
.add_xlock(&cur
->filelock
);
4297 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4300 if (!check_access(mdr
, cur
, MAY_WRITE
))
4303 // wait for pending truncate?
4304 const auto& pi
= cur
->get_projected_inode();
4305 if (pi
->is_truncating()) {
4306 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4307 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4308 mds
->locker
->drop_locks(mdr
.get());
4309 mdr
->drop_local_auth_pins();
4310 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4314 do_open_truncate(mdr
, cmode
);
4318 // sync filelock if snapped.
4319 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4320 // and that data itself is flushed so that we can read the snapped data off disk.
4321 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4322 lov
.add_rdlock(&cur
->filelock
);
4325 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4329 if (cmode
& CEPH_FILE_MODE_WR
)
4331 if (!check_access(mdr
, cur
, mask
))
4334 utime_t now
= ceph_clock_now();
4335 mdr
->set_mds_stamp(now
);
4337 if (cur
->is_file() || cur
->is_dir()) {
4338 if (mdr
->snapid
== CEPH_NOSNAP
) {
4340 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4342 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4343 << " for " << req
->get_source()
4344 << " on " << *cur
<< dendl
;
4346 int caps
= ceph_caps_for_mode(cmode
);
4347 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4348 << " for " << req
->get_source()
4349 << " snapid " << mdr
->snapid
4350 << " on " << *cur
<< dendl
;
4351 mdr
->snap_caps
= caps
;
4355 // increase max_size?
4356 if (cmode
& CEPH_FILE_MODE_WR
)
4357 mds
->locker
->check_inode_max_size(cur
);
4359 // make sure this inode gets into the journal
4360 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4361 mdcache
->open_file_table
.should_log_open(cur
)) {
4362 EOpen
*le
= new EOpen(mds
->mdlog
);
4363 mdlog
->start_entry(le
);
4364 le
->add_clean_inode(cur
);
4365 mdlog
->submit_entry(le
);
4369 if (cmode
& CEPH_FILE_MODE_WR
)
4370 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4372 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
4373 mdr
->client_request
->get_source().num());
4376 if (req
->get_dentry_wanted()) {
4377 ceph_assert(mdr
->dn
[0].size());
4378 dn
= mdr
->dn
[0].back();
4383 respond_to_request(mdr
, 0);
4386 class C_MDS_openc_finish
: public ServerLogContext
{
4390 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4391 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4392 void finish(int r
) override
{
4393 ceph_assert(r
== 0);
4395 dn
->pop_projected_linkage();
4397 // dirty inode, dn, dir
4398 newi
->mark_dirty(mdr
->ls
);
4399 newi
->mark_dirty_parent(mdr
->ls
, true);
4403 get_mds()->locker
->share_inode_max_size(newi
);
4405 MDRequestRef null_ref
;
4406 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4408 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4410 server
->respond_to_request(mdr
, 0);
4412 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4416 /* This function takes responsibility for the passed mdr*/
4417 void Server::handle_client_openc(MDRequestRef
& mdr
)
4419 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4420 client_t client
= mdr
->get_client();
4422 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4424 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4426 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4430 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4431 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true);
4435 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4436 if (!excl
&& !dnl
->is_null()) {
4438 mds
->locker
->xlock_downgrade(&dn
->lock
, mdr
.get());
4440 MutationImpl::LockOpVec lov
;
4441 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4442 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4445 handle_client_open(mdr
);
4449 ceph_assert(dnl
->is_null());
4451 if (req
->get_alternate_name().size() > alternate_name_max
) {
4452 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
4453 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
4456 dn
->set_alternate_name(req
->get_alternate_name());
4459 file_layout_t layout
;
4460 if (mdr
->dir_layout
!= file_layout_t())
4461 layout
= mdr
->dir_layout
;
4463 layout
= mdcache
->default_file_layout
;
4465 // What kind of client caps are required to complete this operation
4466 uint64_t access
= MAY_WRITE
;
4468 const auto default_layout
= layout
;
4470 // fill in any special params from client
4471 if (req
->head
.args
.open
.stripe_unit
)
4472 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4473 if (req
->head
.args
.open
.stripe_count
)
4474 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4475 if (req
->head
.args
.open
.object_size
)
4476 layout
.object_size
= req
->head
.args
.open
.object_size
;
4477 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4478 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4479 layout
.pool_id
= req
->head
.args
.open
.pool
;
4481 // make sure we have as new a map as the client
4482 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4483 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4488 // If client doesn't have capability to modify layout pools, then
4489 // only permit this request if the requested pool matches what the
4490 // file would have inherited anyway from its parent.
4491 if (default_layout
!= layout
) {
4492 access
|= MAY_SET_VXATTR
;
4495 if (!layout
.is_valid()) {
4496 dout(10) << " invalid initial file layout" << dendl
;
4497 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4500 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4501 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4502 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4507 CDir
*dir
= dn
->get_dir();
4508 CInode
*diri
= dir
->get_inode();
4509 if (!check_access(mdr
, diri
, access
))
4511 if (!check_fragment_space(mdr
, dir
))
4513 if (!check_dir_max_entries(mdr
, dir
))
4516 if (mdr
->dn
[0].size() == 1)
4517 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4520 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4521 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4525 dn
->push_projected_linkage(newi
);
4527 auto _inode
= newi
->_get_inode();
4528 _inode
->version
= dn
->pre_dirty();
4529 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4530 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
4531 _inode
->update_backtrace();
4532 _inode
->rstat
.rfiles
= 1;
4533 _inode
->accounted_rstat
= _inode
->rstat
;
4535 SnapRealm
*realm
= diri
->find_snaprealm();
4536 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4537 ceph_assert(follows
>= realm
->get_newest_seq());
4539 ceph_assert(dn
->first
== follows
+1);
4540 newi
->first
= dn
->first
;
4543 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
4544 newi
->authlock
.set_state(LOCK_EXCL
);
4545 newi
->xattrlock
.set_state(LOCK_EXCL
);
4547 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4548 _inode
->client_ranges
[client
].range
.first
= 0;
4549 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
4550 _inode
->client_ranges
[client
].follows
= follows
;
4551 newi
->mark_clientwriteable();
4552 cap
->mark_clientwriteable();
4556 mdr
->ls
= mdlog
->get_current_segment();
4557 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4558 mdlog
->start_entry(le
);
4559 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4560 journal_allocated_inos(mdr
, &le
->metablob
);
4561 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4562 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4564 // make sure this inode gets into the journal
4565 le
->metablob
.add_opened_ino(newi
->ino());
4567 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, newi
);
4569 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4570 openc_response_t ocresp
;
4572 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4573 ocresp
.created_ino
= _inode
->ino
;
4575 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4576 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4577 unsigned frac
= 100 / delegate_inos_pct
;
4578 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4579 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4582 encode(ocresp
, mdr
->reply_extra_bl
);
4583 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4584 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4585 // add the file created flag onto the reply if create_flags features is supported
4586 encode(newi
->ino(), mdr
->reply_extra_bl
);
4589 journal_and_reply(mdr
, newi
, dn
, le
, fin
);
4591 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4592 // have overshot the split size (multiple opencs in flight), so here is
4593 // an early chance to split the dir if this openc makes it oversized.
4594 mds
->balancer
->maybe_fragment(dir
, false);
4599 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4601 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4602 Session
*session
= mds
->get_session(req
);
4603 client_t client
= req
->get_source().num();
4604 MutationImpl::LockOpVec lov
;
4605 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4608 // it's a directory, right?
4609 if (!diri
->is_dir()) {
4611 dout(10) << "reply to " << *req
<< " readdir -CEPHFS_ENOTDIR" << dendl
;
4612 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
4616 auto num_caps
= session
->get_num_caps();
4617 auto session_cap_acquisition
= session
->get_cap_acquisition();
4619 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
4620 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
4621 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
4623 logger
->inc(l_mdss_cap_acquisition_throttle
);
4625 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
4629 lov
.add_rdlock(&diri
->filelock
);
4630 lov
.add_rdlock(&diri
->dirfragtreelock
);
4632 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4635 if (!check_access(mdr
, diri
, MAY_READ
))
4639 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4640 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4641 string offset_str
= req
->get_path2();
4643 __u32 offset_hash
= 0;
4644 if (!offset_str
.empty())
4645 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4647 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4649 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4650 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4652 // does the frag exist?
4653 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4655 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4656 if (fg
.contains((unsigned)offset_hash
)) {
4657 newfg
= diri
->dirfragtree
[offset_hash
];
4659 // client actually wants next frag
4660 newfg
= diri
->dirfragtree
[fg
.value()];
4664 newfg
= diri
->dirfragtree
[fg
.value()];
4666 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4670 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4674 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4675 ceph_assert(dir
->is_auth());
4677 if (!dir
->is_complete()) {
4678 if (dir
->is_frozen()) {
4679 dout(7) << "dir is frozen " << *dir
<< dendl
;
4680 mds
->locker
->drop_locks(mdr
.get());
4681 mdr
->drop_local_auth_pins();
4682 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4686 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4687 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4691 #ifdef MDS_VERIFY_FRAGSTAT
4692 dir
->verify_fragstat();
4695 utime_t now
= ceph_clock_now();
4696 mdr
->set_mds_stamp(now
);
4698 snapid_t snapid
= mdr
->snapid
;
4699 dout(10) << "snapid " << snapid
<< dendl
;
4701 SnapRealm
*realm
= diri
->find_snaprealm();
4703 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4705 max
= dir
->get_num_any(); // whatever, something big.
4706 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4708 // make sure at least one item can be encoded
4709 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4714 ds
.frag
= dir
->get_frag();
4715 ds
.auth
= dir
->get_dir_auth().first
;
4716 if (dir
->is_auth() && !forward_all_requests_to_auth
)
4717 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4719 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4721 // count bytes available.
4722 // this isn't perfect, but we should capture the main variable/unbounded size items!
4723 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4724 int bytes_left
= max_bytes
- front_bytes
;
4725 bytes_left
-= realm
->get_snap_trace().length();
4727 // build dir contents
4730 bool start
= !offset_hash
&& offset_str
.empty();
4731 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4732 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4733 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4734 bool end
= (it
== dir
->end());
4735 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4736 CDentry
*dn
= it
->second
;
4739 if (dn
->state_test(CDentry::STATE_PURGING
))
4742 bool dnp
= dn
->use_projected(client
, mdr
);
4743 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4748 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4749 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4754 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4755 if (!(offset_key
< dn
->key()))
4759 CInode
*in
= dnl
->get_inode();
4761 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4765 // better for the MDS to do the work, if we think the client will stat any of these files.
4766 if (dnl
->is_remote() && !in
) {
4767 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4769 dn
->link_remote(dnl
, in
);
4770 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4771 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4774 // touch everything i _do_ have
4775 for (auto &p
: *dir
) {
4776 if (!p
.second
->get_linkage()->is_null())
4777 mdcache
->lru
.lru_touch(p
.second
);
4780 // already issued caps and leases, reply immediately.
4781 if (dnbl
.length() > 0) {
4782 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4783 dout(10) << " open remote dentry after caps were issued, stopping at "
4784 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4788 mds
->locker
->drop_locks(mdr
.get());
4789 mdr
->drop_local_auth_pins();
4790 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4796 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4797 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4801 unsigned start_len
= dnbl
.length();
4804 dout(12) << "including dn " << *dn
<< dendl
;
4805 encode(dn
->get_name(), dnbl
);
4806 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, dnbl
);
4809 dout(12) << "including inode " << *in
<< dendl
;
4810 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4812 // chop off dn->name, lease
4813 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4815 keep
.substr_of(dnbl
, 0, start_len
);
4819 ceph_assert(r
>= 0);
4823 mdcache
->lru
.lru_touch(dn
);
4826 session
->touch_readdir_cap(numfiles
);
4830 flags
= CEPH_READDIR_FRAG_END
;
4832 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4834 // client only understand END and COMPLETE flags ?
4835 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4836 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4839 // finish final blob
4840 encode(numfiles
, dirbl
);
4841 encode(flags
, dirbl
);
4842 dirbl
.claim_append(dnbl
);
4845 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4846 << " bytes=" << dirbl
.length()
4847 << " start=" << (int)start
4848 << " end=" << (int)end
4850 mdr
->reply_extra_bl
= dirbl
;
4852 // bump popularity. NOTE: this doesn't quite capture it.
4853 mds
->balancer
->hit_dir(dir
, META_POP_READDIR
, -1, numfiles
);
4857 respond_to_request(mdr
, 0);
4862 // ===============================================================================
4867 * finisher for basic inode updates
4869 class C_MDS_inode_update_finish
: public ServerLogContext
{
4871 bool truncating_smaller
, changed_ranges
, adjust_realm
;
4873 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4874 bool sm
=false, bool cr
=false, bool ar
=false) :
4875 ServerLogContext(s
, r
), in(i
),
4876 truncating_smaller(sm
), changed_ranges(cr
), adjust_realm(ar
) { }
4877 void finish(int r
) override
{
4878 ceph_assert(r
== 0);
4880 int snap_op
= (in
->snaprealm
? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
);
4885 MDSRank
*mds
= get_mds();
4887 // notify any clients
4888 if (truncating_smaller
&& in
->get_inode()->is_truncating()) {
4889 mds
->locker
->issue_truncate(in
);
4890 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4894 mds
->mdcache
->send_snap_update(in
, 0, snap_op
);
4895 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, snap_op
);
4898 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4900 server
->respond_to_request(mdr
, 0);
4903 get_mds()->locker
->share_inode_max_size(in
);
4907 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4909 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4910 MutationImpl::LockOpVec lov
;
4912 // get the inode to operate on, and set up any locks needed for that
4913 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4917 lov
.add_xlock(&cur
->flocklock
);
4918 /* acquire_locks will return true if it gets the locks. If it fails,
4919 it will redeliver this request at a later date, so drop the request.
4921 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4922 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4926 // copy the lock change into a ceph_filelock so we can store/apply it
4927 ceph_filelock set_lock
;
4928 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4929 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4930 set_lock
.client
= req
->get_orig_source().num();
4931 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4932 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4933 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4934 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4936 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4938 ceph_lock_state_t
*lock_state
= NULL
;
4939 bool interrupt
= false;
4941 // get the appropriate lock state
4942 switch (req
->head
.args
.filelock_change
.rule
) {
4943 case CEPH_LOCK_FLOCK_INTR
:
4946 case CEPH_LOCK_FLOCK
:
4947 lock_state
= cur
->get_flock_lock_state();
4950 case CEPH_LOCK_FCNTL_INTR
:
4953 case CEPH_LOCK_FCNTL
:
4954 lock_state
= cur
->get_fcntl_lock_state();
4958 dout(10) << "got unknown lock type " << set_lock
.type
4959 << ", dropping request!" << dendl
;
4960 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
4964 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4965 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4966 list
<ceph_filelock
> activated_locks
;
4967 MDSContext::vec waiters
;
4968 if (lock_state
->is_waiting(set_lock
)) {
4969 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4970 lock_state
->remove_waiting(set_lock
);
4971 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4972 } else if (!interrupt
) {
4973 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4974 lock_state
->remove_lock(set_lock
, activated_locks
);
4975 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4977 mds
->queue_waiters(waiters
);
4979 respond_to_request(mdr
, 0);
4981 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4982 bool deadlock
= false;
4983 if (mdr
->more()->flock_was_waiting
&&
4984 !lock_state
->is_waiting(set_lock
)) {
4985 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4986 respond_to_request(mdr
, -CEPHFS_EINTR
);
4987 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4988 dout(10) << " it failed on this attempt" << dendl
;
4989 // couldn't set lock right now
4991 respond_to_request(mdr
, -CEPHFS_EDEADLK
);
4992 } else if (!will_wait
) {
4993 respond_to_request(mdr
, -CEPHFS_EWOULDBLOCK
);
4995 dout(10) << " added to waiting list" << dendl
;
4996 ceph_assert(lock_state
->is_waiting(set_lock
));
4997 mdr
->more()->flock_was_waiting
= true;
4998 mds
->locker
->drop_locks(mdr
.get());
4999 mdr
->drop_local_auth_pins();
5000 mdr
->mark_event("failed to add lock, waiting");
5002 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
5005 respond_to_request(mdr
, 0);
5007 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
5010 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
5012 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5013 MutationImpl::LockOpVec lov
;
5015 // get the inode to operate on, and set up any locks needed for that
5016 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5020 /* acquire_locks will return true if it gets the locks. If it fails,
5021 it will redeliver this request at a later date, so drop the request.
5023 lov
.add_rdlock(&cur
->flocklock
);
5024 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
5025 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
5029 // copy the lock change into a ceph_filelock so we can store/apply it
5030 ceph_filelock checking_lock
;
5031 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
5032 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
5033 checking_lock
.client
= req
->get_orig_source().num();
5034 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
5035 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
5036 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
5038 // get the appropriate lock state
5039 ceph_lock_state_t
*lock_state
= NULL
;
5040 switch (req
->head
.args
.filelock_change
.rule
) {
5041 case CEPH_LOCK_FLOCK
:
5042 lock_state
= cur
->get_flock_lock_state();
5045 case CEPH_LOCK_FCNTL
:
5046 lock_state
= cur
->get_fcntl_lock_state();
5050 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
5051 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5054 lock_state
->look_for_lock(checking_lock
);
5057 encode(checking_lock
, lock_bl
);
5059 mdr
->reply_extra_bl
= lock_bl
;
5060 respond_to_request(mdr
, 0);
5063 void Server::handle_client_setattr(MDRequestRef
& mdr
)
5065 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5066 MutationImpl::LockOpVec lov
;
5067 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5070 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5071 respond_to_request(mdr
, -CEPHFS_EROFS
);
5074 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
5075 respond_to_request(mdr
, -CEPHFS_EPERM
);
5079 __u32 mask
= req
->head
.args
.setattr
.mask
;
5080 __u32 access_mask
= MAY_WRITE
;
5083 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
5084 lov
.add_xlock(&cur
->authlock
);
5085 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
5086 lov
.add_xlock(&cur
->filelock
);
5087 if (mask
& CEPH_SETATTR_CTIME
)
5088 lov
.add_wrlock(&cur
->versionlock
);
5090 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5093 if ((mask
& CEPH_SETATTR_UID
) && (cur
->get_inode()->uid
!= req
->head
.args
.setattr
.uid
))
5094 access_mask
|= MAY_CHOWN
;
5096 if ((mask
& CEPH_SETATTR_GID
) && (cur
->get_inode()->gid
!= req
->head
.args
.setattr
.gid
))
5097 access_mask
|= MAY_CHGRP
;
5099 if (!check_access(mdr
, cur
, access_mask
))
5102 // trunc from bigger -> smaller?
5103 const auto& pip
= cur
->get_projected_inode();
5105 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
5107 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5108 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
5109 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl
;
5110 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
5114 bool truncating_smaller
= false;
5115 if (mask
& CEPH_SETATTR_SIZE
) {
5116 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
5117 if (truncating_smaller
&& pip
->is_truncating()) {
5118 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
5119 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
5120 mds
->locker
->drop_locks(mdr
.get());
5121 mdr
->drop_local_auth_pins();
5122 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
5127 bool changed_ranges
= false;
5130 mdr
->ls
= mdlog
->get_current_segment();
5131 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
5132 mdlog
->start_entry(le
);
5134 auto pi
= cur
->project_inode(mdr
);
5136 if (mask
& CEPH_SETATTR_UID
)
5137 pi
.inode
->uid
= req
->head
.args
.setattr
.uid
;
5138 if (mask
& CEPH_SETATTR_GID
)
5139 pi
.inode
->gid
= req
->head
.args
.setattr
.gid
;
5141 if (mask
& CEPH_SETATTR_MODE
)
5142 pi
.inode
->mode
= (pi
.inode
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
5143 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
5144 S_ISREG(pi
.inode
->mode
) &&
5145 (pi
.inode
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
5146 pi
.inode
->mode
&= ~(S_ISUID
|S_ISGID
);
5149 if (mask
& CEPH_SETATTR_MTIME
)
5150 pi
.inode
->mtime
= req
->head
.args
.setattr
.mtime
;
5151 if (mask
& CEPH_SETATTR_ATIME
)
5152 pi
.inode
->atime
= req
->head
.args
.setattr
.atime
;
5153 if (mask
& CEPH_SETATTR_BTIME
)
5154 pi
.inode
->btime
= req
->head
.args
.setattr
.btime
;
5155 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
5156 pi
.inode
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
5157 if (mask
& CEPH_SETATTR_SIZE
) {
5158 if (truncating_smaller
) {
5159 pi
.inode
->truncate(old_size
, req
->head
.args
.setattr
.size
);
5160 le
->metablob
.add_truncate_start(cur
->ino());
5162 pi
.inode
->size
= req
->head
.args
.setattr
.size
;
5163 pi
.inode
->rstat
.rbytes
= pi
.inode
->size
;
5165 pi
.inode
->mtime
= mdr
->get_op_stamp();
5167 // adjust client's max_size?
5168 if (mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
->size
)) {
5169 dout(10) << " client_ranges " << cur
->get_previous_projected_inode()->client_ranges
5170 << " -> " << pi
.inode
->client_ranges
<< dendl
;
5171 changed_ranges
= true;
5175 pi
.inode
->version
= cur
->pre_dirty();
5176 pi
.inode
->ctime
= mdr
->get_op_stamp();
5177 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5178 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5179 pi
.inode
->change_attr
++;
5182 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5183 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5184 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5186 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5187 truncating_smaller
, changed_ranges
));
5189 // flush immediately if there are readers/writers waiting
5190 if (mdr
->is_xlocked(&cur
->filelock
) &&
5191 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5192 mds
->mdlog
->flush();
5195 /* Takes responsibility for mdr */
5196 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5198 CInode
*in
= mdr
->in
[0];
5199 client_t client
= mdr
->get_client();
5202 dout(10) << "do_open_truncate " << *in
<< dendl
;
5204 SnapRealm
*realm
= in
->find_snaprealm();
5205 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5207 mdr
->ls
= mdlog
->get_current_segment();
5208 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5209 mdlog
->start_entry(le
);
5212 auto pi
= in
->project_inode(mdr
);
5213 pi
.inode
->version
= in
->pre_dirty();
5214 pi
.inode
->mtime
= pi
.inode
->ctime
= mdr
->get_op_stamp();
5215 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5216 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5217 pi
.inode
->change_attr
++;
5219 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
5221 pi
.inode
->truncate(old_size
, 0);
5222 le
->metablob
.add_truncate_start(in
->ino());
5225 bool changed_ranges
= false;
5226 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5227 pi
.inode
->client_ranges
[client
].range
.first
= 0;
5228 pi
.inode
->client_ranges
[client
].range
.last
= pi
.inode
->get_layout_size_increment();
5229 pi
.inode
->client_ranges
[client
].follows
= realm
->get_newest_seq();
5230 changed_ranges
= true;
5231 in
->mark_clientwriteable();
5232 cap
->mark_clientwriteable();
5235 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5237 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5238 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5240 // make sure ino gets into the journal
5241 le
->metablob
.add_opened_ino(in
->ino());
5243 mdr
->o_trunc
= true;
5246 if (mdr
->client_request
->get_dentry_wanted()) {
5247 ceph_assert(mdr
->dn
[0].size());
5248 dn
= mdr
->dn
[0].back();
5251 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5253 // Although the `open` part can give an early reply, the truncation won't
5254 // happen until our EUpdate is persistent, to give the client a prompt
5255 // response we must also flush that event.
5260 /* This function cleans up the passed mdr */
5261 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5263 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5264 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5267 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5268 respond_to_request(mdr
, -CEPHFS_EROFS
);
5271 if (!cur
->is_file()) {
5272 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5275 if (cur
->get_projected_inode()->size
||
5276 cur
->get_projected_inode()->truncate_seq
> 1) {
5277 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5282 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5283 // save existing layout for later
5284 const auto old_layout
= layout
;
5286 int access
= MAY_WRITE
;
5288 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5289 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5290 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5291 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5292 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5293 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5294 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5295 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5297 // make sure we have as new a map as the client
5298 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5299 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5304 // Don't permit layout modifications without 'p' caps
5305 if (layout
!= old_layout
) {
5306 access
|= MAY_SET_VXATTR
;
5309 if (!layout
.is_valid()) {
5310 dout(10) << "bad layout" << dendl
;
5311 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5314 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5315 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5316 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5320 MutationImpl::LockOpVec lov
;
5321 lov
.add_xlock(&cur
->filelock
);
5322 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5325 if (!check_access(mdr
, cur
, access
))
5329 auto pi
= cur
->project_inode(mdr
);
5330 pi
.inode
->layout
= layout
;
5331 // add the old pool to the inode
5332 pi
.inode
->add_old_pool(old_layout
.pool_id
);
5333 pi
.inode
->version
= cur
->pre_dirty();
5334 pi
.inode
->ctime
= mdr
->get_op_stamp();
5335 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5336 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5337 pi
.inode
->change_attr
++;
5340 mdr
->ls
= mdlog
->get_current_segment();
5341 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5342 mdlog
->start_entry(le
);
5343 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5344 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5345 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5347 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5350 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5352 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5355 MutationImpl::LockOpVec lov
;
5356 lov
.add_xlock(&in
->policylock
);
5358 lov
.add_xlock(&in
->snaplock
);
5360 lov
.add_rdlock(&in
->snaplock
);
5361 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5364 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5365 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5366 want_layout
= false;
5368 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5369 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5373 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5377 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5379 CInode
*in
= mdcache
->get_inode(ino
);
5380 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5381 respond_to_request(mdr
, -CEPHFS_ESTALE
);
5384 if (!in
->is_auth()) {
5385 mdcache
->request_forward(mdr
, in
->authority().first
);
5392 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5394 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5396 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5397 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5401 if (!cur
->is_dir()) {
5402 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
5406 if (!xlock_policylock(mdr
, cur
, true))
5410 const auto& old_pi
= cur
->get_projected_inode();
5411 file_layout_t layout
;
5412 if (old_pi
->has_layout())
5413 layout
= old_pi
->layout
;
5414 else if (mdr
->dir_layout
!= file_layout_t())
5415 layout
= mdr
->dir_layout
;
5417 layout
= mdcache
->default_file_layout
;
5419 // Level of access required to complete
5420 int access
= MAY_WRITE
;
5422 const auto old_layout
= layout
;
5424 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5425 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5426 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5427 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5428 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5429 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5430 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5431 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5432 // make sure we have as new a map as the client
5433 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5434 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5439 if (layout
!= old_layout
) {
5440 access
|= MAY_SET_VXATTR
;
5443 if (!layout
.is_valid()) {
5444 dout(10) << "bad layout" << dendl
;
5445 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5448 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5449 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5450 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5454 if (!check_access(mdr
, cur
, access
))
5457 auto pi
= cur
->project_inode(mdr
);
5458 pi
.inode
->layout
= layout
;
5459 pi
.inode
->version
= cur
->pre_dirty();
5462 mdr
->ls
= mdlog
->get_current_segment();
5463 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5464 mdlog
->start_entry(le
);
5465 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5466 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5467 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5469 mdr
->no_early_reply
= true;
5470 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5474 int Server::parse_layout_vxattr_json(
5475 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5477 auto parse_pool
= [&](std::string pool_name
, int64_t pool_id
) -> int64_t {
5478 if (pool_name
!= "") {
5479 int64_t _pool_id
= osdmap
.lookup_pg_pool_name(pool_name
);
5481 dout(10) << __func__
<< ": unknown pool name:" << pool_name
<< dendl
;
5482 return -CEPHFS_EINVAL
;
5485 } else if (pool_id
>= 0) {
5486 const auto pools
= osdmap
.get_pools();
5487 if (pools
.find(pool_id
) == pools
.end()) {
5488 dout(10) << __func__
<< ": unknown pool id:" << pool_id
<< dendl
;
5489 return -CEPHFS_EINVAL
;
5493 return -CEPHFS_EINVAL
;
5498 if (name
== "layout.json") {
5499 JSONParser json_parser
;
5500 if (json_parser
.parse(value
.c_str(), value
.length()) and json_parser
.is_object()) {
5503 field
= "object_size";
5504 JSONDecoder::decode_json("object_size", layout
->object_size
, &json_parser
, true);
5506 field
= "stripe_unit";
5507 JSONDecoder::decode_json("stripe_unit", layout
->stripe_unit
, &json_parser
, true);
5509 field
= "stripe_count";
5510 JSONDecoder::decode_json("stripe_count", layout
->stripe_count
, &json_parser
, true);
5512 field
= "pool_namespace";
5513 JSONDecoder::decode_json("pool_namespace", layout
->pool_ns
, &json_parser
, false);
5516 int64_t pool_id
= 0;
5517 JSONDecoder::decode_json("pool_id", pool_id
, &json_parser
, false);
5519 field
= "pool_name";
5520 std::string pool_name
;
5521 JSONDecoder::decode_json("pool_name", pool_name
, &json_parser
, false);
5523 pool_id
= parse_pool(pool_name
, pool_id
);
5525 return (int)pool_id
;
5527 layout
->pool_id
= pool_id
;
5528 } catch (JSONDecoder::err
&) {
5529 dout(10) << __func__
<< ": json is missing a mandatory field named "
5531 return -CEPHFS_EINVAL
;
5534 dout(10) << __func__
<< ": bad json" << dendl
;
5535 return -CEPHFS_EINVAL
;
5538 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5539 return -CEPHFS_ENODATA
; // no such attribute
5541 } catch (boost::bad_lexical_cast
const&) {
5542 dout(10) << __func__
<< ": bad vxattr value:" << value
5543 << ", unable to parse for xattr:" << name
<< dendl
;
5544 return -CEPHFS_EINVAL
;
5549 // parse old style layout string
5550 int Server::parse_layout_vxattr_string(
5551 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5554 if (name
== "layout") {
5555 string::iterator begin
= value
.begin();
5556 string::iterator end
= value
.end();
5557 keys_and_values
<string::iterator
> p
; // create instance of parser
5558 std::map
<string
, string
> m
; // map to receive results
5559 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5560 return -CEPHFS_EINVAL
;
5562 string
left(begin
, end
);
5563 dout(10) << __func__
<< ": parsed " << m
<< " left '" << left
<< "'" << dendl
;
5565 return -CEPHFS_EINVAL
;
5566 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5567 // Skip validation on each attr, we do it once at the end (avoid
5568 // rejecting intermediate states if the overall result is ok)
5569 int r
= parse_layout_vxattr_string(string("layout.") + q
->first
, q
->second
,
5574 } else if (name
== "layout.object_size") {
5575 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5576 } else if (name
== "layout.stripe_unit") {
5577 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5578 } else if (name
== "layout.stripe_count") {
5579 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5580 } else if (name
== "layout.pool") {
5582 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5583 } catch (boost::bad_lexical_cast
const&) {
5584 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5586 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5587 return -CEPHFS_ENOENT
;
5589 layout
->pool_id
= pool
;
5591 } else if (name
== "layout.pool_id") {
5592 layout
->pool_id
= boost::lexical_cast
<int64_t>(value
);
5593 } else if (name
== "layout.pool_name") {
5594 layout
->pool_id
= osdmap
.lookup_pg_pool_name(value
);
5595 if (layout
->pool_id
< 0) {
5596 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5597 return -CEPHFS_EINVAL
;
5599 } else if (name
== "layout.pool_namespace") {
5600 layout
->pool_ns
= value
;
5602 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5603 return -CEPHFS_ENODATA
; // no such attribute
5605 } catch (boost::bad_lexical_cast
const&) {
5606 dout(10) << __func__
<< ": bad vxattr value, unable to parse int for "
5608 return -CEPHFS_EINVAL
;
5613 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5614 file_layout_t
*layout
, bool validate
)
5616 dout(20) << __func__
<< ": name:" << name
<< " value:'" << value
<< "'" << dendl
;
5619 if (name
== "layout.json") {
5620 r
= parse_layout_vxattr_json(name
, value
, osdmap
, layout
);
5622 r
= parse_layout_vxattr_string(name
, value
, osdmap
, layout
);
5628 if (validate
&& !layout
->is_valid()) {
5629 dout(10) << __func__
<< ": bad layout" << dendl
;
5630 return -CEPHFS_EINVAL
;
5632 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5633 dout(10) << __func__
<< ": invalid data pool " << layout
->pool_id
<< dendl
;
5634 return -CEPHFS_EINVAL
;
5639 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5641 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5643 if (name
== "quota") {
5644 string::iterator begin
= value
.begin();
5645 string::iterator end
= value
.end();
5647 // keep quota unchanged. (for create_quota_realm())
5650 keys_and_values
<string::iterator
> p
; // create instance of parser
5651 std::map
<string
, string
> m
; // map to receive results
5652 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5653 return -CEPHFS_EINVAL
;
5655 string
left(begin
, end
);
5656 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5658 return -CEPHFS_EINVAL
;
5659 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5660 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5664 } else if (name
== "quota.max_bytes") {
5665 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5667 return -CEPHFS_EINVAL
;
5668 quota
->max_bytes
= q
;
5669 } else if (name
== "quota.max_files") {
5670 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5672 return -CEPHFS_EINVAL
;
5673 quota
->max_files
= q
;
5675 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5676 return -CEPHFS_EINVAL
;
5678 } catch (boost::bad_lexical_cast
const&) {
5679 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5680 return -CEPHFS_EINVAL
;
5683 if (!quota
->is_valid()) {
5684 dout(10) << "bad quota" << dendl
;
5685 return -CEPHFS_EINVAL
;
5690 void Server::create_quota_realm(CInode
*in
)
5692 dout(10) << __func__
<< " " << *in
<< dendl
;
5694 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5695 req
->set_filepath(filepath(in
->ino()));
5696 req
->set_string2("ceph.quota");
5697 // empty vxattr value
5698 req
->set_tid(mds
->issue_tid());
5700 mds
->send_message_mds(req
, in
->authority().first
);
5704 * Verify that the file layout attribute carried by client
5705 * is well-formatted.
5706 * Return 0 on success, otherwise this function takes
5707 * responsibility for the passed mdr.
5709 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5712 file_layout_t
*layout
)
5714 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5718 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5719 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5720 epoch
= osdmap
.get_epoch();
5723 if (r
== -CEPHFS_ENOENT
) {
5725 // we don't have the specified pool, make sure our map
5726 // is newer than or as new as the client.
5727 epoch_t req_epoch
= req
->get_osdmap_epoch();
5729 if (req_epoch
> epoch
) {
5731 // well, our map is older. consult mds.
5732 auto fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5734 mds
->objecter
->wait_for_map(req_epoch
, lambdafy(fin
));
5736 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5738 // For compatibility with client w/ old code, we still need get the
5739 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5740 // we can remove those code.
5741 mdr
->waited_for_osdmap
= true;
5742 mds
->objecter
->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5743 mds
, new C_MDS_RetryRequest(mdcache
, mdr
))));
5750 if (r
== -CEPHFS_ENOENT
)
5753 respond_to_request(mdr
, r
);
5761 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5763 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5764 string
name(req
->get_path2());
5765 bufferlist bl
= req
->get_data();
5766 string
value (bl
.c_str(), bl
.length());
5767 dout(10) << "handle_set_vxattr " << name
5768 << " val " << value
.length()
5769 << " bytes on " << *cur
5772 CInode::mempool_inode
*pip
= nullptr;
5775 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5779 bool adjust_realm
= false;
5780 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5781 if (!cur
->is_dir()) {
5782 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5786 if (!xlock_policylock(mdr
, cur
, true))
5789 file_layout_t layout
;
5790 if (cur
->get_projected_inode()->has_layout())
5791 layout
= cur
->get_projected_inode()->layout
;
5792 else if (mdr
->dir_layout
!= file_layout_t())
5793 layout
= mdr
->dir_layout
;
5795 layout
= mdcache
->default_file_layout
;
5797 rest
= name
.substr(name
.find("layout"));
5798 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5801 auto pi
= cur
->project_inode(mdr
);
5802 pi
.inode
->layout
= layout
;
5803 mdr
->no_early_reply
= true;
5804 pip
= pi
.inode
.get();
5805 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5806 if (!cur
->is_file()) {
5807 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5810 if (cur
->get_projected_inode()->size
||
5811 cur
->get_projected_inode()->truncate_seq
> 1) {
5812 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5815 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5816 rest
= name
.substr(name
.find("layout"));
5817 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5820 MutationImpl::LockOpVec lov
;
5821 lov
.add_xlock(&cur
->filelock
);
5822 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5825 auto pi
= cur
->project_inode(mdr
);
5826 int64_t old_pool
= pi
.inode
->layout
.pool_id
;
5827 pi
.inode
->add_old_pool(old_pool
);
5828 pi
.inode
->layout
= layout
;
5829 pip
= pi
.inode
.get();
5830 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5831 if (!cur
->is_dir()) {
5832 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5836 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5838 rest
= name
.substr(name
.find("quota"));
5839 int r
= parse_quota_vxattr(rest
, value
, "a
);
5841 respond_to_request(mdr
, r
);
5845 if (quota
.is_enable() && !cur
->get_projected_srnode())
5846 adjust_realm
= true;
5848 if (!xlock_policylock(mdr
, cur
, false, adjust_realm
))
5851 if (cur
->get_projected_inode()->quota
== quota
) {
5852 respond_to_request(mdr
, 0);
5856 auto pi
= cur
->project_inode(mdr
, false, adjust_realm
);
5857 pi
.inode
->quota
= quota
;
5860 pi
.snapnode
->created
= pi
.snapnode
->seq
= cur
->find_snaprealm()->get_newest_seq();
5862 mdr
->no_early_reply
= true;
5863 pip
= pi
.inode
.get();
5865 client_t exclude_ct
= mdr
->get_client();
5866 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5867 } else if (name
== "ceph.dir.subvolume"sv
) {
5868 if (!cur
->is_dir()) {
5869 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5875 val
= boost::lexical_cast
<bool>(value
);
5876 } catch (boost::bad_lexical_cast
const&) {
5877 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5878 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5882 /* Verify it's not already a subvolume with lighter weight
5885 if (!mdr
->more()->rdonly_checks
) {
5886 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
5887 MutationImpl::LockOpVec lov
;
5888 lov
.add_rdlock(&cur
->snaplock
);
5889 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5891 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5893 const auto srnode
= cur
->get_projected_srnode();
5894 if (val
== (srnode
&& srnode
->is_subvolume())) {
5895 dout(20) << "already marked subvolume" << dendl
;
5896 respond_to_request(mdr
, 0);
5899 mdr
->more()->rdonly_checks
= true;
5902 if ((mdr
->locking_state
& MutationImpl::ALL_LOCKED
) && !mdr
->is_xlocked(&cur
->snaplock
)) {
5903 /* drop the rdlock and acquire xlocks */
5904 dout(20) << "dropping rdlocks" << dendl
;
5905 mds
->locker
->drop_locks(mdr
.get());
5906 if (!xlock_policylock(mdr
, cur
, false, true))
5910 /* repeat rdonly checks in case changed between rdlock -> xlock */
5911 SnapRealm
*realm
= cur
->find_snaprealm();
5913 inodeno_t subvol_ino
= realm
->get_subvolume_ino();
5914 // can't create subvolume inside another subvolume
5915 if (subvol_ino
&& subvol_ino
!= cur
->ino()) {
5916 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5921 const auto srnode
= cur
->get_projected_srnode();
5922 if (val
== (srnode
&& srnode
->is_subvolume())) {
5923 respond_to_request(mdr
, 0);
5927 auto pi
= cur
->project_inode(mdr
, false, true);
5929 pi
.snapnode
->created
= pi
.snapnode
->seq
= realm
->get_newest_seq();
5931 pi
.snapnode
->mark_subvolume();
5933 pi
.snapnode
->clear_subvolume();
5935 mdr
->no_early_reply
= true;
5936 pip
= pi
.inode
.get();
5937 adjust_realm
= true;
5938 } else if (name
== "ceph.dir.pin"sv
) {
5939 if (!cur
->is_dir() || cur
->is_root()) {
5940 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5946 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5947 if (rank
< 0) rank
= MDS_RANK_NONE
;
5948 else if (rank
>= MAX_MDS
) {
5949 respond_to_request(mdr
, -CEPHFS_EDOM
);
5952 } catch (boost::bad_lexical_cast
const&) {
5953 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5954 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5958 if (!xlock_policylock(mdr
, cur
))
5961 auto pi
= cur
->project_inode(mdr
);
5962 cur
->set_export_pin(rank
);
5963 pip
= pi
.inode
.get();
5964 } else if (name
== "ceph.dir.pin.random"sv
) {
5965 if (!cur
->is_dir() || cur
->is_root()) {
5966 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5972 val
= boost::lexical_cast
<double>(value
);
5973 } catch (boost::bad_lexical_cast
const&) {
5974 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
5975 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5979 if (val
< 0.0 || 1.0 < val
) {
5980 respond_to_request(mdr
, -CEPHFS_EDOM
);
5982 } else if (mdcache
->export_ephemeral_random_max
< val
) {
5983 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5987 if (!xlock_policylock(mdr
, cur
))
5990 auto pi
= cur
->project_inode(mdr
);
5991 cur
->setxattr_ephemeral_rand(val
);
5992 pip
= pi
.inode
.get();
5993 } else if (name
== "ceph.dir.pin.distributed"sv
) {
5994 if (!cur
->is_dir() || cur
->is_root()) {
5995 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6001 val
= boost::lexical_cast
<bool>(value
);
6002 } catch (boost::bad_lexical_cast
const&) {
6003 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
6004 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6008 if (!xlock_policylock(mdr
, cur
))
6011 auto pi
= cur
->project_inode(mdr
);
6012 cur
->setxattr_ephemeral_dist(val
);
6013 pip
= pi
.inode
.get();
6015 dout(10) << " unknown vxattr " << name
<< dendl
;
6016 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6021 pip
->ctime
= mdr
->get_op_stamp();
6022 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
6023 pip
->rstat
.rctime
= mdr
->get_op_stamp();
6024 pip
->version
= cur
->pre_dirty();
6026 pip
->update_backtrace();
6029 mdr
->ls
= mdlog
->get_current_segment();
6030 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
6031 mdlog
->start_entry(le
);
6032 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6033 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6034 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6036 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
6037 false, false, adjust_realm
));
6041 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
6043 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6044 string
name(req
->get_path2());
6046 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
6048 if (name
== "ceph.dir.layout") {
6049 if (!cur
->is_dir()) {
6050 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6053 if (cur
->is_root()) {
6054 dout(10) << "can't remove layout policy on the root directory" << dendl
;
6055 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6059 if (!cur
->get_projected_inode()->has_layout()) {
6060 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6064 MutationImpl::LockOpVec lov
;
6065 lov
.add_xlock(&cur
->policylock
);
6066 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6069 auto pi
= cur
->project_inode(mdr
);
6070 pi
.inode
->clear_layout();
6071 pi
.inode
->version
= cur
->pre_dirty();
6074 mdr
->ls
= mdlog
->get_current_segment();
6075 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
6076 mdlog
->start_entry(le
);
6077 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6078 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6079 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6081 mdr
->no_early_reply
= true;
6082 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6084 } else if (name
== "ceph.dir.layout.pool_namespace"
6085 || name
== "ceph.file.layout.pool_namespace") {
6086 // Namespace is the only layout field that has a meaningful
6087 // null/none value (empty string, means default layout). Is equivalent
6088 // to a setxattr with empty string: pass through the empty payload of
6089 // the rmxattr request to do this.
6090 handle_set_vxattr(mdr
, cur
);
6094 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6097 const Server::XattrHandler
Server::xattr_handlers
[] = {
6099 xattr_name
: Server::DEFAULT_HANDLER
,
6100 description
: "default xattr handler",
6101 validate
: &Server::default_xattr_validate
,
6102 setxattr
: &Server::default_setxattr_handler
,
6103 removexattr
: &Server::default_removexattr_handler
,
6106 xattr_name
: "ceph.mirror.info",
6107 description
: "mirror info xattr handler",
6108 validate
: &Server::mirror_info_xattr_validate
,
6109 setxattr
: &Server::mirror_info_setxattr_handler
,
6110 removexattr
: &Server::mirror_info_removexattr_handler
6114 const Server::XattrHandler
* Server::get_xattr_or_default_handler(std::string_view xattr_name
) {
6115 const XattrHandler
*default_xattr_handler
= nullptr;
6117 for (auto &handler
: xattr_handlers
) {
6118 if (handler
.xattr_name
== Server::DEFAULT_HANDLER
) {
6119 ceph_assert(default_xattr_handler
== nullptr);
6120 default_xattr_handler
= &handler
;
6122 if (handler
.xattr_name
== xattr_name
) {
6123 dout(20) << "handler=" << handler
.description
<< dendl
;
6128 ceph_assert(default_xattr_handler
!= nullptr);
6129 dout(20) << "handler=" << default_xattr_handler
->description
<< dendl
;
6130 return default_xattr_handler
;
6133 int Server::xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6134 const std::string
&xattr_name
, int op
, int flags
) {
6135 if (op
== CEPH_MDS_OP_SETXATTR
) {
6137 if ((flags
& CEPH_XATTR_CREATE
) && xattrs
->count(mempool::mds_co::string(xattr_name
))) {
6138 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur
<< dendl
;
6139 return -CEPHFS_EEXIST
;
6142 if ((flags
& CEPH_XATTR_REPLACE
) && !(xattrs
&& xattrs
->count(mempool::mds_co::string(xattr_name
)))) {
6143 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur
<< dendl
;
6144 return -CEPHFS_ENODATA
;
6150 if (op
== CEPH_MDS_OP_RMXATTR
) {
6151 if (!xattrs
|| xattrs
->count(mempool::mds_co::string(xattr_name
)) == 0) {
6152 dout(10) << "removexattr '" << xattr_name
<< "' and CEPHFS_ENODATA on " << *cur
<< dendl
;
6153 return -CEPHFS_ENODATA
;
6159 derr
<< ": unhandled validation for: " << xattr_name
<< dendl
;
6160 return -CEPHFS_EINVAL
;
6163 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
6164 const bufferlist
&xattr_value
) {
6165 size_t len
= xattr_value
.length();
6166 bufferptr b
= buffer::create(len
);
6168 xattr_value
.begin().copy(len
, b
.c_str());
6170 auto em
= xattrs
->emplace(std::piecewise_construct
,
6171 std::forward_as_tuple(mempool::mds_co::string(xattr_name
)),
6172 std::forward_as_tuple(b
));
6174 em
.first
->second
= b
;
6178 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
) {
6179 xattrs
->erase(mempool::mds_co::string(xattr_name
));
6182 int Server::default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6183 XattrOp
*xattr_op
) {
6184 return xattr_validate(cur
, xattrs
, xattr_op
->xattr_name
, xattr_op
->op
, xattr_op
->flags
);
6187 void Server::default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6188 const XattrOp
&xattr_op
) {
6189 xattr_set(xattrs
, xattr_op
.xattr_name
, xattr_op
.xattr_value
);
6192 void Server::default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6193 const XattrOp
&xattr_op
) {
6194 xattr_rm(xattrs
, xattr_op
.xattr_name
);
6197 // mirror info xattr handlers
6198 const std::string
Server::MirrorXattrInfo::MIRROR_INFO_REGEX
= "^cluster_id=([a-f0-9]{8}-" \
6199 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6200 "[a-f0-9]{4}-[a-f0-9]{12})" \
6202 const std::string
Server::MirrorXattrInfo::CLUSTER_ID
= "ceph.mirror.info.cluster_id";
6203 const std::string
Server::MirrorXattrInfo::FS_ID
= "ceph.mirror.info.fs_id";
6204 int Server::parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
6205 std::string
&cluster_id
, std::string
&fs_id
) {
6206 dout(20) << "parsing name=" << name
<< ", value=" << value
<< dendl
;
6208 static const std::regex
regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX
);
6211 std::regex_search(value
, match
, regex
);
6212 if (match
.size() != 3) {
6213 derr
<< "mirror info parse error" << dendl
;
6214 return -CEPHFS_EINVAL
;
6217 cluster_id
= match
[1];
6219 dout(20) << " parsed cluster_id=" << cluster_id
<< ", fs_id=" << fs_id
<< dendl
;
6223 int Server::mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6224 XattrOp
*xattr_op
) {
6225 if (!cur
->is_root()) {
6226 return -CEPHFS_EINVAL
;
6229 int v1
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, xattr_op
->op
, xattr_op
->flags
);
6230 int v2
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::FS_ID
, xattr_op
->op
, xattr_op
->flags
);
6232 derr
<< "inconsistent mirror info state (" << v1
<< "," << v2
<< ")" << dendl
;
6233 return -CEPHFS_EINVAL
;
6240 if (xattr_op
->op
== CEPH_MDS_OP_RMXATTR
) {
6244 std::string cluster_id
;
6246 int r
= parse_mirror_info_xattr(xattr_op
->xattr_name
, xattr_op
->xattr_value
.to_str(),
6252 xattr_op
->xinfo
= std::make_unique
<MirrorXattrInfo
>(cluster_id
, fs_id
);
6256 void Server::mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6257 const XattrOp
&xattr_op
) {
6258 auto mirror_info
= dynamic_cast<MirrorXattrInfo
&>(*(xattr_op
.xinfo
));
6261 bl
.append(mirror_info
.cluster_id
.c_str(), mirror_info
.cluster_id
.length());
6262 xattr_set(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, bl
);
6265 bl
.append(mirror_info
.fs_id
.c_str(), mirror_info
.fs_id
.length());
6266 xattr_set(xattrs
, Server::MirrorXattrInfo::FS_ID
, bl
);
6269 void Server::mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6270 const XattrOp
&xattr_op
) {
6271 xattr_rm(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
);
6272 xattr_rm(xattrs
, Server::MirrorXattrInfo::FS_ID
);
6275 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
6277 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6278 string
name(req
->get_path2());
6280 // is a ceph virtual xattr?
6281 if (is_ceph_vxattr(name
)) {
6282 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6283 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6287 handle_set_vxattr(mdr
, cur
);
6291 if (!is_allowed_ceph_xattr(name
)) {
6292 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6296 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
6300 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6301 respond_to_request(mdr
, -CEPHFS_EROFS
);
6305 int flags
= req
->head
.args
.setxattr
.flags
;
6307 MutationImpl::LockOpVec lov
;
6308 lov
.add_xlock(&cur
->xattrlock
);
6309 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6312 if (!check_access(mdr
, cur
, MAY_WRITE
))
6315 size_t len
= req
->get_data().length();
6316 size_t inc
= len
+ name
.length();
6318 auto handler
= Server::get_xattr_or_default_handler(name
);
6319 const auto& pxattrs
= cur
->get_projected_xattrs();
6321 // check xattrs kv pairs size
6322 size_t cur_xattrs_size
= 0;
6323 for (const auto& p
: *pxattrs
) {
6324 if ((flags
& CEPH_XATTR_REPLACE
) && name
.compare(p
.first
) == 0) {
6327 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
6330 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
6331 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6332 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
6333 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
6338 XattrOp
xattr_op(CEPH_MDS_OP_SETXATTR
, name
, req
->get_data(), flags
);
6339 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6341 respond_to_request(mdr
, r
);
6345 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
6348 auto pi
= cur
->project_inode(mdr
, true);
6349 pi
.inode
->version
= cur
->pre_dirty();
6350 pi
.inode
->ctime
= mdr
->get_op_stamp();
6351 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6352 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6353 if (name
== "encryption.ctx"sv
)
6354 pi
.inode
->fscrypt
= true;
6355 pi
.inode
->change_attr
++;
6356 pi
.inode
->xattr_version
++;
6358 if ((flags
& CEPH_XATTR_REMOVE
)) {
6359 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6361 std::invoke(handler
->setxattr
, this, cur
, pi
.xattrs
, xattr_op
);
6365 mdr
->ls
= mdlog
->get_current_segment();
6366 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
6367 mdlog
->start_entry(le
);
6368 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6369 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6370 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6372 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6375 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
6377 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6378 std::string
name(req
->get_path2());
6380 // is a ceph virtual xattr?
6381 if (is_ceph_vxattr(name
)) {
6382 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6383 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6387 handle_remove_vxattr(mdr
, cur
);
6391 if (!is_allowed_ceph_xattr(name
)) {
6392 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6396 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
6400 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6401 respond_to_request(mdr
, -CEPHFS_EROFS
);
6405 MutationImpl::LockOpVec lov
;
6406 lov
.add_xlock(&cur
->xattrlock
);
6407 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6411 auto handler
= Server::get_xattr_or_default_handler(name
);
6413 XattrOp
xattr_op(CEPH_MDS_OP_RMXATTR
, name
, bl
, 0);
6415 const auto& pxattrs
= cur
->get_projected_xattrs();
6416 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6418 respond_to_request(mdr
, r
);
6422 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
6425 auto pi
= cur
->project_inode(mdr
, true);
6426 pi
.inode
->version
= cur
->pre_dirty();
6427 pi
.inode
->ctime
= mdr
->get_op_stamp();
6428 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6429 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6430 pi
.inode
->change_attr
++;
6431 pi
.inode
->xattr_version
++;
6432 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6435 mdr
->ls
= mdlog
->get_current_segment();
6436 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
6437 mdlog
->start_entry(le
);
6438 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6439 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6440 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6442 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6445 void Server::handle_client_getvxattr(MDRequestRef
& mdr
)
6447 const auto& req
= mdr
->client_request
;
6448 string xattr_name
{req
->get_path2()};
6450 // is a ceph virtual xattr?
6451 if (!is_ceph_vxattr(xattr_name
)) {
6452 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6456 CInode
*cur
= rdlock_path_pin_ref(mdr
, true, false);
6461 if (is_ceph_dir_vxattr(xattr_name
)) {
6462 if (!cur
->is_dir()) {
6463 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6466 } else if (is_ceph_file_vxattr(xattr_name
)) {
6467 if (cur
->is_dir()) {
6468 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6473 CachedStackStringStream css
;
6475 ceph::bufferlist bl
;
6476 // handle these vxattrs
6477 if ((xattr_name
.substr(0, 15) == "ceph.dir.layout"sv
) ||
6478 (xattr_name
.substr(0, 16) == "ceph.file.layout"sv
)) {
6479 std::string layout_field
;
6481 struct layout_xattr_info_t
{
6482 enum class InheritanceStatus
: uint32_t {
6488 const file_layout_t layout
;
6489 const InheritanceStatus status
;
6491 layout_xattr_info_t(const file_layout_t
& l
, InheritanceStatus inh
)
6492 : layout(l
), status(inh
) { }
6494 static std::string
status_to_string(InheritanceStatus status
) {
6496 case InheritanceStatus::DEFAULT
: return "default"s
;
6497 case InheritanceStatus::SET
: return "set"s
;
6498 case InheritanceStatus::INHERITED
: return "inherited"s
;
6499 default: return "unknown"s
;
6504 auto is_default_layout
= [&](const file_layout_t
& layout
) -> bool {
6505 return (layout
== mdcache
->default_file_layout
);
6507 auto get_inherited_layout
= [&](CInode
*cur
) -> layout_xattr_info_t
{
6511 if (cur
->get_projected_inode()->has_layout()) {
6512 auto& curr_layout
= cur
->get_projected_inode()->layout
;
6513 if (is_default_layout(curr_layout
)) {
6514 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::DEFAULT
};
6516 if (cur
== orig_in
) {
6517 // we've found a new layout at this inode
6518 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::SET
};
6520 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::INHERITED
};
6524 if (cur
->is_root()) {
6528 cur
= cur
->get_projected_parent_dir()->get_inode();
6530 mds
->clog
->error() << "no layout found at root dir!";
6531 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6534 if (xattr_name
== "ceph.dir.layout.json"sv
||
6535 xattr_name
== "ceph.file.layout.json"sv
) {
6536 // fetch layout only for valid xattr_name
6537 const auto lxi
= get_inherited_layout(cur
);
6539 *css
<< "{\"stripe_unit\": " << lxi
.layout
.stripe_unit
6540 << ", \"stripe_count\": " << lxi
.layout
.stripe_count
6541 << ", \"object_size\": " << lxi
.layout
.object_size
6542 << ", \"pool_name\": ";
6543 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6545 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6546 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6550 *css
<< ", \"pool_id\": " << (uint64_t)lxi
.layout
.pool_id
;
6551 *css
<< ", \"pool_namespace\": \"" << lxi
.layout
.pool_ns
<< "\"";
6552 *css
<< ", \"inheritance\": \"@"
6553 << layout_xattr_info_t::status_to_string(lxi
.status
) << "\"}";
6554 } else if ((xattr_name
== "ceph.dir.layout.pool_name"sv
) ||
6555 (xattr_name
== "ceph.file.layout.pool_name"sv
)) {
6556 // fetch layout only for valid xattr_name
6557 const auto lxi
= get_inherited_layout(cur
);
6558 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6559 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6560 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6563 } else if ((xattr_name
== "ceph.dir.layout.pool_id"sv
) ||
6564 (xattr_name
== "ceph.file.layout.pool_id"sv
)) {
6565 // fetch layout only for valid xattr_name
6566 const auto lxi
= get_inherited_layout(cur
);
6567 *css
<< (uint64_t)lxi
.layout
.pool_id
;
6569 r
= -CEPHFS_ENODATA
; // no such attribute
6571 } else if (xattr_name
.substr(0, 12) == "ceph.dir.pin"sv
) {
6572 if (xattr_name
== "ceph.dir.pin"sv
) {
6573 *css
<< cur
->get_projected_inode()->export_pin
;
6574 } else if (xattr_name
== "ceph.dir.pin.random"sv
) {
6575 *css
<< cur
->get_projected_inode()->export_ephemeral_random_pin
;
6576 } else if (xattr_name
== "ceph.dir.pin.distributed"sv
) {
6577 *css
<< cur
->get_projected_inode()->export_ephemeral_distributed_pin
;
6579 // otherwise respond as invalid request
6580 // since we only handle ceph vxattrs here
6581 r
= -CEPHFS_ENODATA
; // no such attribute
6584 // otherwise respond as invalid request
6585 // since we only handle ceph vxattrs here
6586 r
= -CEPHFS_ENODATA
; // no such attribute
6590 ENCODE_START(1, 1, bl
);
6591 encode(css
->strv(), bl
);
6593 mdr
->reply_extra_bl
= bl
;
6596 respond_to_request(mdr
, r
);
6599 // =================================================================
6600 // DIRECTORY and NAMESPACE OPS
6603 // ------------------------------------------------
6607 class C_MDS_mknod_finish
: public ServerLogContext
{
6611 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
6612 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
6613 void finish(int r
) override
{
6614 ceph_assert(r
== 0);
6617 dn
->pop_projected_linkage();
6619 // be a bit hacky with the inode version, here.. we decrement it
6620 // just to keep mark_dirty() happen. (we didn't bother projecting
6621 // a new version of hte inode since it's just been created)
6622 newi
->mark_dirty(mdr
->ls
);
6623 newi
->mark_dirty_parent(mdr
->ls
, true);
6626 if (newi
->is_dir()) {
6627 CDir
*dir
= newi
->get_dirfrag(frag_t());
6629 dir
->mark_dirty(mdr
->ls
);
6630 dir
->mark_new(mdr
->ls
);
6635 MDRequestRef null_ref
;
6636 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
6638 if (newi
->is_file()) {
6639 get_mds()->locker
->share_inode_max_size(newi
);
6640 } else if (newi
->is_dir()) {
6641 // We do this now so that the linkages on the new directory are stable.
6642 newi
->maybe_ephemeral_rand();
6646 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
6649 server
->respond_to_request(mdr
, 0);
6654 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6656 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6657 client_t client
= mdr
->get_client();
6659 unsigned mode
= req
->head
.args
.mknod
.mode
;
6660 if ((mode
& S_IFMT
) == 0)
6663 mdr
->disable_lock_cache();
6664 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, S_ISREG(mode
));
6668 CDir
*dir
= dn
->get_dir();
6669 CInode
*diri
= dir
->get_inode();
6670 if (!check_access(mdr
, diri
, MAY_WRITE
))
6672 if (!check_fragment_space(mdr
, dir
))
6674 if (!check_dir_max_entries(mdr
, dir
))
6677 ceph_assert(dn
->get_projected_linkage()->is_null());
6678 if (req
->get_alternate_name().size() > alternate_name_max
) {
6679 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6680 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6683 dn
->set_alternate_name(req
->get_alternate_name());
6686 file_layout_t layout
;
6687 if (mdr
->dir_layout
!= file_layout_t())
6688 layout
= mdr
->dir_layout
;
6690 layout
= mdcache
->default_file_layout
;
6692 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6695 dn
->push_projected_linkage(newi
);
6697 auto _inode
= newi
->_get_inode();
6698 _inode
->version
= dn
->pre_dirty();
6699 _inode
->rdev
= req
->head
.args
.mknod
.rdev
;
6700 _inode
->rstat
.rfiles
= 1;
6701 _inode
->accounted_rstat
= _inode
->rstat
;
6702 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6703 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
6704 _inode
->update_backtrace();
6706 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6707 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6708 ceph_assert(follows
>= realm
->get_newest_seq());
6710 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6711 // want to write to it (e.g., if they are reexporting NFS)
6712 if (S_ISREG(_inode
->mode
)) {
6713 // issue a cap on the file
6714 int cmode
= CEPH_FILE_MODE_RDWR
;
6715 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6719 // put locks in excl mode
6720 newi
->filelock
.set_state(LOCK_EXCL
);
6721 newi
->authlock
.set_state(LOCK_EXCL
);
6722 newi
->xattrlock
.set_state(LOCK_EXCL
);
6724 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6725 _inode
->client_ranges
[client
].range
.first
= 0;
6726 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
6727 _inode
->client_ranges
[client
].follows
= follows
;
6728 newi
->mark_clientwriteable();
6729 cap
->mark_clientwriteable();
6733 ceph_assert(dn
->first
== follows
+ 1);
6734 newi
->first
= dn
->first
;
6736 dout(10) << "mknod mode " << _inode
->mode
<< " rdev " << _inode
->rdev
<< dendl
;
6739 mdr
->ls
= mdlog
->get_current_segment();
6740 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6741 mdlog
->start_entry(le
);
6742 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6743 journal_allocated_inos(mdr
, &le
->metablob
);
6745 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6746 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6747 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6749 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6750 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6756 /* This function takes responsibility for the passed mdr*/
6757 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6759 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6761 mdr
->disable_lock_cache();
6762 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6766 CDir
*dir
= dn
->get_dir();
6767 CInode
*diri
= dir
->get_inode();
6769 // mkdir check access
6770 if (!check_access(mdr
, diri
, MAY_WRITE
))
6773 if (!check_fragment_space(mdr
, dir
))
6775 if (!check_dir_max_entries(mdr
, dir
))
6778 ceph_assert(dn
->get_projected_linkage()->is_null());
6779 if (req
->get_alternate_name().size() > alternate_name_max
) {
6780 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6781 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6784 dn
->set_alternate_name(req
->get_alternate_name());
6787 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6790 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6793 // it's a directory.
6794 dn
->push_projected_linkage(newi
);
6796 auto _inode
= newi
->_get_inode();
6797 _inode
->version
= dn
->pre_dirty();
6798 _inode
->rstat
.rsubdirs
= 1;
6799 _inode
->accounted_rstat
= _inode
->rstat
;
6800 _inode
->update_backtrace();
6802 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6803 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6804 ceph_assert(follows
>= realm
->get_newest_seq());
6806 dout(12) << " follows " << follows
<< dendl
;
6807 ceph_assert(dn
->first
== follows
+ 1);
6808 newi
->first
= dn
->first
;
6810 // ...and that new dir is empty.
6811 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
6812 newdir
->state_set(CDir::STATE_CREATING
);
6813 newdir
->mark_complete();
6814 newdir
->_get_fnode()->version
= newdir
->pre_dirty();
6817 mdr
->ls
= mdlog
->get_current_segment();
6818 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
6819 mdlog
->start_entry(le
);
6820 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6821 journal_allocated_inos(mdr
, &le
->metablob
);
6822 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6823 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6824 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
6826 // issue a cap on the directory
6827 int cmode
= CEPH_FILE_MODE_RDWR
;
6828 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6832 // put locks in excl mode
6833 newi
->filelock
.set_state(LOCK_EXCL
);
6834 newi
->authlock
.set_state(LOCK_EXCL
);
6835 newi
->xattrlock
.set_state(LOCK_EXCL
);
6838 // make sure this inode gets into the journal
6839 le
->metablob
.add_opened_ino(newi
->ino());
6841 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6843 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6844 // have overshot the split size (multiple mkdir in flight), so here is
6845 // an early chance to split the dir if this mkdir makes it oversized.
6846 mds
->balancer
->maybe_fragment(dir
, false);
6852 void Server::handle_client_symlink(MDRequestRef
& mdr
)
6854 const auto& req
= mdr
->client_request
;
6856 mdr
->disable_lock_cache();
6857 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6861 CDir
*dir
= dn
->get_dir();
6862 CInode
*diri
= dir
->get_inode();
6864 if (!check_access(mdr
, diri
, MAY_WRITE
))
6866 if (!check_fragment_space(mdr
, dir
))
6868 if (!check_dir_max_entries(mdr
, dir
))
6871 ceph_assert(dn
->get_projected_linkage()->is_null());
6872 if (req
->get_alternate_name().size() > alternate_name_max
) {
6873 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6874 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6876 dn
->set_alternate_name(req
->get_alternate_name());
6878 unsigned mode
= S_IFLNK
| 0777;
6879 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6883 dn
->push_projected_linkage(newi
);
6885 newi
->symlink
= req
->get_path2();
6886 auto _inode
= newi
->_get_inode();
6887 _inode
->version
= dn
->pre_dirty();
6888 _inode
->size
= newi
->symlink
.length();
6889 _inode
->rstat
.rbytes
= _inode
->size
;
6890 _inode
->rstat
.rfiles
= 1;
6891 _inode
->accounted_rstat
= _inode
->rstat
;
6892 _inode
->update_backtrace();
6894 newi
->first
= dn
->first
;
6897 mdr
->ls
= mdlog
->get_current_segment();
6898 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
6899 mdlog
->start_entry(le
);
6900 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6901 journal_allocated_inos(mdr
, &le
->metablob
);
6902 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6903 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6905 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6906 mds
->balancer
->maybe_fragment(dir
, false);
6915 void Server::handle_client_link(MDRequestRef
& mdr
)
6917 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6919 dout(7) << "handle_client_link " << req
->get_filepath()
6920 << " to " << req
->get_filepath2()
6923 mdr
->disable_lock_cache();
6928 if (req
->get_filepath2().depth() == 0) {
6929 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
6931 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
6932 inodeno_t ino
= req
->get_filepath2().get_ino();
6933 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
6938 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
6939 CDentry
*pdn
= targeti
->get_projected_parent_dn();
6941 dout(7) << "target has no parent dn, failing..." << dendl
;
6942 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6945 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
6947 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
6950 destdn
= rdlock_path_xlock_dentry(mdr
, false);
6954 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
6959 if (!destdn
->get_projected_linkage()->is_null()) {
6960 respond_to_request(mdr
, -CEPHFS_EEXIST
);
6964 targeti
= ret
.second
->get_projected_linkage()->get_inode();
6967 ceph_assert(destdn
->get_projected_linkage()->is_null());
6968 if (req
->get_alternate_name().size() > alternate_name_max
) {
6969 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6970 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6973 destdn
->set_alternate_name(req
->get_alternate_name());
6975 if (targeti
->is_dir()) {
6976 dout(7) << "target is a dir, failing..." << dendl
;
6977 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6981 CDir
*dir
= destdn
->get_dir();
6982 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
6983 dout(7) << "target is " << *targeti
<< dendl
;
6985 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6986 MutationImpl::LockOpVec lov
;
6987 lov
.add_xlock(&targeti
->snaplock
);
6988 lov
.add_xlock(&targeti
->linklock
);
6990 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6993 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6996 if (targeti
->get_projected_inode()->nlink
== 0) {
6997 dout(7) << "target has no link, failing..." << dendl
;
6998 respond_to_request(mdr
, -CEPHFS_ENOENT
);
7002 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7003 if (!check_access(mdr
, targeti
, MAY_WRITE
))
7006 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
7009 if (!check_fragment_space(mdr
, dir
))
7012 if (!check_dir_max_entries(mdr
, dir
))
7016 CInode
* target_pin
= targeti
->get_projected_parent_dir()->inode
;
7017 SnapRealm
*target_realm
= target_pin
->find_snaprealm();
7018 if (target_pin
!= dir
->inode
&&
7019 target_realm
->get_subvolume_ino() !=
7020 dir
->inode
->find_snaprealm()->get_subvolume_ino()) {
7021 dout(7) << "target is in different subvolume, failing..." << dendl
;
7022 respond_to_request(mdr
, -CEPHFS_EXDEV
);
7027 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
7030 if (targeti
->is_auth())
7031 _link_local(mdr
, destdn
, targeti
, target_realm
);
7033 _link_remote(mdr
, true, destdn
, targeti
);
7034 mds
->balancer
->maybe_fragment(dir
, false);
7038 class C_MDS_link_local_finish
: public ServerLogContext
{
7045 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
7046 version_t dnpv_
, version_t tipv_
, bool ar
) :
7047 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
7048 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
7049 void finish(int r
) override
{
7050 ceph_assert(r
== 0);
7051 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
7056 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
)
7058 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
7060 mdr
->ls
= mdlog
->get_current_segment();
7062 // predirty NEW dentry
7063 version_t dnpv
= dn
->pre_dirty();
7064 version_t tipv
= targeti
->pre_dirty();
7066 // project inode update
7067 auto pi
= targeti
->project_inode(mdr
);
7069 pi
.inode
->ctime
= mdr
->get_op_stamp();
7070 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7071 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7072 pi
.inode
->change_attr
++;
7073 pi
.inode
->version
= tipv
;
7075 bool adjust_realm
= false;
7076 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7077 sr_t
*newsnap
= targeti
->project_snaprealm();
7078 targeti
->mark_snaprealm_global(newsnap
);
7079 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, targeti
->get_projected_parent_dn(), true);
7080 adjust_realm
= true;
7084 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
7085 mdlog
->start_entry(le
);
7086 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7087 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
7088 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
7089 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7090 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
7092 // do this after predirty_*, to avoid funky extra dnl arg
7093 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7095 journal_and_reply(mdr
, targeti
, dn
, le
,
7096 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
7099 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
7100 version_t dnpv
, version_t tipv
, bool adjust_realm
)
7102 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
7104 // link and unlock the NEW dentry
7105 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7106 if (!dnl
->get_inode())
7107 dn
->link_remote(dnl
, targeti
);
7108 dn
->mark_dirty(dnpv
, mdr
->ls
);
7113 MDRequestRef null_ref
;
7114 mdcache
->send_dentry_link(dn
, null_ref
);
7117 int op
= CEPH_SNAP_OP_SPLIT
;
7118 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7119 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7122 // bump target popularity
7123 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7124 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7127 respond_to_request(mdr
, 0);
7131 // link / unlink remote
7133 class C_MDS_link_remote_finish
: public ServerLogContext
{
7139 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
7140 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
7141 dpv(d
->get_projected_version()) {}
7142 void finish(int r
) override
{
7143 ceph_assert(r
== 0);
7144 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
7148 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
7150 dout(10) << "_link_remote "
7151 << (inc
? "link ":"unlink ")
7152 << *dn
<< " to " << *targeti
<< dendl
;
7154 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7155 mds_rank_t linkauth
= targeti
->authority().first
;
7156 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
7157 if (mds
->is_cluster_degraded() &&
7158 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
7159 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
7160 if (mdr
->more()->waiting_on_peer
.empty())
7161 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
7165 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
7168 op
= MMDSPeerRequest::OP_LINKPREP
;
7170 op
= MMDSPeerRequest::OP_UNLINKPREP
;
7171 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
7172 targeti
->set_object_info(req
->get_object_info());
7173 req
->op_stamp
= mdr
->get_op_stamp();
7174 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
7175 encode(*desti_srnode
, req
->desti_snapbl
);
7176 mds
->send_message_mds(req
, linkauth
);
7178 ceph_assert(mdr
->more()->waiting_on_peer
.count(linkauth
) == 0);
7179 mdr
->more()->waiting_on_peer
.insert(linkauth
);
7182 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
7184 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
7186 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
7187 delete desti_srnode
;
7188 desti_srnode
= NULL
;
7191 mdr
->set_mds_stamp(ceph_clock_now());
7194 mdr
->ls
= mdlog
->get_current_segment();
7195 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
7196 mdlog
->start_entry(le
);
7197 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7198 if (!mdr
->more()->witnessed
.empty()) {
7199 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7200 le
->reqid
= mdr
->reqid
;
7201 le
->had_peers
= true;
7202 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7207 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
7208 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7209 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7212 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7213 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7214 le
->metablob
.add_null_dentry(dn
, true);
7215 dn
->push_projected_linkage();
7218 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
7219 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
7222 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
7223 CDentry
*dn
, CInode
*targeti
,
7226 dout(10) << "_link_remote_finish "
7227 << (inc
? "link ":"unlink ")
7228 << *dn
<< " to " << *targeti
<< dendl
;
7230 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
7232 if (!mdr
->more()->witnessed
.empty())
7233 mdcache
->logged_leader_update(mdr
->reqid
);
7236 // link the new dentry
7237 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7238 if (!dnl
->get_inode())
7239 dn
->link_remote(dnl
, targeti
);
7240 dn
->mark_dirty(dpv
, mdr
->ls
);
7242 // unlink main dentry
7243 dn
->get_dir()->unlink_inode(dn
);
7244 dn
->pop_projected_linkage();
7245 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
7250 MDRequestRef null_ref
;
7252 mdcache
->send_dentry_link(dn
, null_ref
);
7254 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
7256 // bump target popularity
7257 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7258 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7261 respond_to_request(mdr
, 0);
7264 // removing a new dn?
7265 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7269 // remote linking/unlinking
7271 class C_MDS_PeerLinkPrep
: public ServerLogContext
{
7275 C_MDS_PeerLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
7276 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
7277 void finish(int r
) override
{
7278 ceph_assert(r
== 0);
7279 server
->_logged_peer_link(mdr
, targeti
, adjust_realm
);
7283 class C_MDS_PeerLinkCommit
: public ServerContext
{
7287 C_MDS_PeerLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
7288 ServerContext(s
), mdr(r
), targeti(t
) { }
7289 void finish(int r
) override
{
7290 server
->_commit_peer_link(mdr
, r
, targeti
);
7294 void Server::handle_peer_link_prep(MDRequestRef
& mdr
)
7296 dout(10) << "handle_peer_link_prep " << *mdr
7297 << " on " << mdr
->peer_request
->get_object_info()
7300 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
7302 CInode
*targeti
= mdcache
->get_inode(mdr
->peer_request
->get_object_info().ino
);
7303 ceph_assert(targeti
);
7304 dout(10) << "targeti " << *targeti
<< dendl
;
7305 CDentry
*dn
= targeti
->get_parent_dn();
7306 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7307 ceph_assert(dnl
->is_primary());
7309 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7311 mdr
->auth_pin(targeti
);
7313 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7314 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
7317 mdr
->ls
= mdlog
->get_current_segment();
7318 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_prep", mdr
->reqid
, mdr
->peer_to_mds
,
7319 EPeerUpdate::OP_PREPARE
, EPeerUpdate::LINK
);
7320 mdlog
->start_entry(le
);
7322 auto pi
= dnl
->get_inode()->project_inode(mdr
);
7324 // update journaled target inode
7326 bool adjust_realm
= false;
7327 bool realm_projected
= false;
7328 if (mdr
->peer_request
->get_op() == MMDSPeerRequest::OP_LINKPREP
) {
7332 CDentry
*target_pdn
= targeti
->get_projected_parent_dn();
7333 SnapRealm
*target_realm
= target_pdn
->get_dir()->inode
->find_snaprealm();
7334 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7335 sr_t
*newsnap
= targeti
->project_snaprealm();
7336 targeti
->mark_snaprealm_global(newsnap
);
7337 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, target_pdn
, true);
7338 adjust_realm
= true;
7339 realm_projected
= true;
7344 if (targeti
->is_projected_snaprealm_global()) {
7345 ceph_assert(mdr
->peer_request
->desti_snapbl
.length());
7346 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
7348 sr_t
*newsnap
= targeti
->project_snaprealm();
7349 decode(*newsnap
, p
);
7351 if (pi
.inode
->nlink
== 0)
7352 ceph_assert(!newsnap
->is_parent_global());
7354 realm_projected
= true;
7356 ceph_assert(mdr
->peer_request
->desti_snapbl
.length() == 0);
7360 link_rollback rollback
;
7361 rollback
.reqid
= mdr
->reqid
;
7362 rollback
.ino
= targeti
->ino();
7363 rollback
.old_ctime
= targeti
->get_inode()->ctime
; // we hold versionlock xlock; no concorrent projections
7364 const auto& pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
7365 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
7366 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
7367 rollback
.was_inc
= inc
;
7368 if (realm_projected
) {
7369 if (targeti
->snaprealm
) {
7370 encode(true, rollback
.snapbl
);
7371 targeti
->encode_snap_blob(rollback
.snapbl
);
7373 encode(false, rollback
.snapbl
);
7376 encode(rollback
, le
->rollback
);
7377 mdr
->more()->rollback_bl
= le
->rollback
;
7379 pi
.inode
->ctime
= mdr
->get_op_stamp();
7380 pi
.inode
->version
= targeti
->pre_dirty();
7382 dout(10) << " projected inode " << pi
.inode
->ino
<< " v " << pi
.inode
->version
<< dendl
;
7385 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
7386 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
7387 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7389 // set up commit waiter
7390 mdr
->more()->peer_commit
= new C_MDS_PeerLinkCommit(this, mdr
, targeti
);
7392 mdr
->more()->peer_update_journaled
= true;
7393 submit_mdlog_entry(le
, new C_MDS_PeerLinkPrep(this, mdr
, targeti
, adjust_realm
),
7398 void Server::_logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
7400 dout(10) << "_logged_peer_link " << *mdr
7401 << " " << *targeti
<< dendl
;
7403 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
7405 // update the target
7409 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7412 mdr
->reset_peer_request();
7415 int op
= CEPH_SNAP_OP_SPLIT
;
7416 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7417 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7421 if (!mdr
->aborted
) {
7422 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_LINKPREPACK
);
7423 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7425 dout(10) << " abort flag set, finishing" << dendl
;
7426 mdcache
->request_finish(mdr
);
7431 struct C_MDS_CommittedPeer
: public ServerLogContext
{
7432 C_MDS_CommittedPeer(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
7433 void finish(int r
) override
{
7434 server
->_committed_peer(mdr
);
7438 void Server::_commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
7440 dout(10) << "_commit_peer_link " << *mdr
7442 << " " << *targeti
<< dendl
;
7444 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
7447 // drop our pins, etc.
7450 // write a commit to the journal
7451 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_commit", mdr
->reqid
, mdr
->peer_to_mds
,
7452 EPeerUpdate::OP_COMMIT
, EPeerUpdate::LINK
);
7453 mdlog
->start_entry(le
);
7454 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7457 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7461 void Server::_committed_peer(MDRequestRef
& mdr
)
7463 dout(10) << "_committed_peer " << *mdr
<< dendl
;
7465 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
7467 bool assert_exist
= mdr
->more()->peer_update_journaled
;
7468 mdcache
->finish_uncommitted_peer(mdr
->reqid
, assert_exist
);
7469 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_COMMITTED
);
7470 mds
->send_message_mds(req
, mdr
->peer_to_mds
);
7471 mdcache
->request_finish(mdr
);
7474 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
7476 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7477 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7478 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
7479 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
7481 void finish(int r
) override
{
7482 server
->_link_rollback_finish(mut
, mdr
, splits
);
7486 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7488 link_rollback rollback
;
7489 auto p
= rbl
.cbegin();
7490 decode(rollback
, p
);
7492 dout(10) << "do_link_rollback on " << rollback
.reqid
7493 << (rollback
.was_inc
? " inc":" dec")
7494 << " ino " << rollback
.ino
7497 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
7499 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7500 ceph_assert(mdr
|| mds
->is_resolve());
7502 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7503 mut
->ls
= mds
->mdlog
->get_current_segment();
7505 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
7507 dout(10) << " target is " << *in
<< dendl
;
7508 ceph_assert(!in
->is_projected()); // live peer request hold versionlock xlock.
7510 auto pi
= in
->project_inode(mut
);
7511 pi
.inode
->version
= in
->pre_dirty();
7513 // parent dir rctime
7514 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
7515 auto pf
= parent
->project_fnode(mut
);
7516 pf
->version
= parent
->pre_dirty();
7517 if (pf
->fragstat
.mtime
== pi
.inode
->ctime
) {
7518 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
7519 if (pf
->rstat
.rctime
== pi
.inode
->ctime
)
7520 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
7521 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
7522 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
7526 pi
.inode
->ctime
= rollback
.old_ctime
;
7527 if (rollback
.was_inc
)
7532 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7533 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7535 auto p
= rollback
.snapbl
.cbegin();
7536 decode(hadrealm
, p
);
7538 if (!mds
->is_resolve()) {
7539 sr_t
*new_srnode
= new sr_t();
7540 decode(*new_srnode
, p
);
7541 in
->project_snaprealm(new_srnode
);
7543 decode(in
->snaprealm
->srnode
, p
);
7546 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
7547 if (!mds
->is_resolve())
7548 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
7549 in
->project_snaprealm(NULL
);
7554 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_rollback", rollback
.reqid
, leader
,
7555 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::LINK
);
7556 mdlog
->start_entry(le
);
7557 le
->commit
.add_dir_context(parent
);
7558 le
->commit
.add_dir(parent
, true);
7559 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
7561 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
7566 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
7567 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
7569 dout(10) << "_link_rollback_finish" << dendl
;
7571 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
7575 if (!mds
->is_resolve())
7576 mdcache
->send_snaps(splits
);
7579 mdcache
->request_finish(mdr
);
7581 mdcache
->finish_rollback(mut
->reqid
, mdr
);
7587 void Server::handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
)
7589 dout(10) << "handle_peer_link_prep_ack " << *mdr
7590 << " " << *m
<< dendl
;
7591 mds_rank_t from
= mds_rank_t(m
->get_source().num());
7593 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
7596 mdr
->more()->peers
.insert(from
);
7599 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
7600 mdr
->more()->witnessed
.insert(from
);
7601 ceph_assert(!m
->is_not_journaled());
7602 mdr
->more()->has_journaled_peers
= true;
7604 // remove from waiting list
7605 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7606 mdr
->more()->waiting_on_peer
.erase(from
);
7608 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
7610 dispatch_client_request(mdr
); // go again!
7619 void Server::handle_client_unlink(MDRequestRef
& mdr
)
7621 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7622 client_t client
= mdr
->get_client();
7625 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
7628 mdr
->disable_lock_cache();
7629 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
7633 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
7634 ceph_assert(!dnl
->is_null());
7635 CInode
*in
= dnl
->get_inode();
7638 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
7640 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
7642 dout(7) << "dn links to " << *in
<< dendl
;
7647 // do empty directory checks
7648 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
7649 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7653 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
7654 respond_to_request(mdr
, -CEPHFS_EISDIR
);
7660 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
7661 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
7666 CInode
*diri
= dn
->get_dir()->get_inode();
7667 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7668 if (!check_access(mdr
, diri
, MAY_WRITE
))
7672 // -- create stray dentry? --
7673 CDentry
*straydn
= NULL
;
7674 if (dnl
->is_primary()) {
7675 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
7678 dout(10) << " straydn is " << *straydn
<< dendl
;
7679 } else if (mdr
->straydn
) {
7680 mdr
->unpin(mdr
->straydn
);
7681 mdr
->straydn
= NULL
;
7685 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7686 MutationImpl::LockOpVec lov
;
7688 lov
.add_xlock(&in
->linklock
);
7689 lov
.add_xlock(&in
->snaplock
);
7691 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
7694 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7695 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7696 lov
.add_xlock(&straydn
->lock
);
7699 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7702 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7706 _dir_is_nonempty(mdr
, in
)) {
7707 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7712 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7714 if (!mdr
->more()->desti_srnode
) {
7715 if (in
->is_projected_snaprealm_global()) {
7716 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7717 in
->record_snaprealm_parent_dentry(new_srnode
, nullptr, dn
, dnl
->is_primary());
7718 // dropping the last linkage or dropping the last remote linkage,
7719 // detch the inode from global snaprealm
7720 auto nlink
= in
->get_projected_inode()->nlink
;
7722 (nlink
== 2 && !dnl
->is_primary() &&
7723 !in
->get_projected_parent_dir()->inode
->is_stray()))
7724 in
->clear_snaprealm_global(new_srnode
);
7725 mdr
->more()->desti_srnode
= new_srnode
;
7726 } else if (dnl
->is_primary()) {
7727 // prepare snaprealm blob for peer request
7728 SnapRealm
*realm
= in
->find_snaprealm();
7729 snapid_t follows
= realm
->get_newest_seq();
7730 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7731 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7732 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7733 mdr
->more()->desti_srnode
= new_srnode
;
7739 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7740 // subtree root auths need to be witnesses
7741 set
<mds_rank_t
> witnesses
;
7742 in
->list_replicas(witnesses
);
7743 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7745 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7746 p
!= witnesses
.end();
7748 if (mdr
->more()->witnessed
.count(*p
)) {
7749 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7750 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
7751 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7753 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7757 if (!mdr
->more()->waiting_on_peer
.empty())
7758 return; // we're waiting for a witness.
7761 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
7762 mds
->locker
->create_lock_cache(mdr
, diri
);
7765 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7766 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7768 _unlink_local(mdr
, dn
, straydn
);
7771 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7774 version_t dnpv
; // deleted dentry
7776 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7777 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
7778 dnpv(d
->get_projected_version()) {}
7779 void finish(int r
) override
{
7780 ceph_assert(r
== 0);
7781 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
7785 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7787 dout(10) << "_unlink_local " << *dn
<< dendl
;
7789 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7790 CInode
*in
= dnl
->get_inode();
7794 mdr
->ls
= mdlog
->get_current_segment();
7796 // prepare log entry
7797 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
7798 mdlog
->start_entry(le
);
7799 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7800 if (!mdr
->more()->witnessed
.empty()) {
7801 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7802 le
->reqid
= mdr
->reqid
;
7803 le
->had_peers
= true;
7804 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7808 ceph_assert(dnl
->is_primary());
7809 straydn
->push_projected_linkage(in
);
7812 // the unlinked dentry
7815 auto pi
= in
->project_inode(mdr
);
7818 dn
->make_path_string(t
, true);
7819 pi
.inode
->stray_prior_path
= std::move(t
);
7821 pi
.inode
->version
= in
->pre_dirty();
7822 pi
.inode
->ctime
= mdr
->get_op_stamp();
7823 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7824 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7825 pi
.inode
->change_attr
++;
7827 if (pi
.inode
->nlink
== 0)
7828 in
->state_set(CInode::STATE_ORPHAN
);
7830 if (mdr
->more()->desti_srnode
) {
7831 auto& desti_srnode
= mdr
->more()->desti_srnode
;
7832 in
->project_snaprealm(desti_srnode
);
7833 desti_srnode
= NULL
;
7837 // will manually pop projected inode
7839 // primary link. add stray dentry.
7840 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
7841 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7843 pi
.inode
->update_backtrace();
7844 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
7846 // remote link. update remote inode.
7847 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7848 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
7849 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
7852 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7853 le
->metablob
.add_null_dentry(dn
, true);
7856 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7857 le
->metablob
.renamed_dirino
= in
->ino();
7860 dn
->push_projected_linkage();
7863 ceph_assert(in
->first
<= straydn
->first
);
7864 in
->first
= straydn
->first
;
7868 ceph_assert(straydn
);
7869 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7872 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
7875 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
7876 CDentry
*dn
, CDentry
*straydn
,
7879 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
7881 if (!mdr
->more()->witnessed
.empty())
7882 mdcache
->logged_leader_update(mdr
->reqid
);
7884 CInode
*strayin
= NULL
;
7885 bool hadrealm
= false;
7887 // if there is newly created snaprealm, need to split old snaprealm's
7888 // inodes_with_caps. So pop snaprealm before linkage changes.
7889 strayin
= dn
->get_linkage()->get_inode();
7890 hadrealm
= strayin
->snaprealm
? true : false;
7891 strayin
->early_pop_projected_snaprealm();
7894 // unlink main dentry
7895 dn
->get_dir()->unlink_inode(dn
);
7896 dn
->pop_projected_linkage();
7897 dn
->mark_dirty(dnpv
, mdr
->ls
);
7899 // relink as stray? (i.e. was primary link?)
7901 dout(20) << " straydn is " << *straydn
<< dendl
;
7902 straydn
->pop_projected_linkage();
7903 mdcache
->touch_dentry_bottom(straydn
);
7908 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
7911 // update subtree map?
7912 if (strayin
->is_dir())
7913 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
7915 if (strayin
->snaprealm
&& !hadrealm
)
7916 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
7920 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7923 respond_to_request(mdr
, 0);
7925 // removing a new dn?
7926 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7929 // respond_to_request() drops locks. So stray reintegration can race with us.
7930 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7931 // Tip off the MDCache that this dentry is a stray that
7932 // might be elegible for purge.
7933 mdcache
->notify_stray(straydn
);
7937 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
7939 if (mds
->is_cluster_degraded() &&
7940 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7941 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
7942 if (mdr
->more()->waiting_on_peer
.empty())
7943 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7947 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
7948 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREP
);
7949 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
7950 for (auto dn
: trace
)
7951 req
->srcdnpath
.push_dentry(dn
->get_name());
7952 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
7953 if (mdr
->more()->desti_srnode
)
7954 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7956 req
->op_stamp
= mdr
->get_op_stamp();
7957 mds
->send_message_mds(req
, who
);
7959 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
7960 mdr
->more()->waiting_on_peer
.insert(who
);
7964 struct C_MDS_PeerRmdirPrep
: public ServerLogContext
{
7965 CDentry
*dn
, *straydn
;
7966 C_MDS_PeerRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
7967 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
7968 void finish(int r
) override
{
7969 server
->_logged_peer_rmdir(mdr
, dn
, straydn
);
7973 struct C_MDS_PeerRmdirCommit
: public ServerContext
{
7976 C_MDS_PeerRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
7977 : ServerContext(s
), mdr(r
), straydn(sd
) { }
7978 void finish(int r
) override
{
7979 server
->_commit_peer_rmdir(mdr
, r
, straydn
);
7983 void Server::handle_peer_rmdir_prep(MDRequestRef
& mdr
)
7985 dout(10) << "handle_peer_rmdir_prep " << *mdr
7986 << " " << mdr
->peer_request
->srcdnpath
7987 << " to " << mdr
->peer_request
->destdnpath
7990 vector
<CDentry
*> trace
;
7991 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
7992 dout(10) << " src " << srcpath
<< dendl
;
7994 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
7995 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
7996 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
7999 if (r
== -CEPHFS_ESTALE
) {
8000 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
8001 mdr
->peer_to_mds
, true);
8004 ceph_assert(r
== 0);
8005 CDentry
*dn
= trace
.back();
8006 dout(10) << " dn " << *dn
<< dendl
;
8009 ceph_assert(mdr
->straydn
);
8010 CDentry
*straydn
= mdr
->straydn
;
8011 dout(10) << " straydn " << *straydn
<< dendl
;
8013 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
8015 rmdir_rollback rollback
;
8016 rollback
.reqid
= mdr
->reqid
;
8017 rollback
.src_dir
= dn
->get_dir()->dirfrag();
8018 rollback
.src_dname
= dn
->get_name();
8019 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
8020 rollback
.dest_dname
= straydn
->get_name();
8021 if (mdr
->peer_request
->desti_snapbl
.length()) {
8022 if (in
->snaprealm
) {
8023 encode(true, rollback
.snapbl
);
8024 in
->encode_snap_blob(rollback
.snapbl
);
8026 encode(false, rollback
.snapbl
);
8029 encode(rollback
, mdr
->more()->rollback_bl
);
8030 // FIXME: rollback snaprealm
8031 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
8033 // set up commit waiter
8034 mdr
->more()->peer_commit
= new C_MDS_PeerRmdirCommit(this, mdr
, straydn
);
8036 straydn
->push_projected_linkage(in
);
8037 dn
->push_projected_linkage();
8039 ceph_assert(straydn
->first
>= in
->first
);
8040 in
->first
= straydn
->first
;
8042 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
8043 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
8044 _logged_peer_rmdir(mdr
, dn
, straydn
);
8048 mdr
->ls
= mdlog
->get_current_segment();
8049 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir", mdr
->reqid
, mdr
->peer_to_mds
,
8050 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RMDIR
);
8051 mdlog
->start_entry(le
);
8052 le
->rollback
= mdr
->more()->rollback_bl
;
8054 le
->commit
.add_dir_context(straydn
->get_dir());
8055 le
->commit
.add_primary_dentry(straydn
, in
, true);
8056 // peer: no need to journal original dentry
8058 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8059 le
->commit
.renamed_dirino
= in
->ino();
8061 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
8062 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
8064 mdr
->more()->peer_update_journaled
= true;
8065 submit_mdlog_entry(le
, new C_MDS_PeerRmdirPrep(this, mdr
, dn
, straydn
),
8070 void Server::_logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
8072 dout(10) << "_logged_peer_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
8073 CInode
*in
= dn
->get_linkage()->get_inode();
8076 if (mdr
->peer_request
->desti_snapbl
.length()) {
8077 new_realm
= !in
->snaprealm
;
8078 in
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
8079 ceph_assert(in
->snaprealm
);
8084 // update our cache now, so we are consistent with what is in the journal
8085 // when we journal a subtree map
8086 dn
->get_dir()->unlink_inode(dn
);
8087 straydn
->pop_projected_linkage();
8088 dn
->pop_projected_linkage();
8090 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->peer_update_journaled
);
8093 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
8096 mdr
->reset_peer_request();
8099 if (!mdr
->aborted
) {
8100 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREPACK
);
8101 if (!mdr
->more()->peer_update_journaled
)
8102 reply
->mark_not_journaled();
8103 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
8105 dout(10) << " abort flag set, finishing" << dendl
;
8106 mdcache
->request_finish(mdr
);
8110 void Server::handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
8112 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8113 << " " << *ack
<< dendl
;
8115 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8117 mdr
->more()->peers
.insert(from
);
8118 mdr
->more()->witnessed
.insert(from
);
8119 if (!ack
->is_not_journaled())
8120 mdr
->more()->has_journaled_peers
= true;
8122 // remove from waiting list
8123 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
8124 mdr
->more()->waiting_on_peer
.erase(from
);
8126 if (mdr
->more()->waiting_on_peer
.empty())
8127 dispatch_client_request(mdr
); // go again!
8129 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
8132 void Server::_commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
8134 dout(10) << "_commit_peer_rmdir " << *mdr
<< " r=" << r
<< dendl
;
8137 if (mdr
->more()->peer_update_journaled
) {
8138 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
8139 if (strayin
&& !strayin
->snaprealm
)
8140 mdcache
->clear_dirty_bits_for_stray(strayin
);
8145 if (mdr
->more()->peer_update_journaled
) {
8146 // write a commit to the journal
8147 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_commit", mdr
->reqid
,
8148 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
8149 EPeerUpdate::RMDIR
);
8150 mdlog
->start_entry(le
);
8151 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
8154 _committed_peer(mdr
);
8158 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
8162 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
8166 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
8167 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
8168 void finish(int r
) override
{
8169 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
8173 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
8175 // unlink the other rollback methods, the rmdir rollback is only
8176 // needed to record the subtree changes in the journal for inode
8177 // replicas who are auth for empty dirfrags. no actual changes to
8178 // the file system are taking place here, so there is no Mutation.
8180 rmdir_rollback rollback
;
8181 auto p
= rbl
.cbegin();
8182 decode(rollback
, p
);
8184 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
8185 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
8186 ceph_assert(mdr
|| mds
->is_resolve());
8188 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
8190 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
8192 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
8194 dout(10) << " dn " << *dn
<< dendl
;
8195 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
8196 ceph_assert(straydir
);
8197 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
8198 ceph_assert(straydn
);
8199 dout(10) << " straydn " << *straydn
<< dendl
;
8200 CInode
*in
= straydn
->get_linkage()->get_inode();
8202 dn
->push_projected_linkage(in
);
8203 straydn
->push_projected_linkage();
8205 if (rollback
.snapbl
.length() && in
->snaprealm
) {
8207 auto p
= rollback
.snapbl
.cbegin();
8208 decode(hadrealm
, p
);
8210 decode(in
->snaprealm
->srnode
, p
);
8212 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
8216 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
8217 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
8219 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
8224 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_rollback", rollback
.reqid
, leader
,
8225 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RMDIR
);
8226 mdlog
->start_entry(le
);
8228 le
->commit
.add_dir_context(dn
->get_dir());
8229 le
->commit
.add_primary_dentry(dn
, in
, true);
8230 // peer: no need to journal straydn
8232 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8233 le
->commit
.renamed_dirino
= in
->ino();
8235 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
8237 submit_mdlog_entry(le
,
8238 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
8244 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
8246 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
8248 straydn
->get_dir()->unlink_inode(straydn
);
8249 dn
->pop_projected_linkage();
8250 straydn
->pop_projected_linkage();
8252 CInode
*in
= dn
->get_linkage()->get_inode();
8253 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
8254 !mdr
|| mdr
->more()->peer_update_journaled
);
8256 if (mds
->is_resolve()) {
8257 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
8258 mdcache
->try_trim_non_auth_subtree(root
);
8262 mdcache
->request_finish(mdr
);
8264 mdcache
->finish_rollback(reqid
, mdr
);
8268 /** _dir_is_nonempty[_unlocked]
8270 * check if a directory is non-empty (i.e. we can rmdir it).
8272 * the unlocked varient this is a fastpath check. we can't really be
8273 * sure until we rdlock the filelock.
8275 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
8277 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
8278 ceph_assert(in
->is_auth());
8280 if (in
->filelock
.is_cached())
8281 return false; // there can be pending async create/unlink. don't know.
8282 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
8283 return true; // in a snapshot!
8285 auto&& ls
= in
->get_dirfrags();
8286 for (const auto& dir
: ls
) {
8287 // is the frag obviously non-empty?
8288 if (dir
->is_auth()) {
8289 if (dir
->get_projected_fnode()->fragstat
.size()) {
8290 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8291 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
8300 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
8302 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
8303 ceph_assert(in
->is_auth());
8304 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
8306 frag_info_t dirstat
;
8307 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
8309 auto&& ls
= in
->get_dirfrags();
8310 for (const auto& dir
: ls
) {
8311 const auto& pf
= dir
->get_projected_fnode();
8312 if (pf
->fragstat
.size()) {
8313 dout(10) << "dir_is_nonempty dirstat has "
8314 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
8318 if (pf
->accounted_fragstat
.version
== dirstat_version
)
8319 dirstat
.add(pf
->accounted_fragstat
);
8321 dirstat
.add(pf
->fragstat
);
8324 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
8328 // ======================================================
8331 class C_MDS_rename_finish
: public ServerLogContext
{
8336 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
8337 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
8338 ServerLogContext(s
, r
),
8339 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
8340 void finish(int r
) override
{
8341 ceph_assert(r
== 0);
8342 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
8347 /** handle_client_rename
8349 * rename leader is the destdn auth. this is because cached inodes
8350 * must remain connected. thus, any replica of srci, must also
8351 * replicate destdn, and possibly straydn, so that srci (and
8352 * destdn->inode) remain connected during the rename.
8354 * to do this, we freeze srci, then leader (destdn auth) verifies that
8355 * all other nodes have also replciated destdn and straydn. note that
8356 * destdn replicas need not also replicate srci. this only works when
8359 * This function takes responsibility for the passed mdr.
8361 void Server::handle_client_rename(MDRequestRef
& mdr
)
8363 const auto& req
= mdr
->client_request
;
8364 dout(7) << "handle_client_rename " << *req
<< dendl
;
8366 filepath destpath
= req
->get_filepath();
8367 filepath srcpath
= req
->get_filepath2();
8368 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
8369 respond_to_request(mdr
, -CEPHFS_EBUSY
);
8373 if (req
->get_alternate_name().size() > alternate_name_max
) {
8374 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
8375 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
8379 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
8383 dout(10) << " destdn " << *destdn
<< dendl
;
8384 CDir
*destdir
= destdn
->get_dir();
8385 ceph_assert(destdir
->is_auth());
8386 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8388 dout(10) << " srcdn " << *srcdn
<< dendl
;
8389 CDir
*srcdir
= srcdn
->get_dir();
8390 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8391 CInode
*srci
= srcdnl
->get_inode();
8392 dout(10) << " srci " << *srci
<< dendl
;
8394 // -- some sanity checks --
8395 if (destdn
== srcdn
) {
8396 dout(7) << "rename src=dest, noop" << dendl
;
8397 respond_to_request(mdr
, 0);
8401 // dest a child of src?
8402 // e.g. mv /usr /usr/foo
8403 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
8404 dout(7) << "cannot rename item to be a child of itself" << dendl
;
8405 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8409 // is this a stray migration, reintegration or merge? (sanity checks!)
8410 if (mdr
->reqid
.name
.is_mds() &&
8411 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
8412 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
8413 !(destdnl
->is_remote() &&
8414 destdnl
->get_remote_ino() == srci
->ino())) {
8415 respond_to_request(mdr
, -CEPHFS_EINVAL
); // actually, this won't reply, but whatev.
8420 if (!destdnl
->is_null()) {
8421 //dout(10) << "dest dn exists " << *destdn << dendl;
8422 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
8424 dout(10) << " oldin " << *oldin
<< dendl
;
8426 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8427 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
8428 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8432 // mv /some/thing /to/some/existing_other_thing
8433 if (oldin
->is_dir() && !srci
->is_dir()) {
8434 respond_to_request(mdr
, -CEPHFS_EISDIR
);
8437 if (!oldin
->is_dir() && srci
->is_dir()) {
8438 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
8441 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
8442 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
8445 if (destdn
->get_alternate_name() != req
->get_alternate_name()) {
8446 /* the dentry exists but the alternate_names do not match, fail... */
8447 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8452 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
8453 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
8455 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8456 if (destpath
.get_ino() != srcpath
.get_ino() &&
8457 !(req
->get_source().is_mds() &&
8458 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8459 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
8460 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
8461 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8462 while (srcbase
!= destbase
&&
8463 !srcbase
->is_projected_ancestor_of(destbase
)) {
8464 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
8465 srctrace
.insert(srctrace
.begin(), pdn
);
8466 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
8467 srcbase
= pdn
->get_dir()->get_inode();
8470 // then, extend destpath until it shares the same parent inode as srcpath.
8471 while (destbase
!= srcbase
) {
8472 CDentry
*pdn
= destbase
->get_projected_parent_dn();
8473 desttrace
.insert(desttrace
.begin(), pdn
);
8474 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
8475 destbase
= pdn
->get_dir()->get_inode();
8477 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
8481 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8483 dout(10) << " this is a link merge" << dendl
;
8485 // -- create stray dentry? --
8486 CDentry
*straydn
= NULL
;
8487 if (destdnl
->is_primary() && !linkmerge
) {
8488 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
8491 dout(10) << " straydn is " << *straydn
<< dendl
;
8492 } else if (mdr
->straydn
) {
8493 mdr
->unpin(mdr
->straydn
);
8494 mdr
->straydn
= NULL
;
8499 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
8500 MutationImpl::LockOpVec lov
;
8502 // we need to update srci's ctime. xlock its least contended lock to do that...
8503 lov
.add_xlock(&srci
->linklock
);
8504 lov
.add_xlock(&srci
->snaplock
);
8507 // xlock oldin (for nlink--)
8508 lov
.add_xlock(&oldin
->linklock
);
8509 lov
.add_xlock(&oldin
->snaplock
);
8510 if (oldin
->is_dir()) {
8511 ceph_assert(srci
->is_dir());
8512 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
8514 // adjust locking order?
8515 int cmp
= mdr
->compare_paths();
8516 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
8517 std::reverse(lov
.begin(), lov
.end());
8519 ceph_assert(!srci
->is_dir());
8520 // adjust locking order;
8521 if (srci
->ino() > oldin
->ino())
8522 std::reverse(lov
.begin(), lov
.end());
8528 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
8529 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
8530 lov
.add_xlock(&straydn
->lock
);
8533 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
8534 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
8537 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
8541 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
8543 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
8544 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
8547 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
8550 if (!linkmerge
&& !check_fragment_space(mdr
, destdn
->get_dir()))
8553 if (!linkmerge
&& !check_dir_max_entries(mdr
, destdn
->get_dir()))
8556 if (!check_access(mdr
, srci
, MAY_WRITE
))
8560 // with read lock, really verify oldin is empty
8563 _dir_is_nonempty(mdr
, oldin
)) {
8564 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8568 /* project_snaprealm_past_parent() will do this job
8570 // moving between snaprealms?
8571 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8572 SnapRealm *srcrealm = srci->find_snaprealm();
8573 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8574 if (srcrealm != destrealm &&
8575 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8576 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8577 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8578 mdcache->snaprealm_create(mdr, srci);
8584 SnapRealm
*dest_realm
= nullptr;
8585 SnapRealm
*src_realm
= nullptr;
8587 dest_realm
= destdir
->inode
->find_snaprealm();
8588 if (srcdir
->inode
== destdir
->inode
)
8589 src_realm
= dest_realm
;
8591 src_realm
= srcdir
->inode
->find_snaprealm();
8592 if (src_realm
!= dest_realm
&&
8593 src_realm
->get_subvolume_ino() != dest_realm
->get_subvolume_ino()) {
8594 respond_to_request(mdr
, -CEPHFS_EXDEV
);
8599 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
8601 // -- open all srcdn inode frags, if any --
8602 // we need these open so that auth can properly delegate from inode to dirfrags
8603 // after the inode is _ours_.
8604 if (srcdnl
->is_primary() &&
8605 !srcdn
->is_auth() &&
8607 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
8608 mdr
->set_stickydirs(srci
);
8611 srci
->dirfragtree
.get_leaves(leaves
);
8612 for (const auto& leaf
: leaves
) {
8613 CDir
*dir
= srci
->get_dirfrag(leaf
);
8615 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
8616 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
8622 // -- prepare snaprealm ---
8625 if (!mdr
->more()->srci_srnode
&&
8626 srci
->get_projected_inode()->nlink
== 1 &&
8627 srci
->is_projected_snaprealm_global()) {
8628 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8629 srci
->record_snaprealm_parent_dentry(new_srnode
, nullptr, destdn
, false);
8631 srci
->clear_snaprealm_global(new_srnode
);
8632 mdr
->more()->srci_srnode
= new_srnode
;
8635 if (oldin
&& !mdr
->more()->desti_srnode
) {
8636 if (oldin
->is_projected_snaprealm_global()) {
8637 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
8638 oldin
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, destdn
, destdnl
->is_primary());
8639 // dropping the last linkage or dropping the last remote linkage,
8640 // detch the inode from global snaprealm
8641 auto nlink
= oldin
->get_projected_inode()->nlink
;
8643 (nlink
== 2 && !destdnl
->is_primary() &&
8644 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
8645 oldin
->clear_snaprealm_global(new_srnode
);
8646 mdr
->more()->desti_srnode
= new_srnode
;
8647 } else if (destdnl
->is_primary()) {
8648 snapid_t follows
= dest_realm
->get_newest_seq();
8649 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
8650 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
8651 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
8652 mdr
->more()->desti_srnode
= new_srnode
;
8656 if (!mdr
->more()->srci_srnode
) {
8657 if (srci
->is_projected_snaprealm_global()) {
8658 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8659 srci
->record_snaprealm_parent_dentry(new_srnode
, src_realm
, srcdn
, srcdnl
->is_primary());
8660 mdr
->more()->srci_srnode
= new_srnode
;
8661 } else if (srcdnl
->is_primary()) {
8662 snapid_t follows
= src_realm
->get_newest_seq();
8663 if (src_realm
!= dest_realm
&&
8664 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
8665 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
8666 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
8667 mdr
->more()->srci_srnode
= new_srnode
;
8673 // -- prepare witnesses --
8676 * NOTE: we use _all_ replicas as witnesses.
8677 * this probably isn't totally necessary (esp for file renames),
8678 * but if/when we change that, we have to make sure rejoin is
8679 * sufficiently robust to handle strong rejoins from survivors
8680 * with totally wrong dentry->inode linkage.
8681 * (currently, it can ignore rename effects, because the resolve
8682 * stage will sort them out.)
8684 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
8685 if (srcdn
->is_auth())
8686 srcdn
->list_replicas(witnesses
);
8688 witnesses
.insert(srcdn
->authority().first
);
8689 if (srcdnl
->is_remote() && !srci
->is_auth())
8690 witnesses
.insert(srci
->authority().first
);
8691 destdn
->list_replicas(witnesses
);
8692 if (destdnl
->is_remote() && !oldin
->is_auth())
8693 witnesses
.insert(oldin
->authority().first
);
8694 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
8696 if (!witnesses
.empty()) {
8697 // Replicas can't see projected dentry linkages and will get confused.
8698 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8699 // can't project these inodes' linkages.
8700 bool need_flush
= false;
8701 for (auto& dn
: srctrace
) {
8702 if (dn
->is_projected()) {
8708 CDentry
*dn
= destdn
;
8710 if (dn
->is_projected()) {
8714 CInode
*diri
= dn
->get_dir()->get_inode();
8715 dn
= diri
->get_projected_parent_dn();
8719 mdlog
->wait_for_safe(
8720 new MDSInternalContextWrapper(mds
,
8721 new C_MDS_RetryRequest(mdcache
, mdr
)));
8727 // do srcdn auth last
8728 mds_rank_t last
= MDS_RANK_NONE
;
8729 if (!srcdn
->is_auth()) {
8730 last
= srcdn
->authority().first
;
8731 mdr
->more()->srcdn_auth_mds
= last
;
8732 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8733 // are involved in the rename operation.
8734 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8735 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8736 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8737 ceph_assert(mdr
->more()->rename_inode
== srci
);
8738 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8743 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
8744 p
!= witnesses
.end();
8746 if (*p
== last
) continue; // do it last!
8747 if (mdr
->more()->witnessed
.count(*p
)) {
8748 dout(10) << " already witnessed by mds." << *p
<< dendl
;
8749 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
8750 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
8752 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
8756 if (!mdr
->more()->waiting_on_peer
.empty())
8757 return; // we're waiting for a witness.
8759 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
8760 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
8761 ceph_assert(mdr
->more()->waiting_on_peer
.count(last
) == 0);
8762 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8766 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8767 if (!mdr
->more()->peers
.empty() && !srci
->is_dir())
8768 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
8769 if (!mdr
->more()->peers
.empty() && srci
->is_dir())
8770 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
8772 // -- declare now --
8773 mdr
->set_mds_stamp(ceph_clock_now());
8775 // -- prepare journal entry --
8776 mdr
->ls
= mdlog
->get_current_segment();
8777 EUpdate
*le
= new EUpdate(mdlog
, "rename");
8778 mdlog
->start_entry(le
);
8779 le
->metablob
.add_client_req(mdr
->reqid
, req
->get_oldest_client_tid());
8780 if (!mdr
->more()->witnessed
.empty()) {
8781 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
8783 le
->reqid
= mdr
->reqid
;
8784 le
->had_peers
= true;
8786 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8787 // no need to send frozen auth pin to recovring auth MDS of srci
8788 mdr
->more()->is_remote_frozen_authpin
= false;
8791 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, req
->get_alternate_name(), straydn
);
8792 if (le
->client_map
.length())
8793 le
->cmapv
= mds
->sessionmap
.get_projected();
8795 // -- commit locally --
8796 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
8798 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
8799 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
8803 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8805 dout(10) << "_rename_finish " << *mdr
<< dendl
;
8807 if (!mdr
->more()->witnessed
.empty())
8808 mdcache
->logged_leader_update(mdr
->reqid
);
8811 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8813 mdcache
->send_dentry_link(destdn
, mdr
);
8815 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8816 CInode
*in
= destdnl
->get_inode();
8817 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
8819 // test hack: test peer commit
8820 if (!mdr
->more()->peers
.empty() && !in
->is_dir())
8821 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
8822 if (!mdr
->more()->peers
.empty() && in
->is_dir())
8823 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
8826 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8827 if (destdnl
->is_remote() && in
->is_auth())
8828 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
8830 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8832 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
8835 respond_to_request(mdr
, 0);
8838 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
8841 // respond_to_request() drops locks. So stray reintegration can race with us.
8842 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8843 mdcache
->notify_stray(straydn
);
8851 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
8852 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
8854 const auto& client_req
= mdr
->client_request
;
8855 ceph_assert(client_req
);
8857 if (mds
->is_cluster_degraded() &&
8858 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8859 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
8860 if (mdr
->more()->waiting_on_peer
.empty())
8861 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8865 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
8866 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREP
);
8868 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
8869 for (auto dn
: srctrace
)
8870 req
->srcdnpath
.push_dentry(dn
->get_name());
8871 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
8872 for (auto dn
: dsttrace
)
8873 req
->destdnpath
.push_dentry(dn
->get_name());
8874 req
->alternate_name
= client_req
->alternate_name
;
8876 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8878 if (mdr
->more()->srci_srnode
)
8879 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
8880 if (mdr
->more()->desti_srnode
)
8881 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8883 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
8885 // srcdn auth will verify our current witness list is sufficient
8886 req
->witnesses
= witnesse
;
8888 req
->op_stamp
= mdr
->get_op_stamp();
8889 mds
->send_message_mds(req
, who
);
8891 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
8892 mdr
->more()->waiting_on_peer
.insert(who
);
8896 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
8898 version_t oldpv
= mdr
->more()->inode_import_v
;
8900 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8903 auto blp
= mdr
->more()->inode_import
.cbegin();
8906 map
<client_t
,entity_inst_t
> client_map
;
8907 map
<client_t
, client_metadata_t
> client_metadata_map
;
8908 decode(client_map
, blp
);
8909 decode(client_metadata_map
, blp
);
8910 prepare_force_open_sessions(client_map
, client_metadata_map
,
8911 mdr
->more()->imported_session_map
);
8912 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
8913 encode(client_metadata_map
, *client_map_bl
);
8915 list
<ScatterLock
*> updated_scatterlocks
;
8916 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
8917 mdr
->more()->cap_imports
, updated_scatterlocks
);
8919 // hack: force back to !auth and clean, temporarily
8920 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
8921 srcdnl
->get_inode()->mark_clean();
8926 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
8928 auto&& dirs
= diri
->get_dirfrags();
8930 bool force_journal
= false;
8932 for (const auto& dir
: dirs
) {
8933 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
8934 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
8935 force_journal
= true;
8938 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
8941 // see if any children of our frags are auth subtrees.
8942 std::vector
<CDir
*> subtrees
;
8943 mdcache
->get_subtrees(subtrees
);
8944 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
8945 for (const auto& dir
: dirs
) {
8946 for (const auto& subtree
: subtrees
) {
8947 if (dir
->contains(subtree
)) {
8948 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
8949 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
8950 << *subtree
<< dendl
;
8951 force_journal
= true;
8954 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
8956 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
8962 return force_journal
;
8965 void Server::_rename_prepare(MDRequestRef
& mdr
,
8966 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
8967 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
8970 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8972 dout(10) << " straydn " << *straydn
<< dendl
;
8974 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8975 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8976 CInode
*srci
= srcdnl
->get_inode();
8977 CInode
*oldin
= destdnl
->get_inode();
8979 // primary+remote link merge?
8980 bool linkmerge
= (srci
== oldin
);
8982 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8983 bool silent
= srcdn
->get_dir()->inode
->is_stray();
8985 bool force_journal_dest
= false;
8986 if (srci
->is_dir() && !destdn
->is_auth()) {
8987 if (srci
->is_auth()) {
8988 // if we are auth for srci and exporting it, force journal because journal replay needs
8989 // the source inode to create auth subtrees.
8990 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
8991 force_journal_dest
= true;
8993 force_journal_dest
= _need_force_journal(srci
, false);
8996 bool force_journal_stray
= false;
8997 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
8998 force_journal_stray
= _need_force_journal(oldin
, true);
9001 dout(10) << " merging remote and primary links to the same inode" << dendl
;
9003 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
9004 if (force_journal_dest
)
9005 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
9006 if (force_journal_stray
)
9007 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
9009 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
9010 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
9011 metablob
->renamed_dirino
= srci
->ino();
9012 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
9013 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
9014 metablob
->renamed_dirino
= oldin
->ino();
9018 CInode::mempool_inode
*spi
= 0; // renamed inode
9019 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
9023 if (destdnl
->is_primary()) {
9024 ceph_assert(straydn
); // moving to straydn.
9025 // link--, and move.
9026 if (destdn
->is_auth()) {
9027 auto pi
= oldin
->project_inode(mdr
); //project_snaprealm
9028 pi
.inode
->version
= straydn
->pre_dirty(pi
.inode
->version
);
9029 pi
.inode
->update_backtrace();
9030 tpi
= pi
.inode
.get();
9032 straydn
->push_projected_linkage(oldin
);
9033 } else if (destdnl
->is_remote()) {
9035 if (oldin
->is_auth()) {
9036 auto pi
= oldin
->project_inode(mdr
);
9037 pi
.inode
->version
= oldin
->pre_dirty();
9038 tpi
= pi
.inode
.get();
9044 if (destdnl
->is_null()) {
9045 /* handle_client_rename checks that alternate_name matches for existing destdn */
9046 destdn
->set_alternate_name(alternate_name
);
9048 if (srcdnl
->is_remote()) {
9051 if (destdn
->is_auth())
9052 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
9053 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9055 if (srci
->is_auth()) {
9056 auto pi
= srci
->project_inode(mdr
);
9057 pi
.inode
->version
= srci
->pre_dirty();
9058 spi
= pi
.inode
.get();
9061 dout(10) << " will merge remote onto primary link" << dendl
;
9062 if (destdn
->is_auth()) {
9063 auto pi
= oldin
->project_inode(mdr
);
9064 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->get_version());
9065 spi
= pi
.inode
.get();
9069 if (destdn
->is_auth()) {
9071 if (srcdn
->is_auth())
9072 oldpv
= srci
->get_projected_version();
9074 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
9076 // note which dirfrags have child subtrees in the journal
9077 // event, so that we can open those (as bounds) during replay.
9078 if (srci
->is_dir()) {
9079 auto&& ls
= srci
->get_dirfrags();
9080 for (const auto& dir
: ls
) {
9081 if (!dir
->is_auth())
9082 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
9084 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
9087 auto pi
= srci
->project_inode(mdr
); // project snaprealm if srcdnl->is_primary
9088 // & srcdnl->snaprealm
9089 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
9090 pi
.inode
->update_backtrace();
9091 spi
= pi
.inode
.get();
9093 destdn
->push_projected_linkage(srci
);
9097 if (srcdn
->is_auth())
9098 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
9099 srcdn
->push_projected_linkage(); // push null linkage
9103 spi
->ctime
= mdr
->get_op_stamp();
9104 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
9105 spi
->rstat
.rctime
= mdr
->get_op_stamp();
9111 tpi
->ctime
= mdr
->get_op_stamp();
9112 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
9113 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
9117 destdn
->make_path_string(t
, true);
9118 tpi
->stray_prior_path
= std::move(t
);
9121 if (tpi
->nlink
== 0)
9122 oldin
->state_set(CInode::STATE_ORPHAN
);
9126 // prepare nesting, mtime updates
9127 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
9129 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9130 // then link the source inode to destdn
9131 if (destdnl
->is_primary()) {
9132 ceph_assert(straydn
);
9133 if (straydn
->is_auth()) {
9134 metablob
->add_dir_context(straydn
->get_dir());
9135 metablob
->add_dir(straydn
->get_dir(), true);
9139 if (!linkmerge
&& destdnl
->is_remote() && oldin
->is_auth()) {
9140 CDir
*oldin_dir
= oldin
->get_projected_parent_dir();
9141 if (oldin_dir
!= srcdn
->get_dir() && oldin_dir
!= destdn
->get_dir())
9142 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, oldin_dir
, PREDIRTY_PRIMARY
);
9146 if (destdn
->is_auth() && !destdnl
->is_null()) {
9147 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
9148 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
9149 if (destdnl
->is_primary()) {
9150 ceph_assert(straydn
);
9151 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
9152 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
9156 if (srcdnl
->is_remote() && srci
->is_auth()) {
9157 CDir
*srci_dir
= srci
->get_projected_parent_dir();
9158 if (srci_dir
!= srcdn
->get_dir() && srci_dir
!= destdn
->get_dir())
9159 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srci_dir
, PREDIRTY_PRIMARY
);
9163 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
9164 int flags
= predirty_dir
| predirty_primary
;
9165 if (srcdn
->is_auth())
9166 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
9167 if (destdn
->is_auth())
9168 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
9170 // add it all to the metablob
9173 if (destdnl
->is_primary()) {
9174 ceph_assert(straydn
);
9175 if (destdn
->is_auth()) {
9176 // project snaprealm, too
9177 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9178 oldin
->project_snaprealm(desti_srnode
);
9179 if (tpi
->nlink
== 0)
9180 ceph_assert(!desti_srnode
->is_parent_global());
9181 desti_srnode
= NULL
;
9183 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9184 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
9185 } else if (force_journal_stray
) {
9186 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
9187 metablob
->add_dir_context(straydn
->get_dir());
9188 metablob
->add_primary_dentry(straydn
, oldin
, true);
9190 } else if (destdnl
->is_remote()) {
9191 if (oldin
->is_auth()) {
9192 sr_t
*new_srnode
= NULL
;
9193 if (mdr
->peer_request
) {
9194 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9195 new_srnode
= new sr_t();
9196 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
9197 decode(*new_srnode
, p
);
9199 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9200 new_srnode
= desti_srnode
;
9201 desti_srnode
= NULL
;
9204 oldin
->project_snaprealm(new_srnode
);
9205 if (tpi
->nlink
== 0)
9206 ceph_assert(!new_srnode
->is_parent_global());
9209 CDentry
*oldin_pdn
= oldin
->get_projected_parent_dn();
9210 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin_pdn
);
9211 metablob
->add_primary_dentry(oldin_pdn
, oldin
, true);
9217 if (srcdnl
->is_remote()) {
9218 ceph_assert(!linkmerge
);
9219 if (destdn
->is_auth() && !destdnl
->is_null())
9220 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9222 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9224 if (destdn
->is_auth())
9225 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9227 if (srci
->is_auth() ) { // it's remote
9228 if (mdr
->peer_request
) {
9229 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9230 sr_t
*new_srnode
= new sr_t();
9231 auto p
= mdr
->peer_request
->srci_snapbl
.cbegin();
9232 decode(*new_srnode
, p
);
9233 srci
->project_snaprealm(new_srnode
);
9235 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9236 srci
->project_snaprealm(srci_srnode
);
9240 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
9241 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
);
9242 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
9244 } else if (srcdnl
->is_primary()) {
9245 // project snap parent update?
9246 if (destdn
->is_auth()) {
9247 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9248 srci
->project_snaprealm(srci_srnode
);
9253 if (destdn
->is_auth() && !destdnl
->is_null())
9254 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9256 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9258 if (destdn
->is_auth())
9259 metablob
->add_primary_dentry(destdn
, srci
, true, true);
9260 else if (force_journal_dest
) {
9261 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
9262 metablob
->add_dir_context(destdn
->get_dir());
9263 metablob
->add_primary_dentry(destdn
, srci
, true);
9264 if (srcdn
->is_auth() && srci
->is_dir()) {
9265 // journal new subtrees root dirfrags
9266 auto&& ls
= srci
->get_dirfrags();
9267 for (const auto& dir
: ls
) {
9269 metablob
->add_dir(dir
, true);
9276 if (srcdn
->is_auth()) {
9277 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
9278 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
9279 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9280 // both primary and NULL dentries. Because during journal replay, null dentry is
9281 // processed after primary dentry.
9282 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
9283 metablob
->add_primary_dentry(srcdn
, srci
, true);
9284 metablob
->add_null_dentry(srcdn
, true);
9286 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
9288 // make renamed inode first track the dn
9289 if (srcdnl
->is_primary() && destdn
->is_auth()) {
9290 ceph_assert(srci
->first
<= destdn
->first
);
9291 srci
->first
= destdn
->first
;
9293 // make stray inode first track the straydn
9294 if (straydn
&& straydn
->is_auth()) {
9295 ceph_assert(oldin
->first
<= straydn
->first
);
9296 oldin
->first
= straydn
->first
;
9299 if (oldin
&& oldin
->is_dir()) {
9300 ceph_assert(straydn
);
9301 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
9304 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
9309 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9311 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9312 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
9314 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9315 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9317 CInode
*oldin
= destdnl
->get_inode();
9319 // primary+remote link merge?
9320 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
9322 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
9324 bool new_in_snaprealm
= false;
9325 bool new_oldin_snaprealm
= false;
9329 if (destdnl
->is_primary()) {
9330 ceph_assert(straydn
);
9331 dout(10) << "straydn is " << *straydn
<< dendl
;
9333 // if there is newly created snaprealm, need to split old snaprealm's
9334 // inodes_with_caps. So pop snaprealm before linkage changes.
9335 if (destdn
->is_auth()) {
9336 bool hadrealm
= (oldin
->snaprealm
? true : false);
9337 oldin
->early_pop_projected_snaprealm();
9338 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
9340 ceph_assert(mdr
->peer_request
);
9341 if (mdr
->peer_request
->desti_snapbl
.length()) {
9342 new_oldin_snaprealm
= !oldin
->snaprealm
;
9343 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9344 ceph_assert(oldin
->snaprealm
);
9348 destdn
->get_dir()->unlink_inode(destdn
, false);
9350 straydn
->pop_projected_linkage();
9351 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9352 ceph_assert(!straydn
->is_projected()); // no other projected
9355 if (destdn
->is_auth())
9356 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9358 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
9359 } else if (destdnl
->is_remote()) {
9360 destdn
->get_dir()->unlink_inode(destdn
, false);
9361 if (oldin
->is_auth()) {
9362 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9363 } else if (mdr
->peer_request
) {
9364 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9365 ceph_assert(oldin
->snaprealm
);
9366 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9368 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9369 delete desti_srnode
;
9370 desti_srnode
= NULL
;
9375 // unlink src before we relink it at dest
9376 CInode
*in
= srcdnl
->get_inode();
9379 bool srcdn_was_remote
= srcdnl
->is_remote();
9380 if (!srcdn_was_remote
) {
9381 // if there is newly created snaprealm, need to split old snaprealm's
9382 // inodes_with_caps. So pop snaprealm before linkage changes.
9383 if (destdn
->is_auth()) {
9384 bool hadrealm
= (in
->snaprealm
? true : false);
9385 in
->early_pop_projected_snaprealm();
9386 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
9388 ceph_assert(mdr
->peer_request
);
9389 if (mdr
->peer_request
->srci_snapbl
.length()) {
9390 new_in_snaprealm
= !in
->snaprealm
;
9391 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9392 ceph_assert(in
->snaprealm
);
9397 srcdn
->get_dir()->unlink_inode(srcdn
);
9400 if (srcdn_was_remote
) {
9403 destdnl
= destdn
->pop_projected_linkage();
9404 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9405 ceph_assert(!destdn
->is_projected()); // no other projected
9407 destdn
->link_remote(destdnl
, in
);
9408 if (destdn
->is_auth())
9409 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
9411 if (in
->is_auth()) {
9412 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9413 } else if (mdr
->peer_request
) {
9414 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9415 ceph_assert(in
->snaprealm
);
9416 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9418 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9423 dout(10) << "merging remote onto primary link" << dendl
;
9424 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9428 dout(10) << "merging primary onto remote link" << dendl
;
9429 destdn
->get_dir()->unlink_inode(destdn
, false);
9431 destdnl
= destdn
->pop_projected_linkage();
9432 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9433 ceph_assert(!destdn
->is_projected()); // no other projected
9435 // srcdn inode import?
9436 if (!srcdn
->is_auth() && destdn
->is_auth()) {
9437 ceph_assert(mdr
->more()->inode_import
.length() > 0);
9439 map
<client_t
,Capability::Import
> imported_caps
;
9441 // finish cap imports
9442 finish_force_open_sessions(mdr
->more()->imported_session_map
);
9443 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
9444 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
9445 mdr
->more()->srcdn_auth_mds
, true,
9446 mdr
->more()->imported_session_map
,
9447 mdr
->more()->cap_imports
[destdnl
->get_inode()],
9451 mdr
->more()->inode_import
.clear();
9452 encode(imported_caps
, mdr
->more()->inode_import
);
9454 /* hack: add an auth pin for each xlock we hold. These were
9455 * remote xlocks previously but now they're local and
9456 * we're going to try and unpin when we xlock_finish. */
9458 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
9459 i
!= mdr
->locks
.end();
9461 SimpleLock
*lock
= i
->lock
;
9462 if (lock
->get_parent() != destdnl
->get_inode())
9464 if (i
->is_xlock() && !lock
->is_locallock())
9465 mds
->locker
->xlock_import(lock
);
9468 // hack: fix auth bit
9469 in
->state_set(CInode::STATE_AUTH
);
9471 mdr
->clear_ambiguous_auth();
9474 if (destdn
->is_auth())
9475 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9479 if (srcdn
->is_auth())
9480 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
9481 srcdn
->pop_projected_linkage();
9482 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9483 ceph_assert(!srcdn
->is_projected()); // no other projected
9485 // apply remaining projected inodes (nested)
9488 // update subtree map?
9489 if (destdnl
->is_primary() && in
->is_dir())
9490 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
9492 if (straydn
&& oldin
->is_dir())
9493 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
9495 if (new_oldin_snaprealm
)
9496 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
9497 if (new_in_snaprealm
)
9498 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
9500 // removing a new dn?
9501 if (srcdn
->is_auth())
9502 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
9510 class C_MDS_PeerRenamePrep
: public ServerLogContext
{
9511 CDentry
*srcdn
, *destdn
, *straydn
;
9513 C_MDS_PeerRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9514 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9515 void finish(int r
) override
{
9516 server
->_logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9520 class C_MDS_PeerRenameCommit
: public ServerContext
{
9522 CDentry
*srcdn
, *destdn
, *straydn
;
9524 C_MDS_PeerRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9525 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9526 void finish(int r
) override
{
9527 server
->_commit_peer_rename(mdr
, r
, srcdn
, destdn
, straydn
);
9531 class C_MDS_PeerRenameSessionsFlushed
: public ServerContext
{
9534 C_MDS_PeerRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
9535 ServerContext(s
), mdr(r
) {}
9536 void finish(int r
) override
{
9537 server
->_peer_rename_sessions_flushed(mdr
);
9541 void Server::handle_peer_rename_prep(MDRequestRef
& mdr
)
9543 dout(10) << "handle_peer_rename_prep " << *mdr
9544 << " " << mdr
->peer_request
->srcdnpath
9545 << " to " << mdr
->peer_request
->destdnpath
9548 if (mdr
->peer_request
->is_interrupted()) {
9549 dout(10) << " peer request interrupted, sending noop reply" << dendl
;
9550 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9551 reply
->mark_interrupted();
9552 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9553 mdr
->reset_peer_request();
9558 filepath
destpath(mdr
->peer_request
->destdnpath
);
9559 dout(10) << " dest " << destpath
<< dendl
;
9560 vector
<CDentry
*> trace
;
9561 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
9562 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
9563 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
9566 if (r
== -CEPHFS_ESTALE
) {
9567 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
9568 mdr
->peer_to_mds
, true);
9571 ceph_assert(r
== 0); // we shouldn't get an error here!
9573 CDentry
*destdn
= trace
.back();
9574 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9575 dout(10) << " destdn " << *destdn
<< dendl
;
9579 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
9580 dout(10) << " src " << srcpath
<< dendl
;
9581 CInode
*srci
= nullptr;
9582 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
9583 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
9586 ceph_assert(r
== 0);
9588 CDentry
*srcdn
= trace
.back();
9589 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9590 dout(10) << " srcdn " << *srcdn
<< dendl
;
9595 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
9597 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9598 CDentry
*straydn
= mdr
->straydn
;
9599 if (destdnl
->is_primary() && !linkmerge
)
9600 ceph_assert(straydn
);
9602 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
9603 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
9605 // set up commit waiter (early, to clean up any freezing etc we do)
9606 if (!mdr
->more()->peer_commit
)
9607 mdr
->more()->peer_commit
= new C_MDS_PeerRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
9610 if (srcdn
->is_auth()) {
9611 set
<mds_rank_t
> srcdnrep
;
9612 srcdn
->list_replicas(srcdnrep
);
9614 bool reply_witness
= false;
9615 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
9618 // - avoid conflicting lock state changes
9619 // - avoid concurrent updates to the inode
9620 // (this could also be accomplished with the versionlock)
9621 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9622 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
9623 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
9625 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9626 if (srcdnl
->get_inode()->is_frozen_auth_pin())
9627 mdr
->unfreeze_auth_pin();
9629 if (!frozen_inode
) {
9630 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
9635 * set ambiguous auth for srci
9636 * NOTE: we don't worry about ambiguous cache expire as we do
9637 * with subtree migrations because all peers will pin
9638 * srcdn->get_inode() for duration of this rename.
9640 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9642 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9643 // the leader will send another OP_RENAMEPREP peer request later.
9644 if (mdr
->peer_request
->witnesses
.size() > 1) {
9645 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
9646 reply_witness
= true;
9649 // make sure bystanders have received all lock related messages
9650 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9651 if (*p
== mdr
->peer_to_mds
||
9652 (mds
->is_cluster_degraded() &&
9653 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
9655 auto notify
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMENOTIFY
);
9656 mds
->send_message_mds(notify
, *p
);
9657 mdr
->more()->waiting_on_peer
.insert(*p
);
9660 // make sure clients have received all cap related messages
9661 set
<client_t
> export_client_set
;
9662 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
9664 MDSGatherBuilder
gather(g_ceph_context
);
9665 flush_client_sessions(export_client_set
, gather
);
9666 if (gather
.has_subs()) {
9667 mdr
->more()->waiting_on_peer
.insert(MDS_RANK_NONE
);
9668 gather
.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr
));
9673 // is witness list sufficient?
9674 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9675 if (*p
== mdr
->peer_to_mds
||
9676 mdr
->peer_request
->witnesses
.count(*p
)) continue;
9677 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
9678 reply_witness
= true;
9682 if (reply_witness
) {
9683 ceph_assert(!srcdnrep
.empty());
9684 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9685 reply
->witnesses
.swap(srcdnrep
);
9686 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9687 mdr
->reset_peer_request();
9690 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
9691 if (!mdr
->more()->waiting_on_peer
.empty()) {
9692 dout(10) << " still waiting for rename notify acks from "
9693 << mdr
->more()->waiting_on_peer
<< dendl
;
9696 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
9697 // set ambiguous auth for srci on witnesses
9698 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9701 // encode everything we'd need to roll this back... basically, just the original state.
9702 rename_rollback rollback
;
9704 rollback
.reqid
= mdr
->reqid
;
9706 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
9707 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9708 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9709 rollback
.orig_src
.dname
= srcdn
->get_name();
9710 if (srcdnl
->is_primary())
9711 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
9713 ceph_assert(srcdnl
->is_remote());
9714 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
9715 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
9718 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
9719 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9720 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9721 rollback
.orig_dest
.dname
= destdn
->get_name();
9722 if (destdnl
->is_primary())
9723 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
9724 else if (destdnl
->is_remote()) {
9725 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
9726 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
9730 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
9731 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9732 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9733 rollback
.stray
.dname
= straydn
->get_name();
9735 if (mdr
->peer_request
->desti_snapbl
.length()) {
9736 CInode
*oldin
= destdnl
->get_inode();
9737 if (oldin
->snaprealm
) {
9738 encode(true, rollback
.desti_snapbl
);
9739 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
9741 encode(false, rollback
.desti_snapbl
);
9744 if (mdr
->peer_request
->srci_snapbl
.length()) {
9745 if (srci
->snaprealm
) {
9746 encode(true, rollback
.srci_snapbl
);
9747 srci
->encode_snap_blob(rollback
.srci_snapbl
);
9749 encode(false, rollback
.srci_snapbl
);
9752 encode(rollback
, mdr
->more()->rollback_bl
);
9753 // FIXME: rollback snaprealm
9754 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
9757 mdr
->ls
= mdlog
->get_current_segment();
9758 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_prep", mdr
->reqid
, mdr
->peer_to_mds
,
9759 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RENAME
);
9760 mdlog
->start_entry(le
);
9761 le
->rollback
= mdr
->more()->rollback_bl
;
9763 bufferlist blah
; // inode import data... obviously not used if we're the peer
9764 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, mdr
->peer_request
->alternate_name
, straydn
);
9766 if (le
->commit
.empty()) {
9767 dout(10) << " empty metablob, skipping journal" << dendl
;
9768 mdlog
->cancel_entry(le
);
9770 _logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9772 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
9773 mdr
->more()->peer_update_journaled
= true;
9774 submit_mdlog_entry(le
, new C_MDS_PeerRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
9780 void Server::_logged_peer_rename(MDRequestRef
& mdr
,
9781 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9783 dout(10) << "_logged_peer_rename " << *mdr
<< dendl
;
9786 ref_t
<MMDSPeerRequest
> reply
;
9787 if (!mdr
->aborted
) {
9788 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9789 if (!mdr
->more()->peer_update_journaled
)
9790 reply
->mark_not_journaled();
9793 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9794 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9797 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
9798 // set export bounds for CInode::encode_export()
9800 std::vector
<CDir
*> bounds
;
9801 if (srcdnl
->get_inode()->is_dir()) {
9802 srcdnl
->get_inode()->get_dirfrags(bounds
);
9803 for (const auto& bound
: bounds
) {
9804 bound
->state_set(CDir::STATE_EXPORTBOUND
);
9808 map
<client_t
,entity_inst_t
> exported_client_map
;
9809 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
9811 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
9812 exported_client_map
,
9813 exported_client_metadata_map
);
9815 for (const auto& bound
: bounds
) {
9816 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
9819 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
9820 encode(exported_client_metadata_map
, reply
->inode_export
);
9821 reply
->inode_export
.claim_append(inodebl
);
9822 reply
->inode_export_v
= srcdnl
->get_inode()->get_version();
9825 // remove mdr auth pin
9826 mdr
->auth_unpin(srcdnl
->get_inode());
9827 mdr
->more()->is_inode_exporter
= true;
9829 if (srcdnl
->get_inode()->is_dirty())
9830 srcdnl
->get_inode()->mark_clean();
9832 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
9836 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9838 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9841 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9842 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
9843 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
9846 mdr
->reset_peer_request();
9850 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9852 ceph_assert(mdr
->aborted
);
9853 dout(10) << " abort flag set, finishing" << dendl
;
9854 mdcache
->request_finish(mdr
);
9858 void Server::_commit_peer_rename(MDRequestRef
& mdr
, int r
,
9859 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9861 dout(10) << "_commit_peer_rename " << *mdr
<< " r=" << r
<< dendl
;
9863 CInode
*in
= destdn
->get_linkage()->get_inode();
9865 inodeno_t migrated_stray
;
9866 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
9867 migrated_stray
= in
->ino();
9869 MDSContext::vec finished
;
9871 // unfreeze+singleauth inode
9872 // hmm, do i really need to delay this?
9873 if (mdr
->more()->is_inode_exporter
) {
9875 // we exported, clear out any xlocks that we moved to another MDS
9877 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
9878 i
!= mdr
->locks
.end(); ) {
9879 SimpleLock
*lock
= i
->lock
;
9880 if (lock
->get_parent() != in
)
9882 // we only care about xlocks on the exported inode
9883 if (i
->is_xlock() && !lock
->is_locallock())
9884 mds
->locker
->xlock_export(i
++, mdr
.get());
9889 map
<client_t
,Capability::Import
> peer_imported
;
9890 auto bp
= mdr
->more()->inode_import
.cbegin();
9891 decode(peer_imported
, bp
);
9893 dout(10) << " finishing inode export on " << *in
<< dendl
;
9894 mdcache
->migrator
->finish_export_inode(in
, mdr
->peer_to_mds
, peer_imported
, finished
);
9895 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
9898 ceph_assert(in
->is_frozen_inode());
9899 in
->unfreeze_inode(finished
);
9903 if (mdr
->more()->is_ambiguous_auth
) {
9904 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9905 mdr
->more()->is_ambiguous_auth
= false;
9908 if (straydn
&& mdr
->more()->peer_update_journaled
) {
9909 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
9910 if (strayin
&& !strayin
->snaprealm
)
9911 mdcache
->clear_dirty_bits_for_stray(strayin
);
9914 mds
->queue_waiters(finished
);
9917 if (mdr
->more()->peer_update_journaled
) {
9918 // write a commit to the journal
9919 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_commit", mdr
->reqid
,
9920 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
9921 EPeerUpdate::RENAME
);
9922 mdlog
->start_entry(le
);
9923 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
9926 _committed_peer(mdr
);
9931 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9932 // witness list from the leader, and they failed before we tried prep again.
9933 if (mdr
->more()->rollback_bl
.length()) {
9934 if (mdr
->more()->is_inode_exporter
) {
9935 dout(10) << " reversing inode export of " << *in
<< dendl
;
9938 if (mdcache
->is_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
)) {
9939 mdcache
->remove_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
);
9940 // rollback but preserve the peer request
9941 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, false);
9942 mdr
->more()->rollback_bl
.clear();
9944 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, true);
9946 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl
;
9948 if (mdr
->more()->is_ambiguous_auth
) {
9949 if (srcdn
->is_auth())
9950 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9952 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9953 mdr
->more()->is_ambiguous_auth
= false;
9955 mds
->queue_waiters(finished
);
9956 mdcache
->request_finish(mdr
);
9960 if (migrated_stray
&& mds
->is_stopping())
9961 mdcache
->shutdown_export_stray_finish(migrated_stray
);
9964 static void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
,
9965 rename_rollback::drec
&r
, utime_t ctime
,
9966 bool isdir
, const nest_info_t
&rstat
)
9968 auto pf
= dir
->project_fnode(mut
);
9969 pf
->version
= dir
->pre_dirty();
9972 pf
->fragstat
.nsubdirs
+= 1;
9974 pf
->fragstat
.nfiles
+= 1;
9977 pf
->rstat
.rbytes
+= rstat
.rbytes
;
9978 pf
->rstat
.rfiles
+= rstat
.rfiles
;
9979 pf
->rstat
.rsubdirs
+= rstat
.rsubdirs
;
9980 pf
->rstat
.rsnaps
+= rstat
.rsnaps
;
9982 if (pf
->fragstat
.mtime
== ctime
) {
9983 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
9984 if (pf
->rstat
.rctime
== ctime
)
9985 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
9987 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
9988 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
9991 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
9997 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9999 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
10000 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
10001 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
10002 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
10003 straydn(st
), finish_mdr(f
) {
10004 splits
[0].swap(_splits
[0]);
10005 splits
[1].swap(_splits
[1]);
10007 void finish(int r
) override
{
10008 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
10009 destdn
, straydn
, splits
, finish_mdr
);
10013 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
,
10016 rename_rollback rollback
;
10017 auto p
= rbl
.cbegin();
10018 decode(rollback
, p
);
10020 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
10021 // need to finish this update before sending resolve to claim the subtree
10022 mdcache
->add_rollback(rollback
.reqid
, leader
);
10024 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
10025 mut
->ls
= mds
->mdlog
->get_current_segment();
10027 CDentry
*srcdn
= NULL
;
10028 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
10030 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
10032 dout(10) << " srcdir " << *srcdir
<< dendl
;
10033 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
10035 dout(10) << " srcdn " << *srcdn
<< dendl
;
10036 ceph_assert(srcdn
->get_linkage()->is_null());
10038 dout(10) << " srcdn not found" << dendl
;
10040 dout(10) << " srcdir not found" << dendl
;
10042 CDentry
*destdn
= NULL
;
10043 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
10045 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
10047 dout(10) << " destdir " << *destdir
<< dendl
;
10048 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
10050 dout(10) << " destdn " << *destdn
<< dendl
;
10052 dout(10) << " destdn not found" << dendl
;
10054 dout(10) << " destdir not found" << dendl
;
10057 if (rollback
.orig_src
.ino
) {
10058 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
10059 if (in
&& in
->is_dir())
10060 ceph_assert(srcdn
&& destdn
);
10062 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
10064 CDir
*straydir
= NULL
;
10065 CDentry
*straydn
= NULL
;
10066 if (rollback
.stray
.dirfrag
.ino
) {
10067 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
10069 dout(10) << "straydir " << *straydir
<< dendl
;
10070 straydn
= straydir
->lookup(rollback
.stray
.dname
);
10072 dout(10) << " straydn " << *straydn
<< dendl
;
10073 ceph_assert(straydn
->get_linkage()->is_primary());
10075 dout(10) << " straydn not found" << dendl
;
10077 dout(10) << "straydir not found" << dendl
;
10080 CInode
*target
= NULL
;
10081 if (rollback
.orig_dest
.ino
) {
10082 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
10084 ceph_assert(destdn
&& straydn
);
10085 } else if (rollback
.orig_dest
.remote_ino
)
10086 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
10088 // can't use is_auth() in the resolve stage
10089 mds_rank_t whoami
= mds
->get_nodeid();
10091 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
10092 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
10094 bool force_journal_src
= false;
10095 bool force_journal_dest
= false;
10096 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
10097 force_journal_src
= _need_force_journal(in
, false);
10098 if (in
&& target
&& target
->is_dir())
10099 force_journal_dest
= _need_force_journal(in
, true);
10101 version_t srcdnpv
= 0;
10104 if (srcdn
->authority().first
== whoami
)
10105 srcdnpv
= srcdn
->pre_dirty();
10106 if (rollback
.orig_src
.ino
) {
10108 srcdn
->push_projected_linkage(in
);
10110 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
10111 rollback
.orig_src
.remote_d_type
);
10114 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
10116 const CInode::mempool_inode
*pip
= nullptr;
10119 CDir
*pdir
= in
->get_projected_parent_dir();
10120 if (pdir
->authority().first
== whoami
) {
10121 auto pi
= in
->project_inode(mut
);
10122 pi
.inode
->version
= in
->pre_dirty();
10123 if (pdir
!= srcdir
) {
10124 auto pf
= pdir
->project_fnode(mut
);
10125 pf
->version
= pdir
->pre_dirty();
10127 if (pi
.inode
->ctime
== rollback
.ctime
)
10128 pi
.inode
->ctime
= rollback
.orig_src
.old_ctime
;
10131 if (in
->get_inode()->ctime
== rollback
.ctime
) {
10132 auto _inode
= CInode::allocate_inode(*in
->get_inode());
10133 _inode
->ctime
= rollback
.orig_src
.old_ctime
;
10134 in
->reset_inode(_inode
);
10138 pip
= in
->get_projected_inode().get();
10140 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
10142 auto p
= rollback
.srci_snapbl
.cbegin();
10143 decode(hadrealm
, p
);
10145 if (projected
&& !mds
->is_resolve()) {
10146 sr_t
*new_srnode
= new sr_t();
10147 decode(*new_srnode
, p
);
10148 in
->project_snaprealm(new_srnode
);
10150 decode(in
->snaprealm
->srnode
, p
);
10153 if (rollback
.orig_src
.ino
) {
10154 ceph_assert(srcdir
);
10155 realm
= srcdir
->get_inode()->find_snaprealm();
10157 realm
= in
->snaprealm
->parent
;
10159 if (!mds
->is_resolve())
10160 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
10162 in
->project_snaprealm(NULL
);
10164 in
->snaprealm
->merge_to(realm
);
10171 if (rollback
.orig_dest
.ino
&& target
) {
10172 destdn
->push_projected_linkage(target
);
10173 } else if (rollback
.orig_dest
.remote_ino
) {
10174 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
10175 rollback
.orig_dest
.remote_d_type
);
10177 // the dentry will be trimmed soon, it's ok to have wrong linkage
10178 if (rollback
.orig_dest
.ino
)
10179 ceph_assert(mds
->is_resolve());
10180 destdn
->push_projected_linkage();
10185 straydn
->push_projected_linkage();
10189 CInode::inode_ptr ti
;
10190 CDir
*pdir
= target
->get_projected_parent_dir();
10191 if (pdir
->authority().first
== whoami
) {
10192 auto pi
= target
->project_inode(mut
);
10193 pi
.inode
->version
= target
->pre_dirty();
10194 if (pdir
!= srcdir
) {
10195 auto pf
= pdir
->project_fnode(mut
);
10196 pf
->version
= pdir
->pre_dirty();
10201 ti
= CInode::allocate_inode(*target
->get_inode());
10205 if (ti
->ctime
== rollback
.ctime
)
10206 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
10207 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
10208 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
10209 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
10211 ceph_assert(rollback
.orig_dest
.remote_ino
&&
10212 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
10217 target
->reset_inode(ti
);
10219 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
10221 auto p
= rollback
.desti_snapbl
.cbegin();
10222 decode(hadrealm
, p
);
10224 if (projected
&& !mds
->is_resolve()) {
10225 sr_t
*new_srnode
= new sr_t();
10226 decode(*new_srnode
, p
);
10227 target
->project_snaprealm(new_srnode
);
10229 decode(target
->snaprealm
->srnode
, p
);
10232 if (rollback
.orig_dest
.ino
) {
10233 ceph_assert(destdir
);
10234 realm
= destdir
->get_inode()->find_snaprealm();
10236 realm
= target
->snaprealm
->parent
;
10238 if (!mds
->is_resolve())
10239 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
10241 target
->project_snaprealm(NULL
);
10243 target
->snaprealm
->merge_to(realm
);
10248 if (srcdn
&& srcdn
->authority().first
== whoami
) {
10250 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
10251 in
&& in
->is_dir(), pip
? pip
->accounted_rstat
: blah
);
10255 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
10257 dout(0) << " srci back to " << *in
<< dendl
;
10259 dout(0) << " destdn back to " << *destdn
<< dendl
;
10261 dout(0) << " desti back to " << *target
<< dendl
;
10264 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_rollback", rollback
.reqid
, leader
,
10265 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RENAME
);
10266 mdlog
->start_entry(le
);
10268 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10269 le
->commit
.add_dir_context(srcdir
);
10270 if (rollback
.orig_src
.ino
)
10271 le
->commit
.add_primary_dentry(srcdn
, 0, true);
10273 le
->commit
.add_remote_dentry(srcdn
, true);
10276 if (!rollback
.orig_src
.ino
&& // remote linkage
10277 in
&& in
->authority().first
== whoami
) {
10278 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
10279 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
10282 if (force_journal_dest
) {
10283 ceph_assert(rollback
.orig_dest
.ino
);
10284 le
->commit
.add_dir_context(destdir
);
10285 le
->commit
.add_primary_dentry(destdn
, 0, true);
10288 // peer: no need to journal straydn
10290 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
10291 ceph_assert(rollback
.orig_dest
.remote_ino
);
10292 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
10293 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
10296 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10297 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
10298 le
->commit
.renamed_dirino
= in
->ino();
10299 if (srcdn
->authority().first
== whoami
) {
10300 auto&& ls
= in
->get_dirfrags();
10301 for (const auto& dir
: ls
) {
10302 if (!dir
->is_auth())
10303 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
10305 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
10307 } else if (force_journal_dest
) {
10308 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
10309 le
->commit
.renamed_dirino
= target
->ino();
10312 if (target
&& target
->is_dir()) {
10313 ceph_assert(destdn
);
10314 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
10317 if (in
&& in
->is_dir()) {
10318 ceph_assert(srcdn
);
10319 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
10322 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
10323 ceph_assert(le
->commit
.empty());
10324 mdlog
->cancel_entry(le
);
10326 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
10328 ceph_assert(!le
->commit
.empty());
10330 mdr
->more()->peer_update_journaled
= false;
10331 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
10332 srcdn
, srcdnpv
, destdn
, straydn
,
10333 splits
, finish_mdr
);
10334 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
10339 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
10340 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
10341 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
10343 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
10346 straydn
->get_dir()->unlink_inode(straydn
);
10347 straydn
->pop_projected_linkage();
10350 destdn
->get_dir()->unlink_inode(destdn
);
10351 destdn
->pop_projected_linkage();
10354 srcdn
->pop_projected_linkage();
10355 if (srcdn
->authority().first
== mds
->get_nodeid()) {
10356 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
10357 if (srcdn
->get_linkage()->is_primary())
10358 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
10364 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
10365 CInode
*in
= srcdn
->get_linkage()->get_inode();
10366 if (in
&& in
->is_dir()) {
10367 ceph_assert(destdn
);
10368 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
10373 CInode
*oldin
= destdn
->get_linkage()->get_inode();
10374 // update subtree map?
10375 if (oldin
&& oldin
->is_dir()) {
10376 ceph_assert(straydn
);
10377 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
10381 if (mds
->is_resolve()) {
10384 root
= mdcache
->get_subtree_root(straydn
->get_dir());
10386 root
= mdcache
->get_subtree_root(destdn
->get_dir());
10388 mdcache
->try_trim_non_auth_subtree(root
);
10390 mdcache
->send_snaps(splits
[1]);
10391 mdcache
->send_snaps(splits
[0]);
10395 MDSContext::vec finished
;
10396 if (mdr
->more()->is_ambiguous_auth
) {
10397 if (srcdn
->is_auth())
10398 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10400 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10401 mdr
->more()->is_ambiguous_auth
= false;
10403 mds
->queue_waiters(finished
);
10404 if (finish_mdr
|| mdr
->aborted
)
10405 mdcache
->request_finish(mdr
);
10407 mdr
->more()->peer_rolling_back
= false;
10410 mdcache
->finish_rollback(mut
->reqid
, mdr
);
10415 void Server::handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10417 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10418 << " witnessed by " << ack
->get_source()
10419 << " " << *ack
<< dendl
;
10420 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10423 mdr
->more()->peers
.insert(from
);
10424 if (mdr
->more()->srcdn_auth_mds
== from
&&
10425 mdr
->more()->is_remote_frozen_authpin
&&
10426 !mdr
->more()->is_ambiguous_auth
) {
10427 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
10430 // witnessed? or add extra witnesses?
10431 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
10432 if (ack
->is_interrupted()) {
10433 dout(10) << " peer request interrupted, noop" << dendl
;
10434 } else if (ack
->witnesses
.empty()) {
10435 mdr
->more()->witnessed
.insert(from
);
10436 if (!ack
->is_not_journaled())
10437 mdr
->more()->has_journaled_peers
= true;
10439 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
10440 mdr
->more()->extra_witnesses
= ack
->witnesses
;
10441 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
10445 if (ack
->inode_export
.length()) {
10446 dout(10) << " got srci import" << dendl
;
10447 mdr
->more()->inode_import
.share(ack
->inode_export
);
10448 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
10451 // remove from waiting list
10452 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
10453 mdr
->more()->waiting_on_peer
.erase(from
);
10455 if (mdr
->more()->waiting_on_peer
.empty())
10456 dispatch_client_request(mdr
); // go again!
10458 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
10461 void Server::handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10463 dout(10) << "handle_peer_rename_notify_ack " << *mdr
<< " from mds."
10464 << ack
->get_source() << dendl
;
10465 ceph_assert(mdr
->is_peer());
10466 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10468 if (mdr
->more()->waiting_on_peer
.count(from
)) {
10469 mdr
->more()->waiting_on_peer
.erase(from
);
10471 if (mdr
->more()->waiting_on_peer
.empty()) {
10472 if (mdr
->peer_request
)
10473 dispatch_peer_request(mdr
);
10475 dout(10) << " still waiting for rename notify acks from "
10476 << mdr
->more()->waiting_on_peer
<< dendl
;
10480 void Server::_peer_rename_sessions_flushed(MDRequestRef
& mdr
)
10482 dout(10) << "_peer_rename_sessions_flushed " << *mdr
<< dendl
;
10484 if (mdr
->more()->waiting_on_peer
.count(MDS_RANK_NONE
)) {
10485 mdr
->more()->waiting_on_peer
.erase(MDS_RANK_NONE
);
10487 if (mdr
->more()->waiting_on_peer
.empty()) {
10488 if (mdr
->peer_request
)
10489 dispatch_peer_request(mdr
);
10491 dout(10) << " still waiting for rename notify acks from "
10492 << mdr
->more()->waiting_on_peer
<< dendl
;
10497 /* This function takes responsibility for the passed mdr*/
10498 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
10500 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10502 // traverse to path
10503 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10507 if (!diri
->is_dir()) {
10508 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10511 dout(10) << "lssnap on " << *diri
<< dendl
;
10514 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
10517 if (!check_access(mdr
, diri
, MAY_READ
))
10520 SnapRealm
*realm
= diri
->find_snaprealm();
10521 map
<snapid_t
,const SnapInfo
*> infomap
;
10522 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
10524 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
10526 max_entries
= infomap
.size();
10527 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
10529 // make sure at least one item can be encoded
10530 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
10532 __u64 last_snapid
= 0;
10533 string offset_str
= req
->get_path2();
10534 if (!offset_str
.empty())
10535 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
10539 static DirStat empty
;
10540 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
10542 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
10546 auto p
= infomap
.upper_bound(last_snapid
);
10547 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
10548 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
10552 if (p
->second
->ino
== diri
->ino())
10553 snap_name
= p
->second
->name
;
10555 snap_name
= p
->second
->get_long_name();
10557 unsigned start_len
= dnbl
.length();
10558 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
10561 encode(snap_name
, dnbl
);
10563 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
10564 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
10565 dout(20) << "encode_infinite_lease" << dendl
;
10567 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
10570 keep
.substr_of(dnbl
, 0, start_len
);
10577 encode(num
, dirbl
);
10579 if (p
== infomap
.end()) {
10580 flags
= CEPH_READDIR_FRAG_END
;
10581 if (last_snapid
== 0)
10582 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
10584 encode(flags
, dirbl
);
10585 dirbl
.claim_append(dnbl
);
10587 mdr
->reply_extra_bl
= dirbl
;
10588 mdr
->tracei
= diri
;
10589 respond_to_request(mdr
, 0);
10595 struct C_MDS_mksnap_finish
: public ServerLogContext
{
10598 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
10599 ServerLogContext(s
, r
), diri(di
), info(i
) {}
10600 void finish(int r
) override
{
10601 server
->_mksnap_finish(mdr
, diri
, info
);
10605 /* This function takes responsibility for the passed mdr*/
10606 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
10608 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10609 // make sure we have as new a map as the client
10610 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
10611 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
10614 if (!mds
->mdsmap
->allows_snaps()) {
10615 // you can't make snapshots until you set an option right now
10616 dout(5) << "new snapshots are disabled for this fs" << dendl
;
10617 respond_to_request(mdr
, -CEPHFS_EPERM
);
10621 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10626 if (!diri
->is_dir()) {
10627 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10630 if (diri
->is_system() && !diri
->is_root()) {
10631 // no snaps in system dirs (root is ok)
10632 dout(5) << "is an internal system dir" << dendl
;
10633 respond_to_request(mdr
, -CEPHFS_EPERM
);
10637 std::string_view snapname
= req
->get_filepath().last_dentry();
10639 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10640 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10641 respond_to_request(mdr
, -CEPHFS_EPERM
);
10645 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
10648 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10649 MutationImpl::LockOpVec lov
;
10650 lov
.add_xlock(&diri
->snaplock
);
10651 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10654 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10655 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10658 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10661 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10664 if (inodeno_t subvol_ino
= diri
->find_snaprealm()->get_subvolume_ino();
10665 (subvol_ino
&& subvol_ino
!= diri
->ino())) {
10666 dout(5) << "is a descendent of a subvolume dir" << dendl
;
10667 respond_to_request(mdr
, -CEPHFS_EPERM
);
10671 // check if we can create any more snapshots
10672 // we don't allow any more if we are already at or beyond the limit
10673 if (diri
->snaprealm
&&
10674 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
10675 respond_to_request(mdr
, -CEPHFS_EMLINK
);
10679 // make sure name is unique
10680 if (diri
->snaprealm
&&
10681 diri
->snaprealm
->exists(snapname
)) {
10682 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10685 if (snapname
.length() == 0 ||
10686 snapname
[0] == '_') {
10687 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10691 // allocate a snapid
10692 if (!mdr
->more()->stid
) {
10694 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
10695 mdr
->get_mds_stamp(),
10696 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10697 new C_MDS_RetryRequest(mdcache
, mdr
));
10701 version_t stid
= mdr
->more()->stid
;
10703 auto p
= mdr
->more()->snapidbl
.cbegin();
10705 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
10707 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10709 SnapPayload payload
;
10710 if (req
->get_data().length()) {
10712 auto iter
= req
->get_data().cbegin();
10713 decode(payload
, iter
);
10714 } catch (const ceph::buffer::error
&e
) {
10715 // backward compat -- client sends xattr bufferlist. however,
10716 // that is not used anywhere -- so (log and) ignore.
10717 dout(20) << ": no metadata in payload (old client?)" << dendl
;
10723 info
.ino
= diri
->ino();
10724 info
.snapid
= snapid
;
10725 info
.name
= snapname
;
10726 info
.stamp
= mdr
->get_op_stamp();
10727 info
.metadata
= payload
.metadata
;
10729 auto pi
= diri
->project_inode(mdr
, false, true);
10730 pi
.inode
->ctime
= info
.stamp
;
10731 if (info
.stamp
> pi
.inode
->rstat
.rctime
)
10732 pi
.inode
->rstat
.rctime
= info
.stamp
;
10733 pi
.inode
->rstat
.rsnaps
++;
10734 pi
.inode
->version
= diri
->pre_dirty();
10736 // project the snaprealm
10737 auto &newsnap
= *pi
.snapnode
;
10738 newsnap
.created
= snapid
;
10739 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
10741 em
.first
->second
= info
;
10742 newsnap
.seq
= snapid
;
10743 newsnap
.last_created
= snapid
;
10745 // journal the inode changes
10746 mdr
->ls
= mdlog
->get_current_segment();
10747 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
10748 mdlog
->start_entry(le
);
10750 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10751 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10752 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10753 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10755 // journal the snaprealm changes
10756 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
10761 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
10763 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
10765 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
10769 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10772 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10774 // notify other mds
10775 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
10777 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
10781 mdr
->snapid
= info
.snapid
;
10782 mdr
->tracei
= diri
;
10783 respond_to_request(mdr
, 0);
10789 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
10792 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10793 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10794 void finish(int r
) override
{
10795 server
->_rmsnap_finish(mdr
, diri
, snapid
);
10799 /* This function takes responsibility for the passed mdr*/
10800 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
10802 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10804 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10808 if (!diri
->is_dir()) {
10809 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10813 std::string_view snapname
= req
->get_filepath().last_dentry();
10815 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10816 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10817 respond_to_request(mdr
, -CEPHFS_EPERM
);
10821 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
10823 // does snap exist?
10824 if (snapname
.length() == 0 || snapname
[0] == '_') {
10825 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't prune a parent snap, currently.
10828 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
10829 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10832 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
10833 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
10835 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10836 MutationImpl::LockOpVec lov
;
10837 lov
.add_xlock(&diri
->snaplock
);
10838 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10840 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10841 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10844 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10847 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10851 if (!mdr
->more()->stid
) {
10852 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
10853 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10854 new C_MDS_RetryRequest(mdcache
, mdr
));
10857 version_t stid
= mdr
->more()->stid
;
10858 auto p
= mdr
->more()->snapidbl
.cbegin();
10861 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
10863 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10866 auto pi
= diri
->project_inode(mdr
, false, true);
10867 pi
.inode
->version
= diri
->pre_dirty();
10868 pi
.inode
->ctime
= mdr
->get_op_stamp();
10869 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10870 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10871 pi
.inode
->rstat
.rsnaps
--;
10873 mdr
->ls
= mdlog
->get_current_segment();
10874 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
10875 mdlog
->start_entry(le
);
10877 // project the snaprealm
10878 auto &newnode
= *pi
.snapnode
;
10879 newnode
.snaps
.erase(snapid
);
10881 newnode
.last_destroyed
= seq
;
10883 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10884 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10885 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10886 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10888 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
10893 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10895 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10896 snapid_t stid
= mdr
->more()->stid
;
10897 auto p
= mdr
->more()->snapidbl
.cbegin();
10903 mds
->snapclient
->commit(stid
, mdr
->ls
);
10905 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10907 // notify other mds
10908 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
10910 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
10914 respond_to_request(mdr
, 0);
10916 // purge snapshot data
10917 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
10920 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
10923 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10924 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10925 void finish(int r
) override
{
10926 server
->_renamesnap_finish(mdr
, diri
, snapid
);
10930 /* This function takes responsibility for the passed mdr*/
10931 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
10933 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10934 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
10935 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10939 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10943 if (!diri
->is_dir()) { // dir only
10944 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10948 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
10949 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10950 respond_to_request(mdr
, -CEPHFS_EPERM
);
10954 std::string_view dstname
= req
->get_filepath().last_dentry();
10955 std::string_view srcname
= req
->get_filepath2().last_dentry();
10956 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
10958 if (srcname
.length() == 0 || srcname
[0] == '_') {
10959 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't rename a parent snap.
10962 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
10963 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10966 if (dstname
.length() == 0 || dstname
[0] == '_') {
10967 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10970 if (diri
->snaprealm
->exists(dstname
)) {
10971 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10975 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
10976 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
10979 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10980 MutationImpl::LockOpVec lov
;
10981 lov
.add_xlock(&diri
->snaplock
);
10982 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10984 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10985 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10988 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10991 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10995 if (!mdr
->more()->stid
) {
10996 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
10997 &mdr
->more()->stid
,
10998 new C_MDS_RetryRequest(mdcache
, mdr
));
11002 version_t stid
= mdr
->more()->stid
;
11003 dout(10) << " stid is " << stid
<< dendl
;
11005 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
11008 auto pi
= diri
->project_inode(mdr
, false, true);
11009 pi
.inode
->ctime
= mdr
->get_op_stamp();
11010 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
11011 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
11012 pi
.inode
->version
= diri
->pre_dirty();
11014 // project the snaprealm
11015 auto &newsnap
= *pi
.snapnode
;
11016 auto it
= newsnap
.snaps
.find(snapid
);
11017 ceph_assert(it
!= newsnap
.snaps
.end());
11018 it
->second
.name
= dstname
;
11020 // journal the inode changes
11021 mdr
->ls
= mdlog
->get_current_segment();
11022 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
11023 mdlog
->start_entry(le
);
11025 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
11026 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
11027 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
11028 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11030 // journal the snaprealm changes
11031 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
11036 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
11038 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
11042 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
11044 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11046 // notify other mds
11047 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
11049 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
11053 mdr
->tracei
= diri
;
11054 mdr
->snapid
= snapid
;
11055 respond_to_request(mdr
, 0);
11059 * Return true if server is in state RECONNECT and this
11060 * client has not yet reconnected.
11062 bool Server::waiting_for_reconnect(client_t c
) const
11064 return client_reconnect_gather
.count(c
) > 0;
11067 void Server::dump_reconnect_status(Formatter
*f
) const
11069 f
->open_object_section("reconnect_status");
11070 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
11071 f
->close_section();