1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
35 #include "msg/Messenger.h"
37 #include "osdc/Objecter.h"
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
58 #include <string_view>
61 #include "common/config.h"
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
70 class ServerContext
: public MDSContext
{
73 MDSRank
*get_mds() override
79 explicit ServerContext(Server
*s
) : server(s
) {
80 ceph_assert(server
!= NULL
);
84 class Batch_Getattr_Lookup
: public BatchOp
{
87 ceph::ref_t
<MDRequestImpl
> mdr
;
88 std::vector
<ceph::ref_t
<MDRequestImpl
>> batch_reqs
;
91 Batch_Getattr_Lookup(Server
* s
, const ceph::ref_t
<MDRequestImpl
>& r
)
93 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
)
94 mdr
->batch_op_map
= &mdr
->dn
[0].back()->batch_ops
;
96 mdr
->batch_op_map
= &mdr
->in
[0]->batch_ops
;
98 void add_request(const ceph::ref_t
<MDRequestImpl
>& r
) override
{
99 batch_reqs
.push_back(r
);
101 ceph::ref_t
<MDRequestImpl
> find_new_head() override
{
102 while (!batch_reqs
.empty()) {
103 auto r
= std::move(batch_reqs
.back());
104 batch_reqs
.pop_back();
108 r
->batch_op_map
= mdr
->batch_op_map
;
109 mdr
->batch_op_map
= nullptr;
115 void _forward(mds_rank_t t
) override
{
116 MDCache
* mdcache
= server
->mdcache
;
117 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
118 mdr
->set_mds_stamp(ceph_clock_now());
119 for (auto& m
: batch_reqs
) {
121 mdcache
->request_forward(m
, t
);
125 void _respond(int r
) override
{
126 mdr
->set_mds_stamp(ceph_clock_now());
127 for (auto& m
: batch_reqs
) {
129 m
->tracei
= mdr
->tracei
;
130 m
->tracedn
= mdr
->tracedn
;
131 server
->respond_to_request(m
, r
);
135 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
137 void print(std::ostream
& o
) {
138 o
<< "[batch front=" << *mdr
<< "]";
142 class ServerLogContext
: public MDSLogContextBase
{
145 MDSRank
*get_mds() override
151 void pre_finish(int r
) override
{
153 mdr
->mark_event("journal_committed: ");
156 explicit ServerLogContext(Server
*s
) : server(s
) {
157 ceph_assert(server
!= NULL
);
159 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
160 ceph_assert(server
!= NULL
);
164 void Server::create_logger()
166 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
168 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
170 plb
.add_u64_counter(l_mdss_handle_peer_request
, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
172 plb
.add_u64_counter(l_mdss_handle_client_session
,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING
);
175 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
177 plb
.add_u64_counter(l_mdss_cap_acquisition_throttle
,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING
);
181 // fop latencies are useful
182 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
183 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
192 "Request type lookup latency");
193 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
203 plb
.add_time_avg(l_mdss_req_getvxattr_latency
, "req_getvxattr_latency",
204 "Request type get virtual extended attribute latency");
205 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
206 "Request type set extended attribute latency");
207 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
208 "Request type remove extended attribute latency");
209 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
210 "Request type read directory latency");
211 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
212 "Request type set file lock latency");
213 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
214 "Request type get file lock latency");
215 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
216 "Request type create latency");
217 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
218 "Request type open latency");
219 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
220 "Request type make node latency");
221 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
222 "Request type link latency");
223 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
224 "Request type unlink latency");
225 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
226 "Request type remove directory latency");
227 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
228 "Request type rename latency");
229 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
230 "Request type make directory latency");
231 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
232 "Request type symbolic link latency");
233 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
234 "Request type list snapshot latency");
235 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
236 "Request type make snapshot latency");
237 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
238 "Request type remove snapshot latency");
239 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
240 "Request type rename snapshot latency");
242 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
243 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
244 "Client requests dispatched");
245 plb
.add_u64_counter(l_mdss_dispatch_peer_request
, "dispatch_server_request",
246 "Server requests dispatched");
248 logger
= plb
.create_perf_counters();
249 g_ceph_context
->get_perfcounters_collection()->add(logger
);
252 Server::Server(MDSRank
*m
, MetricsHandler
*metrics_handler
) :
254 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
255 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate")),
256 metrics_handler(metrics_handler
)
258 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
259 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
260 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
261 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
262 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
263 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
264 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
265 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
266 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
267 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
268 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
269 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
270 supported_metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
273 void Server::dispatch(const cref_t
<Message
> &m
)
275 switch (m
->get_type()) {
276 case CEPH_MSG_CLIENT_RECONNECT
:
277 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
282 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
284 1. In reconnect phase, client sent unsafe requests to mds.
285 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
286 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
287 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
290 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
292 // handle_peer_request()/handle_client_session() will wait if necessary
293 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
294 const auto &req
= ref_cast
<MClientRequest
>(m
);
295 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
296 Session
*session
= mds
->get_session(req
);
297 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
298 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
301 bool queue_replay
= false;
302 if (req
->is_replay() || req
->is_async()) {
303 dout(3) << "queuing replayed op" << dendl
;
306 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
307 inodeno_t
ino(req
->head
.ino
);
308 mdcache
->add_replay_ino_alloc(ino
);
309 if (replay_unsafe_with_closed_session
&&
310 session
->free_prealloc_inos
.contains(ino
)) {
311 // don't purge inodes that will be created by later replay
312 session
->free_prealloc_inos
.erase(ino
);
313 session
->delegated_inos
.insert(ino
);
316 } else if (req
->get_retry_attempt()) {
317 // process completed request in clientreplay stage. The completed request
318 // might have created new file/directorie. This guarantees MDS sends a reply
319 // to client before other request modifies the new file/directorie.
320 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
321 dout(3) << "queuing completed op" << dendl
;
324 // this request was created before the cap reconnect message, drop any embedded
326 req
->releases
.clear();
329 req
->mark_queued_for_replay();
330 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
335 bool wait_for_active
= true;
336 if (mds
->is_stopping()) {
337 wait_for_active
= false;
338 } else if (mds
->is_clientreplay()) {
339 if (req
->is_queued_for_replay()) {
340 wait_for_active
= false;
343 if (wait_for_active
) {
344 dout(3) << "not active yet, waiting" << dendl
;
345 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
350 switch (m
->get_type()) {
351 case CEPH_MSG_CLIENT_SESSION
:
352 handle_client_session(ref_cast
<MClientSession
>(m
));
354 case CEPH_MSG_CLIENT_REQUEST
:
355 handle_client_request(ref_cast
<MClientRequest
>(m
));
357 case CEPH_MSG_CLIENT_RECLAIM
:
358 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
360 case MSG_MDS_PEER_REQUEST
:
361 handle_peer_request(ref_cast
<MMDSPeerRequest
>(m
));
364 derr
<< "server unknown message " << m
->get_type() << dendl
;
365 ceph_abort_msg("server unknown message");
371 // ----------------------------------------------------------
372 // SESSION management
374 class C_MDS_session_finish
: public ServerLogContext
{
379 interval_set
<inodeno_t
> inos_to_free
;
381 interval_set
<inodeno_t
> inos_to_purge
;
382 LogSegment
*ls
= nullptr;
385 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= nullptr) :
386 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
387 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
,
388 const interval_set
<inodeno_t
>& to_free
, version_t iv
,
389 const interval_set
<inodeno_t
>& to_purge
, LogSegment
*_ls
, Context
*fin_
= nullptr) :
390 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
),
391 inos_to_free(to_free
), inotablev(iv
), inos_to_purge(to_purge
), ls(_ls
), fin(fin_
) {}
392 void finish(int r
) override
{
394 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos_to_free
, inotablev
, inos_to_purge
, ls
);
401 Session
* Server::find_session_by_uuid(std::string_view uuid
)
403 Session
* session
= nullptr;
404 for (auto& it
: mds
->sessionmap
.get_sessions()) {
405 auto& metadata
= it
.second
->info
.client_metadata
;
407 auto p
= metadata
.find("uuid");
408 if (p
== metadata
.end() || p
->second
!= uuid
)
413 } else if (!session
->reclaiming_from
) {
414 ceph_assert(it
.second
->reclaiming_from
== session
);
417 ceph_assert(session
->reclaiming_from
== it
.second
);
423 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
425 if (!session
->is_open() && !session
->is_stale()) {
426 dout(10) << "session not open, dropping this req" << dendl
;
430 auto reply
= make_message
<MClientReclaimReply
>(0);
431 if (m
->get_uuid().empty()) {
432 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
433 reply
->set_result(-CEPHFS_EINVAL
);
434 mds
->send_message_client(reply
, session
);
438 unsigned flags
= m
->get_flags();
439 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
440 dout(10) << __func__
<< " unsupported flags" << dendl
;
441 reply
->set_result(-CEPHFS_EOPNOTSUPP
);
442 mds
->send_message_client(reply
, session
);
446 Session
* target
= find_session_by_uuid(m
->get_uuid());
448 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
449 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
450 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
451 reply
->set_result(-CEPHFS_EPERM
);
452 mds
->send_message_client(reply
, session
);
455 ceph_assert(!target
->reclaiming_from
);
456 ceph_assert(!session
->reclaiming_from
);
457 session
->reclaiming_from
= target
;
458 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
461 if (flags
& CEPH_RECLAIM_RESET
) {
462 finish_reclaim_session(session
, reply
);
469 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
471 Session
*target
= session
->reclaiming_from
;
473 session
->reclaiming_from
= nullptr;
477 int64_t session_id
= session
->get_client().v
;
478 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
479 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
480 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
484 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
485 reply
->set_epoch(epoch
);
486 mds
->send_message_client(reply
, session
);
489 send_reply
= nullptr;
492 bool blocklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
493 return map
.is_blocklisted(target
->info
.inst
.addr
);
496 if (blocklisted
|| !g_conf()->mds_session_blocklist_on_evict
) {
497 kill_session(target
, send_reply
);
499 CachedStackStringStream css
;
500 mds
->evict_client(target
->get_client().v
, false, true, *css
, send_reply
);
503 mds
->send_message_client(reply
, session
);
507 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
509 Session
*session
= mds
->get_session(m
);
510 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
511 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
514 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
518 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
519 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
520 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
524 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
525 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
529 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
530 finish_reclaim_session(session
);
532 reclaim_session(session
, m
);
536 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
539 Session
*session
= mds
->get_session(m
);
541 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
542 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
545 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
546 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
547 reply
->metadata
["error_string"] = "sessionless";
548 mds
->send_message(reply
, m
->get_connection());
552 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
553 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
554 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
555 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
556 reply
->metadata
["error_string"] = "client doesn't have caps for FS \"" +
557 std::string(fs_name
) + "\"";
558 mds
->send_message(std::move(reply
), m
->get_connection());
562 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
563 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
564 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
565 // close requests need to be handled when mds is active
566 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
567 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
571 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
572 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
578 logger
->inc(l_mdss_handle_client_session
);
581 switch (m
->get_op()) {
582 case CEPH_SESSION_REQUEST_OPEN
:
583 if (session
->is_opening() ||
584 session
->is_open() ||
585 session
->is_stale() ||
586 session
->is_killing() ||
587 terminating_sessions
) {
588 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
591 ceph_assert(session
->is_closed() || session
->is_closing());
593 if (mds
->is_stopping()) {
594 dout(10) << "mds is stopping, dropping open req" << dendl
;
599 auto& addr
= session
->info
.inst
.addr
;
600 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
601 auto& client_metadata
= session
->info
.client_metadata
;
603 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
604 auto now
= ceph_clock_now();
605 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
606 auto elapsed
= now
- m
->get_recv_stamp();
607 CachedStackStringStream css
;
608 *css
<< "New client session:"
609 << " addr=\"" << session
->info
.inst
.addr
<< "\""
610 << ",elapsed=" << elapsed
611 << ",throttled=" << throttle_elapsed
612 << ",status=\"" << status
<< "\"";
614 *css
<< ",error=\"" << err
<< "\"";
616 const auto& metadata
= session
->info
.client_metadata
;
617 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
618 *css
<< ",root=\"" << it
->second
<< "\"";
620 dout(2) << css
->strv() << dendl
;
623 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
, unsigned flags
=0) {
624 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
, 0, flags
);
625 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
626 m
->metadata
["error_string"] = err_str
;
627 mds
->send_message_client(m
, session
);
628 log_session_status("REJECTED", err_str
);
631 bool blocklisted
= mds
->objecter
->with_osdmap(
632 [&addr
](const OSDMap
&osd_map
) -> bool {
633 return osd_map
.is_blocklisted(addr
);
637 dout(10) << "rejecting blocklisted client " << addr
<< dendl
;
638 // This goes on the wire and the "blacklisted" substring is
639 // depended upon by the kernel client for detecting whether it
640 // has been blocklisted. If mounted with recover_session=clean
641 // (since 5.4), it tries to automatically recover itself from
644 flags
|= MClientSession::SESSION_BLOCKLISTED
;
645 send_reject_message("blocklisted (blacklisted)", flags
);
650 if (client_metadata
.features
.empty())
651 infer_supported_features(session
, client_metadata
);
653 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
654 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
655 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
656 for (const auto& p
: client_metadata
) {
657 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
660 feature_bitset_t missing_features
= required_client_features
;
661 missing_features
-= client_metadata
.features
;
662 if (!missing_features
.empty()) {
663 CachedStackStringStream css
;
664 *css
<< "missing required features '" << missing_features
<< "'";
665 send_reject_message(css
->strv());
666 mds
->clog
->warn() << "client session (" << session
->info
.inst
667 << ") lacks required features " << missing_features
668 << "; client supports " << client_metadata
.features
;
673 // Special case for the 'root' metadata path; validate that the claimed
674 // root is actually within the caps of the session
675 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
676 auto claimed_root
= it
->second
;
677 CachedStackStringStream css
;
679 // claimed_root has a leading "/" which we strip before passing
681 if (claimed_root
.empty() || claimed_root
[0] != '/') {
683 *css
<< "invalue root '" << claimed_root
<< "'";
684 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
686 *css
<< "non-allowable root '" << claimed_root
<< "'";
690 // Tell the client we're rejecting their open
691 send_reject_message(css
->strv());
692 mds
->clog
->warn() << "client session with " << css
->strv()
693 << " denied (" << session
->info
.inst
<< ")";
699 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
700 if (find_session_by_uuid(it
->second
)) {
701 send_reject_message("duplicated session uuid");
702 mds
->clog
->warn() << "client session with duplicated session uuid '"
703 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
709 if (session
->is_closed()) {
710 mds
->sessionmap
.add_session(session
);
713 pv
= mds
->sessionmap
.mark_projected(session
);
714 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
715 mds
->sessionmap
.touch_session(session
);
716 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
718 log_session_status("ACCEPTED", "");
720 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
721 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
726 case CEPH_SESSION_REQUEST_RENEWCAPS
:
727 if (session
->is_open() || session
->is_stale()) {
728 mds
->sessionmap
.touch_session(session
);
729 if (session
->is_stale()) {
730 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
731 mds
->locker
->resume_stale_caps(session
);
732 mds
->sessionmap
.touch_session(session
);
734 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
735 mds
->send_message_client(reply
, session
);
737 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
741 case CEPH_SESSION_REQUEST_CLOSE
:
743 if (session
->is_closed() ||
744 session
->is_closing() ||
745 session
->is_killing()) {
746 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
749 if (session
->is_importing()) {
750 dout(10) << "ignoring close req on importing session" << dendl
;
753 ceph_assert(session
->is_open() ||
754 session
->is_stale() ||
755 session
->is_opening());
756 if (m
->get_seq() < session
->get_push_seq()) {
757 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
758 << ", dropping" << dendl
;
761 // We are getting a seq that is higher than expected.
762 // Handle the same as any other seqn error.
764 if (m
->get_seq() != session
->get_push_seq()) {
765 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
766 << ", BUGGY!" << dendl
;
767 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
768 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
771 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
775 case CEPH_SESSION_FLUSHMSG_ACK
:
776 finish_flush_session(session
, m
->get_seq());
779 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
780 if (mds
->is_active())
789 void Server::flush_session(Session
*session
, MDSGatherBuilder
& gather
) {
790 if (!session
->is_open() ||
791 !session
->get_connection() ||
792 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
796 version_t seq
= session
->wait_for_flush(gather
.new_sub());
797 mds
->send_message_client(
798 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
801 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
803 for (const auto& client
: client_set
) {
804 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
805 ceph_assert(session
);
806 flush_session(session
, gather
);
810 void Server::finish_flush_session(Session
*session
, version_t seq
)
812 MDSContext::vec finished
;
813 session
->finish_flush(seq
, finished
);
814 mds
->queue_waiters(finished
);
817 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
818 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
819 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
)
821 dout(10) << "_session_logged " << session
->info
.inst
822 << " state_seq " << state_seq
823 << " " << (open
? "open":"close") << " " << pv
824 << " inos_to_free " << inos_to_free
<< " inotablev " << piv
825 << " inos_to_purge " << inos_to_purge
<< dendl
;
828 if (inos_to_purge
.size()){
830 session
->info
.prealloc_inos
.subtract(inos_to_purge
);
831 ls
->purging_inodes
.insert(inos_to_purge
);
832 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())
833 mdcache
->purge_inodes(inos_to_purge
, ls
);
836 if (inos_to_free
.size()) {
838 ceph_assert(session
->is_closing() || session
->is_killing() ||
839 session
->is_opening()); // re-open closing session
840 session
->info
.prealloc_inos
.subtract(inos_to_free
);
841 mds
->inotable
->apply_release_ids(inos_to_free
);
842 ceph_assert(mds
->inotable
->get_version() == piv
);
844 session
->free_prealloc_inos
= session
->info
.prealloc_inos
;
845 session
->delegated_inos
.clear();
848 mds
->sessionmap
.mark_dirty(session
);
851 if (session
->get_state_seq() != state_seq
) {
852 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
853 << ", noop" << dendl
;
854 // close must have been canceled (by an import?), or any number of other things..
856 ceph_assert(session
->is_opening());
857 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
858 mds
->sessionmap
.touch_session(session
);
859 metrics_handler
->add_session(session
);
860 ceph_assert(session
->get_connection());
861 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
862 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
863 reply
->supported_features
= supported_features
;
864 reply
->metric_spec
= supported_metric_spec
;
866 mds
->send_message_client(reply
, session
);
867 if (mdcache
->is_readonly()) {
868 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
869 mds
->send_message_client(m
, session
);
871 } else if (session
->is_closing() ||
872 session
->is_killing()) {
873 // kill any lingering capabilities, leases, requests
874 bool killing
= session
->is_killing();
875 while (!session
->caps
.empty()) {
876 Capability
*cap
= session
->caps
.front();
877 CInode
*in
= cap
->get_inode();
878 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
879 mds
->locker
->remove_client_cap(in
, cap
, killing
);
881 while (!session
->leases
.empty()) {
882 ClientLease
*r
= session
->leases
.front();
883 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
884 dout(20) << " killing client lease of " << *dn
<< dendl
;
885 dn
->remove_client_lease(r
, mds
->locker
);
887 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
888 dout(20) << " removing client from reconnect set" << dendl
;
889 if (client_reconnect_gather
.empty()) {
890 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
891 reconnect_gather_finish();
894 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
895 dout(20) << " removing client from reclaim set" << dendl
;
896 if (client_reclaim_gather
.empty()) {
897 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
898 mds
->maybe_clientreplay_done();
902 if (session
->is_closing()) {
903 // mark con disposable. if there is a fault, we will get a
904 // reset and clean it up. if the client hasn't received the
905 // CLOSE message yet, they will reconnect and get an
906 // ms_handle_remote_reset() and realize they had in fact closed.
907 // do this *before* sending the message to avoid a possible
909 if (session
->get_connection()) {
910 // Conditional because terminate_sessions will indiscrimately
911 // put sessions in CLOSING whether they ever had a conn or not.
912 session
->get_connection()->mark_disposable();
916 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
917 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
919 metrics_handler
->remove_session(session
);
920 mds
->sessionmap
.remove_session(session
);
921 } else if (session
->is_killing()) {
922 // destroy session, close connection
923 if (session
->get_connection()) {
924 session
->get_connection()->mark_down();
925 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
926 session
->set_connection(nullptr);
928 metrics_handler
->remove_session(session
);
929 mds
->sessionmap
.remove_session(session
);
939 * Inject sessions from some source other than actual connections.
942 * - sessions inferred from journal replay
943 * - sessions learned from other MDSs during rejoin
944 * - sessions learned from other MDSs during dir/caps migration
945 * - sessions learned from other MDSs during a cross-MDS rename
947 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
948 map
<client_t
,client_metadata_t
>& cmm
,
949 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
951 version_t pv
= mds
->sessionmap
.get_projected();
953 dout(10) << "prepare_force_open_sessions " << pv
954 << " on " << cm
.size() << " clients"
957 mds
->objecter
->with_osdmap(
958 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
959 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
960 if (osd_map
.is_blocklisted(p
->second
.addr
)) {
961 dout(10) << " ignoring blocklisted client." << p
->first
962 << " (" << p
->second
.addr
<< ")" << dendl
;
971 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
972 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
973 pv
= mds
->sessionmap
.mark_projected(session
);
975 if (session
->is_closed() ||
976 session
->is_closing() ||
977 session
->is_killing()) {
978 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
979 auto q
= cmm
.find(p
->first
);
981 session
->info
.client_metadata
.merge(q
->second
);
983 ceph_assert(session
->is_open() ||
984 session
->is_opening() ||
985 session
->is_stale());
988 smap
[p
->first
] = make_pair(session
, sseq
);
989 session
->inc_importing();
994 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
998 * FIXME: need to carefully consider the race conditions between a
999 * client trying to close a session and an MDS doing an import
1000 * trying to force open a session...
1002 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
1003 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
1005 for (auto &it
: smap
) {
1006 Session
*session
= it
.second
.first
;
1007 uint64_t sseq
= it
.second
.second
;
1009 if (session
->get_state_seq() != sseq
) {
1010 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
1012 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
1013 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1014 mds
->sessionmap
.touch_session(session
);
1015 metrics_handler
->add_session(session
);
1017 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1018 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1019 reply
->supported_features
= supported_features
;
1020 reply
->metric_spec
= supported_metric_spec
;
1022 mds
->send_message_client(reply
, session
);
1024 if (mdcache
->is_readonly())
1025 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1028 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
1029 ceph_assert(session
->is_open() || session
->is_stale());
1033 session
->dec_importing();
1036 mds
->sessionmap
.mark_dirty(session
);
1039 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
1042 class C_MDS_TerminatedSessions
: public ServerContext
{
1043 void finish(int r
) override
{
1044 server
->terminating_sessions
= false;
1047 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
1050 void Server::terminate_sessions()
1052 dout(5) << "terminating all sessions..." << dendl
;
1054 terminating_sessions
= true;
1056 // kill them off. clients will retry etc.
1057 set
<Session
*> sessions
;
1058 mds
->sessionmap
.get_client_session_set(sessions
);
1059 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1060 p
!= sessions
.end();
1062 Session
*session
= *p
;
1063 if (session
->is_closing() ||
1064 session
->is_killing() ||
1065 session
->is_closed())
1067 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
1070 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
1074 void Server::find_idle_sessions()
1076 auto now
= clock::now();
1077 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1079 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1082 // (caps go stale, lease die)
1083 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1084 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1086 // don't kick clients if we've been laggy
1087 if (last_cleared_laggy
< cutoff
) {
1088 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1089 << "), not marking any client stale" << dendl
;
1093 std::vector
<Session
*> to_evict
;
1095 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1096 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1097 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1098 std::vector
<Session
*> new_stale
;
1100 for (auto session
: *(sessions_p1
->second
)) {
1101 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1102 if (last_cap_renew_span
< cutoff
) {
1103 dout(20) << "laggiest active session is " << session
->info
.inst
1104 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1108 if (session
->last_seen
> session
->last_cap_renew
) {
1109 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1110 if (last_cap_renew_span
< cutoff
) {
1111 dout(20) << "laggiest active session is " << session
->info
.inst
1112 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1117 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1118 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1119 "has arrived" << dendl
;
1120 // evict session without marking it stale
1121 to_evict
.push_back(session
);
1125 if (defer_session_stale
&&
1126 !session
->is_any_flush_waiter() &&
1127 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1128 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1129 "since it holds no caps" << dendl
;
1133 auto it
= session
->info
.client_metadata
.find("timeout");
1134 if (it
!= session
->info
.client_metadata
.end()) {
1135 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1137 dout(10) << "skipping session " << session
->info
.inst
1138 << ", infinite timeout specified" << dendl
;
1141 double cutoff
= queue_max_age
+ timeout
;
1142 if (last_cap_renew_span
< cutoff
) {
1143 dout(10) << "skipping session " << session
->info
.inst
1144 << ", timeout (" << timeout
<< ") specified"
1145 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1149 // do not go through stale, evict it directly.
1150 to_evict
.push_back(session
);
1152 dout(10) << "new stale session " << session
->info
.inst
1153 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1154 new_stale
.push_back(session
);
1158 for (auto session
: new_stale
) {
1159 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1160 if (mds
->locker
->revoke_stale_caps(session
)) {
1161 mds
->locker
->remove_stale_leases(session
);
1162 finish_flush_session(session
, session
->get_push_seq());
1163 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
, session
->get_push_seq());
1164 mds
->send_message_client(m
, session
);
1166 to_evict
.push_back(session
);
1172 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1174 // Collect a list of sessions exceeding the autoclose threshold
1175 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1176 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1177 for (auto session
: *(sessions_p2
->second
)) {
1178 ceph_assert(session
->is_stale());
1179 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1180 if (last_cap_renew_span
< cutoff
) {
1181 dout(20) << "oldest stale session is " << session
->info
.inst
1182 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1185 to_evict
.push_back(session
);
1189 for (auto session
: to_evict
) {
1190 if (session
->is_importing()) {
1191 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1195 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1196 mds
->clog
->warn() << "evicting unresponsive client " << *session
1197 << ", after " << last_cap_renew_span
<< " seconds";
1198 dout(10) << "autoclosing stale session " << session
->info
.inst
1199 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1201 if (g_conf()->mds_session_blocklist_on_timeout
) {
1202 CachedStackStringStream css
;
1203 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
1205 kill_session(session
, NULL
);
1210 void Server::evict_cap_revoke_non_responders() {
1211 if (!cap_revoke_eviction_timeout
) {
1215 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1217 for (auto const &client
: to_evict
) {
1218 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1219 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1220 << " seconds, evicting";
1221 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1224 CachedStackStringStream css
;
1225 bool evicted
= mds
->evict_client(client
.v
, false,
1226 g_conf()->mds_session_blocklist_on_evict
,
1228 if (evicted
&& logger
) {
1229 logger
->inc(l_mdss_cap_revoke_eviction
);
1234 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1235 if (changed
.count("mds_forward_all_requests_to_auth")){
1236 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
1238 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1239 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1240 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1241 << cap_revoke_eviction_timeout
<< dendl
;
1243 if (changed
.count("mds_recall_max_decay_rate")) {
1244 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1246 if (changed
.count("mds_max_snaps_per_dir")) {
1247 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1248 dout(20) << __func__
<< " max snapshots per directory changed to "
1249 << max_snaps_per_dir
<< dendl
;
1251 if (changed
.count("mds_client_delegate_inos_pct")) {
1252 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1254 if (changed
.count("mds_max_caps_per_client")) {
1255 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1257 if (changed
.count("mds_session_cap_acquisition_throttle")) {
1258 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
1260 if (changed
.count("mds_session_max_caps_throttle_ratio")) {
1261 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
1263 if (changed
.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1264 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1266 if (changed
.count("mds_alternate_name_max")) {
1267 alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
1269 if (changed
.count("mds_dir_max_entries")) {
1270 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
1271 dout(20) << __func__
<< " max entries per directory changed to "
1272 << dir_max_entries
<< dendl
;
1274 if (changed
.count("mds_bal_fragment_size_max")) {
1275 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
1276 dout(20) << __func__
<< " max fragment size changed to "
1277 << bal_fragment_size_max
<< dendl
;
1282 * XXX bump in the interface here, not using an MDSContext here
1283 * because all the callers right now happen to use a SaferCond
1285 void Server::kill_session(Session
*session
, Context
*on_safe
)
1287 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1289 if ((session
->is_opening() ||
1290 session
->is_open() ||
1291 session
->is_stale()) &&
1292 !session
->is_importing()) {
1293 dout(10) << "kill_session " << session
<< dendl
;
1294 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1296 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1297 if (session
->is_closing() ||
1298 session
->is_killing()) {
1300 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1302 ceph_assert(session
->is_closed() ||
1303 session
->is_importing());
1305 on_safe
->complete(0);
1310 size_t Server::apply_blocklist()
1312 std::vector
<Session
*> victims
;
1313 const auto& sessions
= mds
->sessionmap
.get_sessions();
1314 mds
->objecter
->with_osdmap(
1315 [&](const OSDMap
& o
) {
1316 for (const auto& p
: sessions
) {
1317 if (!p
.first
.is_client()) {
1318 // Do not apply OSDMap blocklist to MDS daemons, we find out
1319 // about their death via MDSMap.
1322 if (o
.is_blocklisted(p
.second
->info
.inst
.addr
)) {
1323 victims
.push_back(p
.second
);
1328 for (const auto& s
: victims
) {
1329 kill_session(s
, nullptr);
1332 dout(10) << "apply_blocklist: killed " << victims
.size() << dendl
;
1334 return victims
.size();
1337 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1339 dout(10) << __func__
<< " : "
1340 << session
->info
.inst
1341 << " pending_prealloc_inos " << session
->pending_prealloc_inos
1342 << " free_prealloc_inos " << session
->free_prealloc_inos
1343 << " delegated_inos " << session
->delegated_inos
<< dendl
;
1345 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1346 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1349 // release alloc and pending-alloc inos for this session
1350 // and wipe out session state, in case the session close aborts for some reason
1351 interval_set
<inodeno_t
> inos_to_free
;
1352 inos_to_free
.insert(session
->pending_prealloc_inos
);
1353 inos_to_free
.insert(session
->free_prealloc_inos
);
1354 if (inos_to_free
.size()) {
1355 mds
->inotable
->project_release_ids(inos_to_free
);
1356 piv
= mds
->inotable
->get_projected_version();
1360 auto le
= new ESession(session
->info
.inst
, false, pv
, inos_to_free
, piv
, session
->delegated_inos
);
1361 auto fin
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, inos_to_free
, piv
,
1362 session
->delegated_inos
, mdlog
->get_current_segment(), on_safe
);
1363 mdlog
->start_submit_entry(le
, fin
);
1366 // clean up requests, too
1367 while(!session
->requests
.empty()) {
1368 auto mdr
= MDRequestRef(*session
->requests
.begin());
1369 mdcache
->request_kill(mdr
);
1372 finish_flush_session(session
, session
->get_push_seq());
1375 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1377 reconnect_done
= reconnect_done_
;
1379 auto now
= clock::now();
1380 set
<Session
*> sessions
;
1381 mds
->sessionmap
.get_client_session_set(sessions
);
1382 for (auto session
: sessions
) {
1383 if (session
->is_open()) {
1384 client_reconnect_gather
.insert(session
->get_client());
1385 session
->set_reconnecting(true);
1386 session
->last_cap_renew
= now
;
1390 if (client_reconnect_gather
.empty()) {
1391 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1392 reconnect_gather_finish();
1396 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1398 reconnect_start
= now
;
1399 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1400 mds
->sessionmap
.dump();
1403 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1405 dout(7) << "handle_client_reconnect " << m
->get_source()
1406 << (m
->has_more() ? " (more)" : "") << dendl
;
1407 client_t from
= m
->get_source().num();
1408 Session
*session
= mds
->get_session(m
);
1410 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1411 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1412 reply
->metadata
["error_string"] = "sessionless";
1413 mds
->send_message(reply
, m
->get_connection());
1417 if (!session
->is_open()) {
1418 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1419 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1420 mds
->send_message(reply
, m
->get_connection());
1424 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1426 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1427 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1428 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1432 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1433 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1436 if (reconnect_all_deny
|| !mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1437 // XXX maybe in the future we can do better than this?
1438 if (reconnect_all_deny
) {
1439 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl
;
1441 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1443 mds
->clog
->info() << "denied reconnect attempt (mds is "
1444 << ceph_mds_state_name(mds
->get_state())
1445 << ") from " << m
->get_source_inst()
1446 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1449 std::string error_str
;
1450 if (!session
->is_open()) {
1451 error_str
= "session is closed";
1452 } else if (mdcache
->is_readonly()) {
1453 error_str
= "mds is readonly";
1455 if (session
->info
.client_metadata
.features
.empty())
1456 infer_supported_features(session
, session
->info
.client_metadata
);
1458 feature_bitset_t missing_features
= required_client_features
;
1459 missing_features
-= session
->info
.client_metadata
.features
;
1460 if (!missing_features
.empty()) {
1461 CachedStackStringStream css
;
1462 *css
<< "missing required features '" << missing_features
<< "'";
1463 error_str
= css
->strv();
1467 if (!error_str
.empty()) {
1469 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1470 mds
->clog
->info() << "denied reconnect attempt from "
1471 << m
->get_source_inst() << " (" << error_str
<< ")";
1476 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1477 mds
->send_message_client(r
, session
);
1478 if (session
->is_open()) {
1479 client_reconnect_denied
.insert(session
->get_client());
1484 if (!m
->has_more()) {
1485 metrics_handler
->add_session(session
);
1486 // notify client of success with an OPEN
1487 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1488 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1489 reply
->supported_features
= supported_features
;
1490 reply
->metric_spec
= supported_metric_spec
;
1492 mds
->send_message_client(reply
, session
);
1493 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1496 session
->last_cap_renew
= clock::now();
1499 for (const auto &r
: m
->realms
) {
1500 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1501 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1504 if (in
->snaprealm
) {
1505 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1507 // this can happen if we are non-auth or we rollback snaprealm
1508 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1510 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1512 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1513 << " seq " << r
.realm
.seq
<< dendl
;
1514 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1519 for (const auto &p
: m
->caps
) {
1520 // make sure our last_cap_id is MAX over all issued caps
1521 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1522 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1524 CInode
*in
= mdcache
->get_inode(p
.first
);
1525 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1527 if (in
&& in
->is_auth()) {
1528 // we recovered it, and it's ours. take note.
1529 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1530 << " on " << *in
<< dendl
;
1531 in
->reconnect_cap(from
, p
.second
, session
);
1532 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1533 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1537 if (in
&& !in
->is_auth()) {
1539 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1540 // add to cap export list.
1541 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1542 in
->authority().first
, true);
1544 // don't know if the inode is mine
1545 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1546 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1550 reconnect_last_seen
= clock::now();
1552 if (!m
->has_more()) {
1553 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1555 // remove from gather set
1556 client_reconnect_gather
.erase(from
);
1557 session
->set_reconnecting(false);
1558 if (client_reconnect_gather
.empty())
1559 reconnect_gather_finish();
1563 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1566 auto it
= client_metadata
.find("ceph_version");
1567 if (it
!= client_metadata
.end()) {
1568 // user space client
1569 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1570 supported
= CEPHFS_FEATURE_LUMINOUS
;
1571 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1572 supported
= CEPHFS_FEATURE_KRAKEN
;
1574 it
= client_metadata
.find("kernel_version");
1575 if (it
!= client_metadata
.end()) {
1577 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1578 supported
= CEPHFS_FEATURE_LUMINOUS
;
1581 if (supported
== -1 &&
1582 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1583 supported
= CEPHFS_FEATURE_JEWEL
;
1585 if (supported
>= 0) {
1586 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1587 client_metadata
.features
= feature_bitset_t(value
);
1588 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1592 void Server::update_required_client_features()
1594 required_client_features
= mds
->mdsmap
->get_required_client_features();
1595 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1597 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1598 set
<Session
*> sessions
;
1599 mds
->sessionmap
.get_client_session_set(sessions
);
1600 for (auto session
: sessions
) {
1601 feature_bitset_t missing_features
= required_client_features
;
1602 missing_features
-= session
->info
.client_metadata
.features
;
1603 if (!missing_features
.empty()) {
1604 bool blocklisted
= mds
->objecter
->with_osdmap(
1605 [session
](const OSDMap
&osd_map
) -> bool {
1606 return osd_map
.is_blocklisted(session
->info
.inst
.addr
);
1611 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1612 << missing_features
<< "'";
1613 CachedStackStringStream css
;
1614 mds
->evict_client(session
->get_client().v
, false,
1615 g_conf()->mds_session_blocklist_on_evict
, *css
);
1621 void Server::reconnect_gather_finish()
1623 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1624 ceph_assert(reconnect_done
);
1626 if (!mds
->snapclient
->is_synced()) {
1627 // make sure snaptable cache is populated. snaprealms will be
1628 // extensively used in rejoin stage.
1629 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1630 mds
->snapclient
->wait_for_sync(reconnect_done
);
1632 reconnect_done
->complete(0);
1634 reconnect_done
= NULL
;
1637 void Server::reconnect_tick()
1639 bool reject_all_reconnect
= false;
1640 if (reconnect_evicting
) {
1641 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1646 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1647 * then load less meta information in rejoin phase. This will shorten reboot time.
1648 * Moreover, loading less meta increases the chance standby with less memory can failover.
1650 * Why not shorten reconnect period?
1651 * Clients may send unsafe or retry requests, which haven't been
1652 * completed before old mds stop, to new mds. These requests may
1653 * need to be processed during new mds's clientreplay phase,
1654 * see: #https://github.com/ceph/ceph/pull/29059.
1656 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1657 if (client_reconnect_gather
.empty())
1660 if (reconnect_all_deny
&& (client_reconnect_gather
== client_reconnect_denied
))
1661 reject_all_reconnect
= true;
1663 auto now
= clock::now();
1664 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1665 if (elapse1
< g_conf()->mds_reconnect_timeout
&& !reject_all_reconnect
)
1668 vector
<Session
*> remaining_sessions
;
1669 remaining_sessions
.reserve(client_reconnect_gather
.size());
1670 for (auto c
: client_reconnect_gather
) {
1671 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1672 ceph_assert(session
);
1673 remaining_sessions
.push_back(session
);
1674 // client re-sends cap flush messages before the reconnect message
1675 if (session
->last_seen
> reconnect_last_seen
)
1676 reconnect_last_seen
= session
->last_seen
;
1679 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1680 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2 && !reject_all_reconnect
) {
1681 dout(7) << "reconnect_tick: last seen " << elapse2
1682 << " seconds ago, extending reconnect interval" << dendl
;
1686 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1687 << " clients have not reconnected in time" << dendl
;
1689 // If we're doing blocklist evictions, use this to wait for them before
1690 // proceeding to reconnect_gather_finish
1691 MDSGatherBuilder
gather(g_ceph_context
);
1693 for (auto session
: remaining_sessions
) {
1694 // Keep sessions that have specified timeout. These sessions will prevent
1695 // mds from going to active. MDS goes to active after they all have been
1696 // killed or reclaimed.
1697 if (session
->info
.client_metadata
.find("timeout") !=
1698 session
->info
.client_metadata
.end()) {
1699 dout(1) << "reconnect keeps " << session
->info
.inst
1700 << ", need to be reclaimed" << dendl
;
1701 client_reclaim_gather
.insert(session
->get_client());
1705 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1707 mds
->clog
->warn() << "evicting unresponsive client " << *session
1708 << ", after waiting " << elapse1
1709 << " seconds during MDS startup";
1711 // make _session_logged() purge orphan objects of lost async/unsafe requests
1712 session
->delegated_inos
.swap(session
->free_prealloc_inos
);
1714 if (g_conf()->mds_session_blocklist_on_timeout
) {
1715 CachedStackStringStream css
;
1716 mds
->evict_client(session
->get_client().v
, false, true, *css
,
1719 kill_session(session
, NULL
);
1722 failed_reconnects
++;
1724 client_reconnect_gather
.clear();
1725 client_reconnect_denied
.clear();
1727 if (gather
.has_subs()) {
1728 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1729 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1730 [this](int r
){reconnect_gather_finish();})));
1732 reconnect_evicting
= true;
1734 reconnect_gather_finish();
1738 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1740 if (!locks
.length()) return;
1743 auto p
= locks
.cbegin();
1744 decode(numlocks
, p
);
1745 for (int i
= 0; i
< numlocks
; ++i
) {
1747 lock
.client
= client
;
1748 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1749 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1751 decode(numlocks
, p
);
1752 for (int i
= 0; i
< numlocks
; ++i
) {
1754 lock
.client
= client
;
1755 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1756 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1761 * Call this when the MDCache is oversized, to send requests to the clients
1762 * to trim some caps, and consequently unpin some inodes in the MDCache so
1763 * that it can trim too.
1765 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1767 const auto now
= clock::now();
1768 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1769 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1770 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1771 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1773 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1774 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1775 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1776 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1777 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1778 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1780 dout(7) << __func__
<< ":"
1781 << " min=" << min_caps_per_client
1782 << " max=" << max_caps_per_client
1783 << " total=" << Capability::count()
1784 << " flags=" << flags
1787 /* trim caps of sessions with the most caps first */
1788 std::multimap
<uint64_t, Session
*> caps_session
;
1789 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1790 auto num_caps
= s
->caps
.size();
1791 auto cache_liveness
= s
->get_session_cache_liveness();
1792 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1793 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1796 mds
->sessionmap
.get_client_sessions(std::move(f
));
1798 std::pair
<bool, uint64_t> result
= {false, 0};
1799 auto& [throttled
, caps_recalled
] = result
;
1800 last_recall_state
= now
;
1801 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1802 if (!session
->is_open() ||
1803 !session
->get_connection() ||
1804 !session
->info
.inst
.name
.is_client())
1807 dout(10) << __func__
<< ":"
1808 << " session " << session
->info
.inst
1809 << " caps " << num_caps
1810 << ", leases " << session
->leases
.size()
1814 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1815 newlim
= min_caps_per_client
;
1817 newlim
= num_caps
-recall_max_caps
;
1819 if (num_caps
> newlim
) {
1820 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1821 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1822 newlim
= num_caps
-recall
;
1823 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1824 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1825 const uint64_t global_recall_throttle
= recall_throttle
.get();
1826 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1827 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1830 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1831 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1834 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1835 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1840 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1842 const auto session_recall
= session
->get_recall_caps();
1843 const auto session_release
= session
->get_release_caps();
1844 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1845 /* The session has been unable to keep up with the number of caps
1846 * recalled (by half); additionally, to prevent marking sessions
1847 * we've just begun to recall from, the session_recall counter
1848 * (decayed count of caps recently recalled) is **greater** than the
1849 * session threshold for the session's cap recall throttle.
1851 dout(15) << " 2*session_release < session_recall"
1852 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1853 " 2*session_recall < recall_max_decay_threshold"
1854 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1855 " Skipping because we are unlikely to get more released." << dendl
;
1857 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1858 /* The number of caps recalled is less than the number we *could*
1859 * recall (so there isn't much left to recall?) and the number of
1860 * caps is less than the current recall_caps counter (decayed count
1861 * of caps recently recalled).
1863 dout(15) << " 2*recall < session_recall "
1864 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1865 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1866 " Skipping because we are unlikely to get more released." << dendl
;
1871 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1873 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1874 m
->head
.max_caps
= newlim
;
1875 mds
->send_message_client(m
, session
);
1877 flush_session(session
, *gather
);
1879 caps_recalled
+= session
->notify_recall_sent(newlim
);
1880 recall_throttle
.hit(recall
);
1884 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1889 void Server::force_clients_readonly()
1891 dout(10) << "force_clients_readonly" << dendl
;
1892 set
<Session
*> sessions
;
1893 mds
->sessionmap
.get_client_session_set(sessions
);
1894 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1895 p
!= sessions
.end();
1897 Session
*session
= *p
;
1898 if (!session
->info
.inst
.name
.is_client() ||
1899 !(session
->is_open() || session
->is_stale()))
1901 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1906 * some generic stuff for finishing off requests
1908 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1910 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1911 ceph_assert(!mdr
->has_completed
);
1913 // note trace items for eventual reply.
1922 early_reply(mdr
, in
, dn
);
1924 mdr
->committing
= true;
1925 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1927 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1928 if (mds
->queue_one_replay()) {
1929 dout(10) << " queued next replay op" << dendl
;
1931 dout(10) << " journaled last replay op" << dendl
;
1933 } else if (mdr
->did_early_reply
)
1934 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1939 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1940 std::string_view event
)
1943 string
event_str("submit entry: ");
1945 mdr
->mark_event(event_str
);
1947 mdlog
->submit_entry(le
, fin
);
1951 * send response built from mdr contents and error code; clean up mdr
1953 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1955 if (mdr
->client_request
) {
1956 if (mdr
->is_batch_head()) {
1957 dout(20) << __func__
<< " batch head " << *mdr
<< dendl
;
1958 mdr
->release_batch_op()->respond(r
);
1960 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
1962 } else if (mdr
->internal_op
> -1) {
1963 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1964 if (!mdr
->internal_op_finish
)
1965 ceph_abort_msg("trying to respond to internal op without finisher");
1966 mdr
->internal_op_finish
->complete(r
);
1967 mdcache
->request_finish(mdr
);
1971 // statistics mds req op number and latency
1972 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
1974 int code
= l_mdss_first
;
1975 switch(req
->get_op()) {
1976 case CEPH_MDS_OP_LOOKUPHASH
:
1977 code
= l_mdss_req_lookuphash_latency
;
1979 case CEPH_MDS_OP_LOOKUPINO
:
1980 code
= l_mdss_req_lookupino_latency
;
1982 case CEPH_MDS_OP_LOOKUPPARENT
:
1983 code
= l_mdss_req_lookupparent_latency
;
1985 case CEPH_MDS_OP_LOOKUPNAME
:
1986 code
= l_mdss_req_lookupname_latency
;
1988 case CEPH_MDS_OP_LOOKUP
:
1989 code
= l_mdss_req_lookup_latency
;
1991 case CEPH_MDS_OP_LOOKUPSNAP
:
1992 code
= l_mdss_req_lookupsnap_latency
;
1994 case CEPH_MDS_OP_GETATTR
:
1995 code
= l_mdss_req_getattr_latency
;
1997 case CEPH_MDS_OP_SETATTR
:
1998 code
= l_mdss_req_setattr_latency
;
2000 case CEPH_MDS_OP_SETLAYOUT
:
2001 code
= l_mdss_req_setlayout_latency
;
2003 case CEPH_MDS_OP_SETDIRLAYOUT
:
2004 code
= l_mdss_req_setdirlayout_latency
;
2006 case CEPH_MDS_OP_GETVXATTR
:
2007 code
= l_mdss_req_getvxattr_latency
;
2009 case CEPH_MDS_OP_SETXATTR
:
2010 code
= l_mdss_req_setxattr_latency
;
2012 case CEPH_MDS_OP_RMXATTR
:
2013 code
= l_mdss_req_rmxattr_latency
;
2015 case CEPH_MDS_OP_READDIR
:
2016 code
= l_mdss_req_readdir_latency
;
2018 case CEPH_MDS_OP_SETFILELOCK
:
2019 code
= l_mdss_req_setfilelock_latency
;
2021 case CEPH_MDS_OP_GETFILELOCK
:
2022 code
= l_mdss_req_getfilelock_latency
;
2024 case CEPH_MDS_OP_CREATE
:
2025 code
= l_mdss_req_create_latency
;
2027 case CEPH_MDS_OP_OPEN
:
2028 code
= l_mdss_req_open_latency
;
2030 case CEPH_MDS_OP_MKNOD
:
2031 code
= l_mdss_req_mknod_latency
;
2033 case CEPH_MDS_OP_LINK
:
2034 code
= l_mdss_req_link_latency
;
2036 case CEPH_MDS_OP_UNLINK
:
2037 code
= l_mdss_req_unlink_latency
;
2039 case CEPH_MDS_OP_RMDIR
:
2040 code
= l_mdss_req_rmdir_latency
;
2042 case CEPH_MDS_OP_RENAME
:
2043 code
= l_mdss_req_rename_latency
;
2045 case CEPH_MDS_OP_MKDIR
:
2046 code
= l_mdss_req_mkdir_latency
;
2048 case CEPH_MDS_OP_SYMLINK
:
2049 code
= l_mdss_req_symlink_latency
;
2051 case CEPH_MDS_OP_LSSNAP
:
2052 code
= l_mdss_req_lssnap_latency
;
2054 case CEPH_MDS_OP_MKSNAP
:
2055 code
= l_mdss_req_mksnap_latency
;
2057 case CEPH_MDS_OP_RMSNAP
:
2058 code
= l_mdss_req_rmsnap_latency
;
2060 case CEPH_MDS_OP_RENAMESNAP
:
2061 code
= l_mdss_req_renamesnap_latency
;
2064 dout(1) << ": unknown client op" << dendl
;
2067 logger
->tinc(code
, lat
);
2070 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
2072 if (!g_conf()->mds_early_reply
)
2075 if (mdr
->no_early_reply
) {
2076 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2080 if (mdr
->has_more() && mdr
->more()->has_journaled_peers
) {
2081 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl
;
2085 if (mdr
->alloc_ino
) {
2086 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2090 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2091 entity_inst_t client_inst
= req
->get_source_inst();
2092 if (client_inst
.name
.is_mds())
2095 if (req
->is_replay()) {
2096 dout(10) << " no early reply on replay op" << dendl
;
2101 auto reply
= make_message
<MClientReply
>(*req
, 0);
2102 reply
->set_unsafe();
2104 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2106 //_rename_finish() does not send dentry link/unlink message to replicas.
2107 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2108 // that have projected linkages from getting new replica.
2109 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2111 dout(10) << "early_reply " << reply
->get_result()
2112 << " (" << cpp_strerror(reply
->get_result())
2113 << ") " << *req
<< dendl
;
2115 if (tracei
|| tracedn
) {
2117 mdr
->cap_releases
.erase(tracei
->vino());
2119 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2121 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2124 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2125 mds
->send_message_client(reply
, mdr
->session
);
2127 mdr
->did_early_reply
= true;
2129 mds
->logger
->inc(l_mds_reply
);
2130 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2131 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2132 if (lat
>= g_conf()->mds_op_complaint_time
) {
2133 mds
->logger
->inc(l_mds_slow_reply
);
2135 if (client_inst
.name
.is_client()) {
2136 mds
->sessionmap
.hit_session(mdr
->session
);
2138 perf_gather_op_latency(req
, lat
);
2139 dout(20) << "lat " << lat
<< dendl
;
2141 mdr
->mark_event("early_replied");
2146 * include a trace to tracei
2149 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2151 ceph_assert(mdr
.get());
2152 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2154 dout(7) << "reply_client_request " << reply
->get_result()
2155 << " (" << cpp_strerror(reply
->get_result())
2156 << ") " << *req
<< dendl
;
2158 mdr
->mark_event("replying");
2160 Session
*session
= mdr
->session
;
2162 // note successful request in session map?
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2169 reply
->get_result() == 0 && session
) {
2170 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2171 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2173 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr
, session
);
2180 // get tracei/tracedn from mdr?
2181 CInode
*tracei
= mdr
->tracei
;
2182 CDentry
*tracedn
= mdr
->tracedn
;
2184 bool is_replay
= mdr
->client_request
->is_replay();
2185 bool did_early_reply
= mdr
->did_early_reply
;
2186 entity_inst_t client_inst
= req
->get_source_inst();
2188 if (!did_early_reply
&& !is_replay
) {
2190 mds
->logger
->inc(l_mds_reply
);
2191 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2192 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2193 if (lat
>= g_conf()->mds_op_complaint_time
) {
2194 mds
->logger
->inc(l_mds_slow_reply
);
2196 if (session
&& client_inst
.name
.is_client()) {
2197 mds
->sessionmap
.hit_session(session
);
2199 perf_gather_op_latency(req
, lat
);
2200 dout(20) << "lat " << lat
<< dendl
;
2203 mdr
->cap_releases
.erase(tracei
->vino());
2205 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2208 // drop non-rdlocks before replying, so that we can issue leases
2209 mdcache
->request_drop_non_rdlocks(mdr
);
2212 if (session
&& !client_inst
.name
.is_mds()) {
2214 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2215 (tracei
|| tracedn
)) {
2218 mdcache
->try_reconnect_cap(tracei
, session
);
2220 // include metadata in reply
2221 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2225 // We can set the extra bl unconditionally: if it's already been sent in the
2226 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2227 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2229 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2230 mds
->send_message_client(reply
, session
);
2233 if (req
->is_queued_for_replay() &&
2234 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2235 if (reply
->get_result() < 0) {
2236 int r
= reply
->get_result();
2237 derr
<< "reply_client_request: failed to replay " << *req
2238 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2239 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2241 mds
->queue_one_replay();
2245 mdcache
->request_finish(mdr
);
2247 // take a closer look at tracei, if it happens to be a remote link
2250 tracedn
->get_projected_linkage()->is_remote()) {
2251 mdcache
->eval_remote(tracedn
);
2256 * pass inode OR dentry (not both, or we may get confused)
2258 * trace is in reverse order (i.e. root inode comes last)
2260 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2261 CInode
*in
, CDentry
*dn
,
2264 // skip doing this for debugging purposes?
2265 if (g_conf()->mds_inject_traceless_reply_probability
&&
2266 mdr
->ls
&& !mdr
->o_trunc
&&
2267 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2268 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2272 // inode, dentry, dir, ..., inode
2274 mds_rank_t whoami
= mds
->get_nodeid();
2275 Session
*session
= mdr
->session
;
2276 snapid_t snapid
= mdr
->snapid
;
2277 utime_t now
= ceph_clock_now();
2279 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2282 if (snapid
== CEPH_NOSNAP
) {
2285 realm
= in
->find_snaprealm();
2287 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2288 reply
->snapbl
= realm
->get_snap_trace();
2289 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2294 reply
->head
.is_dentry
= 1;
2295 CDir
*dir
= dn
->get_dir();
2296 CInode
*diri
= dir
->get_inode();
2298 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2299 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2301 #ifdef MDS_VERIFY_FRAGSTAT
2302 if (dir
->is_complete())
2303 dir
->verify_fragstat();
2306 ds
.frag
= dir
->get_frag();
2307 ds
.auth
= dir
->get_dir_auth().first
;
2308 if (dir
->is_auth() && !forward_all_requests_to_auth
)
2309 dir
->get_dist_spec(ds
.dist
, whoami
);
2311 dir
->encode_dirstat(bl
, session
->info
, ds
);
2312 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2314 encode(dn
->get_name(), bl
);
2317 CDentry::linkage_t
*dnl
= dn
->get_linkage(mdr
->get_client(), mdr
);
2318 if (dnl
->is_primary()) {
2319 ceph_assert(dnl
->get_inode() == in
);
2320 lease_mask
= CEPH_LEASE_PRIMARY_LINK
;
2322 if (dnl
->is_remote())
2323 ceph_assert(dnl
->get_remote_ino() == in
->ino());
2327 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, bl
);
2328 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
2330 reply
->head
.is_dentry
= 0;
2334 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2335 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2336 reply
->head
.is_target
= 1;
2338 reply
->head
.is_target
= 0;
2340 reply
->set_trace(bl
);
2343 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2345 dout(4) << "handle_client_request " << *req
<< dendl
;
2348 mds
->logger
->inc(l_mds_request
);
2350 logger
->inc(l_mdss_handle_client_request
);
2352 if (!mdcache
->is_open()) {
2353 dout(5) << "waiting for root" << dendl
;
2354 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2358 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2360 Session
*session
= 0;
2361 if (req
->get_source().is_client()) {
2362 session
= mds
->get_session(req
);
2364 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2365 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2366 session
->is_closing() ||
2367 session
->is_killing()) {
2368 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2372 if (req
->is_queued_for_replay())
2373 mds
->queue_one_replay();
2379 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2380 // send it? hrm, this isn't ideal; they may get a lot of copies if
2381 // they have a high request rate.
2384 // completed request?
2385 bool has_completed
= false;
2386 if (req
->is_replay() || req
->get_retry_attempt()) {
2387 ceph_assert(session
);
2389 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2390 has_completed
= true;
2391 if (!session
->is_open())
2393 // Don't send traceless reply if the completed request has created
2394 // new inode. Treat the request as lookup request instead.
2395 if (req
->is_replay() ||
2396 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2397 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2398 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2399 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2400 auto reply
= make_message
<MClientReply
>(*req
, 0);
2401 if (created
!= inodeno_t()) {
2403 encode(created
, extra
);
2404 reply
->set_extra_bl(extra
);
2406 mds
->send_message_client(reply
, session
);
2408 if (req
->is_queued_for_replay())
2409 mds
->queue_one_replay();
2413 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2414 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2415 dout(10) << " completed request which created new inode " << created
2416 << ", convert it to lookup request" << dendl
;
2417 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2418 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2423 // trim completed_request list
2424 if (req
->get_oldest_client_tid() > 0) {
2425 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2426 ceph_assert(session
);
2427 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2428 // Sessions 'completed_requests' was dirtied, mark it to be
2429 // potentially flushed at segment expiry.
2430 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2432 if (session
->get_num_trim_requests_warnings() > 0 &&
2433 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2434 session
->reset_num_trim_requests_warnings();
2436 if (session
->get_num_completed_requests() >=
2437 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2438 session
->inc_num_trim_requests_warnings();
2439 CachedStackStringStream css
;
2440 *css
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2441 << req
->get_oldest_client_tid() << "), "
2442 << session
->get_num_completed_requests()
2443 << " completed requests recorded in session\n";
2444 mds
->clog
->warn() << css
->strv();
2445 dout(20) << __func__
<< " " << css
->strv() << dendl
;
2450 // register + dispatch
2451 MDRequestRef mdr
= mdcache
->request_start(req
);
2456 mdr
->session
= session
;
2457 session
->requests
.push_back(&mdr
->item_session_request
);
2461 mdr
->has_completed
= true;
2463 // process embedded cap releases?
2464 // (only if NOT replay!)
2465 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2466 client_t client
= req
->get_source().num();
2467 for (const auto &r
: req
->releases
) {
2468 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2470 req
->releases
.clear();
2473 dispatch_client_request(mdr
);
2477 void Server::handle_osd_map()
2479 /* Note that we check the OSDMAP_FULL flag directly rather than
2480 * using osdmap_full_flag(), because we want to know "is the flag set"
2481 * rather than "does the flag apply to us?" */
2482 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2483 auto pi
= o
.get_pg_pool(mds
->get_metadata_pool());
2484 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2485 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2486 << o
.get_epoch() << dendl
;
2490 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2492 // we shouldn't be waiting on anyone.
2493 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_peer
.empty());
2496 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2497 //if the mdr is a "batch_op" and it has followers, pick a follower as
2498 //the new "head of the batch ops" and go on processing the new one.
2499 if (mdr
->is_batch_head()) {
2500 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2501 auto it
= mdr
->batch_op_map
->find(mask
);
2502 auto new_batch_head
= it
->second
->find_new_head();
2503 if (!new_batch_head
) {
2504 mdr
->batch_op_map
->erase(it
);
2507 mdr
= std::move(new_batch_head
);
2511 } else if (mdr
->aborted
) {
2512 mdr
->aborted
= false;
2513 mdcache
->request_kill(mdr
);
2517 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2519 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2521 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2523 if (req
->may_write() && mdcache
->is_readonly()) {
2524 dout(10) << " read-only FS" << dendl
;
2525 respond_to_request(mdr
, -CEPHFS_EROFS
);
2528 if (mdr
->has_more() && mdr
->more()->peer_error
) {
2529 dout(10) << " got error from peers" << dendl
;
2530 respond_to_request(mdr
, mdr
->more()->peer_error
);
2535 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
2537 // the request is already responded to
2540 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2541 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2542 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2543 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2544 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2545 req
->get_op() == CEPH_MDS_OP_CREATE
||
2546 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2547 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2548 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2549 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2550 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started peer request
2553 if (check_access(mdr
, cur
, MAY_FULL
)) {
2554 dout(20) << __func__
<< ": full, has FULL caps, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2556 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2557 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
2561 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2565 switch (req
->get_op()) {
2566 case CEPH_MDS_OP_LOOKUPHASH
:
2567 case CEPH_MDS_OP_LOOKUPINO
:
2568 handle_client_lookup_ino(mdr
, false, false);
2570 case CEPH_MDS_OP_LOOKUPPARENT
:
2571 handle_client_lookup_ino(mdr
, true, false);
2573 case CEPH_MDS_OP_LOOKUPNAME
:
2574 handle_client_lookup_ino(mdr
, false, true);
2578 case CEPH_MDS_OP_LOOKUP
:
2579 handle_client_getattr(mdr
, true);
2582 case CEPH_MDS_OP_LOOKUPSNAP
:
2583 // lookupsnap does not reference a CDentry; treat it as a getattr
2584 case CEPH_MDS_OP_GETATTR
:
2585 handle_client_getattr(mdr
, false);
2587 case CEPH_MDS_OP_GETVXATTR
:
2588 handle_client_getvxattr(mdr
);
2591 case CEPH_MDS_OP_SETATTR
:
2592 handle_client_setattr(mdr
);
2594 case CEPH_MDS_OP_SETLAYOUT
:
2595 handle_client_setlayout(mdr
);
2597 case CEPH_MDS_OP_SETDIRLAYOUT
:
2598 handle_client_setdirlayout(mdr
);
2600 case CEPH_MDS_OP_SETXATTR
:
2601 handle_client_setxattr(mdr
);
2603 case CEPH_MDS_OP_RMXATTR
:
2604 handle_client_removexattr(mdr
);
2607 case CEPH_MDS_OP_READDIR
:
2608 handle_client_readdir(mdr
);
2611 case CEPH_MDS_OP_SETFILELOCK
:
2612 handle_client_file_setlock(mdr
);
2615 case CEPH_MDS_OP_GETFILELOCK
:
2616 handle_client_file_readlock(mdr
);
2620 case CEPH_MDS_OP_CREATE
:
2621 if (mdr
->has_completed
)
2622 handle_client_open(mdr
); // already created.. just open
2624 handle_client_openc(mdr
);
2627 case CEPH_MDS_OP_OPEN
:
2628 handle_client_open(mdr
);
2633 case CEPH_MDS_OP_MKNOD
:
2634 handle_client_mknod(mdr
);
2636 case CEPH_MDS_OP_LINK
:
2637 handle_client_link(mdr
);
2639 case CEPH_MDS_OP_UNLINK
:
2640 case CEPH_MDS_OP_RMDIR
:
2641 handle_client_unlink(mdr
);
2643 case CEPH_MDS_OP_RENAME
:
2644 handle_client_rename(mdr
);
2646 case CEPH_MDS_OP_MKDIR
:
2647 handle_client_mkdir(mdr
);
2649 case CEPH_MDS_OP_SYMLINK
:
2650 handle_client_symlink(mdr
);
2655 case CEPH_MDS_OP_LSSNAP
:
2656 handle_client_lssnap(mdr
);
2658 case CEPH_MDS_OP_MKSNAP
:
2659 handle_client_mksnap(mdr
);
2661 case CEPH_MDS_OP_RMSNAP
:
2662 handle_client_rmsnap(mdr
);
2664 case CEPH_MDS_OP_RENAMESNAP
:
2665 handle_client_renamesnap(mdr
);
2669 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2670 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
2675 // ---------------------------------------
2678 void Server::handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
)
2680 dout(4) << "handle_peer_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2681 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2683 if (logger
) logger
->inc(l_mdss_handle_peer_request
);
2687 return handle_peer_request_reply(m
);
2689 // the purpose of rename notify is enforcing causal message ordering. making sure
2690 // bystanders have received all messages from rename srcdn's auth MDS.
2691 if (m
->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY
) {
2692 auto reply
= make_message
<MMDSPeerRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK
);
2693 mds
->send_message(reply
, m
->get_connection());
2697 CDentry
*straydn
= NULL
;
2698 if (m
->straybl
.length() > 0) {
2699 mdcache
->decode_replica_stray(straydn
, nullptr, m
->straybl
, from
);
2700 ceph_assert(straydn
);
2704 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2705 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2706 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2712 if (mdcache
->have_request(m
->get_reqid())) {
2714 mdr
= mdcache
->request_get(m
->get_reqid());
2716 // is my request newer?
2717 if (mdr
->attempt
> m
->get_attempt()) {
2718 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2719 << ", dropping " << *m
<< dendl
;
2723 if (mdr
->attempt
< m
->get_attempt()) {
2724 // mine is old, close it out
2725 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2726 << ", closing out" << dendl
;
2727 mdcache
->request_finish(mdr
);
2729 } else if (mdr
->peer_to_mds
!= from
) {
2730 dout(10) << "local request " << *mdr
<< " not peer to mds." << from
<< dendl
;
2734 // may get these while mdr->peer_request is non-null
2735 if (m
->get_op() == MMDSPeerRequest::OP_DROPLOCKS
) {
2736 mds
->locker
->drop_locks(mdr
.get());
2739 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2740 if (m
->is_abort()) {
2741 mdr
->aborted
= true;
2742 if (mdr
->peer_request
) {
2743 // only abort on-going xlock, wrlock and auth pin
2744 ceph_assert(!mdr
->peer_did_prepare());
2746 mdcache
->request_finish(mdr
);
2749 if (m
->inode_export
.length() > 0)
2750 mdr
->more()->inode_import
= m
->inode_export
;
2751 // finish off request.
2752 mdcache
->request_finish(mdr
);
2759 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2760 dout(10) << "missing peer request for " << m
->get_reqid()
2761 << " OP_FINISH, must have lost race with a forward" << dendl
;
2764 mdr
= mdcache
->request_start_peer(m
->get_reqid(), m
->get_attempt(), m
);
2765 mdr
->set_op_stamp(m
->op_stamp
);
2767 ceph_assert(mdr
->peer_request
== 0); // only one at a time, please!
2771 mdr
->straydn
= straydn
;
2774 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2775 mdr
->locks
.empty()) {
2776 dout(3) << "not active yet, waiting" << dendl
;
2777 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2781 mdr
->reset_peer_request(m
);
2783 dispatch_peer_request(mdr
);
2786 void Server::handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
)
2788 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2790 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2791 metareqid_t r
= m
->get_reqid();
2792 if (!mdcache
->have_uncommitted_leader(r
, from
)) {
2793 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2794 << from
<< " reqid " << r
<< dendl
;
2797 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2798 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2802 if (m
->get_op() == MMDSPeerRequest::OP_COMMITTED
) {
2803 metareqid_t r
= m
->get_reqid();
2804 mdcache
->committed_leader_peer(r
, from
);
2808 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2809 if (m
->get_attempt() != mdr
->attempt
) {
2810 dout(10) << "handle_peer_request_reply " << *mdr
<< " ignoring reply from other attempt "
2811 << m
->get_attempt() << dendl
;
2815 switch (m
->get_op()) {
2816 case MMDSPeerRequest::OP_XLOCKACK
:
2818 // identify lock, leader request
2819 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2820 m
->get_object_info());
2821 mdr
->more()->peers
.insert(from
);
2822 lock
->decode_locked_state(m
->get_lock_data());
2823 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2824 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2825 mdr
->finish_locking(lock
);
2826 lock
->get_xlock(mdr
, mdr
->get_client());
2828 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2829 mdr
->more()->waiting_on_peer
.erase(from
);
2830 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2831 mdcache
->dispatch_request(mdr
);
2835 case MMDSPeerRequest::OP_WRLOCKACK
:
2837 // identify lock, leader request
2838 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2839 m
->get_object_info());
2840 mdr
->more()->peers
.insert(from
);
2841 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2842 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2843 ceph_assert(it
->is_remote_wrlock());
2844 ceph_assert(it
->wrlock_target
== from
);
2846 mdr
->finish_locking(lock
);
2848 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2849 mdr
->more()->waiting_on_peer
.erase(from
);
2850 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2851 mdcache
->dispatch_request(mdr
);
2855 case MMDSPeerRequest::OP_AUTHPINACK
:
2856 handle_peer_auth_pin_ack(mdr
, m
);
2859 case MMDSPeerRequest::OP_LINKPREPACK
:
2860 handle_peer_link_prep_ack(mdr
, m
);
2863 case MMDSPeerRequest::OP_RMDIRPREPACK
:
2864 handle_peer_rmdir_prep_ack(mdr
, m
);
2867 case MMDSPeerRequest::OP_RENAMEPREPACK
:
2868 handle_peer_rename_prep_ack(mdr
, m
);
2871 case MMDSPeerRequest::OP_RENAMENOTIFYACK
:
2872 handle_peer_rename_notify_ack(mdr
, m
);
2880 void Server::dispatch_peer_request(MDRequestRef
& mdr
)
2882 dout(7) << "dispatch_peer_request " << *mdr
<< " " << *mdr
->peer_request
<< dendl
;
2885 dout(7) << " abort flag set, finishing" << dendl
;
2886 mdcache
->request_finish(mdr
);
2890 if (logger
) logger
->inc(l_mdss_dispatch_peer_request
);
2892 int op
= mdr
->peer_request
->get_op();
2894 case MMDSPeerRequest::OP_XLOCK
:
2895 case MMDSPeerRequest::OP_WRLOCK
:
2898 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2899 mdr
->peer_request
->get_object_info());
2902 dout(10) << "don't have object, dropping" << dendl
;
2903 ceph_abort(); // can this happen, if we auth pinned properly.
2905 if (op
== MMDSPeerRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2906 dout(10) << "not auth for remote xlock attempt, dropping on "
2907 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2909 // use acquire_locks so that we get auth_pinning.
2910 MutationImpl::LockOpVec lov
;
2911 for (const auto& p
: mdr
->locks
) {
2913 lov
.add_xlock(p
.lock
);
2914 else if (p
.is_wrlock())
2915 lov
.add_wrlock(p
.lock
);
2920 case MMDSPeerRequest::OP_XLOCK
:
2921 lov
.add_xlock(lock
);
2922 replycode
= MMDSPeerRequest::OP_XLOCKACK
;
2924 case MMDSPeerRequest::OP_WRLOCK
:
2925 lov
.add_wrlock(lock
);
2926 replycode
= MMDSPeerRequest::OP_WRLOCKACK
;
2930 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2934 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2935 r
->set_lock_type(lock
->get_type());
2936 lock
->get_parent()->set_object_info(r
->get_object_info());
2937 if (replycode
== MMDSPeerRequest::OP_XLOCKACK
)
2938 lock
->encode_locked_state(r
->get_lock_data());
2939 mds
->send_message(r
, mdr
->peer_request
->get_connection());
2943 mdr
->reset_peer_request();
2947 case MMDSPeerRequest::OP_UNXLOCK
:
2948 case MMDSPeerRequest::OP_UNWRLOCK
:
2950 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2951 mdr
->peer_request
->get_object_info());
2953 auto it
= mdr
->locks
.find(lock
);
2954 ceph_assert(it
!= mdr
->locks
.end());
2955 bool need_issue
= false;
2957 case MMDSPeerRequest::OP_UNXLOCK
:
2958 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2960 case MMDSPeerRequest::OP_UNWRLOCK
:
2961 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2965 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2967 // done. no ack necessary.
2968 mdr
->reset_peer_request();
2972 case MMDSPeerRequest::OP_AUTHPIN
:
2973 handle_peer_auth_pin(mdr
);
2976 case MMDSPeerRequest::OP_LINKPREP
:
2977 case MMDSPeerRequest::OP_UNLINKPREP
:
2978 handle_peer_link_prep(mdr
);
2981 case MMDSPeerRequest::OP_RMDIRPREP
:
2982 handle_peer_rmdir_prep(mdr
);
2985 case MMDSPeerRequest::OP_RENAMEPREP
:
2986 handle_peer_rename_prep(mdr
);
2994 void Server::handle_peer_auth_pin(MDRequestRef
& mdr
)
2996 dout(10) << "handle_peer_auth_pin " << *mdr
<< dendl
;
2998 // build list of objects
2999 list
<MDSCacheObject
*> objects
;
3000 CInode
*auth_pin_freeze
= NULL
;
3001 bool nonblocking
= mdr
->peer_request
->is_nonblocking();
3002 bool fail
= false, wouldblock
= false, readonly
= false;
3003 ref_t
<MMDSPeerRequest
> reply
;
3005 if (mdcache
->is_readonly()) {
3006 dout(10) << " read-only FS" << dendl
;
3012 for (const auto &oi
: mdr
->peer_request
->get_authpins()) {
3013 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3015 dout(10) << " don't have " << oi
<< dendl
;
3020 objects
.push_back(object
);
3021 if (oi
== mdr
->peer_request
->get_authpin_freeze())
3022 auth_pin_freeze
= static_cast<CInode
*>(object
);
3026 // can we auth pin them?
3028 for (const auto& obj
: objects
) {
3029 if (!obj
->is_auth()) {
3030 dout(10) << " not auth for " << *obj
<< dendl
;
3034 if (mdr
->is_auth_pinned(obj
))
3036 if (!mdr
->can_auth_pin(obj
)) {
3038 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
3044 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
3045 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3046 mdr
->drop_local_auth_pins();
3048 mds
->locker
->notify_freeze_waiter(obj
);
3055 /* freeze authpin wrong inode */
3056 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
3057 mdr
->more()->rename_inode
!= auth_pin_freeze
)
3058 mdr
->unfreeze_auth_pin(true);
3060 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3061 * on the source inode to complete. This happens after all locks for the rename
3062 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3063 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3064 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3065 * The solution is freeze the inode and prevent other MDRequests from getting new
3068 if (auth_pin_freeze
) {
3069 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3070 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3071 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3072 mds
->mdlog
->flush();
3078 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3081 mdr
->drop_local_auth_pins(); // just in case
3083 reply
->mark_error_rofs();
3085 reply
->mark_error_wouldblock();
3088 for (const auto& obj
: objects
) {
3089 dout(10) << "auth_pinning " << *obj
<< dendl
;
3092 // return list of my auth_pins (if any)
3093 for (const auto &p
: mdr
->object_states
) {
3094 if (!p
.second
.auth_pinned
)
3096 MDSCacheObjectInfo info
;
3097 p
.first
->set_object_info(info
);
3098 reply
->get_authpins().push_back(info
);
3099 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3100 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3104 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3106 // clean up this request
3107 mdr
->reset_peer_request();
3111 if (mdr
->peer_request
->should_notify_blocking()) {
3112 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3113 reply
->mark_req_blocked();
3114 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3115 mdr
->peer_request
->clear_notify_blocking();
3120 void Server::handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
3122 dout(10) << "handle_peer_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3123 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3125 if (ack
->is_req_blocked()) {
3126 mdr
->disable_lock_cache();
3127 // peer auth pin is blocked, drop locks to avoid deadlock
3128 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3133 set
<MDSCacheObject
*> pinned
;
3134 for (const auto &oi
: ack
->get_authpins()) {
3135 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3136 ceph_assert(object
); // we pinned it
3137 dout(10) << " remote has pinned " << *object
<< dendl
;
3138 mdr
->set_remote_auth_pinned(object
, from
);
3139 if (oi
== ack
->get_authpin_freeze())
3140 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3141 pinned
.insert(object
);
3144 // removed frozen auth pin ?
3145 if (mdr
->more()->is_remote_frozen_authpin
&&
3146 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3147 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3148 ceph_assert(stat_p
);
3149 if (stat_p
->remote_auth_pinned
== from
) {
3150 mdr
->more()->is_remote_frozen_authpin
= false;
3154 // removed auth pins?
3155 for (auto& p
: mdr
->object_states
) {
3156 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3158 MDSCacheObject
* object
= p
.first
;
3159 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3160 dout(10) << " remote has unpinned " << *object
<< dendl
;
3161 mdr
->_clear_remote_auth_pinned(p
.second
);
3166 mdr
->more()->peers
.insert(from
);
3168 // clear from waiting list
3169 auto ret
= mdr
->more()->waiting_on_peer
.erase(from
);
3172 if (ack
->is_error_rofs()) {
3173 mdr
->more()->peer_error
= -CEPHFS_EROFS
;
3174 } else if (ack
->is_error_wouldblock()) {
3175 mdr
->more()->peer_error
= -CEPHFS_EWOULDBLOCK
;
3179 if (mdr
->more()->waiting_on_peer
.empty())
3180 mdcache
->dispatch_request(mdr
);
3182 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
3186 // ---------------------------------------
3191 * check whether we are permitted to complete a request
3193 * Check whether we have permission to perform the operation specified
3194 * by mask on the given inode, based on the capability in the mdr's
3197 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3200 int r
= mdr
->session
->check_access(
3202 mdr
->client_request
->get_caller_uid(),
3203 mdr
->client_request
->get_caller_gid(),
3204 &mdr
->client_request
->get_caller_gid_list(),
3205 mdr
->client_request
->head
.args
.setattr
.uid
,
3206 mdr
->client_request
->head
.args
.setattr
.gid
);
3208 respond_to_request(mdr
, r
);
3216 * check whether fragment has reached maximum size
3219 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*dir
)
3221 const auto size
= dir
->get_frag_size();
3222 const auto max
= bal_fragment_size_max
;
3224 dout(10) << "fragment " << *dir
<< " size exceeds " << max
<< " (CEPHFS_ENOSPC)" << dendl
;
3225 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
3228 dout(20) << "fragment " << *dir
<< " size " << size
<< " < " << max
<< dendl
;
3235 * check whether entries in a dir reached maximum size
3238 bool Server::check_dir_max_entries(MDRequestRef
&mdr
, CDir
*in
)
3240 const uint64_t size
= in
->inode
->get_projected_inode()->dirstat
.nfiles
+
3241 in
->inode
->get_projected_inode()->dirstat
.nsubdirs
;
3242 if (dir_max_entries
&& size
>= dir_max_entries
) {
3243 dout(10) << "entries per dir " << *in
<< " size exceeds " << dir_max_entries
<< " (ENOSPC)" << dendl
;
3244 respond_to_request(mdr
, -ENOSPC
);
3251 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3254 in
->name_stray_dentry(straydname
);
3256 CDentry
*straydn
= mdr
->straydn
;
3258 ceph_assert(straydn
->get_name() == straydname
);
3261 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3263 if (!mdr
->client_request
->is_replay() &&
3264 !check_fragment_space(mdr
, straydir
))
3267 straydn
= straydir
->lookup(straydname
);
3269 if (straydir
->is_frozen_dir()) {
3270 dout(10) << __func__
<< ": " << *straydir
<< " is frozen, waiting" << dendl
;
3271 mds
->locker
->drop_locks(mdr
.get());
3272 mdr
->drop_local_auth_pins();
3273 straydir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3276 straydn
= straydir
->add_null_dentry(straydname
);
3277 straydn
->mark_new();
3279 ceph_assert(straydn
->get_projected_linkage()->is_null());
3282 straydn
->state_set(CDentry::STATE_STRAY
);
3283 mdr
->straydn
= straydn
;
3289 /** prepare_new_inode
3291 * create a new inode. set c/m/atime. hit dir pop.
3293 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3294 const file_layout_t
*layout
)
3296 CInode
*in
= new CInode(mdcache
);
3297 auto _inode
= in
->_get_inode();
3299 // Server::prepare_force_open_sessions() can re-open session in closing
3300 // state. In that corner case, session's prealloc_inos are being freed.
3301 // To simplify the code, we disallow using/refilling session's prealloc_ino
3302 // while session is opening.
3303 bool allow_prealloc_inos
= mdr
->session
->is_open();
3306 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= _inode
->ino
= mdr
->session
->take_ino(useino
))) {
3307 mds
->sessionmap
.mark_projected(mdr
->session
);
3308 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3309 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3313 _inode
->ino
= mds
->inotable
->project_alloc_id(useino
);
3314 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3317 if (useino
&& useino
!= _inode
->ino
) {
3318 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << _inode
->ino
<< dendl
;
3319 mds
->clog
->error() << mdr
->client_request
->get_source()
3320 << " specified ino " << useino
3321 << " but mds." << mds
->get_nodeid() << " allocated " << _inode
->ino
;
3322 //ceph_abort(); // just for now.
3325 if (allow_prealloc_inos
&&
3326 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3327 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3328 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3329 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3330 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3331 mds
->sessionmap
.mark_projected(mdr
->session
);
3332 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3335 _inode
->version
= 1;
3336 _inode
->xattr_version
= 1;
3337 _inode
->nlink
= 1; // FIXME
3339 _inode
->mode
= mode
;
3341 // FIPS zeroization audit 20191117: this memset is not security related.
3342 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
3343 if (_inode
->is_dir()) {
3344 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3345 } else if (layout
) {
3346 _inode
->layout
= *layout
;
3348 _inode
->layout
= mdcache
->default_file_layout
;
3351 _inode
->truncate_size
= -1ull; // not truncated, yet!
3352 _inode
->truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3354 CInode
*diri
= dir
->get_inode();
3356 dout(10) << oct
<< " dir mode 0" << diri
->get_inode()->mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3358 if (diri
->get_inode()->mode
& S_ISGID
) {
3359 dout(10) << " dir is sticky" << dendl
;
3360 _inode
->gid
= diri
->get_inode()->gid
;
3361 if (S_ISDIR(mode
)) {
3362 dout(10) << " new dir also sticky" << dendl
;
3363 _inode
->mode
|= S_ISGID
;
3366 _inode
->gid
= mdr
->client_request
->get_caller_gid();
3368 _inode
->uid
= mdr
->client_request
->get_caller_uid();
3370 _inode
->btime
= _inode
->ctime
= _inode
->mtime
= _inode
->atime
=
3371 mdr
->get_op_stamp();
3373 _inode
->change_attr
= 0;
3375 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3376 if (req
->get_data().length()) {
3377 auto p
= req
->get_data().cbegin();
3379 // xattrs on new inode?
3380 auto _xattrs
= CInode::allocate_xattr_map();
3381 decode_noshare(*_xattrs
, p
);
3382 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs
<< dendl
;
3383 if (_xattrs
->count("encryption.ctx")) {
3384 _inode
->fscrypt
= true;
3386 in
->reset_xattrs(std::move(_xattrs
));
3389 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3390 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3391 _inode
->inline_data
.version
= CEPH_INLINE_NONE
;
3393 mdcache
->add_inode(in
); // add
3394 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3398 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3400 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3401 << " inotablev " << mds
->inotable
->get_projected_version()
3403 blob
->set_ino_alloc(mdr
->alloc_ino
,
3404 mdr
->used_prealloc_ino
,
3406 mdr
->client_request
->get_source(),
3407 mds
->sessionmap
.get_projected(),
3408 mds
->inotable
->get_projected_version());
3411 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3413 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3414 << " / " << mdr
->prealloc_inos
3415 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3417 if (mdr
->alloc_ino
) {
3418 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3420 if (mdr
->prealloc_inos
.size()) {
3421 ceph_assert(session
);
3422 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3423 session
->free_prealloc_inos
.insert(mdr
->prealloc_inos
);
3424 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3425 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3426 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3428 if (mdr
->used_prealloc_ino
) {
3429 ceph_assert(session
);
3430 session
->info
.prealloc_inos
.erase(mdr
->used_prealloc_ino
);
3431 mds
->sessionmap
.mark_dirty(session
);
3435 class C_MDS_TryFindInode
: public ServerContext
{
3438 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3439 void finish(int r
) override
{
3440 if (r
== -CEPHFS_ESTALE
) // :( find_ino_peers failed
3441 server
->respond_to_request(mdr
, r
);
3443 server
->dispatch_client_request(mdr
);
3447 /* If this returns null, the request has been handled
3448 * as appropriate: forwarded on, or the client's been replied to */
3449 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3453 const filepath
& refpath
= mdr
->get_filepath();
3454 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3456 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3460 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3462 if (refpath
.is_last_snap()) {
3466 if (!no_want_auth
&& forward_all_requests_to_auth
)
3468 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3471 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3472 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3474 return nullptr; // delayed
3475 if (r
< 0) { // error
3476 if (r
== -CEPHFS_ENOENT
&& !mdr
->dn
[0].empty()) {
3477 if (mdr
->client_request
&&
3478 mdr
->client_request
->get_dentry_wanted())
3479 mdr
->tracedn
= mdr
->dn
[0].back();
3480 respond_to_request(mdr
, r
);
3481 } else if (r
== -CEPHFS_ESTALE
) {
3482 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3483 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
3484 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
3486 dout(10) << "FAIL on error " << r
<< dendl
;
3487 respond_to_request(mdr
, r
);
3491 CInode
*ref
= mdr
->in
[0];
3492 dout(10) << "ref is " << *ref
<< dendl
;
3496 // do NOT proceed if freezing, as cap release may defer in that case, and
3497 // we could deadlock when we try to lock @ref.
3498 // if we're already auth_pinned, continue; the release has already been processed.
3499 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3500 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3501 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3502 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3503 if (mdr
->is_any_remote_auth_pin())
3504 mds
->locker
->notify_freeze_waiter(ref
);
3516 /** rdlock_path_xlock_dentry
3517 * traverse path to the directory that could/would contain dentry.
3518 * make sure i am auth for that dentry, forward as necessary.
3519 * create null dentry in place (or use existing if okexist).
3520 * get rdlocks on traversed dentries, xlock on new dentry.
3522 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3523 bool create
, bool okexist
, bool want_layout
)
3525 const filepath
& refpath
= mdr
->get_filepath();
3526 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3528 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3529 return mdr
->dn
[0].back();
3531 // figure parent dir vs dname
3532 if (refpath
.depth() == 0) {
3533 dout(7) << "invalid path (zero length)" << dendl
;
3534 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3538 if (refpath
.is_last_snap()) {
3539 respond_to_request(mdr
, -CEPHFS_EROFS
);
3543 if (refpath
.is_last_dot_or_dotdot()) {
3544 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3546 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3548 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
3552 // traverse to parent dir
3553 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3554 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3555 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3556 MDS_TRAVERSE_WANT_AUTH
;
3557 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3558 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3560 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3562 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3563 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3565 return nullptr; // delayed
3567 if (r
== -CEPHFS_ESTALE
) {
3568 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3569 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3572 respond_to_request(mdr
, r
);
3576 CDentry
*dn
= mdr
->dn
[0].back();
3577 CDir
*dir
= dn
->get_dir();
3578 CInode
*diri
= dir
->get_inode();
3580 if (!mdr
->reqid
.name
.is_mds()) {
3581 if (diri
->is_system() && !diri
->is_root()) {
3582 respond_to_request(mdr
, -CEPHFS_EROFS
);
3587 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3588 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3592 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3593 if (dnl
->is_null()) {
3594 if (!create
&& okexist
) {
3595 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3599 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3600 dn
->first
= std::max(dn
->first
, next_snap
);
3603 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3606 mdr
->in
[0] = dnl
->get_inode();
3612 /** rdlock_two_paths_xlock_destdn
3613 * traverse two paths and lock the two paths in proper order.
3614 * The order of taking locks is:
3615 * 1. Lock directory inodes or dentries according to which trees they
3616 * are under. Lock objects under fs root before objects under mdsdir.
3617 * 2. Lock directory inodes or dentries according to their depth, in
3619 * 3. Lock directory inodes or dentries according to inode numbers or
3620 * dentries' parent inode numbers, in ascending order.
3621 * 4. Lock dentries in the same directory in order of their keys.
3622 * 5. Lock non-directory inodes according to inode numbers, in ascending
3625 std::pair
<CDentry
*, CDentry
*>
3626 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3629 const filepath
& refpath
= mdr
->get_filepath();
3630 const filepath
& refpath2
= mdr
->get_filepath2();
3632 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3634 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3635 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3637 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3638 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3639 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3642 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3643 respond_to_request(mdr
, -CEPHFS_EROFS
);
3644 return std::make_pair(nullptr, nullptr);
3647 // traverse to parent dir
3648 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3649 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3650 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3652 if (r
== -CEPHFS_ESTALE
) {
3653 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl
;
3654 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3656 respond_to_request(mdr
, r
);
3658 return std::make_pair(nullptr, nullptr);
3661 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3662 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3664 if (r
== -CEPHFS_ESTALE
) {
3665 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
3666 mdcache
->find_ino_peers(refpath2
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3668 respond_to_request(mdr
, r
);
3670 return std::make_pair(nullptr, nullptr);
3673 CDentry
*srcdn
= mdr
->dn
[1].back();
3674 CDir
*srcdir
= srcdn
->get_dir();
3675 CDentry
*destdn
= mdr
->dn
[0].back();
3676 CDir
*destdir
= destdn
->get_dir();
3678 if (!mdr
->reqid
.name
.is_mds()) {
3679 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3680 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3681 respond_to_request(mdr
, -CEPHFS_EROFS
);
3682 return std::make_pair(nullptr, nullptr);
3686 if (!destdir
->get_inode()->is_base() &&
3687 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3688 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3689 return std::make_pair(nullptr, nullptr);
3692 MutationImpl::LockOpVec lov
;
3693 if (srcdir
->get_inode() == destdir
->get_inode()) {
3694 lov
.add_wrlock(&destdir
->inode
->filelock
);
3695 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3696 if (xlock_srcdn
&& srcdir
!= destdir
) {
3697 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3698 if (srcdir_auth
!= mds
->get_nodeid()) {
3699 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3700 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3704 if (srcdn
->get_name() > destdn
->get_name())
3705 lov
.add_xlock(&destdn
->lock
);
3708 lov
.add_xlock(&srcdn
->lock
);
3710 lov
.add_rdlock(&srcdn
->lock
);
3712 if (srcdn
->get_name() < destdn
->get_name())
3713 lov
.add_xlock(&destdn
->lock
);
3715 int cmp
= mdr
->compare_paths();
3716 bool lock_destdir_first
=
3717 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3719 if (lock_destdir_first
) {
3720 lov
.add_wrlock(&destdir
->inode
->filelock
);
3721 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3722 lov
.add_xlock(&destdn
->lock
);
3726 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3727 if (srcdir_auth
== mds
->get_nodeid()) {
3728 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3729 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3731 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3732 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3734 lov
.add_xlock(&srcdn
->lock
);
3736 lov
.add_rdlock(&srcdn
->lock
);
3739 if (!lock_destdir_first
) {
3740 lov
.add_wrlock(&destdir
->inode
->filelock
);
3741 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3742 lov
.add_xlock(&destdn
->lock
);
3746 CInode
*auth_pin_freeze
= nullptr;
3747 // XXX any better way to do this?
3748 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3749 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3750 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3752 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3753 return std::make_pair(nullptr, nullptr);
3755 if (srcdn
->get_projected_linkage()->is_null()) {
3756 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3757 return std::make_pair(nullptr, nullptr);
3760 if (destdn
->get_projected_linkage()->is_null()) {
3761 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3762 destdn
->first
= std::max(destdn
->first
, next_snap
);
3765 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3767 return std::make_pair(destdn
, srcdn
);
3771 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3773 * @param diri base inode
3774 * @param fg the exact frag we want
3775 * @param mdr request
3776 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3778 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3780 CDir
*dir
= diri
->get_dirfrag(fg
);
3783 // am i auth for the dirfrag?
3784 if (!dir
->is_auth()) {
3785 mds_rank_t auth
= dir
->authority().first
;
3786 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3787 << ", fw to mds." << auth
<< dendl
;
3788 mdcache
->request_forward(mdr
, auth
);
3792 // not open and inode not mine?
3793 if (!diri
->is_auth()) {
3794 mds_rank_t inauth
= diri
->authority().first
;
3795 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3796 mdcache
->request_forward(mdr
, inauth
);
3800 // not open and inode frozen?
3801 if (diri
->is_frozen()) {
3802 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3803 ceph_assert(diri
->get_parent_dir());
3804 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3809 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3816 // ===============================================================================
3819 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3821 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3823 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3824 // refpath can't be empty for lookup but it can for
3825 // getattr (we do getattr with empty refpath for mount of '/')
3826 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3830 bool want_auth
= false;
3831 int mask
= req
->head
.args
.getattr
.mask
;
3832 if (mask
& CEPH_STAT_RSTAT
)
3833 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3835 if (!mdr
->is_batch_head() && mdr
->can_batch()) {
3836 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
3837 int r
= mdcache
->path_traverse(mdr
, cf
, mdr
->get_filepath(),
3838 (want_auth
? MDS_TRAVERSE_WANT_AUTH
: 0),
3839 &mdr
->dn
[0], &mdr
->in
[0]);
3844 // fall-thru. let rdlock_path_pin_ref() check again.
3845 } else if (is_lookup
) {
3846 CDentry
* dn
= mdr
->dn
[0].back();
3848 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3850 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3852 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3853 em
.first
->second
->add_request(mdr
);
3857 CInode
*in
= mdr
->in
[0];
3859 auto em
= in
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3861 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3863 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3864 em
.first
->second
->add_request(mdr
);
3870 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3874 mdr
->getattr_caps
= mask
;
3877 * if client currently holds the EXCL cap on a field, do not rdlock
3878 * it; client's stat() will result in valid info if _either_ EXCL
3879 * cap is held or MDS rdlocks and reads the value here.
3881 * handling this case here is easier than weakening rdlock
3882 * semantics... that would cause problems elsewhere.
3884 client_t client
= mdr
->get_client();
3886 Capability
*cap
= ref
->get_client_cap(client
);
3887 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3888 mdr
->snapid
<= cap
->client_follows
))
3889 issued
= cap
->issued();
3892 MutationImpl::LockOpVec lov
;
3893 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3894 lov
.add_rdlock(&ref
->linklock
);
3895 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3896 lov
.add_rdlock(&ref
->authlock
);
3897 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3898 lov
.add_rdlock(&ref
->xattrlock
);
3899 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3900 // Don't wait on unstable filelock if client is allowed to read file size.
3901 // This can reduce the response time of getattr in the case that multiple
3902 // clients do stat(2) and there are writers.
3903 // The downside of this optimization is that mds may not issue Fs caps along
3904 // with getattr reply. Client may need to send more getattr requests.
3905 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3906 lov
.add_rdlock(&ref
->filelock
);
3907 } else if (ref
->filelock
.is_stable() ||
3908 ref
->filelock
.get_num_wrlocks() > 0 ||
3909 !ref
->filelock
.can_read(mdr
->get_client())) {
3910 lov
.add_rdlock(&ref
->filelock
);
3911 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
3915 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3918 if (!check_access(mdr
, ref
, MAY_READ
))
3921 utime_t now
= ceph_clock_now();
3922 mdr
->set_mds_stamp(now
);
3924 // note which caps are requested, so we return at least a snapshot
3925 // value for them. (currently this matters for xattrs and inline data)
3926 mdr
->getattr_caps
= mask
;
3928 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3931 dout(10) << "reply to stat on " << *req
<< dendl
;
3934 mdr
->tracedn
= mdr
->dn
[0].back();
3935 respond_to_request(mdr
, 0);
3938 struct C_MDS_LookupIno2
: public ServerContext
{
3940 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3941 void finish(int r
) override
{
3942 server
->_lookup_ino_2(mdr
, r
);
3949 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3950 bool want_parent
, bool want_dentry
)
3952 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3954 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
3955 return _lookup_snap_ino(mdr
);
3957 inodeno_t ino
= req
->get_filepath().get_ino();
3958 auto _ino
= ino
.val
;
3960 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
3961 * I do not have an explanation for how that happened organically but this
3962 * check will ensure that the client can no longer do that.
3964 * [1] https://tracker.ceph.com/issues/49922
3966 if (MDS_IS_PRIVATE_INO(_ino
)) {
3967 respond_to_request(mdr
, -CEPHFS_ESTALE
);
3971 CInode
*in
= mdcache
->get_inode(ino
);
3972 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3973 respond_to_request(mdr
, -CEPHFS_ESTALE
);
3977 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3981 // check for nothing (not read or write); this still applies the
3983 if (!check_access(mdr
, in
, 0))
3986 CDentry
*dn
= in
->get_projected_parent_dn();
3987 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3989 MutationImpl::LockOpVec lov
;
3990 if (dn
&& (want_parent
|| want_dentry
)) {
3992 lov
.add_rdlock(&dn
->lock
);
3995 unsigned mask
= req
->head
.args
.lookupino
.mask
;
3997 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3999 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4000 issued
= cap
->issued();
4002 // permission bits, ACL/security xattrs
4003 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4004 lov
.add_rdlock(&in
->authlock
);
4005 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4006 lov
.add_rdlock(&in
->xattrlock
);
4008 mdr
->getattr_caps
= mask
;
4012 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4016 // need read access to directory inode
4017 if (!check_access(mdr
, diri
, MAY_READ
))
4023 if (in
->is_base()) {
4024 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4027 if (!diri
|| diri
->is_stray()) {
4028 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4031 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
4033 respond_to_request(mdr
, 0);
4036 inodeno_t dirino
= req
->get_filepath2().get_ino();
4037 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
4038 respond_to_request(mdr
, -CEPHFS_ENOENT
);
4041 dout(10) << "reply to lookup_name " << *in
<< dendl
;
4043 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
4048 respond_to_request(mdr
, 0);
4052 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
4054 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4057 vino
.ino
= req
->get_filepath().get_ino();
4058 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
4059 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
4060 __u32 hash
= req
->head
.args
.lookupino
.hash
;
4062 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
4064 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
4066 in
= mdcache
->get_inode(vino
.ino
);
4068 if (in
->state_test(CInode::STATE_PURGING
) ||
4069 !in
->has_snap_data(vino
.snapid
)) {
4070 if (in
->is_dir() || !parent_ino
) {
4071 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4080 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
4081 mdr
->snapid
= vino
.snapid
;
4083 respond_to_request(mdr
, 0);
4087 CInode
*diri
= NULL
;
4089 diri
= mdcache
->get_inode(parent_ino
);
4091 mdcache
->open_ino(parent_ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
4095 if (!diri
->is_dir()) {
4096 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4100 MutationImpl::LockOpVec lov
;
4101 lov
.add_rdlock(&diri
->dirfragtreelock
);
4102 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4105 frag_t frag
= diri
->dirfragtree
[hash
];
4106 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4110 if (!dir
->is_complete()) {
4111 if (dir
->is_frozen()) {
4112 mds
->locker
->drop_locks(mdr
.get());
4113 mdr
->drop_local_auth_pins();
4114 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4117 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4121 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4123 mdcache
->open_ino(vino
.ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4127 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4129 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4130 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4132 // `r` is a rank if >=0, else an error code
4134 mds_rank_t
dest_rank(r
);
4135 if (dest_rank
== mds
->get_nodeid())
4136 dispatch_client_request(mdr
);
4138 mdcache
->request_forward(mdr
, dest_rank
);
4143 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
4145 respond_to_request(mdr
, r
);
4149 /* This function takes responsibility for the passed mdr*/
4150 void Server::handle_client_open(MDRequestRef
& mdr
)
4152 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4153 dout(7) << "open on " << req
->get_filepath() << dendl
;
4155 int flags
= req
->head
.args
.open
.flags
;
4156 int cmode
= ceph_flags_to_mode(flags
);
4158 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4162 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4163 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4165 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4166 dout(7) << "read-only FS" << dendl
;
4167 respond_to_request(mdr
, -CEPHFS_EROFS
);
4171 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4175 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4176 ceph_assert(!need_auth
);
4177 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4178 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4183 if (!cur
->is_file()) {
4184 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4185 cmode
= CEPH_FILE_MODE_PIN
;
4186 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4187 if (cur
->is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4188 flags
&= ~CEPH_O_TRUNC
;
4191 dout(10) << "open flags = " << flags
4192 << ", filemode = " << cmode
4193 << ", need_auth = " << need_auth
4197 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4198 dout(7) << "not a file or dir " << *cur << dendl;
4199 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4202 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->is_dir() && !cur
->is_symlink()) {
4203 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4204 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4208 if ((flags
& CEPH_O_TRUNC
) && !cur
->is_file()) {
4209 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4210 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4211 respond_to_request(mdr
, cur
->is_dir() ? -CEPHFS_EISDIR
: -CEPHFS_EINVAL
);
4215 if (cur
->get_inode()->inline_data
.version
!= CEPH_INLINE_NONE
&&
4216 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4217 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4218 respond_to_request(mdr
, -CEPHFS_EPERM
);
4222 // snapped data is read only
4223 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4224 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4225 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4226 respond_to_request(mdr
, -CEPHFS_EROFS
);
4230 MutationImpl::LockOpVec lov
;
4232 unsigned mask
= req
->head
.args
.open
.mask
;
4234 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4236 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4237 issued
= cap
->issued();
4238 // permission bits, ACL/security xattrs
4239 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4240 lov
.add_rdlock(&cur
->authlock
);
4241 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4242 lov
.add_rdlock(&cur
->xattrlock
);
4244 mdr
->getattr_caps
= mask
;
4248 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4249 ceph_assert(cur
->is_auth());
4251 lov
.add_xlock(&cur
->filelock
);
4252 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4255 if (!check_access(mdr
, cur
, MAY_WRITE
))
4258 // wait for pending truncate?
4259 const auto& pi
= cur
->get_projected_inode();
4260 if (pi
->is_truncating()) {
4261 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4262 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4263 mds
->locker
->drop_locks(mdr
.get());
4264 mdr
->drop_local_auth_pins();
4265 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4269 do_open_truncate(mdr
, cmode
);
4273 // sync filelock if snapped.
4274 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4275 // and that data itself is flushed so that we can read the snapped data off disk.
4276 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4277 lov
.add_rdlock(&cur
->filelock
);
4280 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4284 if (cmode
& CEPH_FILE_MODE_WR
)
4286 if (!check_access(mdr
, cur
, mask
))
4289 utime_t now
= ceph_clock_now();
4290 mdr
->set_mds_stamp(now
);
4292 if (cur
->is_file() || cur
->is_dir()) {
4293 if (mdr
->snapid
== CEPH_NOSNAP
) {
4295 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4297 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4298 << " for " << req
->get_source()
4299 << " on " << *cur
<< dendl
;
4301 int caps
= ceph_caps_for_mode(cmode
);
4302 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4303 << " for " << req
->get_source()
4304 << " snapid " << mdr
->snapid
4305 << " on " << *cur
<< dendl
;
4306 mdr
->snap_caps
= caps
;
4310 // increase max_size?
4311 if (cmode
& CEPH_FILE_MODE_WR
)
4312 mds
->locker
->check_inode_max_size(cur
);
4314 // make sure this inode gets into the journal
4315 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4316 mdcache
->open_file_table
.should_log_open(cur
)) {
4317 EOpen
*le
= new EOpen(mds
->mdlog
);
4318 mdlog
->start_entry(le
);
4319 le
->add_clean_inode(cur
);
4320 mdlog
->submit_entry(le
);
4324 if (cmode
& CEPH_FILE_MODE_WR
)
4325 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4327 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
4328 mdr
->client_request
->get_source().num());
4331 if (req
->get_dentry_wanted()) {
4332 ceph_assert(mdr
->dn
[0].size());
4333 dn
= mdr
->dn
[0].back();
4338 respond_to_request(mdr
, 0);
4341 class C_MDS_openc_finish
: public ServerLogContext
{
4345 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4346 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4347 void finish(int r
) override
{
4348 ceph_assert(r
== 0);
4350 dn
->pop_projected_linkage();
4352 // dirty inode, dn, dir
4353 newi
->mark_dirty(mdr
->ls
);
4354 newi
->mark_dirty_parent(mdr
->ls
, true);
4358 get_mds()->locker
->share_inode_max_size(newi
);
4360 MDRequestRef null_ref
;
4361 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4363 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4365 server
->respond_to_request(mdr
, 0);
4367 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4371 /* This function takes responsibility for the passed mdr*/
4372 void Server::handle_client_openc(MDRequestRef
& mdr
)
4374 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4375 client_t client
= mdr
->get_client();
4377 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4379 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4381 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4385 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4386 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true);
4390 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4391 if (!excl
&& !dnl
->is_null()) {
4393 mds
->locker
->xlock_downgrade(&dn
->lock
, mdr
.get());
4395 MutationImpl::LockOpVec lov
;
4396 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4397 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4400 handle_client_open(mdr
);
4404 ceph_assert(dnl
->is_null());
4406 if (req
->get_alternate_name().size() > alternate_name_max
) {
4407 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
4408 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
4411 dn
->set_alternate_name(req
->get_alternate_name());
4414 file_layout_t layout
;
4415 if (mdr
->dir_layout
!= file_layout_t())
4416 layout
= mdr
->dir_layout
;
4418 layout
= mdcache
->default_file_layout
;
4420 // What kind of client caps are required to complete this operation
4421 uint64_t access
= MAY_WRITE
;
4423 const auto default_layout
= layout
;
4425 // fill in any special params from client
4426 if (req
->head
.args
.open
.stripe_unit
)
4427 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4428 if (req
->head
.args
.open
.stripe_count
)
4429 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4430 if (req
->head
.args
.open
.object_size
)
4431 layout
.object_size
= req
->head
.args
.open
.object_size
;
4432 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4433 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4434 layout
.pool_id
= req
->head
.args
.open
.pool
;
4436 // make sure we have as new a map as the client
4437 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4438 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4443 // If client doesn't have capability to modify layout pools, then
4444 // only permit this request if the requested pool matches what the
4445 // file would have inherited anyway from its parent.
4446 if (default_layout
!= layout
) {
4447 access
|= MAY_SET_VXATTR
;
4450 if (!layout
.is_valid()) {
4451 dout(10) << " invalid initial file layout" << dendl
;
4452 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4455 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4456 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4457 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4462 CDir
*dir
= dn
->get_dir();
4463 CInode
*diri
= dir
->get_inode();
4464 if (!check_access(mdr
, diri
, access
))
4466 if (!check_fragment_space(mdr
, dir
))
4468 if (!check_dir_max_entries(mdr
, dir
))
4471 if (mdr
->dn
[0].size() == 1)
4472 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4475 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4476 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4480 dn
->push_projected_linkage(newi
);
4482 auto _inode
= newi
->_get_inode();
4483 _inode
->version
= dn
->pre_dirty();
4484 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4485 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
4486 _inode
->update_backtrace();
4487 _inode
->rstat
.rfiles
= 1;
4488 _inode
->accounted_rstat
= _inode
->rstat
;
4490 SnapRealm
*realm
= diri
->find_snaprealm();
4491 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4492 ceph_assert(follows
>= realm
->get_newest_seq());
4494 ceph_assert(dn
->first
== follows
+1);
4495 newi
->first
= dn
->first
;
4498 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
4499 newi
->authlock
.set_state(LOCK_EXCL
);
4500 newi
->xattrlock
.set_state(LOCK_EXCL
);
4502 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4503 _inode
->client_ranges
[client
].range
.first
= 0;
4504 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
4505 _inode
->client_ranges
[client
].follows
= follows
;
4506 newi
->mark_clientwriteable();
4507 cap
->mark_clientwriteable();
4511 mdr
->ls
= mdlog
->get_current_segment();
4512 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4513 mdlog
->start_entry(le
);
4514 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4515 journal_allocated_inos(mdr
, &le
->metablob
);
4516 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4517 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4519 // make sure this inode gets into the journal
4520 le
->metablob
.add_opened_ino(newi
->ino());
4522 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, newi
);
4524 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4525 openc_response_t ocresp
;
4527 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4528 ocresp
.created_ino
= _inode
->ino
;
4530 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4531 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4532 unsigned frac
= 100 / delegate_inos_pct
;
4533 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4534 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4537 encode(ocresp
, mdr
->reply_extra_bl
);
4538 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4539 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4540 // add the file created flag onto the reply if create_flags features is supported
4541 encode(newi
->ino(), mdr
->reply_extra_bl
);
4544 journal_and_reply(mdr
, newi
, dn
, le
, fin
);
4546 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4547 // have overshot the split size (multiple opencs in flight), so here is
4548 // an early chance to split the dir if this openc makes it oversized.
4549 mds
->balancer
->maybe_fragment(dir
, false);
4554 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4556 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4557 Session
*session
= mds
->get_session(req
);
4558 client_t client
= req
->get_source().num();
4559 MutationImpl::LockOpVec lov
;
4560 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4563 // it's a directory, right?
4564 if (!diri
->is_dir()) {
4566 dout(10) << "reply to " << *req
<< " readdir -CEPHFS_ENOTDIR" << dendl
;
4567 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
4571 auto num_caps
= session
->get_num_caps();
4572 auto session_cap_acquisition
= session
->get_cap_acquisition();
4574 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
4575 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
4576 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
4578 logger
->inc(l_mdss_cap_acquisition_throttle
);
4580 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
4584 lov
.add_rdlock(&diri
->filelock
);
4585 lov
.add_rdlock(&diri
->dirfragtreelock
);
4587 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4590 if (!check_access(mdr
, diri
, MAY_READ
))
4594 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4595 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4596 string offset_str
= req
->get_path2();
4598 __u32 offset_hash
= 0;
4599 if (!offset_str
.empty())
4600 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4602 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4604 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4605 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4607 // does the frag exist?
4608 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4610 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4611 if (fg
.contains((unsigned)offset_hash
)) {
4612 newfg
= diri
->dirfragtree
[offset_hash
];
4614 // client actually wants next frag
4615 newfg
= diri
->dirfragtree
[fg
.value()];
4619 newfg
= diri
->dirfragtree
[fg
.value()];
4621 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4625 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4629 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4630 ceph_assert(dir
->is_auth());
4632 if (!dir
->is_complete()) {
4633 if (dir
->is_frozen()) {
4634 dout(7) << "dir is frozen " << *dir
<< dendl
;
4635 mds
->locker
->drop_locks(mdr
.get());
4636 mdr
->drop_local_auth_pins();
4637 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4641 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4642 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4646 #ifdef MDS_VERIFY_FRAGSTAT
4647 dir
->verify_fragstat();
4650 utime_t now
= ceph_clock_now();
4651 mdr
->set_mds_stamp(now
);
4653 snapid_t snapid
= mdr
->snapid
;
4654 dout(10) << "snapid " << snapid
<< dendl
;
4656 SnapRealm
*realm
= diri
->find_snaprealm();
4658 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4660 max
= dir
->get_num_any(); // whatever, something big.
4661 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4663 // make sure at least one item can be encoded
4664 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4669 ds
.frag
= dir
->get_frag();
4670 ds
.auth
= dir
->get_dir_auth().first
;
4671 if (dir
->is_auth() && !forward_all_requests_to_auth
)
4672 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4674 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4676 // count bytes available.
4677 // this isn't perfect, but we should capture the main variable/unbounded size items!
4678 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4679 int bytes_left
= max_bytes
- front_bytes
;
4680 bytes_left
-= realm
->get_snap_trace().length();
4682 // build dir contents
4685 bool start
= !offset_hash
&& offset_str
.empty();
4686 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4687 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4688 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4689 bool end
= (it
== dir
->end());
4690 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4691 CDentry
*dn
= it
->second
;
4694 if (dn
->state_test(CDentry::STATE_PURGING
))
4697 bool dnp
= dn
->use_projected(client
, mdr
);
4698 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4703 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4704 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4709 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4710 if (!(offset_key
< dn
->key()))
4714 CInode
*in
= dnl
->get_inode();
4716 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4720 // better for the MDS to do the work, if we think the client will stat any of these files.
4721 if (dnl
->is_remote() && !in
) {
4722 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4724 dn
->link_remote(dnl
, in
);
4725 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4726 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4729 // touch everything i _do_ have
4730 for (auto &p
: *dir
) {
4731 if (!p
.second
->get_linkage()->is_null())
4732 mdcache
->lru
.lru_touch(p
.second
);
4735 // already issued caps and leases, reply immediately.
4736 if (dnbl
.length() > 0) {
4737 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4738 dout(10) << " open remote dentry after caps were issued, stopping at "
4739 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4743 mds
->locker
->drop_locks(mdr
.get());
4744 mdr
->drop_local_auth_pins();
4745 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4751 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4752 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4756 unsigned start_len
= dnbl
.length();
4759 dout(12) << "including dn " << *dn
<< dendl
;
4760 encode(dn
->get_name(), dnbl
);
4761 int lease_mask
= dnl
->is_primary() ? CEPH_LEASE_PRIMARY_LINK
: 0;
4762 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, dnbl
);
4765 dout(12) << "including inode " << *in
<< dendl
;
4766 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4768 // chop off dn->name, lease
4769 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4771 keep
.substr_of(dnbl
, 0, start_len
);
4775 ceph_assert(r
>= 0);
4779 mdcache
->lru
.lru_touch(dn
);
4782 session
->touch_readdir_cap(numfiles
);
4786 flags
= CEPH_READDIR_FRAG_END
;
4788 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4790 // client only understand END and COMPLETE flags ?
4791 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4792 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4795 // finish final blob
4796 encode(numfiles
, dirbl
);
4797 encode(flags
, dirbl
);
4798 dirbl
.claim_append(dnbl
);
4801 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4802 << " bytes=" << dirbl
.length()
4803 << " start=" << (int)start
4804 << " end=" << (int)end
4806 mdr
->reply_extra_bl
= dirbl
;
4808 // bump popularity. NOTE: this doesn't quite capture it.
4809 mds
->balancer
->hit_dir(dir
, META_POP_READDIR
, -1, numfiles
);
4813 respond_to_request(mdr
, 0);
4818 // ===============================================================================
4823 * finisher for basic inode updates
4825 class C_MDS_inode_update_finish
: public ServerLogContext
{
4827 bool truncating_smaller
, changed_ranges
, adjust_realm
;
4829 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4830 bool sm
=false, bool cr
=false, bool ar
=false) :
4831 ServerLogContext(s
, r
), in(i
),
4832 truncating_smaller(sm
), changed_ranges(cr
), adjust_realm(ar
) { }
4833 void finish(int r
) override
{
4834 ceph_assert(r
== 0);
4836 int snap_op
= (in
->snaprealm
? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
);
4841 MDSRank
*mds
= get_mds();
4843 // notify any clients
4844 if (truncating_smaller
&& in
->get_inode()->is_truncating()) {
4845 mds
->locker
->issue_truncate(in
);
4846 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4850 mds
->mdcache
->send_snap_update(in
, 0, snap_op
);
4851 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, snap_op
);
4854 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4856 server
->respond_to_request(mdr
, 0);
4859 get_mds()->locker
->share_inode_max_size(in
);
4863 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4865 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4866 MutationImpl::LockOpVec lov
;
4868 // get the inode to operate on, and set up any locks needed for that
4869 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4873 lov
.add_xlock(&cur
->flocklock
);
4874 /* acquire_locks will return true if it gets the locks. If it fails,
4875 it will redeliver this request at a later date, so drop the request.
4877 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4878 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4882 // copy the lock change into a ceph_filelock so we can store/apply it
4883 ceph_filelock set_lock
;
4884 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4885 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4886 set_lock
.client
= req
->get_orig_source().num();
4887 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4888 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4889 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4890 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4892 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4894 ceph_lock_state_t
*lock_state
= NULL
;
4895 bool interrupt
= false;
4897 // get the appropriate lock state
4898 switch (req
->head
.args
.filelock_change
.rule
) {
4899 case CEPH_LOCK_FLOCK_INTR
:
4902 case CEPH_LOCK_FLOCK
:
4903 lock_state
= cur
->get_flock_lock_state();
4906 case CEPH_LOCK_FCNTL_INTR
:
4909 case CEPH_LOCK_FCNTL
:
4910 lock_state
= cur
->get_fcntl_lock_state();
4914 dout(10) << "got unknown lock type " << set_lock
.type
4915 << ", dropping request!" << dendl
;
4916 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
4920 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4921 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4922 list
<ceph_filelock
> activated_locks
;
4923 MDSContext::vec waiters
;
4924 if (lock_state
->is_waiting(set_lock
)) {
4925 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4926 lock_state
->remove_waiting(set_lock
);
4927 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4928 } else if (!interrupt
) {
4929 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4930 lock_state
->remove_lock(set_lock
, activated_locks
);
4931 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4933 mds
->queue_waiters(waiters
);
4935 respond_to_request(mdr
, 0);
4937 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4938 bool deadlock
= false;
4939 if (mdr
->more()->flock_was_waiting
&&
4940 !lock_state
->is_waiting(set_lock
)) {
4941 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4942 respond_to_request(mdr
, -CEPHFS_EINTR
);
4943 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4944 dout(10) << " it failed on this attempt" << dendl
;
4945 // couldn't set lock right now
4947 respond_to_request(mdr
, -CEPHFS_EDEADLK
);
4948 } else if (!will_wait
) {
4949 respond_to_request(mdr
, -CEPHFS_EWOULDBLOCK
);
4951 dout(10) << " added to waiting list" << dendl
;
4952 ceph_assert(lock_state
->is_waiting(set_lock
));
4953 mdr
->more()->flock_was_waiting
= true;
4954 mds
->locker
->drop_locks(mdr
.get());
4955 mdr
->drop_local_auth_pins();
4956 mdr
->mark_event("failed to add lock, waiting");
4958 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4961 respond_to_request(mdr
, 0);
4963 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4966 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4968 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4969 MutationImpl::LockOpVec lov
;
4971 // get the inode to operate on, and set up any locks needed for that
4972 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4976 /* acquire_locks will return true if it gets the locks. If it fails,
4977 it will redeliver this request at a later date, so drop the request.
4979 lov
.add_rdlock(&cur
->flocklock
);
4980 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4981 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4985 // copy the lock change into a ceph_filelock so we can store/apply it
4986 ceph_filelock checking_lock
;
4987 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4988 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4989 checking_lock
.client
= req
->get_orig_source().num();
4990 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4991 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4992 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4994 // get the appropriate lock state
4995 ceph_lock_state_t
*lock_state
= NULL
;
4996 switch (req
->head
.args
.filelock_change
.rule
) {
4997 case CEPH_LOCK_FLOCK
:
4998 lock_state
= cur
->get_flock_lock_state();
5001 case CEPH_LOCK_FCNTL
:
5002 lock_state
= cur
->get_fcntl_lock_state();
5006 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
5007 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5010 lock_state
->look_for_lock(checking_lock
);
5013 encode(checking_lock
, lock_bl
);
5015 mdr
->reply_extra_bl
= lock_bl
;
5016 respond_to_request(mdr
, 0);
5019 void Server::handle_client_setattr(MDRequestRef
& mdr
)
5021 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5022 MutationImpl::LockOpVec lov
;
5023 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5026 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5027 respond_to_request(mdr
, -CEPHFS_EROFS
);
5030 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
5031 respond_to_request(mdr
, -CEPHFS_EPERM
);
5035 __u32 mask
= req
->head
.args
.setattr
.mask
;
5036 __u32 access_mask
= MAY_WRITE
;
5039 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
5040 lov
.add_xlock(&cur
->authlock
);
5041 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
5042 lov
.add_xlock(&cur
->filelock
);
5043 if (mask
& CEPH_SETATTR_CTIME
)
5044 lov
.add_wrlock(&cur
->versionlock
);
5046 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5049 if ((mask
& CEPH_SETATTR_UID
) && (cur
->get_inode()->uid
!= req
->head
.args
.setattr
.uid
))
5050 access_mask
|= MAY_CHOWN
;
5052 if ((mask
& CEPH_SETATTR_GID
) && (cur
->get_inode()->gid
!= req
->head
.args
.setattr
.gid
))
5053 access_mask
|= MAY_CHGRP
;
5055 if (!check_access(mdr
, cur
, access_mask
))
5058 // trunc from bigger -> smaller?
5059 const auto& pip
= cur
->get_projected_inode();
5061 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
5063 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5064 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
5065 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl
;
5066 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
5070 bool truncating_smaller
= false;
5071 if (mask
& CEPH_SETATTR_SIZE
) {
5072 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
5073 if (truncating_smaller
&& pip
->is_truncating()) {
5074 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
5075 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
5076 mds
->locker
->drop_locks(mdr
.get());
5077 mdr
->drop_local_auth_pins();
5078 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
5083 bool changed_ranges
= false;
5086 mdr
->ls
= mdlog
->get_current_segment();
5087 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
5088 mdlog
->start_entry(le
);
5090 auto pi
= cur
->project_inode(mdr
);
5092 if (mask
& CEPH_SETATTR_UID
)
5093 pi
.inode
->uid
= req
->head
.args
.setattr
.uid
;
5094 if (mask
& CEPH_SETATTR_GID
)
5095 pi
.inode
->gid
= req
->head
.args
.setattr
.gid
;
5097 if (mask
& CEPH_SETATTR_MODE
)
5098 pi
.inode
->mode
= (pi
.inode
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
5099 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
5100 S_ISREG(pi
.inode
->mode
) &&
5101 (pi
.inode
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
5102 pi
.inode
->mode
&= ~(S_ISUID
|S_ISGID
);
5105 if (mask
& CEPH_SETATTR_MTIME
)
5106 pi
.inode
->mtime
= req
->head
.args
.setattr
.mtime
;
5107 if (mask
& CEPH_SETATTR_ATIME
)
5108 pi
.inode
->atime
= req
->head
.args
.setattr
.atime
;
5109 if (mask
& CEPH_SETATTR_BTIME
)
5110 pi
.inode
->btime
= req
->head
.args
.setattr
.btime
;
5111 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
5112 pi
.inode
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
5113 if (mask
& CEPH_SETATTR_SIZE
) {
5114 if (truncating_smaller
) {
5115 pi
.inode
->truncate(old_size
, req
->head
.args
.setattr
.size
);
5116 le
->metablob
.add_truncate_start(cur
->ino());
5118 pi
.inode
->size
= req
->head
.args
.setattr
.size
;
5119 pi
.inode
->rstat
.rbytes
= pi
.inode
->size
;
5121 pi
.inode
->mtime
= mdr
->get_op_stamp();
5123 // adjust client's max_size?
5124 if (mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
->size
)) {
5125 dout(10) << " client_ranges " << cur
->get_previous_projected_inode()->client_ranges
5126 << " -> " << pi
.inode
->client_ranges
<< dendl
;
5127 changed_ranges
= true;
5131 pi
.inode
->version
= cur
->pre_dirty();
5132 pi
.inode
->ctime
= mdr
->get_op_stamp();
5133 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5134 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5135 pi
.inode
->change_attr
++;
5138 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5139 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5140 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5142 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5143 truncating_smaller
, changed_ranges
));
5145 // flush immediately if there are readers/writers waiting
5146 if (mdr
->is_xlocked(&cur
->filelock
) &&
5147 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5148 mds
->mdlog
->flush();
5151 /* Takes responsibility for mdr */
5152 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5154 CInode
*in
= mdr
->in
[0];
5155 client_t client
= mdr
->get_client();
5158 dout(10) << "do_open_truncate " << *in
<< dendl
;
5160 SnapRealm
*realm
= in
->find_snaprealm();
5161 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5163 mdr
->ls
= mdlog
->get_current_segment();
5164 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5165 mdlog
->start_entry(le
);
5168 auto pi
= in
->project_inode(mdr
);
5169 pi
.inode
->version
= in
->pre_dirty();
5170 pi
.inode
->mtime
= pi
.inode
->ctime
= mdr
->get_op_stamp();
5171 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5172 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5173 pi
.inode
->change_attr
++;
5175 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
5177 pi
.inode
->truncate(old_size
, 0);
5178 le
->metablob
.add_truncate_start(in
->ino());
5181 bool changed_ranges
= false;
5182 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5183 pi
.inode
->client_ranges
[client
].range
.first
= 0;
5184 pi
.inode
->client_ranges
[client
].range
.last
= pi
.inode
->get_layout_size_increment();
5185 pi
.inode
->client_ranges
[client
].follows
= realm
->get_newest_seq();
5186 changed_ranges
= true;
5187 in
->mark_clientwriteable();
5188 cap
->mark_clientwriteable();
5191 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5193 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5194 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5196 // make sure ino gets into the journal
5197 le
->metablob
.add_opened_ino(in
->ino());
5199 mdr
->o_trunc
= true;
5202 if (mdr
->client_request
->get_dentry_wanted()) {
5203 ceph_assert(mdr
->dn
[0].size());
5204 dn
= mdr
->dn
[0].back();
5207 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5209 // Although the `open` part can give an early reply, the truncation won't
5210 // happen until our EUpdate is persistent, to give the client a prompt
5211 // response we must also flush that event.
5216 /* This function cleans up the passed mdr */
5217 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5219 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5220 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5223 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5224 respond_to_request(mdr
, -CEPHFS_EROFS
);
5227 if (!cur
->is_file()) {
5228 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5231 if (cur
->get_projected_inode()->size
||
5232 cur
->get_projected_inode()->truncate_seq
> 1) {
5233 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5238 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5239 // save existing layout for later
5240 const auto old_layout
= layout
;
5242 int access
= MAY_WRITE
;
5244 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5245 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5246 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5247 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5248 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5249 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5250 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5251 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5253 // make sure we have as new a map as the client
5254 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5255 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5260 // Don't permit layout modifications without 'p' caps
5261 if (layout
!= old_layout
) {
5262 access
|= MAY_SET_VXATTR
;
5265 if (!layout
.is_valid()) {
5266 dout(10) << "bad layout" << dendl
;
5267 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5270 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5271 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5272 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5276 MutationImpl::LockOpVec lov
;
5277 lov
.add_xlock(&cur
->filelock
);
5278 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5281 if (!check_access(mdr
, cur
, access
))
5285 auto pi
= cur
->project_inode(mdr
);
5286 pi
.inode
->layout
= layout
;
5287 // add the old pool to the inode
5288 pi
.inode
->add_old_pool(old_layout
.pool_id
);
5289 pi
.inode
->version
= cur
->pre_dirty();
5290 pi
.inode
->ctime
= mdr
->get_op_stamp();
5291 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5292 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5293 pi
.inode
->change_attr
++;
5296 mdr
->ls
= mdlog
->get_current_segment();
5297 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5298 mdlog
->start_entry(le
);
5299 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5300 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5301 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5303 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5306 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5308 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5311 MutationImpl::LockOpVec lov
;
5312 lov
.add_xlock(&in
->policylock
);
5314 lov
.add_xlock(&in
->snaplock
);
5316 lov
.add_rdlock(&in
->snaplock
);
5317 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5320 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5321 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5322 want_layout
= false;
5324 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5325 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5329 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5333 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5335 CInode
*in
= mdcache
->get_inode(ino
);
5336 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5337 respond_to_request(mdr
, -CEPHFS_ESTALE
);
5340 if (!in
->is_auth()) {
5341 mdcache
->request_forward(mdr
, in
->authority().first
);
5348 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5350 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5352 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5353 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5357 if (!cur
->is_dir()) {
5358 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
5362 if (!xlock_policylock(mdr
, cur
, true))
5366 const auto& old_pi
= cur
->get_projected_inode();
5367 file_layout_t layout
;
5368 if (old_pi
->has_layout())
5369 layout
= old_pi
->layout
;
5370 else if (mdr
->dir_layout
!= file_layout_t())
5371 layout
= mdr
->dir_layout
;
5373 layout
= mdcache
->default_file_layout
;
5375 // Level of access required to complete
5376 int access
= MAY_WRITE
;
5378 const auto old_layout
= layout
;
5380 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5381 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5382 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5383 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5384 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5385 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5386 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5387 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5388 // make sure we have as new a map as the client
5389 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5390 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5395 if (layout
!= old_layout
) {
5396 access
|= MAY_SET_VXATTR
;
5399 if (!layout
.is_valid()) {
5400 dout(10) << "bad layout" << dendl
;
5401 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5404 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5405 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5406 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5410 if (!check_access(mdr
, cur
, access
))
5413 auto pi
= cur
->project_inode(mdr
);
5414 pi
.inode
->layout
= layout
;
5415 pi
.inode
->version
= cur
->pre_dirty();
5418 mdr
->ls
= mdlog
->get_current_segment();
5419 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5420 mdlog
->start_entry(le
);
5421 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5422 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5423 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5425 mdr
->no_early_reply
= true;
5426 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5430 int Server::parse_layout_vxattr_json(
5431 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5433 auto parse_pool
= [&](std::string pool_name
, int64_t pool_id
) -> int64_t {
5434 if (pool_name
!= "") {
5435 int64_t _pool_id
= osdmap
.lookup_pg_pool_name(pool_name
);
5437 dout(10) << __func__
<< ": unknown pool name:" << pool_name
<< dendl
;
5438 return -CEPHFS_EINVAL
;
5441 } else if (pool_id
>= 0) {
5442 const auto pools
= osdmap
.get_pools();
5443 if (pools
.find(pool_id
) == pools
.end()) {
5444 dout(10) << __func__
<< ": unknown pool id:" << pool_id
<< dendl
;
5445 return -CEPHFS_EINVAL
;
5449 return -CEPHFS_EINVAL
;
5454 if (name
== "layout.json") {
5455 JSONParser json_parser
;
5456 if (json_parser
.parse(value
.c_str(), value
.length()) and json_parser
.is_object()) {
5459 field
= "object_size";
5460 JSONDecoder::decode_json("object_size", layout
->object_size
, &json_parser
, true);
5462 field
= "stripe_unit";
5463 JSONDecoder::decode_json("stripe_unit", layout
->stripe_unit
, &json_parser
, true);
5465 field
= "stripe_count";
5466 JSONDecoder::decode_json("stripe_count", layout
->stripe_count
, &json_parser
, true);
5468 field
= "pool_namespace";
5469 JSONDecoder::decode_json("pool_namespace", layout
->pool_ns
, &json_parser
, false);
5472 int64_t pool_id
= 0;
5473 JSONDecoder::decode_json("pool_id", pool_id
, &json_parser
, false);
5475 field
= "pool_name";
5476 std::string pool_name
;
5477 JSONDecoder::decode_json("pool_name", pool_name
, &json_parser
, false);
5479 pool_id
= parse_pool(pool_name
, pool_id
);
5481 return (int)pool_id
;
5483 layout
->pool_id
= pool_id
;
5484 } catch (JSONDecoder::err
&) {
5485 dout(10) << __func__
<< ": json is missing a mandatory field named "
5487 return -CEPHFS_EINVAL
;
5490 dout(10) << __func__
<< ": bad json" << dendl
;
5491 return -CEPHFS_EINVAL
;
5494 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5495 return -CEPHFS_ENODATA
; // no such attribute
5497 } catch (boost::bad_lexical_cast
const&) {
5498 dout(10) << __func__
<< ": bad vxattr value:" << value
5499 << ", unable to parse for xattr:" << name
<< dendl
;
5500 return -CEPHFS_EINVAL
;
5505 // parse old style layout string
5506 int Server::parse_layout_vxattr_string(
5507 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5510 if (name
== "layout") {
5511 string::iterator begin
= value
.begin();
5512 string::iterator end
= value
.end();
5513 keys_and_values
<string::iterator
> p
; // create instance of parser
5514 std::map
<string
, string
> m
; // map to receive results
5515 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5516 return -CEPHFS_EINVAL
;
5518 string
left(begin
, end
);
5519 dout(10) << __func__
<< ": parsed " << m
<< " left '" << left
<< "'" << dendl
;
5521 return -CEPHFS_EINVAL
;
5522 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5523 // Skip validation on each attr, we do it once at the end (avoid
5524 // rejecting intermediate states if the overall result is ok)
5525 int r
= parse_layout_vxattr_string(string("layout.") + q
->first
, q
->second
,
5530 } else if (name
== "layout.object_size") {
5531 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5532 } else if (name
== "layout.stripe_unit") {
5533 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5534 } else if (name
== "layout.stripe_count") {
5535 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5536 } else if (name
== "layout.pool") {
5538 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5539 } catch (boost::bad_lexical_cast
const&) {
5540 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5542 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5543 return -CEPHFS_ENOENT
;
5545 layout
->pool_id
= pool
;
5547 } else if (name
== "layout.pool_id") {
5548 layout
->pool_id
= boost::lexical_cast
<int64_t>(value
);
5549 } else if (name
== "layout.pool_name") {
5550 layout
->pool_id
= osdmap
.lookup_pg_pool_name(value
);
5551 if (layout
->pool_id
< 0) {
5552 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5553 return -CEPHFS_EINVAL
;
5555 } else if (name
== "layout.pool_namespace") {
5556 layout
->pool_ns
= value
;
5558 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5559 return -CEPHFS_ENODATA
; // no such attribute
5561 } catch (boost::bad_lexical_cast
const&) {
5562 dout(10) << __func__
<< ": bad vxattr value, unable to parse int for "
5564 return -CEPHFS_EINVAL
;
5569 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5570 file_layout_t
*layout
, bool validate
)
5572 dout(20) << __func__
<< ": name:" << name
<< " value:'" << value
<< "'" << dendl
;
5575 if (name
== "layout.json") {
5576 r
= parse_layout_vxattr_json(name
, value
, osdmap
, layout
);
5578 r
= parse_layout_vxattr_string(name
, value
, osdmap
, layout
);
5584 if (validate
&& !layout
->is_valid()) {
5585 dout(10) << __func__
<< ": bad layout" << dendl
;
5586 return -CEPHFS_EINVAL
;
5588 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5589 dout(10) << __func__
<< ": invalid data pool " << layout
->pool_id
<< dendl
;
5590 return -CEPHFS_EINVAL
;
5595 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5597 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5599 if (name
== "quota") {
5600 string::iterator begin
= value
.begin();
5601 string::iterator end
= value
.end();
5603 // keep quota unchanged. (for create_quota_realm())
5606 keys_and_values
<string::iterator
> p
; // create instance of parser
5607 std::map
<string
, string
> m
; // map to receive results
5608 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5609 return -CEPHFS_EINVAL
;
5611 string
left(begin
, end
);
5612 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5614 return -CEPHFS_EINVAL
;
5615 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5616 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5620 } else if (name
== "quota.max_bytes") {
5621 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5623 return -CEPHFS_EINVAL
;
5624 quota
->max_bytes
= q
;
5625 } else if (name
== "quota.max_files") {
5626 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5628 return -CEPHFS_EINVAL
;
5629 quota
->max_files
= q
;
5631 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5632 return -CEPHFS_EINVAL
;
5634 } catch (boost::bad_lexical_cast
const&) {
5635 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5636 return -CEPHFS_EINVAL
;
5639 if (!quota
->is_valid()) {
5640 dout(10) << "bad quota" << dendl
;
5641 return -CEPHFS_EINVAL
;
5646 void Server::create_quota_realm(CInode
*in
)
5648 dout(10) << __func__
<< " " << *in
<< dendl
;
5650 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5651 req
->set_filepath(filepath(in
->ino()));
5652 req
->set_string2("ceph.quota");
5653 // empty vxattr value
5654 req
->set_tid(mds
->issue_tid());
5656 mds
->send_message_mds(req
, in
->authority().first
);
5660 * Verify that the file layout attribute carried by client
5661 * is well-formatted.
5662 * Return 0 on success, otherwise this function takes
5663 * responsibility for the passed mdr.
5665 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5668 file_layout_t
*layout
)
5670 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5674 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5675 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5676 epoch
= osdmap
.get_epoch();
5679 if (r
== -CEPHFS_ENOENT
) {
5681 // we don't have the specified pool, make sure our map
5682 // is newer than or as new as the client.
5683 epoch_t req_epoch
= req
->get_osdmap_epoch();
5685 if (req_epoch
> epoch
) {
5687 // well, our map is older. consult mds.
5688 auto fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5690 mds
->objecter
->wait_for_map(req_epoch
, lambdafy(fin
));
5692 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5694 // For compatibility with client w/ old code, we still need get the
5695 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5696 // we can remove those code.
5697 mdr
->waited_for_osdmap
= true;
5698 mds
->objecter
->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5699 mds
, new C_MDS_RetryRequest(mdcache
, mdr
))));
5706 if (r
== -CEPHFS_ENOENT
)
5709 respond_to_request(mdr
, r
);
5717 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5719 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5720 string
name(req
->get_path2());
5721 bufferlist bl
= req
->get_data();
5722 string
value (bl
.c_str(), bl
.length());
5723 dout(10) << "handle_set_vxattr " << name
5724 << " val " << value
.length()
5725 << " bytes on " << *cur
5728 CInode::mempool_inode
*pip
= nullptr;
5731 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5735 bool adjust_realm
= false;
5736 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5737 if (!cur
->is_dir()) {
5738 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5742 if (!xlock_policylock(mdr
, cur
, true))
5745 file_layout_t layout
;
5746 if (cur
->get_projected_inode()->has_layout())
5747 layout
= cur
->get_projected_inode()->layout
;
5748 else if (mdr
->dir_layout
!= file_layout_t())
5749 layout
= mdr
->dir_layout
;
5751 layout
= mdcache
->default_file_layout
;
5753 rest
= name
.substr(name
.find("layout"));
5754 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5757 auto pi
= cur
->project_inode(mdr
);
5758 pi
.inode
->layout
= layout
;
5759 mdr
->no_early_reply
= true;
5760 pip
= pi
.inode
.get();
5761 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5762 if (!cur
->is_file()) {
5763 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5766 if (cur
->get_projected_inode()->size
||
5767 cur
->get_projected_inode()->truncate_seq
> 1) {
5768 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5771 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5772 rest
= name
.substr(name
.find("layout"));
5773 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5776 MutationImpl::LockOpVec lov
;
5777 lov
.add_xlock(&cur
->filelock
);
5778 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5781 auto pi
= cur
->project_inode(mdr
);
5782 int64_t old_pool
= pi
.inode
->layout
.pool_id
;
5783 pi
.inode
->add_old_pool(old_pool
);
5784 pi
.inode
->layout
= layout
;
5785 pip
= pi
.inode
.get();
5786 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5787 if (!cur
->is_dir()) {
5788 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5792 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5794 rest
= name
.substr(name
.find("quota"));
5795 int r
= parse_quota_vxattr(rest
, value
, "a
);
5797 respond_to_request(mdr
, r
);
5801 if (quota
.is_enable() && !cur
->get_projected_srnode())
5802 adjust_realm
= true;
5804 if (!xlock_policylock(mdr
, cur
, false, adjust_realm
))
5807 if (cur
->get_projected_inode()->quota
== quota
) {
5808 respond_to_request(mdr
, 0);
5812 auto pi
= cur
->project_inode(mdr
, false, adjust_realm
);
5813 pi
.inode
->quota
= quota
;
5816 pi
.snapnode
->created
= pi
.snapnode
->seq
= cur
->find_snaprealm()->get_newest_seq();
5818 mdr
->no_early_reply
= true;
5819 pip
= pi
.inode
.get();
5821 client_t exclude_ct
= mdr
->get_client();
5822 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5823 } else if (name
== "ceph.dir.subvolume"sv
) {
5824 if (!cur
->is_dir()) {
5825 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5831 val
= boost::lexical_cast
<bool>(value
);
5832 } catch (boost::bad_lexical_cast
const&) {
5833 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5834 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5838 /* Verify it's not already a subvolume with lighter weight
5841 if (!mdr
->more()->rdonly_checks
) {
5842 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
5843 MutationImpl::LockOpVec lov
;
5844 lov
.add_rdlock(&cur
->snaplock
);
5845 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5847 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5849 const auto srnode
= cur
->get_projected_srnode();
5850 if (val
== (srnode
&& srnode
->is_subvolume())) {
5851 dout(20) << "already marked subvolume" << dendl
;
5852 respond_to_request(mdr
, 0);
5855 mdr
->more()->rdonly_checks
= true;
5858 if ((mdr
->locking_state
& MutationImpl::ALL_LOCKED
) && !mdr
->is_xlocked(&cur
->snaplock
)) {
5859 /* drop the rdlock and acquire xlocks */
5860 dout(20) << "dropping rdlocks" << dendl
;
5861 mds
->locker
->drop_locks(mdr
.get());
5862 if (!xlock_policylock(mdr
, cur
, false, true))
5866 /* repeat rdonly checks in case changed between rdlock -> xlock */
5867 SnapRealm
*realm
= cur
->find_snaprealm();
5869 inodeno_t subvol_ino
= realm
->get_subvolume_ino();
5870 // can't create subvolume inside another subvolume
5871 if (subvol_ino
&& subvol_ino
!= cur
->ino()) {
5872 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5877 const auto srnode
= cur
->get_projected_srnode();
5878 if (val
== (srnode
&& srnode
->is_subvolume())) {
5879 respond_to_request(mdr
, 0);
5883 auto pi
= cur
->project_inode(mdr
, false, true);
5885 pi
.snapnode
->created
= pi
.snapnode
->seq
= realm
->get_newest_seq();
5887 pi
.snapnode
->mark_subvolume();
5889 pi
.snapnode
->clear_subvolume();
5891 mdr
->no_early_reply
= true;
5892 pip
= pi
.inode
.get();
5893 adjust_realm
= true;
5894 } else if (name
== "ceph.dir.pin"sv
) {
5895 if (!cur
->is_dir() || cur
->is_root()) {
5896 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5902 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5903 if (rank
< 0) rank
= MDS_RANK_NONE
;
5904 else if (rank
>= MAX_MDS
) {
5905 respond_to_request(mdr
, -CEPHFS_EDOM
);
5908 } catch (boost::bad_lexical_cast
const&) {
5909 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5910 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5914 if (!xlock_policylock(mdr
, cur
))
5917 auto pi
= cur
->project_inode(mdr
);
5918 cur
->set_export_pin(rank
);
5919 pip
= pi
.inode
.get();
5920 } else if (name
== "ceph.dir.pin.random"sv
) {
5921 if (!cur
->is_dir() || cur
->is_root()) {
5922 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5928 val
= boost::lexical_cast
<double>(value
);
5929 } catch (boost::bad_lexical_cast
const&) {
5930 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
5931 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5935 if (val
< 0.0 || 1.0 < val
) {
5936 respond_to_request(mdr
, -CEPHFS_EDOM
);
5938 } else if (mdcache
->export_ephemeral_random_max
< val
) {
5939 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5943 if (!xlock_policylock(mdr
, cur
))
5946 auto pi
= cur
->project_inode(mdr
);
5947 cur
->setxattr_ephemeral_rand(val
);
5948 pip
= pi
.inode
.get();
5949 } else if (name
== "ceph.dir.pin.distributed"sv
) {
5950 if (!cur
->is_dir() || cur
->is_root()) {
5951 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5957 val
= boost::lexical_cast
<bool>(value
);
5958 } catch (boost::bad_lexical_cast
const&) {
5959 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5960 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5964 if (!xlock_policylock(mdr
, cur
))
5967 auto pi
= cur
->project_inode(mdr
);
5968 cur
->setxattr_ephemeral_dist(val
);
5969 pip
= pi
.inode
.get();
5971 dout(10) << " unknown vxattr " << name
<< dendl
;
5972 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5977 pip
->ctime
= mdr
->get_op_stamp();
5978 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
5979 pip
->rstat
.rctime
= mdr
->get_op_stamp();
5980 pip
->version
= cur
->pre_dirty();
5982 pip
->update_backtrace();
5985 mdr
->ls
= mdlog
->get_current_segment();
5986 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
5987 mdlog
->start_entry(le
);
5988 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5989 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5990 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5992 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5993 false, false, adjust_realm
));
5997 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5999 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6000 string
name(req
->get_path2());
6002 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
6004 if (name
== "ceph.dir.layout") {
6005 if (!cur
->is_dir()) {
6006 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6009 if (cur
->is_root()) {
6010 dout(10) << "can't remove layout policy on the root directory" << dendl
;
6011 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6015 if (!cur
->get_projected_inode()->has_layout()) {
6016 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6020 MutationImpl::LockOpVec lov
;
6021 lov
.add_xlock(&cur
->policylock
);
6022 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6025 auto pi
= cur
->project_inode(mdr
);
6026 pi
.inode
->clear_layout();
6027 pi
.inode
->version
= cur
->pre_dirty();
6030 mdr
->ls
= mdlog
->get_current_segment();
6031 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
6032 mdlog
->start_entry(le
);
6033 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6034 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6035 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6037 mdr
->no_early_reply
= true;
6038 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6040 } else if (name
== "ceph.dir.layout.pool_namespace"
6041 || name
== "ceph.file.layout.pool_namespace") {
6042 // Namespace is the only layout field that has a meaningful
6043 // null/none value (empty string, means default layout). Is equivalent
6044 // to a setxattr with empty string: pass through the empty payload of
6045 // the rmxattr request to do this.
6046 handle_set_vxattr(mdr
, cur
);
6050 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6053 const Server::XattrHandler
Server::xattr_handlers
[] = {
6055 xattr_name
: Server::DEFAULT_HANDLER
,
6056 description
: "default xattr handler",
6057 validate
: &Server::default_xattr_validate
,
6058 setxattr
: &Server::default_setxattr_handler
,
6059 removexattr
: &Server::default_removexattr_handler
,
6062 xattr_name
: "ceph.mirror.info",
6063 description
: "mirror info xattr handler",
6064 validate
: &Server::mirror_info_xattr_validate
,
6065 setxattr
: &Server::mirror_info_setxattr_handler
,
6066 removexattr
: &Server::mirror_info_removexattr_handler
6070 const Server::XattrHandler
* Server::get_xattr_or_default_handler(std::string_view xattr_name
) {
6071 const XattrHandler
*default_xattr_handler
= nullptr;
6073 for (auto &handler
: xattr_handlers
) {
6074 if (handler
.xattr_name
== Server::DEFAULT_HANDLER
) {
6075 ceph_assert(default_xattr_handler
== nullptr);
6076 default_xattr_handler
= &handler
;
6078 if (handler
.xattr_name
== xattr_name
) {
6079 dout(20) << "handler=" << handler
.description
<< dendl
;
6084 ceph_assert(default_xattr_handler
!= nullptr);
6085 dout(20) << "handler=" << default_xattr_handler
->description
<< dendl
;
6086 return default_xattr_handler
;
6089 int Server::xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6090 const std::string
&xattr_name
, int op
, int flags
) {
6091 if (op
== CEPH_MDS_OP_SETXATTR
) {
6093 if ((flags
& CEPH_XATTR_CREATE
) && xattrs
->count(mempool::mds_co::string(xattr_name
))) {
6094 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur
<< dendl
;
6095 return -CEPHFS_EEXIST
;
6098 if ((flags
& CEPH_XATTR_REPLACE
) && !(xattrs
&& xattrs
->count(mempool::mds_co::string(xattr_name
)))) {
6099 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur
<< dendl
;
6100 return -CEPHFS_ENODATA
;
6106 if (op
== CEPH_MDS_OP_RMXATTR
) {
6107 if (!xattrs
|| xattrs
->count(mempool::mds_co::string(xattr_name
)) == 0) {
6108 dout(10) << "removexattr '" << xattr_name
<< "' and CEPHFS_ENODATA on " << *cur
<< dendl
;
6109 return -CEPHFS_ENODATA
;
6115 derr
<< ": unhandled validation for: " << xattr_name
<< dendl
;
6116 return -CEPHFS_EINVAL
;
6119 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
6120 const bufferlist
&xattr_value
) {
6121 size_t len
= xattr_value
.length();
6122 bufferptr b
= buffer::create(len
);
6124 xattr_value
.begin().copy(len
, b
.c_str());
6126 auto em
= xattrs
->emplace(std::piecewise_construct
,
6127 std::forward_as_tuple(mempool::mds_co::string(xattr_name
)),
6128 std::forward_as_tuple(b
));
6130 em
.first
->second
= b
;
6134 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
) {
6135 xattrs
->erase(mempool::mds_co::string(xattr_name
));
6138 int Server::default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6139 XattrOp
*xattr_op
) {
6140 return xattr_validate(cur
, xattrs
, xattr_op
->xattr_name
, xattr_op
->op
, xattr_op
->flags
);
6143 void Server::default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6144 const XattrOp
&xattr_op
) {
6145 xattr_set(xattrs
, xattr_op
.xattr_name
, xattr_op
.xattr_value
);
6148 void Server::default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6149 const XattrOp
&xattr_op
) {
6150 xattr_rm(xattrs
, xattr_op
.xattr_name
);
6153 // mirror info xattr handlers
6154 const std::string
Server::MirrorXattrInfo::MIRROR_INFO_REGEX
= "^cluster_id=([a-f0-9]{8}-" \
6155 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6156 "[a-f0-9]{4}-[a-f0-9]{12})" \
6158 const std::string
Server::MirrorXattrInfo::CLUSTER_ID
= "ceph.mirror.info.cluster_id";
6159 const std::string
Server::MirrorXattrInfo::FS_ID
= "ceph.mirror.info.fs_id";
6160 int Server::parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
6161 std::string
&cluster_id
, std::string
&fs_id
) {
6162 dout(20) << "parsing name=" << name
<< ", value=" << value
<< dendl
;
6164 static const std::regex
regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX
);
6167 std::regex_search(value
, match
, regex
);
6168 if (match
.size() != 3) {
6169 derr
<< "mirror info parse error" << dendl
;
6170 return -CEPHFS_EINVAL
;
6173 cluster_id
= match
[1];
6175 dout(20) << " parsed cluster_id=" << cluster_id
<< ", fs_id=" << fs_id
<< dendl
;
6179 int Server::mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6180 XattrOp
*xattr_op
) {
6181 if (!cur
->is_root()) {
6182 return -CEPHFS_EINVAL
;
6185 int v1
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, xattr_op
->op
, xattr_op
->flags
);
6186 int v2
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::FS_ID
, xattr_op
->op
, xattr_op
->flags
);
6188 derr
<< "inconsistent mirror info state (" << v1
<< "," << v2
<< ")" << dendl
;
6189 return -CEPHFS_EINVAL
;
6196 if (xattr_op
->op
== CEPH_MDS_OP_RMXATTR
) {
6200 std::string cluster_id
;
6202 int r
= parse_mirror_info_xattr(xattr_op
->xattr_name
, xattr_op
->xattr_value
.to_str(),
6208 xattr_op
->xinfo
= std::make_unique
<MirrorXattrInfo
>(cluster_id
, fs_id
);
6212 void Server::mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6213 const XattrOp
&xattr_op
) {
6214 auto mirror_info
= dynamic_cast<MirrorXattrInfo
&>(*(xattr_op
.xinfo
));
6217 bl
.append(mirror_info
.cluster_id
.c_str(), mirror_info
.cluster_id
.length());
6218 xattr_set(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, bl
);
6221 bl
.append(mirror_info
.fs_id
.c_str(), mirror_info
.fs_id
.length());
6222 xattr_set(xattrs
, Server::MirrorXattrInfo::FS_ID
, bl
);
6225 void Server::mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6226 const XattrOp
&xattr_op
) {
6227 xattr_rm(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
);
6228 xattr_rm(xattrs
, Server::MirrorXattrInfo::FS_ID
);
6231 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
6233 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6234 string
name(req
->get_path2());
6236 // is a ceph virtual xattr?
6237 if (is_ceph_vxattr(name
)) {
6238 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6239 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6243 handle_set_vxattr(mdr
, cur
);
6247 if (!is_allowed_ceph_xattr(name
)) {
6248 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6252 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
6256 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6257 respond_to_request(mdr
, -CEPHFS_EROFS
);
6261 int flags
= req
->head
.args
.setxattr
.flags
;
6263 MutationImpl::LockOpVec lov
;
6264 lov
.add_xlock(&cur
->xattrlock
);
6265 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6268 if (!check_access(mdr
, cur
, MAY_WRITE
))
6271 size_t len
= req
->get_data().length();
6272 size_t inc
= len
+ name
.length();
6274 auto handler
= Server::get_xattr_or_default_handler(name
);
6275 const auto& pxattrs
= cur
->get_projected_xattrs();
6277 // check xattrs kv pairs size
6278 size_t cur_xattrs_size
= 0;
6279 for (const auto& p
: *pxattrs
) {
6280 if ((flags
& CEPH_XATTR_REPLACE
) && name
.compare(p
.first
) == 0) {
6283 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
6286 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
6287 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6288 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
6289 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
6294 XattrOp
xattr_op(CEPH_MDS_OP_SETXATTR
, name
, req
->get_data(), flags
);
6295 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6297 respond_to_request(mdr
, r
);
6301 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
6304 auto pi
= cur
->project_inode(mdr
, true);
6305 pi
.inode
->version
= cur
->pre_dirty();
6306 pi
.inode
->ctime
= mdr
->get_op_stamp();
6307 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6308 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6309 if (name
== "encryption.ctx"sv
)
6310 pi
.inode
->fscrypt
= true;
6311 pi
.inode
->change_attr
++;
6312 pi
.inode
->xattr_version
++;
6314 if ((flags
& CEPH_XATTR_REMOVE
)) {
6315 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6317 std::invoke(handler
->setxattr
, this, cur
, pi
.xattrs
, xattr_op
);
6321 mdr
->ls
= mdlog
->get_current_segment();
6322 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
6323 mdlog
->start_entry(le
);
6324 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6325 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6326 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6328 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6331 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
6333 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6334 std::string
name(req
->get_path2());
6336 // is a ceph virtual xattr?
6337 if (is_ceph_vxattr(name
)) {
6338 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6339 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6343 handle_remove_vxattr(mdr
, cur
);
6347 if (!is_allowed_ceph_xattr(name
)) {
6348 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6352 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
6356 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6357 respond_to_request(mdr
, -CEPHFS_EROFS
);
6361 MutationImpl::LockOpVec lov
;
6362 lov
.add_xlock(&cur
->xattrlock
);
6363 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6367 auto handler
= Server::get_xattr_or_default_handler(name
);
6369 XattrOp
xattr_op(CEPH_MDS_OP_RMXATTR
, name
, bl
, 0);
6371 const auto& pxattrs
= cur
->get_projected_xattrs();
6372 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6374 respond_to_request(mdr
, r
);
6378 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
6381 auto pi
= cur
->project_inode(mdr
, true);
6382 pi
.inode
->version
= cur
->pre_dirty();
6383 pi
.inode
->ctime
= mdr
->get_op_stamp();
6384 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6385 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6386 pi
.inode
->change_attr
++;
6387 pi
.inode
->xattr_version
++;
6388 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6391 mdr
->ls
= mdlog
->get_current_segment();
6392 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
6393 mdlog
->start_entry(le
);
6394 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6395 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6396 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6398 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6401 void Server::handle_client_getvxattr(MDRequestRef
& mdr
)
6403 const auto& req
= mdr
->client_request
;
6404 string xattr_name
{req
->get_path2()};
6406 // is a ceph virtual xattr?
6407 if (!is_ceph_vxattr(xattr_name
)) {
6408 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6412 CInode
*cur
= rdlock_path_pin_ref(mdr
, true, false);
6417 if (is_ceph_dir_vxattr(xattr_name
)) {
6418 if (!cur
->is_dir()) {
6419 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6422 } else if (is_ceph_file_vxattr(xattr_name
)) {
6423 if (cur
->is_dir()) {
6424 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6429 CachedStackStringStream css
;
6431 ceph::bufferlist bl
;
6432 // handle these vxattrs
6433 if ((xattr_name
.substr(0, 15) == "ceph.dir.layout"sv
) ||
6434 (xattr_name
.substr(0, 16) == "ceph.file.layout"sv
)) {
6435 std::string layout_field
;
6437 struct layout_xattr_info_t
{
6438 enum class InheritanceStatus
: uint32_t {
6444 const file_layout_t layout
;
6445 const InheritanceStatus status
;
6447 layout_xattr_info_t(const file_layout_t
& l
, InheritanceStatus inh
)
6448 : layout(l
), status(inh
) { }
6450 static std::string
status_to_string(InheritanceStatus status
) {
6452 case InheritanceStatus::DEFAULT
: return "default"s
;
6453 case InheritanceStatus::SET
: return "set"s
;
6454 case InheritanceStatus::INHERITED
: return "inherited"s
;
6455 default: return "unknown"s
;
6460 auto is_default_layout
= [&](const file_layout_t
& layout
) -> bool {
6461 return (layout
== mdcache
->default_file_layout
);
6463 auto get_inherited_layout
= [&](CInode
*cur
) -> layout_xattr_info_t
{
6467 if (cur
->get_projected_inode()->has_layout()) {
6468 auto& curr_layout
= cur
->get_projected_inode()->layout
;
6469 if (is_default_layout(curr_layout
)) {
6470 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::DEFAULT
};
6472 if (cur
== orig_in
) {
6473 // we've found a new layout at this inode
6474 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::SET
};
6476 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::INHERITED
};
6480 if (cur
->is_root()) {
6484 cur
= cur
->get_projected_parent_dir()->get_inode();
6486 mds
->clog
->error() << "no layout found at root dir!";
6487 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6490 if (xattr_name
== "ceph.dir.layout.json"sv
||
6491 xattr_name
== "ceph.file.layout.json"sv
) {
6492 // fetch layout only for valid xattr_name
6493 const auto lxi
= get_inherited_layout(cur
);
6495 *css
<< "{\"stripe_unit\": " << lxi
.layout
.stripe_unit
6496 << ", \"stripe_count\": " << lxi
.layout
.stripe_count
6497 << ", \"object_size\": " << lxi
.layout
.object_size
6498 << ", \"pool_name\": ";
6499 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6501 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6502 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6506 *css
<< ", \"pool_id\": " << (uint64_t)lxi
.layout
.pool_id
;
6507 *css
<< ", \"pool_namespace\": \"" << lxi
.layout
.pool_ns
<< "\"";
6508 *css
<< ", \"inheritance\": \"@"
6509 << layout_xattr_info_t::status_to_string(lxi
.status
) << "\"}";
6510 } else if ((xattr_name
== "ceph.dir.layout.pool_name"sv
) ||
6511 (xattr_name
== "ceph.file.layout.pool_name"sv
)) {
6512 // fetch layout only for valid xattr_name
6513 const auto lxi
= get_inherited_layout(cur
);
6514 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6515 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6516 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6519 } else if ((xattr_name
== "ceph.dir.layout.pool_id"sv
) ||
6520 (xattr_name
== "ceph.file.layout.pool_id"sv
)) {
6521 // fetch layout only for valid xattr_name
6522 const auto lxi
= get_inherited_layout(cur
);
6523 *css
<< (uint64_t)lxi
.layout
.pool_id
;
6525 r
= -CEPHFS_ENODATA
; // no such attribute
6527 } else if (xattr_name
.substr(0, 12) == "ceph.dir.pin"sv
) {
6528 if (xattr_name
== "ceph.dir.pin"sv
) {
6529 *css
<< cur
->get_projected_inode()->export_pin
;
6530 } else if (xattr_name
== "ceph.dir.pin.random"sv
) {
6531 *css
<< cur
->get_projected_inode()->export_ephemeral_random_pin
;
6532 } else if (xattr_name
== "ceph.dir.pin.distributed"sv
) {
6533 *css
<< cur
->get_projected_inode()->export_ephemeral_distributed_pin
;
6535 // otherwise respond as invalid request
6536 // since we only handle ceph vxattrs here
6537 r
= -CEPHFS_ENODATA
; // no such attribute
6540 // otherwise respond as invalid request
6541 // since we only handle ceph vxattrs here
6542 r
= -CEPHFS_ENODATA
; // no such attribute
6546 ENCODE_START(1, 1, bl
);
6547 encode(css
->strv(), bl
);
6549 mdr
->reply_extra_bl
= bl
;
6552 respond_to_request(mdr
, r
);
6555 // =================================================================
6556 // DIRECTORY and NAMESPACE OPS
6559 // ------------------------------------------------
6563 class C_MDS_mknod_finish
: public ServerLogContext
{
6567 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
6568 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
6569 void finish(int r
) override
{
6570 ceph_assert(r
== 0);
6573 dn
->pop_projected_linkage();
6575 // be a bit hacky with the inode version, here.. we decrement it
6576 // just to keep mark_dirty() happen. (we didn't bother projecting
6577 // a new version of hte inode since it's just been created)
6578 newi
->mark_dirty(mdr
->ls
);
6579 newi
->mark_dirty_parent(mdr
->ls
, true);
6582 if (newi
->is_dir()) {
6583 CDir
*dir
= newi
->get_dirfrag(frag_t());
6585 dir
->mark_dirty(mdr
->ls
);
6586 dir
->mark_new(mdr
->ls
);
6591 MDRequestRef null_ref
;
6592 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
6594 if (newi
->is_file()) {
6595 get_mds()->locker
->share_inode_max_size(newi
);
6596 } else if (newi
->is_dir()) {
6597 // We do this now so that the linkages on the new directory are stable.
6598 newi
->maybe_ephemeral_rand();
6602 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
6605 server
->respond_to_request(mdr
, 0);
6610 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6612 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6613 client_t client
= mdr
->get_client();
6615 unsigned mode
= req
->head
.args
.mknod
.mode
;
6616 if ((mode
& S_IFMT
) == 0)
6619 mdr
->disable_lock_cache();
6620 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, S_ISREG(mode
));
6624 CDir
*dir
= dn
->get_dir();
6625 CInode
*diri
= dir
->get_inode();
6626 if (!check_access(mdr
, diri
, MAY_WRITE
))
6628 if (!check_fragment_space(mdr
, dir
))
6630 if (!check_dir_max_entries(mdr
, dir
))
6633 ceph_assert(dn
->get_projected_linkage()->is_null());
6634 if (req
->get_alternate_name().size() > alternate_name_max
) {
6635 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6636 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6639 dn
->set_alternate_name(req
->get_alternate_name());
6642 file_layout_t layout
;
6643 if (mdr
->dir_layout
!= file_layout_t())
6644 layout
= mdr
->dir_layout
;
6646 layout
= mdcache
->default_file_layout
;
6648 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6651 dn
->push_projected_linkage(newi
);
6653 auto _inode
= newi
->_get_inode();
6654 _inode
->version
= dn
->pre_dirty();
6655 _inode
->rdev
= req
->head
.args
.mknod
.rdev
;
6656 _inode
->rstat
.rfiles
= 1;
6657 _inode
->accounted_rstat
= _inode
->rstat
;
6658 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6659 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
6660 _inode
->update_backtrace();
6662 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6663 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6664 ceph_assert(follows
>= realm
->get_newest_seq());
6666 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6667 // want to write to it (e.g., if they are reexporting NFS)
6668 if (S_ISREG(_inode
->mode
)) {
6669 // issue a cap on the file
6670 int cmode
= CEPH_FILE_MODE_RDWR
;
6671 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6675 // put locks in excl mode
6676 newi
->filelock
.set_state(LOCK_EXCL
);
6677 newi
->authlock
.set_state(LOCK_EXCL
);
6678 newi
->xattrlock
.set_state(LOCK_EXCL
);
6680 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6681 _inode
->client_ranges
[client
].range
.first
= 0;
6682 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
6683 _inode
->client_ranges
[client
].follows
= follows
;
6684 newi
->mark_clientwriteable();
6685 cap
->mark_clientwriteable();
6689 ceph_assert(dn
->first
== follows
+ 1);
6690 newi
->first
= dn
->first
;
6692 dout(10) << "mknod mode " << _inode
->mode
<< " rdev " << _inode
->rdev
<< dendl
;
6695 mdr
->ls
= mdlog
->get_current_segment();
6696 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6697 mdlog
->start_entry(le
);
6698 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6699 journal_allocated_inos(mdr
, &le
->metablob
);
6701 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6702 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6703 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6705 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6706 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6712 /* This function takes responsibility for the passed mdr*/
6713 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6715 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6717 mdr
->disable_lock_cache();
6718 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6722 CDir
*dir
= dn
->get_dir();
6723 CInode
*diri
= dir
->get_inode();
6725 // mkdir check access
6726 if (!check_access(mdr
, diri
, MAY_WRITE
))
6729 if (!check_fragment_space(mdr
, dir
))
6731 if (!check_dir_max_entries(mdr
, dir
))
6734 ceph_assert(dn
->get_projected_linkage()->is_null());
6735 if (req
->get_alternate_name().size() > alternate_name_max
) {
6736 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6737 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6740 dn
->set_alternate_name(req
->get_alternate_name());
6743 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6746 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6749 // it's a directory.
6750 dn
->push_projected_linkage(newi
);
6752 auto _inode
= newi
->_get_inode();
6753 _inode
->version
= dn
->pre_dirty();
6754 _inode
->rstat
.rsubdirs
= 1;
6755 _inode
->accounted_rstat
= _inode
->rstat
;
6756 _inode
->update_backtrace();
6758 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6759 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6760 ceph_assert(follows
>= realm
->get_newest_seq());
6762 dout(12) << " follows " << follows
<< dendl
;
6763 ceph_assert(dn
->first
== follows
+ 1);
6764 newi
->first
= dn
->first
;
6766 // ...and that new dir is empty.
6767 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
6768 newdir
->state_set(CDir::STATE_CREATING
);
6769 newdir
->mark_complete();
6770 newdir
->_get_fnode()->version
= newdir
->pre_dirty();
6773 mdr
->ls
= mdlog
->get_current_segment();
6774 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
6775 mdlog
->start_entry(le
);
6776 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6777 journal_allocated_inos(mdr
, &le
->metablob
);
6778 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6779 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6780 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
6782 // issue a cap on the directory
6783 int cmode
= CEPH_FILE_MODE_RDWR
;
6784 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6788 // put locks in excl mode
6789 newi
->filelock
.set_state(LOCK_EXCL
);
6790 newi
->authlock
.set_state(LOCK_EXCL
);
6791 newi
->xattrlock
.set_state(LOCK_EXCL
);
6794 // make sure this inode gets into the journal
6795 le
->metablob
.add_opened_ino(newi
->ino());
6797 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6799 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6800 // have overshot the split size (multiple mkdir in flight), so here is
6801 // an early chance to split the dir if this mkdir makes it oversized.
6802 mds
->balancer
->maybe_fragment(dir
, false);
6808 void Server::handle_client_symlink(MDRequestRef
& mdr
)
6810 const auto& req
= mdr
->client_request
;
6812 mdr
->disable_lock_cache();
6813 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6817 CDir
*dir
= dn
->get_dir();
6818 CInode
*diri
= dir
->get_inode();
6820 if (!check_access(mdr
, diri
, MAY_WRITE
))
6822 if (!check_fragment_space(mdr
, dir
))
6824 if (!check_dir_max_entries(mdr
, dir
))
6827 ceph_assert(dn
->get_projected_linkage()->is_null());
6828 if (req
->get_alternate_name().size() > alternate_name_max
) {
6829 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6830 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6832 dn
->set_alternate_name(req
->get_alternate_name());
6834 unsigned mode
= S_IFLNK
| 0777;
6835 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6839 dn
->push_projected_linkage(newi
);
6841 newi
->symlink
= req
->get_path2();
6842 auto _inode
= newi
->_get_inode();
6843 _inode
->version
= dn
->pre_dirty();
6844 _inode
->size
= newi
->symlink
.length();
6845 _inode
->rstat
.rbytes
= _inode
->size
;
6846 _inode
->rstat
.rfiles
= 1;
6847 _inode
->accounted_rstat
= _inode
->rstat
;
6848 _inode
->update_backtrace();
6850 newi
->first
= dn
->first
;
6853 mdr
->ls
= mdlog
->get_current_segment();
6854 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
6855 mdlog
->start_entry(le
);
6856 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6857 journal_allocated_inos(mdr
, &le
->metablob
);
6858 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6859 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6861 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6862 mds
->balancer
->maybe_fragment(dir
, false);
6871 void Server::handle_client_link(MDRequestRef
& mdr
)
6873 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6875 dout(7) << "handle_client_link " << req
->get_filepath()
6876 << " to " << req
->get_filepath2()
6879 mdr
->disable_lock_cache();
6884 if (req
->get_filepath2().depth() == 0) {
6885 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
6887 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
6888 mdcache
->find_ino_peers(req
->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr
));
6893 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
6894 CDentry
*pdn
= targeti
->get_projected_parent_dn();
6896 dout(7) << "target has no parent dn, failing..." << dendl
;
6897 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6900 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
6902 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
6905 destdn
= rdlock_path_xlock_dentry(mdr
, false);
6909 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
6914 if (!destdn
->get_projected_linkage()->is_null()) {
6915 respond_to_request(mdr
, -CEPHFS_EEXIST
);
6919 targeti
= ret
.second
->get_projected_linkage()->get_inode();
6922 ceph_assert(destdn
->get_projected_linkage()->is_null());
6923 if (req
->get_alternate_name().size() > alternate_name_max
) {
6924 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6925 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6928 destdn
->set_alternate_name(req
->get_alternate_name());
6930 if (targeti
->is_dir()) {
6931 dout(7) << "target is a dir, failing..." << dendl
;
6932 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6936 CDir
*dir
= destdn
->get_dir();
6937 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
6938 dout(7) << "target is " << *targeti
<< dendl
;
6940 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6941 MutationImpl::LockOpVec lov
;
6942 lov
.add_xlock(&targeti
->snaplock
);
6943 lov
.add_xlock(&targeti
->linklock
);
6945 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6948 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6951 if (targeti
->get_projected_inode()->nlink
== 0) {
6952 dout(7) << "target has no link, failing..." << dendl
;
6953 respond_to_request(mdr
, -CEPHFS_ENOENT
);
6957 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6958 if (!check_access(mdr
, targeti
, MAY_WRITE
))
6961 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
6964 if (!check_fragment_space(mdr
, dir
))
6967 if (!check_dir_max_entries(mdr
, dir
))
6971 CInode
* target_pin
= targeti
->get_projected_parent_dir()->inode
;
6972 SnapRealm
*target_realm
= target_pin
->find_snaprealm();
6973 if (target_pin
!= dir
->inode
&&
6974 target_realm
->get_subvolume_ino() !=
6975 dir
->inode
->find_snaprealm()->get_subvolume_ino()) {
6976 dout(7) << "target is in different subvolume, failing..." << dendl
;
6977 respond_to_request(mdr
, -CEPHFS_EXDEV
);
6982 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
6985 if (targeti
->is_auth())
6986 _link_local(mdr
, destdn
, targeti
, target_realm
);
6988 _link_remote(mdr
, true, destdn
, targeti
);
6989 mds
->balancer
->maybe_fragment(dir
, false);
6993 class C_MDS_link_local_finish
: public ServerLogContext
{
7000 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
7001 version_t dnpv_
, version_t tipv_
, bool ar
) :
7002 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
7003 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
7004 void finish(int r
) override
{
7005 ceph_assert(r
== 0);
7006 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
7011 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
)
7013 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
7015 mdr
->ls
= mdlog
->get_current_segment();
7017 // predirty NEW dentry
7018 version_t dnpv
= dn
->pre_dirty();
7019 version_t tipv
= targeti
->pre_dirty();
7021 // project inode update
7022 auto pi
= targeti
->project_inode(mdr
);
7024 pi
.inode
->ctime
= mdr
->get_op_stamp();
7025 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7026 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7027 pi
.inode
->change_attr
++;
7028 pi
.inode
->version
= tipv
;
7030 bool adjust_realm
= false;
7031 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7032 sr_t
*newsnap
= targeti
->project_snaprealm();
7033 targeti
->mark_snaprealm_global(newsnap
);
7034 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, targeti
->get_projected_parent_dn(), true);
7035 adjust_realm
= true;
7039 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
7040 mdlog
->start_entry(le
);
7041 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7042 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
7043 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
7044 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7045 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
7047 // do this after predirty_*, to avoid funky extra dnl arg
7048 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7050 journal_and_reply(mdr
, targeti
, dn
, le
,
7051 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
7054 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
7055 version_t dnpv
, version_t tipv
, bool adjust_realm
)
7057 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
7059 // link and unlock the NEW dentry
7060 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7061 if (!dnl
->get_inode())
7062 dn
->link_remote(dnl
, targeti
);
7063 dn
->mark_dirty(dnpv
, mdr
->ls
);
7068 MDRequestRef null_ref
;
7069 mdcache
->send_dentry_link(dn
, null_ref
);
7072 int op
= CEPH_SNAP_OP_SPLIT
;
7073 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7074 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7077 // bump target popularity
7078 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7079 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7082 respond_to_request(mdr
, 0);
7086 // link / unlink remote
7088 class C_MDS_link_remote_finish
: public ServerLogContext
{
7094 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
7095 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
7096 dpv(d
->get_projected_version()) {}
7097 void finish(int r
) override
{
7098 ceph_assert(r
== 0);
7099 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
7103 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
7105 dout(10) << "_link_remote "
7106 << (inc
? "link ":"unlink ")
7107 << *dn
<< " to " << *targeti
<< dendl
;
7109 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7110 mds_rank_t linkauth
= targeti
->authority().first
;
7111 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
7112 if (mds
->is_cluster_degraded() &&
7113 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
7114 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
7115 if (mdr
->more()->waiting_on_peer
.empty())
7116 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
7120 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
7123 op
= MMDSPeerRequest::OP_LINKPREP
;
7125 op
= MMDSPeerRequest::OP_UNLINKPREP
;
7126 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
7127 targeti
->set_object_info(req
->get_object_info());
7128 req
->op_stamp
= mdr
->get_op_stamp();
7129 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
7130 encode(*desti_srnode
, req
->desti_snapbl
);
7131 mds
->send_message_mds(req
, linkauth
);
7133 ceph_assert(mdr
->more()->waiting_on_peer
.count(linkauth
) == 0);
7134 mdr
->more()->waiting_on_peer
.insert(linkauth
);
7137 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
7139 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
7141 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
7142 delete desti_srnode
;
7143 desti_srnode
= NULL
;
7146 mdr
->set_mds_stamp(ceph_clock_now());
7149 mdr
->ls
= mdlog
->get_current_segment();
7150 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
7151 mdlog
->start_entry(le
);
7152 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7153 if (!mdr
->more()->witnessed
.empty()) {
7154 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7155 le
->reqid
= mdr
->reqid
;
7156 le
->had_peers
= true;
7157 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7162 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
7163 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7164 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7167 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7168 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7169 le
->metablob
.add_null_dentry(dn
, true);
7170 dn
->push_projected_linkage();
7173 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
7174 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
7177 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
7178 CDentry
*dn
, CInode
*targeti
,
7181 dout(10) << "_link_remote_finish "
7182 << (inc
? "link ":"unlink ")
7183 << *dn
<< " to " << *targeti
<< dendl
;
7185 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
7187 if (!mdr
->more()->witnessed
.empty())
7188 mdcache
->logged_leader_update(mdr
->reqid
);
7191 // link the new dentry
7192 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7193 if (!dnl
->get_inode())
7194 dn
->link_remote(dnl
, targeti
);
7195 dn
->mark_dirty(dpv
, mdr
->ls
);
7197 // unlink main dentry
7198 dn
->get_dir()->unlink_inode(dn
);
7199 dn
->pop_projected_linkage();
7200 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
7205 MDRequestRef null_ref
;
7207 mdcache
->send_dentry_link(dn
, null_ref
);
7209 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
7211 // bump target popularity
7212 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7213 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7216 respond_to_request(mdr
, 0);
7219 // removing a new dn?
7220 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7224 // remote linking/unlinking
7226 class C_MDS_PeerLinkPrep
: public ServerLogContext
{
7230 C_MDS_PeerLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
7231 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
7232 void finish(int r
) override
{
7233 ceph_assert(r
== 0);
7234 server
->_logged_peer_link(mdr
, targeti
, adjust_realm
);
7238 class C_MDS_PeerLinkCommit
: public ServerContext
{
7242 C_MDS_PeerLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
7243 ServerContext(s
), mdr(r
), targeti(t
) { }
7244 void finish(int r
) override
{
7245 server
->_commit_peer_link(mdr
, r
, targeti
);
7249 void Server::handle_peer_link_prep(MDRequestRef
& mdr
)
7251 dout(10) << "handle_peer_link_prep " << *mdr
7252 << " on " << mdr
->peer_request
->get_object_info()
7255 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
7257 CInode
*targeti
= mdcache
->get_inode(mdr
->peer_request
->get_object_info().ino
);
7258 ceph_assert(targeti
);
7259 dout(10) << "targeti " << *targeti
<< dendl
;
7260 CDentry
*dn
= targeti
->get_parent_dn();
7261 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7262 ceph_assert(dnl
->is_primary());
7264 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7266 mdr
->auth_pin(targeti
);
7268 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7269 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
7272 mdr
->ls
= mdlog
->get_current_segment();
7273 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_prep", mdr
->reqid
, mdr
->peer_to_mds
,
7274 EPeerUpdate::OP_PREPARE
, EPeerUpdate::LINK
);
7275 mdlog
->start_entry(le
);
7277 auto pi
= dnl
->get_inode()->project_inode(mdr
);
7279 // update journaled target inode
7281 bool adjust_realm
= false;
7282 bool realm_projected
= false;
7283 if (mdr
->peer_request
->get_op() == MMDSPeerRequest::OP_LINKPREP
) {
7287 CDentry
*target_pdn
= targeti
->get_projected_parent_dn();
7288 SnapRealm
*target_realm
= target_pdn
->get_dir()->inode
->find_snaprealm();
7289 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7290 sr_t
*newsnap
= targeti
->project_snaprealm();
7291 targeti
->mark_snaprealm_global(newsnap
);
7292 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, target_pdn
, true);
7293 adjust_realm
= true;
7294 realm_projected
= true;
7299 if (targeti
->is_projected_snaprealm_global()) {
7300 ceph_assert(mdr
->peer_request
->desti_snapbl
.length());
7301 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
7303 sr_t
*newsnap
= targeti
->project_snaprealm();
7304 decode(*newsnap
, p
);
7306 if (pi
.inode
->nlink
== 0)
7307 ceph_assert(!newsnap
->is_parent_global());
7309 realm_projected
= true;
7311 ceph_assert(mdr
->peer_request
->desti_snapbl
.length() == 0);
7315 link_rollback rollback
;
7316 rollback
.reqid
= mdr
->reqid
;
7317 rollback
.ino
= targeti
->ino();
7318 rollback
.old_ctime
= targeti
->get_inode()->ctime
; // we hold versionlock xlock; no concorrent projections
7319 const auto& pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
7320 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
7321 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
7322 rollback
.was_inc
= inc
;
7323 if (realm_projected
) {
7324 if (targeti
->snaprealm
) {
7325 encode(true, rollback
.snapbl
);
7326 targeti
->encode_snap_blob(rollback
.snapbl
);
7328 encode(false, rollback
.snapbl
);
7331 encode(rollback
, le
->rollback
);
7332 mdr
->more()->rollback_bl
= le
->rollback
;
7334 pi
.inode
->ctime
= mdr
->get_op_stamp();
7335 pi
.inode
->version
= targeti
->pre_dirty();
7337 dout(10) << " projected inode " << pi
.inode
->ino
<< " v " << pi
.inode
->version
<< dendl
;
7340 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
7341 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
7342 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7344 // set up commit waiter
7345 mdr
->more()->peer_commit
= new C_MDS_PeerLinkCommit(this, mdr
, targeti
);
7347 mdr
->more()->peer_update_journaled
= true;
7348 submit_mdlog_entry(le
, new C_MDS_PeerLinkPrep(this, mdr
, targeti
, adjust_realm
),
7353 void Server::_logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
7355 dout(10) << "_logged_peer_link " << *mdr
7356 << " " << *targeti
<< dendl
;
7358 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
7360 // update the target
7364 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7367 mdr
->reset_peer_request();
7370 int op
= CEPH_SNAP_OP_SPLIT
;
7371 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7372 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7376 if (!mdr
->aborted
) {
7377 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_LINKPREPACK
);
7378 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7380 dout(10) << " abort flag set, finishing" << dendl
;
7381 mdcache
->request_finish(mdr
);
7386 struct C_MDS_CommittedPeer
: public ServerLogContext
{
7387 C_MDS_CommittedPeer(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
7388 void finish(int r
) override
{
7389 server
->_committed_peer(mdr
);
7393 void Server::_commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
7395 dout(10) << "_commit_peer_link " << *mdr
7397 << " " << *targeti
<< dendl
;
7399 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
7402 // drop our pins, etc.
7405 // write a commit to the journal
7406 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_commit", mdr
->reqid
, mdr
->peer_to_mds
,
7407 EPeerUpdate::OP_COMMIT
, EPeerUpdate::LINK
);
7408 mdlog
->start_entry(le
);
7409 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7412 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7416 void Server::_committed_peer(MDRequestRef
& mdr
)
7418 dout(10) << "_committed_peer " << *mdr
<< dendl
;
7420 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
7422 bool assert_exist
= mdr
->more()->peer_update_journaled
;
7423 mdcache
->finish_uncommitted_peer(mdr
->reqid
, assert_exist
);
7424 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_COMMITTED
);
7425 mds
->send_message_mds(req
, mdr
->peer_to_mds
);
7426 mdcache
->request_finish(mdr
);
7429 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
7431 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7432 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7433 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
7434 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
7436 void finish(int r
) override
{
7437 server
->_link_rollback_finish(mut
, mdr
, splits
);
7441 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7443 link_rollback rollback
;
7444 auto p
= rbl
.cbegin();
7445 decode(rollback
, p
);
7447 dout(10) << "do_link_rollback on " << rollback
.reqid
7448 << (rollback
.was_inc
? " inc":" dec")
7449 << " ino " << rollback
.ino
7452 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
7454 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7455 ceph_assert(mdr
|| mds
->is_resolve());
7457 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7458 mut
->ls
= mds
->mdlog
->get_current_segment();
7460 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
7462 dout(10) << " target is " << *in
<< dendl
;
7463 ceph_assert(!in
->is_projected()); // live peer request hold versionlock xlock.
7465 auto pi
= in
->project_inode(mut
);
7466 pi
.inode
->version
= in
->pre_dirty();
7468 // parent dir rctime
7469 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
7470 auto pf
= parent
->project_fnode(mut
);
7471 pf
->version
= parent
->pre_dirty();
7472 if (pf
->fragstat
.mtime
== pi
.inode
->ctime
) {
7473 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
7474 if (pf
->rstat
.rctime
== pi
.inode
->ctime
)
7475 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
7476 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
7477 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
7481 pi
.inode
->ctime
= rollback
.old_ctime
;
7482 if (rollback
.was_inc
)
7487 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7488 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7490 auto p
= rollback
.snapbl
.cbegin();
7491 decode(hadrealm
, p
);
7493 if (!mds
->is_resolve()) {
7494 sr_t
*new_srnode
= new sr_t();
7495 decode(*new_srnode
, p
);
7496 in
->project_snaprealm(new_srnode
);
7498 decode(in
->snaprealm
->srnode
, p
);
7501 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
7502 if (!mds
->is_resolve())
7503 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
7504 in
->project_snaprealm(NULL
);
7509 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_rollback", rollback
.reqid
, leader
,
7510 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::LINK
);
7511 mdlog
->start_entry(le
);
7512 le
->commit
.add_dir_context(parent
);
7513 le
->commit
.add_dir(parent
, true);
7514 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
7516 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
7521 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
7522 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
7524 dout(10) << "_link_rollback_finish" << dendl
;
7526 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
7530 if (!mds
->is_resolve())
7531 mdcache
->send_snaps(splits
);
7534 mdcache
->request_finish(mdr
);
7536 mdcache
->finish_rollback(mut
->reqid
, mdr
);
7542 void Server::handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
)
7544 dout(10) << "handle_peer_link_prep_ack " << *mdr
7545 << " " << *m
<< dendl
;
7546 mds_rank_t from
= mds_rank_t(m
->get_source().num());
7548 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
7551 mdr
->more()->peers
.insert(from
);
7554 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
7555 mdr
->more()->witnessed
.insert(from
);
7556 ceph_assert(!m
->is_not_journaled());
7557 mdr
->more()->has_journaled_peers
= true;
7559 // remove from waiting list
7560 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7561 mdr
->more()->waiting_on_peer
.erase(from
);
7563 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
7565 dispatch_client_request(mdr
); // go again!
7574 void Server::handle_client_unlink(MDRequestRef
& mdr
)
7576 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7577 client_t client
= mdr
->get_client();
7580 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
7583 mdr
->disable_lock_cache();
7584 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
7588 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
7589 ceph_assert(!dnl
->is_null());
7590 CInode
*in
= dnl
->get_inode();
7593 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
7595 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
7597 dout(7) << "dn links to " << *in
<< dendl
;
7602 // do empty directory checks
7603 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
7604 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7608 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
7609 respond_to_request(mdr
, -CEPHFS_EISDIR
);
7615 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
7616 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
7621 CInode
*diri
= dn
->get_dir()->get_inode();
7622 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7623 if (!check_access(mdr
, diri
, MAY_WRITE
))
7627 // -- create stray dentry? --
7628 CDentry
*straydn
= NULL
;
7629 if (dnl
->is_primary()) {
7630 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
7633 dout(10) << " straydn is " << *straydn
<< dendl
;
7634 } else if (mdr
->straydn
) {
7635 mdr
->unpin(mdr
->straydn
);
7636 mdr
->straydn
= NULL
;
7640 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7641 MutationImpl::LockOpVec lov
;
7643 lov
.add_xlock(&in
->linklock
);
7644 lov
.add_xlock(&in
->snaplock
);
7646 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
7649 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7650 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7651 lov
.add_xlock(&straydn
->lock
);
7654 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7657 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7661 _dir_is_nonempty(mdr
, in
)) {
7662 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7667 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7669 if (!mdr
->more()->desti_srnode
) {
7670 if (in
->is_projected_snaprealm_global()) {
7671 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7672 in
->record_snaprealm_parent_dentry(new_srnode
, nullptr, dn
, dnl
->is_primary());
7673 // dropping the last linkage or dropping the last remote linkage,
7674 // detch the inode from global snaprealm
7675 auto nlink
= in
->get_projected_inode()->nlink
;
7677 (nlink
== 2 && !dnl
->is_primary() &&
7678 !in
->get_projected_parent_dir()->inode
->is_stray()))
7679 in
->clear_snaprealm_global(new_srnode
);
7680 mdr
->more()->desti_srnode
= new_srnode
;
7681 } else if (dnl
->is_primary()) {
7682 // prepare snaprealm blob for peer request
7683 SnapRealm
*realm
= in
->find_snaprealm();
7684 snapid_t follows
= realm
->get_newest_seq();
7685 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7686 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7687 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7688 mdr
->more()->desti_srnode
= new_srnode
;
7694 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7695 // subtree root auths need to be witnesses
7696 set
<mds_rank_t
> witnesses
;
7697 in
->list_replicas(witnesses
);
7698 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7700 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7701 p
!= witnesses
.end();
7703 if (mdr
->more()->witnessed
.count(*p
)) {
7704 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7705 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
7706 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7708 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7712 if (!mdr
->more()->waiting_on_peer
.empty())
7713 return; // we're waiting for a witness.
7716 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
7717 mds
->locker
->create_lock_cache(mdr
, diri
);
7720 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7721 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7723 _unlink_local(mdr
, dn
, straydn
);
7726 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7729 version_t dnpv
; // deleted dentry
7731 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7732 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
7733 dnpv(d
->get_projected_version()) {}
7734 void finish(int r
) override
{
7735 ceph_assert(r
== 0);
7736 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
7740 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7742 dout(10) << "_unlink_local " << *dn
<< dendl
;
7744 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7745 CInode
*in
= dnl
->get_inode();
7749 mdr
->ls
= mdlog
->get_current_segment();
7751 // prepare log entry
7752 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
7753 mdlog
->start_entry(le
);
7754 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7755 if (!mdr
->more()->witnessed
.empty()) {
7756 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7757 le
->reqid
= mdr
->reqid
;
7758 le
->had_peers
= true;
7759 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7763 ceph_assert(dnl
->is_primary());
7764 straydn
->push_projected_linkage(in
);
7767 // the unlinked dentry
7770 auto pi
= in
->project_inode(mdr
);
7773 dn
->make_path_string(t
, true);
7774 pi
.inode
->stray_prior_path
= std::move(t
);
7776 pi
.inode
->version
= in
->pre_dirty();
7777 pi
.inode
->ctime
= mdr
->get_op_stamp();
7778 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7779 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7780 pi
.inode
->change_attr
++;
7782 if (pi
.inode
->nlink
== 0)
7783 in
->state_set(CInode::STATE_ORPHAN
);
7785 if (mdr
->more()->desti_srnode
) {
7786 auto& desti_srnode
= mdr
->more()->desti_srnode
;
7787 in
->project_snaprealm(desti_srnode
);
7788 desti_srnode
= NULL
;
7792 // will manually pop projected inode
7794 // primary link. add stray dentry.
7795 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
7796 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7798 pi
.inode
->update_backtrace();
7799 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
7801 // remote link. update remote inode.
7802 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7803 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
7804 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
7807 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7808 le
->metablob
.add_null_dentry(dn
, true);
7811 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7812 le
->metablob
.renamed_dirino
= in
->ino();
7815 dn
->push_projected_linkage();
7818 ceph_assert(in
->first
<= straydn
->first
);
7819 in
->first
= straydn
->first
;
7823 ceph_assert(straydn
);
7824 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7827 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
7830 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
7831 CDentry
*dn
, CDentry
*straydn
,
7834 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
7836 if (!mdr
->more()->witnessed
.empty())
7837 mdcache
->logged_leader_update(mdr
->reqid
);
7839 CInode
*strayin
= NULL
;
7840 bool hadrealm
= false;
7842 // if there is newly created snaprealm, need to split old snaprealm's
7843 // inodes_with_caps. So pop snaprealm before linkage changes.
7844 strayin
= dn
->get_linkage()->get_inode();
7845 hadrealm
= strayin
->snaprealm
? true : false;
7846 strayin
->early_pop_projected_snaprealm();
7849 // unlink main dentry
7850 dn
->get_dir()->unlink_inode(dn
);
7851 dn
->pop_projected_linkage();
7852 dn
->mark_dirty(dnpv
, mdr
->ls
);
7854 // relink as stray? (i.e. was primary link?)
7856 dout(20) << " straydn is " << *straydn
<< dendl
;
7857 straydn
->pop_projected_linkage();
7858 mdcache
->touch_dentry_bottom(straydn
);
7863 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
7866 // update subtree map?
7867 if (strayin
->is_dir())
7868 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
7870 if (strayin
->snaprealm
&& !hadrealm
)
7871 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
7875 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7878 respond_to_request(mdr
, 0);
7880 // removing a new dn?
7881 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7884 // respond_to_request() drops locks. So stray reintegration can race with us.
7885 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7886 // Tip off the MDCache that this dentry is a stray that
7887 // might be elegible for purge.
7888 mdcache
->notify_stray(straydn
);
7892 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
7894 if (mds
->is_cluster_degraded() &&
7895 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7896 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
7897 if (mdr
->more()->waiting_on_peer
.empty())
7898 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7902 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
7903 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREP
);
7904 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
7905 for (auto dn
: trace
)
7906 req
->srcdnpath
.push_dentry(dn
->get_name());
7907 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
7908 if (mdr
->more()->desti_srnode
)
7909 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7911 req
->op_stamp
= mdr
->get_op_stamp();
7912 mds
->send_message_mds(req
, who
);
7914 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
7915 mdr
->more()->waiting_on_peer
.insert(who
);
7919 struct C_MDS_PeerRmdirPrep
: public ServerLogContext
{
7920 CDentry
*dn
, *straydn
;
7921 C_MDS_PeerRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
7922 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
7923 void finish(int r
) override
{
7924 server
->_logged_peer_rmdir(mdr
, dn
, straydn
);
7928 struct C_MDS_PeerRmdirCommit
: public ServerContext
{
7931 C_MDS_PeerRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
7932 : ServerContext(s
), mdr(r
), straydn(sd
) { }
7933 void finish(int r
) override
{
7934 server
->_commit_peer_rmdir(mdr
, r
, straydn
);
7938 void Server::handle_peer_rmdir_prep(MDRequestRef
& mdr
)
7940 dout(10) << "handle_peer_rmdir_prep " << *mdr
7941 << " " << mdr
->peer_request
->srcdnpath
7942 << " to " << mdr
->peer_request
->destdnpath
7945 vector
<CDentry
*> trace
;
7946 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
7947 dout(10) << " src " << srcpath
<< dendl
;
7949 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
7950 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
7951 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
7954 if (r
== -CEPHFS_ESTALE
) {
7955 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7956 mdr
->peer_to_mds
, true);
7959 ceph_assert(r
== 0);
7960 CDentry
*dn
= trace
.back();
7961 dout(10) << " dn " << *dn
<< dendl
;
7964 ceph_assert(mdr
->straydn
);
7965 CDentry
*straydn
= mdr
->straydn
;
7966 dout(10) << " straydn " << *straydn
<< dendl
;
7968 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7970 rmdir_rollback rollback
;
7971 rollback
.reqid
= mdr
->reqid
;
7972 rollback
.src_dir
= dn
->get_dir()->dirfrag();
7973 rollback
.src_dname
= dn
->get_name();
7974 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
7975 rollback
.dest_dname
= straydn
->get_name();
7976 if (mdr
->peer_request
->desti_snapbl
.length()) {
7977 if (in
->snaprealm
) {
7978 encode(true, rollback
.snapbl
);
7979 in
->encode_snap_blob(rollback
.snapbl
);
7981 encode(false, rollback
.snapbl
);
7984 encode(rollback
, mdr
->more()->rollback_bl
);
7985 // FIXME: rollback snaprealm
7986 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7988 // set up commit waiter
7989 mdr
->more()->peer_commit
= new C_MDS_PeerRmdirCommit(this, mdr
, straydn
);
7991 straydn
->push_projected_linkage(in
);
7992 dn
->push_projected_linkage();
7994 ceph_assert(straydn
->first
>= in
->first
);
7995 in
->first
= straydn
->first
;
7997 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
7998 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
7999 _logged_peer_rmdir(mdr
, dn
, straydn
);
8003 mdr
->ls
= mdlog
->get_current_segment();
8004 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir", mdr
->reqid
, mdr
->peer_to_mds
,
8005 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RMDIR
);
8006 mdlog
->start_entry(le
);
8007 le
->rollback
= mdr
->more()->rollback_bl
;
8009 le
->commit
.add_dir_context(straydn
->get_dir());
8010 le
->commit
.add_primary_dentry(straydn
, in
, true);
8011 // peer: no need to journal original dentry
8013 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8014 le
->commit
.renamed_dirino
= in
->ino();
8016 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
8017 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
8019 mdr
->more()->peer_update_journaled
= true;
8020 submit_mdlog_entry(le
, new C_MDS_PeerRmdirPrep(this, mdr
, dn
, straydn
),
8025 void Server::_logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
8027 dout(10) << "_logged_peer_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
8028 CInode
*in
= dn
->get_linkage()->get_inode();
8031 if (mdr
->peer_request
->desti_snapbl
.length()) {
8032 new_realm
= !in
->snaprealm
;
8033 in
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
8034 ceph_assert(in
->snaprealm
);
8039 // update our cache now, so we are consistent with what is in the journal
8040 // when we journal a subtree map
8041 dn
->get_dir()->unlink_inode(dn
);
8042 straydn
->pop_projected_linkage();
8043 dn
->pop_projected_linkage();
8045 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->peer_update_journaled
);
8048 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
8051 mdr
->reset_peer_request();
8054 if (!mdr
->aborted
) {
8055 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREPACK
);
8056 if (!mdr
->more()->peer_update_journaled
)
8057 reply
->mark_not_journaled();
8058 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
8060 dout(10) << " abort flag set, finishing" << dendl
;
8061 mdcache
->request_finish(mdr
);
8065 void Server::handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
8067 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8068 << " " << *ack
<< dendl
;
8070 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8072 mdr
->more()->peers
.insert(from
);
8073 mdr
->more()->witnessed
.insert(from
);
8074 if (!ack
->is_not_journaled())
8075 mdr
->more()->has_journaled_peers
= true;
8077 // remove from waiting list
8078 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
8079 mdr
->more()->waiting_on_peer
.erase(from
);
8081 if (mdr
->more()->waiting_on_peer
.empty())
8082 dispatch_client_request(mdr
); // go again!
8084 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
8087 void Server::_commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
8089 dout(10) << "_commit_peer_rmdir " << *mdr
<< " r=" << r
<< dendl
;
8092 if (mdr
->more()->peer_update_journaled
) {
8093 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
8094 if (strayin
&& !strayin
->snaprealm
)
8095 mdcache
->clear_dirty_bits_for_stray(strayin
);
8100 if (mdr
->more()->peer_update_journaled
) {
8101 // write a commit to the journal
8102 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_commit", mdr
->reqid
,
8103 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
8104 EPeerUpdate::RMDIR
);
8105 mdlog
->start_entry(le
);
8106 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
8109 _committed_peer(mdr
);
8113 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
8117 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
8121 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
8122 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
8123 void finish(int r
) override
{
8124 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
8128 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
8130 // unlink the other rollback methods, the rmdir rollback is only
8131 // needed to record the subtree changes in the journal for inode
8132 // replicas who are auth for empty dirfrags. no actual changes to
8133 // the file system are taking place here, so there is no Mutation.
8135 rmdir_rollback rollback
;
8136 auto p
= rbl
.cbegin();
8137 decode(rollback
, p
);
8139 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
8140 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
8141 ceph_assert(mdr
|| mds
->is_resolve());
8143 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
8145 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
8147 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
8149 dout(10) << " dn " << *dn
<< dendl
;
8150 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
8151 ceph_assert(straydir
);
8152 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
8153 ceph_assert(straydn
);
8154 dout(10) << " straydn " << *straydn
<< dendl
;
8155 CInode
*in
= straydn
->get_linkage()->get_inode();
8157 dn
->push_projected_linkage(in
);
8158 straydn
->push_projected_linkage();
8160 if (rollback
.snapbl
.length() && in
->snaprealm
) {
8162 auto p
= rollback
.snapbl
.cbegin();
8163 decode(hadrealm
, p
);
8165 decode(in
->snaprealm
->srnode
, p
);
8167 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
8171 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
8172 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
8174 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
8179 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_rollback", rollback
.reqid
, leader
,
8180 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RMDIR
);
8181 mdlog
->start_entry(le
);
8183 le
->commit
.add_dir_context(dn
->get_dir());
8184 le
->commit
.add_primary_dentry(dn
, in
, true);
8185 // peer: no need to journal straydn
8187 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8188 le
->commit
.renamed_dirino
= in
->ino();
8190 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
8192 submit_mdlog_entry(le
,
8193 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
8199 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
8201 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
8203 straydn
->get_dir()->unlink_inode(straydn
);
8204 dn
->pop_projected_linkage();
8205 straydn
->pop_projected_linkage();
8207 CInode
*in
= dn
->get_linkage()->get_inode();
8208 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
8209 !mdr
|| mdr
->more()->peer_update_journaled
);
8211 if (mds
->is_resolve()) {
8212 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
8213 mdcache
->try_trim_non_auth_subtree(root
);
8217 mdcache
->request_finish(mdr
);
8219 mdcache
->finish_rollback(reqid
, mdr
);
8223 /** _dir_is_nonempty[_unlocked]
8225 * check if a directory is non-empty (i.e. we can rmdir it).
8227 * the unlocked varient this is a fastpath check. we can't really be
8228 * sure until we rdlock the filelock.
8230 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
8232 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
8233 ceph_assert(in
->is_auth());
8235 if (in
->filelock
.is_cached())
8236 return false; // there can be pending async create/unlink. don't know.
8237 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
8238 return true; // in a snapshot!
8240 auto&& ls
= in
->get_dirfrags();
8241 for (const auto& dir
: ls
) {
8242 // is the frag obviously non-empty?
8243 if (dir
->is_auth()) {
8244 if (dir
->get_projected_fnode()->fragstat
.size()) {
8245 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8246 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
8255 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
8257 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
8258 ceph_assert(in
->is_auth());
8259 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
8261 frag_info_t dirstat
;
8262 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
8264 auto&& ls
= in
->get_dirfrags();
8265 for (const auto& dir
: ls
) {
8266 const auto& pf
= dir
->get_projected_fnode();
8267 if (pf
->fragstat
.size()) {
8268 dout(10) << "dir_is_nonempty dirstat has "
8269 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
8273 if (pf
->accounted_fragstat
.version
== dirstat_version
)
8274 dirstat
.add(pf
->accounted_fragstat
);
8276 dirstat
.add(pf
->fragstat
);
8279 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
8283 // ======================================================
8286 class C_MDS_rename_finish
: public ServerLogContext
{
8291 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
8292 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
8293 ServerLogContext(s
, r
),
8294 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
8295 void finish(int r
) override
{
8296 ceph_assert(r
== 0);
8297 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
8302 /** handle_client_rename
8304 * rename leader is the destdn auth. this is because cached inodes
8305 * must remain connected. thus, any replica of srci, must also
8306 * replicate destdn, and possibly straydn, so that srci (and
8307 * destdn->inode) remain connected during the rename.
8309 * to do this, we freeze srci, then leader (destdn auth) verifies that
8310 * all other nodes have also replciated destdn and straydn. note that
8311 * destdn replicas need not also replicate srci. this only works when
8314 * This function takes responsibility for the passed mdr.
8316 void Server::handle_client_rename(MDRequestRef
& mdr
)
8318 const auto& req
= mdr
->client_request
;
8319 dout(7) << "handle_client_rename " << *req
<< dendl
;
8321 filepath destpath
= req
->get_filepath();
8322 filepath srcpath
= req
->get_filepath2();
8323 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
8324 respond_to_request(mdr
, -CEPHFS_EBUSY
);
8328 if (req
->get_alternate_name().size() > alternate_name_max
) {
8329 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
8330 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
8334 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
8338 dout(10) << " destdn " << *destdn
<< dendl
;
8339 CDir
*destdir
= destdn
->get_dir();
8340 ceph_assert(destdir
->is_auth());
8341 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8343 dout(10) << " srcdn " << *srcdn
<< dendl
;
8344 CDir
*srcdir
= srcdn
->get_dir();
8345 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8346 CInode
*srci
= srcdnl
->get_inode();
8347 dout(10) << " srci " << *srci
<< dendl
;
8349 // -- some sanity checks --
8350 if (destdn
== srcdn
) {
8351 dout(7) << "rename src=dest, noop" << dendl
;
8352 respond_to_request(mdr
, 0);
8356 // dest a child of src?
8357 // e.g. mv /usr /usr/foo
8358 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
8359 dout(7) << "cannot rename item to be a child of itself" << dendl
;
8360 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8364 // is this a stray migration, reintegration or merge? (sanity checks!)
8365 if (mdr
->reqid
.name
.is_mds() &&
8366 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
8367 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
8368 !(destdnl
->is_remote() &&
8369 destdnl
->get_remote_ino() == srci
->ino())) {
8370 respond_to_request(mdr
, -CEPHFS_EINVAL
); // actually, this won't reply, but whatev.
8375 if (!destdnl
->is_null()) {
8376 //dout(10) << "dest dn exists " << *destdn << dendl;
8377 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
8379 dout(10) << " oldin " << *oldin
<< dendl
;
8381 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8382 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
8383 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8387 // mv /some/thing /to/some/existing_other_thing
8388 if (oldin
->is_dir() && !srci
->is_dir()) {
8389 respond_to_request(mdr
, -CEPHFS_EISDIR
);
8392 if (!oldin
->is_dir() && srci
->is_dir()) {
8393 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
8396 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
8397 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
8400 if (destdn
->get_alternate_name() != req
->get_alternate_name()) {
8401 /* the dentry exists but the alternate_names do not match, fail... */
8402 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8407 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
8408 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
8410 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8411 if (destpath
.get_ino() != srcpath
.get_ino() &&
8412 !(req
->get_source().is_mds() &&
8413 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8414 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
8415 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
8416 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8417 while (srcbase
!= destbase
&&
8418 !srcbase
->is_projected_ancestor_of(destbase
)) {
8419 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
8420 srctrace
.insert(srctrace
.begin(), pdn
);
8421 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
8422 srcbase
= pdn
->get_dir()->get_inode();
8425 // then, extend destpath until it shares the same parent inode as srcpath.
8426 while (destbase
!= srcbase
) {
8427 CDentry
*pdn
= destbase
->get_projected_parent_dn();
8428 desttrace
.insert(desttrace
.begin(), pdn
);
8429 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
8430 destbase
= pdn
->get_dir()->get_inode();
8432 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
8436 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8438 dout(10) << " this is a link merge" << dendl
;
8440 // -- create stray dentry? --
8441 CDentry
*straydn
= NULL
;
8442 if (destdnl
->is_primary() && !linkmerge
) {
8443 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
8446 dout(10) << " straydn is " << *straydn
<< dendl
;
8447 } else if (mdr
->straydn
) {
8448 mdr
->unpin(mdr
->straydn
);
8449 mdr
->straydn
= NULL
;
8454 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
8455 MutationImpl::LockOpVec lov
;
8457 // we need to update srci's ctime. xlock its least contended lock to do that...
8458 lov
.add_xlock(&srci
->linklock
);
8459 lov
.add_xlock(&srci
->snaplock
);
8462 // xlock oldin (for nlink--)
8463 lov
.add_xlock(&oldin
->linklock
);
8464 lov
.add_xlock(&oldin
->snaplock
);
8465 if (oldin
->is_dir()) {
8466 ceph_assert(srci
->is_dir());
8467 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
8469 // adjust locking order?
8470 int cmp
= mdr
->compare_paths();
8471 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
8472 std::reverse(lov
.begin(), lov
.end());
8474 ceph_assert(!srci
->is_dir());
8475 // adjust locking order;
8476 if (srci
->ino() > oldin
->ino())
8477 std::reverse(lov
.begin(), lov
.end());
8483 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
8484 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
8485 lov
.add_xlock(&straydn
->lock
);
8488 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
8489 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
8492 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
8496 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
8498 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
8499 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
8502 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
8505 if (!linkmerge
&& !check_fragment_space(mdr
, destdn
->get_dir()))
8508 if (!linkmerge
&& !check_dir_max_entries(mdr
, destdn
->get_dir()))
8511 if (!check_access(mdr
, srci
, MAY_WRITE
))
8515 // with read lock, really verify oldin is empty
8518 _dir_is_nonempty(mdr
, oldin
)) {
8519 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8523 /* project_snaprealm_past_parent() will do this job
8525 // moving between snaprealms?
8526 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8527 SnapRealm *srcrealm = srci->find_snaprealm();
8528 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8529 if (srcrealm != destrealm &&
8530 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8531 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8532 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8533 mdcache->snaprealm_create(mdr, srci);
8539 SnapRealm
*dest_realm
= nullptr;
8540 SnapRealm
*src_realm
= nullptr;
8542 dest_realm
= destdir
->inode
->find_snaprealm();
8543 if (srcdir
->inode
== destdir
->inode
)
8544 src_realm
= dest_realm
;
8546 src_realm
= srcdir
->inode
->find_snaprealm();
8547 if (src_realm
!= dest_realm
&&
8548 src_realm
->get_subvolume_ino() != dest_realm
->get_subvolume_ino()) {
8549 respond_to_request(mdr
, -CEPHFS_EXDEV
);
8554 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
8556 // -- open all srcdn inode frags, if any --
8557 // we need these open so that auth can properly delegate from inode to dirfrags
8558 // after the inode is _ours_.
8559 if (srcdnl
->is_primary() &&
8560 !srcdn
->is_auth() &&
8562 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
8563 mdr
->set_stickydirs(srci
);
8566 srci
->dirfragtree
.get_leaves(leaves
);
8567 for (const auto& leaf
: leaves
) {
8568 CDir
*dir
= srci
->get_dirfrag(leaf
);
8570 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
8571 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
8577 // -- prepare snaprealm ---
8580 if (!mdr
->more()->srci_srnode
&&
8581 srci
->get_projected_inode()->nlink
== 1 &&
8582 srci
->is_projected_snaprealm_global()) {
8583 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8584 srci
->record_snaprealm_parent_dentry(new_srnode
, nullptr, destdn
, false);
8586 srci
->clear_snaprealm_global(new_srnode
);
8587 mdr
->more()->srci_srnode
= new_srnode
;
8590 if (oldin
&& !mdr
->more()->desti_srnode
) {
8591 if (oldin
->is_projected_snaprealm_global()) {
8592 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
8593 oldin
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, destdn
, destdnl
->is_primary());
8594 // dropping the last linkage or dropping the last remote linkage,
8595 // detch the inode from global snaprealm
8596 auto nlink
= oldin
->get_projected_inode()->nlink
;
8598 (nlink
== 2 && !destdnl
->is_primary() &&
8599 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
8600 oldin
->clear_snaprealm_global(new_srnode
);
8601 mdr
->more()->desti_srnode
= new_srnode
;
8602 } else if (destdnl
->is_primary()) {
8603 snapid_t follows
= dest_realm
->get_newest_seq();
8604 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
8605 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
8606 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
8607 mdr
->more()->desti_srnode
= new_srnode
;
8611 if (!mdr
->more()->srci_srnode
) {
8612 if (srci
->is_projected_snaprealm_global()) {
8613 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8614 srci
->record_snaprealm_parent_dentry(new_srnode
, src_realm
, srcdn
, srcdnl
->is_primary());
8615 mdr
->more()->srci_srnode
= new_srnode
;
8616 } else if (srcdnl
->is_primary()) {
8617 snapid_t follows
= src_realm
->get_newest_seq();
8618 if (src_realm
!= dest_realm
&&
8619 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
8620 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
8621 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
8622 mdr
->more()->srci_srnode
= new_srnode
;
8628 // -- prepare witnesses --
8631 * NOTE: we use _all_ replicas as witnesses.
8632 * this probably isn't totally necessary (esp for file renames),
8633 * but if/when we change that, we have to make sure rejoin is
8634 * sufficiently robust to handle strong rejoins from survivors
8635 * with totally wrong dentry->inode linkage.
8636 * (currently, it can ignore rename effects, because the resolve
8637 * stage will sort them out.)
8639 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
8640 if (srcdn
->is_auth())
8641 srcdn
->list_replicas(witnesses
);
8643 witnesses
.insert(srcdn
->authority().first
);
8644 if (srcdnl
->is_remote() && !srci
->is_auth())
8645 witnesses
.insert(srci
->authority().first
);
8646 destdn
->list_replicas(witnesses
);
8647 if (destdnl
->is_remote() && !oldin
->is_auth())
8648 witnesses
.insert(oldin
->authority().first
);
8649 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
8651 if (!witnesses
.empty()) {
8652 // Replicas can't see projected dentry linkages and will get confused.
8653 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8654 // can't project these inodes' linkages.
8655 bool need_flush
= false;
8656 for (auto& dn
: srctrace
) {
8657 if (dn
->is_projected()) {
8663 CDentry
*dn
= destdn
;
8665 if (dn
->is_projected()) {
8669 CInode
*diri
= dn
->get_dir()->get_inode();
8670 dn
= diri
->get_projected_parent_dn();
8674 mdlog
->wait_for_safe(
8675 new MDSInternalContextWrapper(mds
,
8676 new C_MDS_RetryRequest(mdcache
, mdr
)));
8682 // do srcdn auth last
8683 mds_rank_t last
= MDS_RANK_NONE
;
8684 if (!srcdn
->is_auth()) {
8685 last
= srcdn
->authority().first
;
8686 mdr
->more()->srcdn_auth_mds
= last
;
8687 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8688 // are involved in the rename operation.
8689 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8690 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8691 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8692 ceph_assert(mdr
->more()->rename_inode
== srci
);
8693 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8698 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
8699 p
!= witnesses
.end();
8701 if (*p
== last
) continue; // do it last!
8702 if (mdr
->more()->witnessed
.count(*p
)) {
8703 dout(10) << " already witnessed by mds." << *p
<< dendl
;
8704 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
8705 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
8707 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
8711 if (!mdr
->more()->waiting_on_peer
.empty())
8712 return; // we're waiting for a witness.
8714 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
8715 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
8716 ceph_assert(mdr
->more()->waiting_on_peer
.count(last
) == 0);
8717 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8721 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8722 if (!mdr
->more()->peers
.empty() && !srci
->is_dir())
8723 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
8724 if (!mdr
->more()->peers
.empty() && srci
->is_dir())
8725 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
8727 // -- declare now --
8728 mdr
->set_mds_stamp(ceph_clock_now());
8730 // -- prepare journal entry --
8731 mdr
->ls
= mdlog
->get_current_segment();
8732 EUpdate
*le
= new EUpdate(mdlog
, "rename");
8733 mdlog
->start_entry(le
);
8734 le
->metablob
.add_client_req(mdr
->reqid
, req
->get_oldest_client_tid());
8735 if (!mdr
->more()->witnessed
.empty()) {
8736 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
8738 le
->reqid
= mdr
->reqid
;
8739 le
->had_peers
= true;
8741 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8742 // no need to send frozen auth pin to recovring auth MDS of srci
8743 mdr
->more()->is_remote_frozen_authpin
= false;
8746 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, req
->get_alternate_name(), straydn
);
8747 if (le
->client_map
.length())
8748 le
->cmapv
= mds
->sessionmap
.get_projected();
8750 // -- commit locally --
8751 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
8753 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
8754 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
8758 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8760 dout(10) << "_rename_finish " << *mdr
<< dendl
;
8762 if (!mdr
->more()->witnessed
.empty())
8763 mdcache
->logged_leader_update(mdr
->reqid
);
8766 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8768 mdcache
->send_dentry_link(destdn
, mdr
);
8770 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8771 CInode
*in
= destdnl
->get_inode();
8772 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
8774 // test hack: test peer commit
8775 if (!mdr
->more()->peers
.empty() && !in
->is_dir())
8776 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
8777 if (!mdr
->more()->peers
.empty() && in
->is_dir())
8778 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
8781 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8782 if (destdnl
->is_remote() && in
->is_auth())
8783 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
8785 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8787 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
8790 respond_to_request(mdr
, 0);
8793 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
8796 // respond_to_request() drops locks. So stray reintegration can race with us.
8797 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8798 mdcache
->notify_stray(straydn
);
8806 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
8807 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
8809 const auto& client_req
= mdr
->client_request
;
8810 ceph_assert(client_req
);
8812 if (mds
->is_cluster_degraded() &&
8813 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8814 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
8815 if (mdr
->more()->waiting_on_peer
.empty())
8816 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8820 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
8821 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREP
);
8823 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
8824 for (auto dn
: srctrace
)
8825 req
->srcdnpath
.push_dentry(dn
->get_name());
8826 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
8827 for (auto dn
: dsttrace
)
8828 req
->destdnpath
.push_dentry(dn
->get_name());
8829 req
->alternate_name
= client_req
->alternate_name
;
8831 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8833 if (mdr
->more()->srci_srnode
)
8834 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
8835 if (mdr
->more()->desti_srnode
)
8836 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8838 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
8840 // srcdn auth will verify our current witness list is sufficient
8841 req
->witnesses
= witnesse
;
8843 req
->op_stamp
= mdr
->get_op_stamp();
8844 mds
->send_message_mds(req
, who
);
8846 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
8847 mdr
->more()->waiting_on_peer
.insert(who
);
8851 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
8853 version_t oldpv
= mdr
->more()->inode_import_v
;
8855 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8858 auto blp
= mdr
->more()->inode_import
.cbegin();
8861 map
<client_t
,entity_inst_t
> client_map
;
8862 map
<client_t
, client_metadata_t
> client_metadata_map
;
8863 decode(client_map
, blp
);
8864 decode(client_metadata_map
, blp
);
8865 prepare_force_open_sessions(client_map
, client_metadata_map
,
8866 mdr
->more()->imported_session_map
);
8867 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
8868 encode(client_metadata_map
, *client_map_bl
);
8870 list
<ScatterLock
*> updated_scatterlocks
;
8871 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
8872 mdr
->more()->cap_imports
, updated_scatterlocks
);
8874 // hack: force back to !auth and clean, temporarily
8875 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
8876 srcdnl
->get_inode()->mark_clean();
8881 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
8883 auto&& dirs
= diri
->get_dirfrags();
8885 bool force_journal
= false;
8887 for (const auto& dir
: dirs
) {
8888 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
8889 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
8890 force_journal
= true;
8893 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
8896 // see if any children of our frags are auth subtrees.
8897 std::vector
<CDir
*> subtrees
;
8898 mdcache
->get_subtrees(subtrees
);
8899 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
8900 for (const auto& dir
: dirs
) {
8901 for (const auto& subtree
: subtrees
) {
8902 if (dir
->contains(subtree
)) {
8903 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
8904 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
8905 << *subtree
<< dendl
;
8906 force_journal
= true;
8909 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
8911 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
8917 return force_journal
;
8920 void Server::_rename_prepare(MDRequestRef
& mdr
,
8921 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
8922 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
8925 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8927 dout(10) << " straydn " << *straydn
<< dendl
;
8929 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8930 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8931 CInode
*srci
= srcdnl
->get_inode();
8932 CInode
*oldin
= destdnl
->get_inode();
8934 // primary+remote link merge?
8935 bool linkmerge
= (srci
== oldin
);
8937 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8938 bool silent
= srcdn
->get_dir()->inode
->is_stray();
8940 bool force_journal_dest
= false;
8941 if (srci
->is_dir() && !destdn
->is_auth()) {
8942 if (srci
->is_auth()) {
8943 // if we are auth for srci and exporting it, force journal because journal replay needs
8944 // the source inode to create auth subtrees.
8945 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
8946 force_journal_dest
= true;
8948 force_journal_dest
= _need_force_journal(srci
, false);
8951 bool force_journal_stray
= false;
8952 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
8953 force_journal_stray
= _need_force_journal(oldin
, true);
8956 dout(10) << " merging remote and primary links to the same inode" << dendl
;
8958 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
8959 if (force_journal_dest
)
8960 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
8961 if (force_journal_stray
)
8962 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
8964 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
8965 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
8966 metablob
->renamed_dirino
= srci
->ino();
8967 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
8968 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
8969 metablob
->renamed_dirino
= oldin
->ino();
8973 CInode::mempool_inode
*spi
= 0; // renamed inode
8974 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
8978 if (destdnl
->is_primary()) {
8979 ceph_assert(straydn
); // moving to straydn.
8980 // link--, and move.
8981 if (destdn
->is_auth()) {
8982 auto pi
= oldin
->project_inode(mdr
); //project_snaprealm
8983 pi
.inode
->version
= straydn
->pre_dirty(pi
.inode
->version
);
8984 pi
.inode
->update_backtrace();
8985 tpi
= pi
.inode
.get();
8987 straydn
->push_projected_linkage(oldin
);
8988 } else if (destdnl
->is_remote()) {
8990 if (oldin
->is_auth()) {
8991 auto pi
= oldin
->project_inode(mdr
);
8992 pi
.inode
->version
= oldin
->pre_dirty();
8993 tpi
= pi
.inode
.get();
8999 if (destdnl
->is_null()) {
9000 /* handle_client_rename checks that alternate_name matches for existing destdn */
9001 destdn
->set_alternate_name(alternate_name
);
9003 if (srcdnl
->is_remote()) {
9006 if (destdn
->is_auth())
9007 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
9008 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9010 if (srci
->is_auth()) {
9011 auto pi
= srci
->project_inode(mdr
);
9012 pi
.inode
->version
= srci
->pre_dirty();
9013 spi
= pi
.inode
.get();
9016 dout(10) << " will merge remote onto primary link" << dendl
;
9017 if (destdn
->is_auth()) {
9018 auto pi
= oldin
->project_inode(mdr
);
9019 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->get_version());
9020 spi
= pi
.inode
.get();
9024 if (destdn
->is_auth()) {
9026 if (srcdn
->is_auth())
9027 oldpv
= srci
->get_projected_version();
9029 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
9031 // note which dirfrags have child subtrees in the journal
9032 // event, so that we can open those (as bounds) during replay.
9033 if (srci
->is_dir()) {
9034 auto&& ls
= srci
->get_dirfrags();
9035 for (const auto& dir
: ls
) {
9036 if (!dir
->is_auth())
9037 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
9039 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
9042 auto pi
= srci
->project_inode(mdr
); // project snaprealm if srcdnl->is_primary
9043 // & srcdnl->snaprealm
9044 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
9045 pi
.inode
->update_backtrace();
9046 spi
= pi
.inode
.get();
9048 destdn
->push_projected_linkage(srci
);
9052 if (srcdn
->is_auth())
9053 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
9054 srcdn
->push_projected_linkage(); // push null linkage
9058 spi
->ctime
= mdr
->get_op_stamp();
9059 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
9060 spi
->rstat
.rctime
= mdr
->get_op_stamp();
9066 tpi
->ctime
= mdr
->get_op_stamp();
9067 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
9068 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
9072 destdn
->make_path_string(t
, true);
9073 tpi
->stray_prior_path
= std::move(t
);
9076 if (tpi
->nlink
== 0)
9077 oldin
->state_set(CInode::STATE_ORPHAN
);
9081 // prepare nesting, mtime updates
9082 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
9084 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9085 // then link the source inode to destdn
9086 if (destdnl
->is_primary()) {
9087 ceph_assert(straydn
);
9088 if (straydn
->is_auth()) {
9089 metablob
->add_dir_context(straydn
->get_dir());
9090 metablob
->add_dir(straydn
->get_dir(), true);
9094 if (!linkmerge
&& destdnl
->is_remote() && oldin
->is_auth()) {
9095 CDir
*oldin_dir
= oldin
->get_projected_parent_dir();
9096 if (oldin_dir
!= srcdn
->get_dir() && oldin_dir
!= destdn
->get_dir())
9097 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, oldin_dir
, PREDIRTY_PRIMARY
);
9101 if (destdn
->is_auth() && !destdnl
->is_null()) {
9102 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
9103 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
9104 if (destdnl
->is_primary()) {
9105 ceph_assert(straydn
);
9106 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
9107 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
9111 if (srcdnl
->is_remote() && srci
->is_auth()) {
9112 CDir
*srci_dir
= srci
->get_projected_parent_dir();
9113 if (srci_dir
!= srcdn
->get_dir() && srci_dir
!= destdn
->get_dir())
9114 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srci_dir
, PREDIRTY_PRIMARY
);
9118 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
9119 int flags
= predirty_dir
| predirty_primary
;
9120 if (srcdn
->is_auth())
9121 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
9122 if (destdn
->is_auth())
9123 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
9125 // add it all to the metablob
9128 if (destdnl
->is_primary()) {
9129 ceph_assert(straydn
);
9130 if (destdn
->is_auth()) {
9131 // project snaprealm, too
9132 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9133 oldin
->project_snaprealm(desti_srnode
);
9134 if (tpi
->nlink
== 0)
9135 ceph_assert(!desti_srnode
->is_parent_global());
9136 desti_srnode
= NULL
;
9138 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9139 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
9140 } else if (force_journal_stray
) {
9141 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
9142 metablob
->add_dir_context(straydn
->get_dir());
9143 metablob
->add_primary_dentry(straydn
, oldin
, true);
9145 } else if (destdnl
->is_remote()) {
9146 if (oldin
->is_auth()) {
9147 sr_t
*new_srnode
= NULL
;
9148 if (mdr
->peer_request
) {
9149 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9150 new_srnode
= new sr_t();
9151 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
9152 decode(*new_srnode
, p
);
9154 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9155 new_srnode
= desti_srnode
;
9156 desti_srnode
= NULL
;
9159 oldin
->project_snaprealm(new_srnode
);
9160 if (tpi
->nlink
== 0)
9161 ceph_assert(!new_srnode
->is_parent_global());
9164 CDentry
*oldin_pdn
= oldin
->get_projected_parent_dn();
9165 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin_pdn
);
9166 metablob
->add_primary_dentry(oldin_pdn
, oldin
, true);
9172 if (srcdnl
->is_remote()) {
9173 ceph_assert(!linkmerge
);
9174 if (destdn
->is_auth() && !destdnl
->is_null())
9175 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9177 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9179 if (destdn
->is_auth())
9180 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9182 if (srci
->is_auth() ) { // it's remote
9183 if (mdr
->peer_request
) {
9184 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9185 sr_t
*new_srnode
= new sr_t();
9186 auto p
= mdr
->peer_request
->srci_snapbl
.cbegin();
9187 decode(*new_srnode
, p
);
9188 srci
->project_snaprealm(new_srnode
);
9190 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9191 srci
->project_snaprealm(srci_srnode
);
9195 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
9196 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
);
9197 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
9199 } else if (srcdnl
->is_primary()) {
9200 // project snap parent update?
9201 if (destdn
->is_auth()) {
9202 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9203 srci
->project_snaprealm(srci_srnode
);
9208 if (destdn
->is_auth() && !destdnl
->is_null())
9209 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9211 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9213 if (destdn
->is_auth())
9214 metablob
->add_primary_dentry(destdn
, srci
, true, true);
9215 else if (force_journal_dest
) {
9216 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
9217 metablob
->add_dir_context(destdn
->get_dir());
9218 metablob
->add_primary_dentry(destdn
, srci
, true);
9219 if (srcdn
->is_auth() && srci
->is_dir()) {
9220 // journal new subtrees root dirfrags
9221 auto&& ls
= srci
->get_dirfrags();
9222 for (const auto& dir
: ls
) {
9224 metablob
->add_dir(dir
, true);
9231 if (srcdn
->is_auth()) {
9232 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
9233 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
9234 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9235 // both primary and NULL dentries. Because during journal replay, null dentry is
9236 // processed after primary dentry.
9237 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
9238 metablob
->add_primary_dentry(srcdn
, srci
, true);
9239 metablob
->add_null_dentry(srcdn
, true);
9241 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
9243 // make renamed inode first track the dn
9244 if (srcdnl
->is_primary() && destdn
->is_auth()) {
9245 ceph_assert(srci
->first
<= destdn
->first
);
9246 srci
->first
= destdn
->first
;
9248 // make stray inode first track the straydn
9249 if (straydn
&& straydn
->is_auth()) {
9250 ceph_assert(oldin
->first
<= straydn
->first
);
9251 oldin
->first
= straydn
->first
;
9254 if (oldin
&& oldin
->is_dir()) {
9255 ceph_assert(straydn
);
9256 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
9259 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
9264 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9266 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9267 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
9269 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9270 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9272 CInode
*oldin
= destdnl
->get_inode();
9274 // primary+remote link merge?
9275 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
9277 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
9279 bool new_in_snaprealm
= false;
9280 bool new_oldin_snaprealm
= false;
9284 if (destdnl
->is_primary()) {
9285 ceph_assert(straydn
);
9286 dout(10) << "straydn is " << *straydn
<< dendl
;
9288 // if there is newly created snaprealm, need to split old snaprealm's
9289 // inodes_with_caps. So pop snaprealm before linkage changes.
9290 if (destdn
->is_auth()) {
9291 bool hadrealm
= (oldin
->snaprealm
? true : false);
9292 oldin
->early_pop_projected_snaprealm();
9293 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
9295 ceph_assert(mdr
->peer_request
);
9296 if (mdr
->peer_request
->desti_snapbl
.length()) {
9297 new_oldin_snaprealm
= !oldin
->snaprealm
;
9298 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9299 ceph_assert(oldin
->snaprealm
);
9303 destdn
->get_dir()->unlink_inode(destdn
, false);
9305 straydn
->pop_projected_linkage();
9306 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9307 ceph_assert(!straydn
->is_projected()); // no other projected
9310 if (destdn
->is_auth())
9311 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9313 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
9314 } else if (destdnl
->is_remote()) {
9315 destdn
->get_dir()->unlink_inode(destdn
, false);
9316 if (oldin
->is_auth()) {
9317 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9318 } else if (mdr
->peer_request
) {
9319 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9320 ceph_assert(oldin
->snaprealm
);
9321 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9323 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9324 delete desti_srnode
;
9325 desti_srnode
= NULL
;
9330 // unlink src before we relink it at dest
9331 CInode
*in
= srcdnl
->get_inode();
9334 bool srcdn_was_remote
= srcdnl
->is_remote();
9335 if (!srcdn_was_remote
) {
9336 // if there is newly created snaprealm, need to split old snaprealm's
9337 // inodes_with_caps. So pop snaprealm before linkage changes.
9338 if (destdn
->is_auth()) {
9339 bool hadrealm
= (in
->snaprealm
? true : false);
9340 in
->early_pop_projected_snaprealm();
9341 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
9343 ceph_assert(mdr
->peer_request
);
9344 if (mdr
->peer_request
->srci_snapbl
.length()) {
9345 new_in_snaprealm
= !in
->snaprealm
;
9346 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9347 ceph_assert(in
->snaprealm
);
9352 srcdn
->get_dir()->unlink_inode(srcdn
);
9355 if (srcdn_was_remote
) {
9358 destdnl
= destdn
->pop_projected_linkage();
9359 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9360 ceph_assert(!destdn
->is_projected()); // no other projected
9362 destdn
->link_remote(destdnl
, in
);
9363 if (destdn
->is_auth())
9364 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
9366 if (in
->is_auth()) {
9367 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9368 } else if (mdr
->peer_request
) {
9369 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9370 ceph_assert(in
->snaprealm
);
9371 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9373 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9378 dout(10) << "merging remote onto primary link" << dendl
;
9379 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9383 dout(10) << "merging primary onto remote link" << dendl
;
9384 destdn
->get_dir()->unlink_inode(destdn
, false);
9386 destdnl
= destdn
->pop_projected_linkage();
9387 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9388 ceph_assert(!destdn
->is_projected()); // no other projected
9390 // srcdn inode import?
9391 if (!srcdn
->is_auth() && destdn
->is_auth()) {
9392 ceph_assert(mdr
->more()->inode_import
.length() > 0);
9394 map
<client_t
,Capability::Import
> imported_caps
;
9396 // finish cap imports
9397 finish_force_open_sessions(mdr
->more()->imported_session_map
);
9398 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
9399 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
9400 mdr
->more()->srcdn_auth_mds
, true,
9401 mdr
->more()->imported_session_map
,
9402 mdr
->more()->cap_imports
[destdnl
->get_inode()],
9406 mdr
->more()->inode_import
.clear();
9407 encode(imported_caps
, mdr
->more()->inode_import
);
9409 /* hack: add an auth pin for each xlock we hold. These were
9410 * remote xlocks previously but now they're local and
9411 * we're going to try and unpin when we xlock_finish. */
9413 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
9414 i
!= mdr
->locks
.end();
9416 SimpleLock
*lock
= i
->lock
;
9417 if (lock
->get_parent() != destdnl
->get_inode())
9419 if (i
->is_xlock() && !lock
->is_locallock())
9420 mds
->locker
->xlock_import(lock
);
9423 // hack: fix auth bit
9424 in
->state_set(CInode::STATE_AUTH
);
9426 mdr
->clear_ambiguous_auth();
9429 if (destdn
->is_auth())
9430 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9434 if (srcdn
->is_auth())
9435 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
9436 srcdn
->pop_projected_linkage();
9437 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9438 ceph_assert(!srcdn
->is_projected()); // no other projected
9440 // apply remaining projected inodes (nested)
9443 // update subtree map?
9444 if (destdnl
->is_primary() && in
->is_dir())
9445 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
9447 if (straydn
&& oldin
->is_dir())
9448 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
9450 if (new_oldin_snaprealm
)
9451 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
9452 if (new_in_snaprealm
)
9453 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
9455 // removing a new dn?
9456 if (srcdn
->is_auth())
9457 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
9465 class C_MDS_PeerRenamePrep
: public ServerLogContext
{
9466 CDentry
*srcdn
, *destdn
, *straydn
;
9468 C_MDS_PeerRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9469 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9470 void finish(int r
) override
{
9471 server
->_logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9475 class C_MDS_PeerRenameCommit
: public ServerContext
{
9477 CDentry
*srcdn
, *destdn
, *straydn
;
9479 C_MDS_PeerRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9480 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9481 void finish(int r
) override
{
9482 server
->_commit_peer_rename(mdr
, r
, srcdn
, destdn
, straydn
);
9486 class C_MDS_PeerRenameSessionsFlushed
: public ServerContext
{
9489 C_MDS_PeerRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
9490 ServerContext(s
), mdr(r
) {}
9491 void finish(int r
) override
{
9492 server
->_peer_rename_sessions_flushed(mdr
);
9496 void Server::handle_peer_rename_prep(MDRequestRef
& mdr
)
9498 dout(10) << "handle_peer_rename_prep " << *mdr
9499 << " " << mdr
->peer_request
->srcdnpath
9500 << " to " << mdr
->peer_request
->destdnpath
9503 if (mdr
->peer_request
->is_interrupted()) {
9504 dout(10) << " peer request interrupted, sending noop reply" << dendl
;
9505 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9506 reply
->mark_interrupted();
9507 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9508 mdr
->reset_peer_request();
9513 filepath
destpath(mdr
->peer_request
->destdnpath
);
9514 dout(10) << " dest " << destpath
<< dendl
;
9515 vector
<CDentry
*> trace
;
9516 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
9517 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
9518 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
9521 if (r
== -CEPHFS_ESTALE
) {
9522 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
9523 mdr
->peer_to_mds
, true);
9526 ceph_assert(r
== 0); // we shouldn't get an error here!
9528 CDentry
*destdn
= trace
.back();
9529 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9530 dout(10) << " destdn " << *destdn
<< dendl
;
9534 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
9535 dout(10) << " src " << srcpath
<< dendl
;
9536 CInode
*srci
= nullptr;
9537 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
9538 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
9541 ceph_assert(r
== 0);
9543 CDentry
*srcdn
= trace
.back();
9544 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9545 dout(10) << " srcdn " << *srcdn
<< dendl
;
9550 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
9552 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9553 CDentry
*straydn
= mdr
->straydn
;
9554 if (destdnl
->is_primary() && !linkmerge
)
9555 ceph_assert(straydn
);
9557 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
9558 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
9560 // set up commit waiter (early, to clean up any freezing etc we do)
9561 if (!mdr
->more()->peer_commit
)
9562 mdr
->more()->peer_commit
= new C_MDS_PeerRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
9565 if (srcdn
->is_auth()) {
9566 set
<mds_rank_t
> srcdnrep
;
9567 srcdn
->list_replicas(srcdnrep
);
9569 bool reply_witness
= false;
9570 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
9573 // - avoid conflicting lock state changes
9574 // - avoid concurrent updates to the inode
9575 // (this could also be accomplished with the versionlock)
9576 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9577 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
9578 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
9580 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9581 if (srcdnl
->get_inode()->is_frozen_auth_pin())
9582 mdr
->unfreeze_auth_pin();
9584 if (!frozen_inode
) {
9585 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
9590 * set ambiguous auth for srci
9591 * NOTE: we don't worry about ambiguous cache expire as we do
9592 * with subtree migrations because all peers will pin
9593 * srcdn->get_inode() for duration of this rename.
9595 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9597 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9598 // the leader will send another OP_RENAMEPREP peer request later.
9599 if (mdr
->peer_request
->witnesses
.size() > 1) {
9600 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
9601 reply_witness
= true;
9604 // make sure bystanders have received all lock related messages
9605 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9606 if (*p
== mdr
->peer_to_mds
||
9607 (mds
->is_cluster_degraded() &&
9608 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
9610 auto notify
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMENOTIFY
);
9611 mds
->send_message_mds(notify
, *p
);
9612 mdr
->more()->waiting_on_peer
.insert(*p
);
9615 // make sure clients have received all cap related messages
9616 set
<client_t
> export_client_set
;
9617 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
9619 MDSGatherBuilder
gather(g_ceph_context
);
9620 flush_client_sessions(export_client_set
, gather
);
9621 if (gather
.has_subs()) {
9622 mdr
->more()->waiting_on_peer
.insert(MDS_RANK_NONE
);
9623 gather
.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr
));
9628 // is witness list sufficient?
9629 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9630 if (*p
== mdr
->peer_to_mds
||
9631 mdr
->peer_request
->witnesses
.count(*p
)) continue;
9632 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
9633 reply_witness
= true;
9637 if (reply_witness
) {
9638 ceph_assert(!srcdnrep
.empty());
9639 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9640 reply
->witnesses
.swap(srcdnrep
);
9641 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9642 mdr
->reset_peer_request();
9645 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
9646 if (!mdr
->more()->waiting_on_peer
.empty()) {
9647 dout(10) << " still waiting for rename notify acks from "
9648 << mdr
->more()->waiting_on_peer
<< dendl
;
9651 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
9652 // set ambiguous auth for srci on witnesses
9653 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9656 // encode everything we'd need to roll this back... basically, just the original state.
9657 rename_rollback rollback
;
9659 rollback
.reqid
= mdr
->reqid
;
9661 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
9662 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9663 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9664 rollback
.orig_src
.dname
= srcdn
->get_name();
9665 if (srcdnl
->is_primary())
9666 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
9668 ceph_assert(srcdnl
->is_remote());
9669 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
9670 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
9673 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
9674 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9675 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9676 rollback
.orig_dest
.dname
= destdn
->get_name();
9677 if (destdnl
->is_primary())
9678 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
9679 else if (destdnl
->is_remote()) {
9680 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
9681 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
9685 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
9686 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9687 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9688 rollback
.stray
.dname
= straydn
->get_name();
9690 if (mdr
->peer_request
->desti_snapbl
.length()) {
9691 CInode
*oldin
= destdnl
->get_inode();
9692 if (oldin
->snaprealm
) {
9693 encode(true, rollback
.desti_snapbl
);
9694 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
9696 encode(false, rollback
.desti_snapbl
);
9699 if (mdr
->peer_request
->srci_snapbl
.length()) {
9700 if (srci
->snaprealm
) {
9701 encode(true, rollback
.srci_snapbl
);
9702 srci
->encode_snap_blob(rollback
.srci_snapbl
);
9704 encode(false, rollback
.srci_snapbl
);
9707 encode(rollback
, mdr
->more()->rollback_bl
);
9708 // FIXME: rollback snaprealm
9709 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
9712 mdr
->ls
= mdlog
->get_current_segment();
9713 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_prep", mdr
->reqid
, mdr
->peer_to_mds
,
9714 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RENAME
);
9715 mdlog
->start_entry(le
);
9716 le
->rollback
= mdr
->more()->rollback_bl
;
9718 bufferlist blah
; // inode import data... obviously not used if we're the peer
9719 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, mdr
->peer_request
->alternate_name
, straydn
);
9721 if (le
->commit
.empty()) {
9722 dout(10) << " empty metablob, skipping journal" << dendl
;
9723 mdlog
->cancel_entry(le
);
9725 _logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9727 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
9728 mdr
->more()->peer_update_journaled
= true;
9729 submit_mdlog_entry(le
, new C_MDS_PeerRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
9735 void Server::_logged_peer_rename(MDRequestRef
& mdr
,
9736 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9738 dout(10) << "_logged_peer_rename " << *mdr
<< dendl
;
9741 ref_t
<MMDSPeerRequest
> reply
;
9742 if (!mdr
->aborted
) {
9743 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9744 if (!mdr
->more()->peer_update_journaled
)
9745 reply
->mark_not_journaled();
9748 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9749 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9752 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
9753 // set export bounds for CInode::encode_export()
9755 std::vector
<CDir
*> bounds
;
9756 if (srcdnl
->get_inode()->is_dir()) {
9757 srcdnl
->get_inode()->get_dirfrags(bounds
);
9758 for (const auto& bound
: bounds
) {
9759 bound
->state_set(CDir::STATE_EXPORTBOUND
);
9763 map
<client_t
,entity_inst_t
> exported_client_map
;
9764 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
9766 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
9767 exported_client_map
,
9768 exported_client_metadata_map
);
9770 for (const auto& bound
: bounds
) {
9771 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
9774 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
9775 encode(exported_client_metadata_map
, reply
->inode_export
);
9776 reply
->inode_export
.claim_append(inodebl
);
9777 reply
->inode_export_v
= srcdnl
->get_inode()->get_version();
9780 // remove mdr auth pin
9781 mdr
->auth_unpin(srcdnl
->get_inode());
9782 mdr
->more()->is_inode_exporter
= true;
9784 if (srcdnl
->get_inode()->is_dirty())
9785 srcdnl
->get_inode()->mark_clean();
9787 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
9791 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9793 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9796 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9797 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
9798 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
9801 mdr
->reset_peer_request();
9805 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9807 ceph_assert(mdr
->aborted
);
9808 dout(10) << " abort flag set, finishing" << dendl
;
9809 mdcache
->request_finish(mdr
);
9813 void Server::_commit_peer_rename(MDRequestRef
& mdr
, int r
,
9814 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9816 dout(10) << "_commit_peer_rename " << *mdr
<< " r=" << r
<< dendl
;
9818 CInode
*in
= destdn
->get_linkage()->get_inode();
9820 inodeno_t migrated_stray
;
9821 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
9822 migrated_stray
= in
->ino();
9824 MDSContext::vec finished
;
9826 // unfreeze+singleauth inode
9827 // hmm, do i really need to delay this?
9828 if (mdr
->more()->is_inode_exporter
) {
9830 // we exported, clear out any xlocks that we moved to another MDS
9832 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
9833 i
!= mdr
->locks
.end(); ) {
9834 SimpleLock
*lock
= i
->lock
;
9835 if (lock
->get_parent() != in
)
9837 // we only care about xlocks on the exported inode
9838 if (i
->is_xlock() && !lock
->is_locallock())
9839 mds
->locker
->xlock_export(i
++, mdr
.get());
9844 map
<client_t
,Capability::Import
> peer_imported
;
9845 auto bp
= mdr
->more()->inode_import
.cbegin();
9846 decode(peer_imported
, bp
);
9848 dout(10) << " finishing inode export on " << *in
<< dendl
;
9849 mdcache
->migrator
->finish_export_inode(in
, mdr
->peer_to_mds
, peer_imported
, finished
);
9850 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
9853 ceph_assert(in
->is_frozen_inode());
9854 in
->unfreeze_inode(finished
);
9858 if (mdr
->more()->is_ambiguous_auth
) {
9859 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9860 mdr
->more()->is_ambiguous_auth
= false;
9863 if (straydn
&& mdr
->more()->peer_update_journaled
) {
9864 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
9865 if (strayin
&& !strayin
->snaprealm
)
9866 mdcache
->clear_dirty_bits_for_stray(strayin
);
9869 mds
->queue_waiters(finished
);
9872 if (mdr
->more()->peer_update_journaled
) {
9873 // write a commit to the journal
9874 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_commit", mdr
->reqid
,
9875 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
9876 EPeerUpdate::RENAME
);
9877 mdlog
->start_entry(le
);
9878 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
9881 _committed_peer(mdr
);
9886 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9887 // witness list from the leader, and they failed before we tried prep again.
9888 if (mdr
->more()->rollback_bl
.length()) {
9889 if (mdr
->more()->is_inode_exporter
) {
9890 dout(10) << " reversing inode export of " << *in
<< dendl
;
9893 if (mdcache
->is_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
)) {
9894 mdcache
->remove_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
);
9895 // rollback but preserve the peer request
9896 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, false);
9897 mdr
->more()->rollback_bl
.clear();
9899 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, true);
9901 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl
;
9903 if (mdr
->more()->is_ambiguous_auth
) {
9904 if (srcdn
->is_auth())
9905 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9907 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9908 mdr
->more()->is_ambiguous_auth
= false;
9910 mds
->queue_waiters(finished
);
9911 mdcache
->request_finish(mdr
);
9915 if (migrated_stray
&& mds
->is_stopping())
9916 mdcache
->shutdown_export_stray_finish(migrated_stray
);
9919 static void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
,
9920 rename_rollback::drec
&r
, utime_t ctime
,
9921 bool isdir
, const nest_info_t
&rstat
)
9923 auto pf
= dir
->project_fnode(mut
);
9924 pf
->version
= dir
->pre_dirty();
9927 pf
->fragstat
.nsubdirs
+= 1;
9929 pf
->fragstat
.nfiles
+= 1;
9932 pf
->rstat
.rbytes
+= rstat
.rbytes
;
9933 pf
->rstat
.rfiles
+= rstat
.rfiles
;
9934 pf
->rstat
.rsubdirs
+= rstat
.rsubdirs
;
9935 pf
->rstat
.rsnaps
+= rstat
.rsnaps
;
9937 if (pf
->fragstat
.mtime
== ctime
) {
9938 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
9939 if (pf
->rstat
.rctime
== ctime
)
9940 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
9942 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
9943 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
9946 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
9952 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9954 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
9955 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
9956 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
9957 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
9958 straydn(st
), finish_mdr(f
) {
9959 splits
[0].swap(_splits
[0]);
9960 splits
[1].swap(_splits
[1]);
9962 void finish(int r
) override
{
9963 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
9964 destdn
, straydn
, splits
, finish_mdr
);
9968 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
,
9971 rename_rollback rollback
;
9972 auto p
= rbl
.cbegin();
9973 decode(rollback
, p
);
9975 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
9976 // need to finish this update before sending resolve to claim the subtree
9977 mdcache
->add_rollback(rollback
.reqid
, leader
);
9979 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
9980 mut
->ls
= mds
->mdlog
->get_current_segment();
9982 CDentry
*srcdn
= NULL
;
9983 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
9985 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
9987 dout(10) << " srcdir " << *srcdir
<< dendl
;
9988 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
9990 dout(10) << " srcdn " << *srcdn
<< dendl
;
9991 ceph_assert(srcdn
->get_linkage()->is_null());
9993 dout(10) << " srcdn not found" << dendl
;
9995 dout(10) << " srcdir not found" << dendl
;
9997 CDentry
*destdn
= NULL
;
9998 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
10000 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
10002 dout(10) << " destdir " << *destdir
<< dendl
;
10003 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
10005 dout(10) << " destdn " << *destdn
<< dendl
;
10007 dout(10) << " destdn not found" << dendl
;
10009 dout(10) << " destdir not found" << dendl
;
10012 if (rollback
.orig_src
.ino
) {
10013 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
10014 if (in
&& in
->is_dir())
10015 ceph_assert(srcdn
&& destdn
);
10017 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
10019 CDir
*straydir
= NULL
;
10020 CDentry
*straydn
= NULL
;
10021 if (rollback
.stray
.dirfrag
.ino
) {
10022 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
10024 dout(10) << "straydir " << *straydir
<< dendl
;
10025 straydn
= straydir
->lookup(rollback
.stray
.dname
);
10027 dout(10) << " straydn " << *straydn
<< dendl
;
10028 ceph_assert(straydn
->get_linkage()->is_primary());
10030 dout(10) << " straydn not found" << dendl
;
10032 dout(10) << "straydir not found" << dendl
;
10035 CInode
*target
= NULL
;
10036 if (rollback
.orig_dest
.ino
) {
10037 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
10039 ceph_assert(destdn
&& straydn
);
10040 } else if (rollback
.orig_dest
.remote_ino
)
10041 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
10043 // can't use is_auth() in the resolve stage
10044 mds_rank_t whoami
= mds
->get_nodeid();
10046 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
10047 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
10049 bool force_journal_src
= false;
10050 bool force_journal_dest
= false;
10051 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
10052 force_journal_src
= _need_force_journal(in
, false);
10053 if (in
&& target
&& target
->is_dir())
10054 force_journal_dest
= _need_force_journal(in
, true);
10056 version_t srcdnpv
= 0;
10059 if (srcdn
->authority().first
== whoami
)
10060 srcdnpv
= srcdn
->pre_dirty();
10061 if (rollback
.orig_src
.ino
) {
10063 srcdn
->push_projected_linkage(in
);
10065 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
10066 rollback
.orig_src
.remote_d_type
);
10069 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
10071 const CInode::mempool_inode
*pip
= nullptr;
10074 CDir
*pdir
= in
->get_projected_parent_dir();
10075 if (pdir
->authority().first
== whoami
) {
10076 auto pi
= in
->project_inode(mut
);
10077 pi
.inode
->version
= in
->pre_dirty();
10078 if (pdir
!= srcdir
) {
10079 auto pf
= pdir
->project_fnode(mut
);
10080 pf
->version
= pdir
->pre_dirty();
10082 if (pi
.inode
->ctime
== rollback
.ctime
)
10083 pi
.inode
->ctime
= rollback
.orig_src
.old_ctime
;
10086 if (in
->get_inode()->ctime
== rollback
.ctime
) {
10087 auto _inode
= CInode::allocate_inode(*in
->get_inode());
10088 _inode
->ctime
= rollback
.orig_src
.old_ctime
;
10089 in
->reset_inode(_inode
);
10093 pip
= in
->get_projected_inode().get();
10095 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
10097 auto p
= rollback
.srci_snapbl
.cbegin();
10098 decode(hadrealm
, p
);
10100 if (projected
&& !mds
->is_resolve()) {
10101 sr_t
*new_srnode
= new sr_t();
10102 decode(*new_srnode
, p
);
10103 in
->project_snaprealm(new_srnode
);
10105 decode(in
->snaprealm
->srnode
, p
);
10108 if (rollback
.orig_src
.ino
) {
10109 ceph_assert(srcdir
);
10110 realm
= srcdir
->get_inode()->find_snaprealm();
10112 realm
= in
->snaprealm
->parent
;
10114 if (!mds
->is_resolve())
10115 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
10117 in
->project_snaprealm(NULL
);
10119 in
->snaprealm
->merge_to(realm
);
10126 if (rollback
.orig_dest
.ino
&& target
) {
10127 destdn
->push_projected_linkage(target
);
10128 } else if (rollback
.orig_dest
.remote_ino
) {
10129 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
10130 rollback
.orig_dest
.remote_d_type
);
10132 // the dentry will be trimmed soon, it's ok to have wrong linkage
10133 if (rollback
.orig_dest
.ino
)
10134 ceph_assert(mds
->is_resolve());
10135 destdn
->push_projected_linkage();
10140 straydn
->push_projected_linkage();
10144 CInode::inode_ptr ti
;
10145 CDir
*pdir
= target
->get_projected_parent_dir();
10146 if (pdir
->authority().first
== whoami
) {
10147 auto pi
= target
->project_inode(mut
);
10148 pi
.inode
->version
= target
->pre_dirty();
10149 if (pdir
!= srcdir
) {
10150 auto pf
= pdir
->project_fnode(mut
);
10151 pf
->version
= pdir
->pre_dirty();
10156 ti
= CInode::allocate_inode(*target
->get_inode());
10160 if (ti
->ctime
== rollback
.ctime
)
10161 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
10162 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
10163 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
10164 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
10166 ceph_assert(rollback
.orig_dest
.remote_ino
&&
10167 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
10172 target
->reset_inode(ti
);
10174 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
10176 auto p
= rollback
.desti_snapbl
.cbegin();
10177 decode(hadrealm
, p
);
10179 if (projected
&& !mds
->is_resolve()) {
10180 sr_t
*new_srnode
= new sr_t();
10181 decode(*new_srnode
, p
);
10182 target
->project_snaprealm(new_srnode
);
10184 decode(target
->snaprealm
->srnode
, p
);
10187 if (rollback
.orig_dest
.ino
) {
10188 ceph_assert(destdir
);
10189 realm
= destdir
->get_inode()->find_snaprealm();
10191 realm
= target
->snaprealm
->parent
;
10193 if (!mds
->is_resolve())
10194 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
10196 target
->project_snaprealm(NULL
);
10198 target
->snaprealm
->merge_to(realm
);
10203 if (srcdn
&& srcdn
->authority().first
== whoami
) {
10205 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
10206 in
&& in
->is_dir(), pip
? pip
->accounted_rstat
: blah
);
10210 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
10212 dout(0) << " srci back to " << *in
<< dendl
;
10214 dout(0) << " destdn back to " << *destdn
<< dendl
;
10216 dout(0) << " desti back to " << *target
<< dendl
;
10219 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_rollback", rollback
.reqid
, leader
,
10220 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RENAME
);
10221 mdlog
->start_entry(le
);
10223 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10224 le
->commit
.add_dir_context(srcdir
);
10225 if (rollback
.orig_src
.ino
)
10226 le
->commit
.add_primary_dentry(srcdn
, 0, true);
10228 le
->commit
.add_remote_dentry(srcdn
, true);
10231 if (!rollback
.orig_src
.ino
&& // remote linkage
10232 in
&& in
->authority().first
== whoami
) {
10233 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
10234 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
10237 if (force_journal_dest
) {
10238 ceph_assert(rollback
.orig_dest
.ino
);
10239 le
->commit
.add_dir_context(destdir
);
10240 le
->commit
.add_primary_dentry(destdn
, 0, true);
10243 // peer: no need to journal straydn
10245 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
10246 ceph_assert(rollback
.orig_dest
.remote_ino
);
10247 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
10248 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
10251 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10252 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
10253 le
->commit
.renamed_dirino
= in
->ino();
10254 if (srcdn
->authority().first
== whoami
) {
10255 auto&& ls
= in
->get_dirfrags();
10256 for (const auto& dir
: ls
) {
10257 if (!dir
->is_auth())
10258 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
10260 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
10262 } else if (force_journal_dest
) {
10263 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
10264 le
->commit
.renamed_dirino
= target
->ino();
10267 if (target
&& target
->is_dir()) {
10268 ceph_assert(destdn
);
10269 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
10272 if (in
&& in
->is_dir()) {
10273 ceph_assert(srcdn
);
10274 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
10277 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
10278 ceph_assert(le
->commit
.empty());
10279 mdlog
->cancel_entry(le
);
10281 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
10283 ceph_assert(!le
->commit
.empty());
10285 mdr
->more()->peer_update_journaled
= false;
10286 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
10287 srcdn
, srcdnpv
, destdn
, straydn
,
10288 splits
, finish_mdr
);
10289 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
10294 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
10295 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
10296 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
10298 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
10301 straydn
->get_dir()->unlink_inode(straydn
);
10302 straydn
->pop_projected_linkage();
10305 destdn
->get_dir()->unlink_inode(destdn
);
10306 destdn
->pop_projected_linkage();
10309 srcdn
->pop_projected_linkage();
10310 if (srcdn
->authority().first
== mds
->get_nodeid()) {
10311 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
10312 if (srcdn
->get_linkage()->is_primary())
10313 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
10319 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
10320 CInode
*in
= srcdn
->get_linkage()->get_inode();
10321 if (in
&& in
->is_dir()) {
10322 ceph_assert(destdn
);
10323 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
10328 CInode
*oldin
= destdn
->get_linkage()->get_inode();
10329 // update subtree map?
10330 if (oldin
&& oldin
->is_dir()) {
10331 ceph_assert(straydn
);
10332 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
10336 if (mds
->is_resolve()) {
10339 root
= mdcache
->get_subtree_root(straydn
->get_dir());
10341 root
= mdcache
->get_subtree_root(destdn
->get_dir());
10343 mdcache
->try_trim_non_auth_subtree(root
);
10345 mdcache
->send_snaps(splits
[1]);
10346 mdcache
->send_snaps(splits
[0]);
10350 MDSContext::vec finished
;
10351 if (mdr
->more()->is_ambiguous_auth
) {
10352 if (srcdn
->is_auth())
10353 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10355 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10356 mdr
->more()->is_ambiguous_auth
= false;
10358 mds
->queue_waiters(finished
);
10359 if (finish_mdr
|| mdr
->aborted
)
10360 mdcache
->request_finish(mdr
);
10362 mdr
->more()->peer_rolling_back
= false;
10365 mdcache
->finish_rollback(mut
->reqid
, mdr
);
10370 void Server::handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10372 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10373 << " witnessed by " << ack
->get_source()
10374 << " " << *ack
<< dendl
;
10375 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10378 mdr
->more()->peers
.insert(from
);
10379 if (mdr
->more()->srcdn_auth_mds
== from
&&
10380 mdr
->more()->is_remote_frozen_authpin
&&
10381 !mdr
->more()->is_ambiguous_auth
) {
10382 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
10385 // witnessed? or add extra witnesses?
10386 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
10387 if (ack
->is_interrupted()) {
10388 dout(10) << " peer request interrupted, noop" << dendl
;
10389 } else if (ack
->witnesses
.empty()) {
10390 mdr
->more()->witnessed
.insert(from
);
10391 if (!ack
->is_not_journaled())
10392 mdr
->more()->has_journaled_peers
= true;
10394 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
10395 mdr
->more()->extra_witnesses
= ack
->witnesses
;
10396 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
10400 if (ack
->inode_export
.length()) {
10401 dout(10) << " got srci import" << dendl
;
10402 mdr
->more()->inode_import
.share(ack
->inode_export
);
10403 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
10406 // remove from waiting list
10407 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
10408 mdr
->more()->waiting_on_peer
.erase(from
);
10410 if (mdr
->more()->waiting_on_peer
.empty())
10411 dispatch_client_request(mdr
); // go again!
10413 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
10416 void Server::handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10418 dout(10) << "handle_peer_rename_notify_ack " << *mdr
<< " from mds."
10419 << ack
->get_source() << dendl
;
10420 ceph_assert(mdr
->is_peer());
10421 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10423 if (mdr
->more()->waiting_on_peer
.count(from
)) {
10424 mdr
->more()->waiting_on_peer
.erase(from
);
10426 if (mdr
->more()->waiting_on_peer
.empty()) {
10427 if (mdr
->peer_request
)
10428 dispatch_peer_request(mdr
);
10430 dout(10) << " still waiting for rename notify acks from "
10431 << mdr
->more()->waiting_on_peer
<< dendl
;
10435 void Server::_peer_rename_sessions_flushed(MDRequestRef
& mdr
)
10437 dout(10) << "_peer_rename_sessions_flushed " << *mdr
<< dendl
;
10439 if (mdr
->more()->waiting_on_peer
.count(MDS_RANK_NONE
)) {
10440 mdr
->more()->waiting_on_peer
.erase(MDS_RANK_NONE
);
10442 if (mdr
->more()->waiting_on_peer
.empty()) {
10443 if (mdr
->peer_request
)
10444 dispatch_peer_request(mdr
);
10446 dout(10) << " still waiting for rename notify acks from "
10447 << mdr
->more()->waiting_on_peer
<< dendl
;
10452 /* This function takes responsibility for the passed mdr*/
10453 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
10455 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10457 // traverse to path
10458 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10462 if (!diri
->is_dir()) {
10463 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10466 dout(10) << "lssnap on " << *diri
<< dendl
;
10469 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
10472 if (!check_access(mdr
, diri
, MAY_READ
))
10475 SnapRealm
*realm
= diri
->find_snaprealm();
10476 map
<snapid_t
,const SnapInfo
*> infomap
;
10477 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
10479 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
10481 max_entries
= infomap
.size();
10482 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
10484 // make sure at least one item can be encoded
10485 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
10487 __u64 last_snapid
= 0;
10488 string offset_str
= req
->get_path2();
10489 if (!offset_str
.empty())
10490 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
10494 static DirStat empty
;
10495 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
10497 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
10501 auto p
= infomap
.upper_bound(last_snapid
);
10502 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
10503 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
10507 if (p
->second
->ino
== diri
->ino())
10508 snap_name
= p
->second
->name
;
10510 snap_name
= p
->second
->get_long_name();
10512 unsigned start_len
= dnbl
.length();
10513 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
10516 encode(snap_name
, dnbl
);
10518 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
10519 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
10520 dout(20) << "encode_infinite_lease" << dendl
;
10522 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
10525 keep
.substr_of(dnbl
, 0, start_len
);
10532 encode(num
, dirbl
);
10534 if (p
== infomap
.end()) {
10535 flags
= CEPH_READDIR_FRAG_END
;
10536 if (last_snapid
== 0)
10537 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
10539 encode(flags
, dirbl
);
10540 dirbl
.claim_append(dnbl
);
10542 mdr
->reply_extra_bl
= dirbl
;
10543 mdr
->tracei
= diri
;
10544 respond_to_request(mdr
, 0);
10550 struct C_MDS_mksnap_finish
: public ServerLogContext
{
10553 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
10554 ServerLogContext(s
, r
), diri(di
), info(i
) {}
10555 void finish(int r
) override
{
10556 server
->_mksnap_finish(mdr
, diri
, info
);
10560 /* This function takes responsibility for the passed mdr*/
10561 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
10563 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10564 // make sure we have as new a map as the client
10565 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
10566 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
10569 if (!mds
->mdsmap
->allows_snaps()) {
10570 // you can't make snapshots until you set an option right now
10571 dout(5) << "new snapshots are disabled for this fs" << dendl
;
10572 respond_to_request(mdr
, -CEPHFS_EPERM
);
10576 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10581 if (!diri
->is_dir()) {
10582 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10585 if (diri
->is_system() && !diri
->is_root()) {
10586 // no snaps in system dirs (root is ok)
10587 dout(5) << "is an internal system dir" << dendl
;
10588 respond_to_request(mdr
, -CEPHFS_EPERM
);
10592 std::string_view snapname
= req
->get_filepath().last_dentry();
10594 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10595 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10596 respond_to_request(mdr
, -CEPHFS_EPERM
);
10600 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
10603 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10604 MutationImpl::LockOpVec lov
;
10605 lov
.add_xlock(&diri
->snaplock
);
10606 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10609 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10610 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10613 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10616 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10619 if (inodeno_t subvol_ino
= diri
->find_snaprealm()->get_subvolume_ino();
10620 (subvol_ino
&& subvol_ino
!= diri
->ino())) {
10621 dout(5) << "is a descendent of a subvolume dir" << dendl
;
10622 respond_to_request(mdr
, -CEPHFS_EPERM
);
10626 // check if we can create any more snapshots
10627 // we don't allow any more if we are already at or beyond the limit
10628 if (diri
->snaprealm
&&
10629 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
10630 respond_to_request(mdr
, -CEPHFS_EMLINK
);
10634 // make sure name is unique
10635 if (diri
->snaprealm
&&
10636 diri
->snaprealm
->exists(snapname
)) {
10637 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10640 if (snapname
.length() == 0 ||
10641 snapname
[0] == '_') {
10642 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10646 // allocate a snapid
10647 if (!mdr
->more()->stid
) {
10649 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
10650 mdr
->get_mds_stamp(),
10651 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10652 new C_MDS_RetryRequest(mdcache
, mdr
));
10656 version_t stid
= mdr
->more()->stid
;
10658 auto p
= mdr
->more()->snapidbl
.cbegin();
10660 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
10662 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10664 SnapPayload payload
;
10665 if (req
->get_data().length()) {
10667 auto iter
= req
->get_data().cbegin();
10668 decode(payload
, iter
);
10669 } catch (const ceph::buffer::error
&e
) {
10670 // backward compat -- client sends xattr bufferlist. however,
10671 // that is not used anywhere -- so (log and) ignore.
10672 dout(20) << ": no metadata in payload (old client?)" << dendl
;
10678 info
.ino
= diri
->ino();
10679 info
.snapid
= snapid
;
10680 info
.name
= snapname
;
10681 info
.stamp
= mdr
->get_op_stamp();
10682 info
.metadata
= payload
.metadata
;
10684 auto pi
= diri
->project_inode(mdr
, false, true);
10685 pi
.inode
->ctime
= info
.stamp
;
10686 if (info
.stamp
> pi
.inode
->rstat
.rctime
)
10687 pi
.inode
->rstat
.rctime
= info
.stamp
;
10688 pi
.inode
->rstat
.rsnaps
++;
10689 pi
.inode
->version
= diri
->pre_dirty();
10691 // project the snaprealm
10692 auto &newsnap
= *pi
.snapnode
;
10693 newsnap
.created
= snapid
;
10694 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
10696 em
.first
->second
= info
;
10697 newsnap
.seq
= snapid
;
10698 newsnap
.last_created
= snapid
;
10700 // journal the inode changes
10701 mdr
->ls
= mdlog
->get_current_segment();
10702 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
10703 mdlog
->start_entry(le
);
10705 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10706 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10707 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10708 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10710 // journal the snaprealm changes
10711 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
10716 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
10718 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
10720 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
10724 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10727 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10729 // notify other mds
10730 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
10732 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
10736 mdr
->snapid
= info
.snapid
;
10737 mdr
->tracei
= diri
;
10738 respond_to_request(mdr
, 0);
10744 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
10747 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10748 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10749 void finish(int r
) override
{
10750 server
->_rmsnap_finish(mdr
, diri
, snapid
);
10754 /* This function takes responsibility for the passed mdr*/
10755 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
10757 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10759 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10763 if (!diri
->is_dir()) {
10764 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10768 std::string_view snapname
= req
->get_filepath().last_dentry();
10770 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10771 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10772 respond_to_request(mdr
, -CEPHFS_EPERM
);
10776 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
10778 // does snap exist?
10779 if (snapname
.length() == 0 || snapname
[0] == '_') {
10780 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't prune a parent snap, currently.
10783 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
10784 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10787 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
10788 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
10790 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10791 MutationImpl::LockOpVec lov
;
10792 lov
.add_xlock(&diri
->snaplock
);
10793 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10795 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10796 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10799 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10802 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10806 if (!mdr
->more()->stid
) {
10807 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
10808 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10809 new C_MDS_RetryRequest(mdcache
, mdr
));
10812 version_t stid
= mdr
->more()->stid
;
10813 auto p
= mdr
->more()->snapidbl
.cbegin();
10816 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
10818 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10821 auto pi
= diri
->project_inode(mdr
, false, true);
10822 pi
.inode
->version
= diri
->pre_dirty();
10823 pi
.inode
->ctime
= mdr
->get_op_stamp();
10824 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10825 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10826 pi
.inode
->rstat
.rsnaps
--;
10828 mdr
->ls
= mdlog
->get_current_segment();
10829 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
10830 mdlog
->start_entry(le
);
10832 // project the snaprealm
10833 auto &newnode
= *pi
.snapnode
;
10834 newnode
.snaps
.erase(snapid
);
10836 newnode
.last_destroyed
= seq
;
10838 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10839 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10840 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10841 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10843 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
10848 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10850 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10851 snapid_t stid
= mdr
->more()->stid
;
10852 auto p
= mdr
->more()->snapidbl
.cbegin();
10858 mds
->snapclient
->commit(stid
, mdr
->ls
);
10860 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10862 // notify other mds
10863 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
10865 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
10869 respond_to_request(mdr
, 0);
10871 // purge snapshot data
10872 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
10875 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
10878 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10879 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10880 void finish(int r
) override
{
10881 server
->_renamesnap_finish(mdr
, diri
, snapid
);
10885 /* This function takes responsibility for the passed mdr*/
10886 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
10888 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10889 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
10890 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10894 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10898 if (!diri
->is_dir()) { // dir only
10899 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10903 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
10904 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10905 respond_to_request(mdr
, -CEPHFS_EPERM
);
10909 std::string_view dstname
= req
->get_filepath().last_dentry();
10910 std::string_view srcname
= req
->get_filepath2().last_dentry();
10911 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
10913 if (srcname
.length() == 0 || srcname
[0] == '_') {
10914 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't rename a parent snap.
10917 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
10918 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10921 if (dstname
.length() == 0 || dstname
[0] == '_') {
10922 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10925 if (diri
->snaprealm
->exists(dstname
)) {
10926 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10930 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
10931 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
10934 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10935 MutationImpl::LockOpVec lov
;
10936 lov
.add_xlock(&diri
->snaplock
);
10937 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10939 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10940 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10943 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10946 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10950 if (!mdr
->more()->stid
) {
10951 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
10952 &mdr
->more()->stid
,
10953 new C_MDS_RetryRequest(mdcache
, mdr
));
10957 version_t stid
= mdr
->more()->stid
;
10958 dout(10) << " stid is " << stid
<< dendl
;
10960 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10963 auto pi
= diri
->project_inode(mdr
, false, true);
10964 pi
.inode
->ctime
= mdr
->get_op_stamp();
10965 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10966 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10967 pi
.inode
->version
= diri
->pre_dirty();
10969 // project the snaprealm
10970 auto &newsnap
= *pi
.snapnode
;
10971 auto it
= newsnap
.snaps
.find(snapid
);
10972 ceph_assert(it
!= newsnap
.snaps
.end());
10973 it
->second
.name
= dstname
;
10975 // journal the inode changes
10976 mdr
->ls
= mdlog
->get_current_segment();
10977 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
10978 mdlog
->start_entry(le
);
10980 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10981 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10982 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10983 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10985 // journal the snaprealm changes
10986 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
10991 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10993 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10997 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10999 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11001 // notify other mds
11002 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
11004 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
11008 mdr
->tracei
= diri
;
11009 mdr
->snapid
= snapid
;
11010 respond_to_request(mdr
, 0);
11014 * Return true if server is in state RECONNECT and this
11015 * client has not yet reconnected.
11017 bool Server::waiting_for_reconnect(client_t c
) const
11019 return client_reconnect_gather
.count(c
) > 0;
11022 void Server::dump_reconnect_status(Formatter
*f
) const
11024 f
->open_object_section("reconnect_status");
11025 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
11026 f
->close_section();