1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
35 #include "msg/Messenger.h"
37 #include "osdc/Objecter.h"
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
58 #include <string_view>
61 #include "common/config.h"
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
68 class ServerContext
: public MDSContext
{
71 MDSRank
*get_mds() override
77 explicit ServerContext(Server
*s
) : server(s
) {
78 ceph_assert(server
!= NULL
);
82 class Batch_Getattr_Lookup
: public BatchOp
{
85 ceph::ref_t
<MDRequestImpl
> mdr
;
86 std::vector
<ceph::ref_t
<MDRequestImpl
>> batch_reqs
;
89 Batch_Getattr_Lookup(Server
* s
, const ceph::ref_t
<MDRequestImpl
>& r
)
91 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
)
92 mdr
->batch_op_map
= &mdr
->dn
[0].back()->batch_ops
;
94 mdr
->batch_op_map
= &mdr
->in
[0]->batch_ops
;
96 void add_request(const ceph::ref_t
<MDRequestImpl
>& r
) override
{
97 batch_reqs
.push_back(r
);
99 ceph::ref_t
<MDRequestImpl
> find_new_head() override
{
100 while (!batch_reqs
.empty()) {
101 auto r
= std::move(batch_reqs
.back());
102 batch_reqs
.pop_back();
106 r
->batch_op_map
= mdr
->batch_op_map
;
107 mdr
->batch_op_map
= nullptr;
113 void _forward(mds_rank_t t
) override
{
114 MDCache
* mdcache
= server
->mdcache
;
115 mdcache
->mds
->forward_message_mds(mdr
->release_client_request(), t
);
116 mdr
->set_mds_stamp(ceph_clock_now());
117 for (auto& m
: batch_reqs
) {
119 mdcache
->request_forward(m
, t
);
123 void _respond(int r
) override
{
124 mdr
->set_mds_stamp(ceph_clock_now());
125 for (auto& m
: batch_reqs
) {
127 m
->tracei
= mdr
->tracei
;
128 m
->tracedn
= mdr
->tracedn
;
129 server
->respond_to_request(m
, r
);
133 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
135 void print(std::ostream
& o
) {
136 o
<< "[batch front=" << *mdr
<< "]";
140 class ServerLogContext
: public MDSLogContextBase
{
143 MDSRank
*get_mds() override
149 void pre_finish(int r
) override
{
151 mdr
->mark_event("journal_committed: ");
154 explicit ServerLogContext(Server
*s
) : server(s
) {
155 ceph_assert(server
!= NULL
);
157 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
158 ceph_assert(server
!= NULL
);
162 void Server::create_logger()
164 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
166 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
167 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
168 plb
.add_u64_counter(l_mdss_handle_peer_request
, "handle_peer_request",
169 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
170 plb
.add_u64_counter(l_mdss_handle_client_session
,
171 "handle_client_session", "Client session messages", "hcs",
172 PerfCountersBuilder::PRIO_INTERESTING
);
173 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
174 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
175 plb
.add_u64_counter(l_mdss_cap_acquisition_throttle
,
176 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
177 PerfCountersBuilder::PRIO_INTERESTING
);
179 // fop latencies are useful
180 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
181 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
182 "Request type lookup hash of inode latency");
183 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
184 "Request type lookup inode latency");
185 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
186 "Request type lookup parent latency");
187 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
188 "Request type lookup name latency");
189 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
190 "Request type lookup latency");
191 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
192 "Request type lookup snapshot latency");
193 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
194 "Request type get attribute latency");
195 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
196 "Request type set attribute latency");
197 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
198 "Request type set file layout latency");
199 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
200 "Request type set directory layout latency");
201 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
202 "Request type set extended attribute latency");
203 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
204 "Request type remove extended attribute latency");
205 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
206 "Request type read directory latency");
207 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
208 "Request type set file lock latency");
209 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
210 "Request type get file lock latency");
211 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
212 "Request type create latency");
213 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
214 "Request type open latency");
215 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
216 "Request type make node latency");
217 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
218 "Request type link latency");
219 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
220 "Request type unlink latency");
221 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
222 "Request type remove directory latency");
223 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
224 "Request type rename latency");
225 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
226 "Request type make directory latency");
227 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
228 "Request type symbolic link latency");
229 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
230 "Request type list snapshot latency");
231 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
232 "Request type make snapshot latency");
233 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
234 "Request type remove snapshot latency");
235 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
236 "Request type rename snapshot latency");
238 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
239 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
240 "Client requests dispatched");
241 plb
.add_u64_counter(l_mdss_dispatch_peer_request
, "dispatch_server_request",
242 "Server requests dispatched");
244 logger
= plb
.create_perf_counters();
245 g_ceph_context
->get_perfcounters_collection()->add(logger
);
248 Server::Server(MDSRank
*m
, MetricsHandler
*metrics_handler
) :
250 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
251 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate")),
252 metrics_handler(metrics_handler
)
254 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
255 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
256 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
257 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
258 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
259 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
260 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
261 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
262 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
263 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
266 void Server::dispatch(const cref_t
<Message
> &m
)
268 switch (m
->get_type()) {
269 case CEPH_MSG_CLIENT_RECONNECT
:
270 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
275 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
277 1. In reconnect phase, client sent unsafe requests to mds.
278 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
279 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
280 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
283 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
285 // handle_peer_request()/handle_client_session() will wait if necessary
286 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
287 const auto &req
= ref_cast
<MClientRequest
>(m
);
288 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
289 Session
*session
= mds
->get_session(req
);
290 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
291 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
294 bool queue_replay
= false;
295 if (req
->is_replay() || req
->is_async()) {
296 dout(3) << "queuing replayed op" << dendl
;
299 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
300 inodeno_t
ino(req
->head
.ino
);
301 mdcache
->add_replay_ino_alloc(ino
);
302 if (replay_unsafe_with_closed_session
&&
303 session
->free_prealloc_inos
.contains(ino
)) {
304 // don't purge inodes that will be created by later replay
305 session
->free_prealloc_inos
.erase(ino
);
306 session
->delegated_inos
.insert(ino
);
309 } else if (req
->get_retry_attempt()) {
310 // process completed request in clientreplay stage. The completed request
311 // might have created new file/directorie. This guarantees MDS sends a reply
312 // to client before other request modifies the new file/directorie.
313 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
314 dout(3) << "queuing completed op" << dendl
;
317 // this request was created before the cap reconnect message, drop any embedded
319 req
->releases
.clear();
322 req
->mark_queued_for_replay();
323 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
328 bool wait_for_active
= true;
329 if (mds
->is_stopping()) {
330 wait_for_active
= false;
331 } else if (mds
->is_clientreplay()) {
332 if (req
->is_queued_for_replay()) {
333 wait_for_active
= false;
336 if (wait_for_active
) {
337 dout(3) << "not active yet, waiting" << dendl
;
338 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
343 switch (m
->get_type()) {
344 case CEPH_MSG_CLIENT_SESSION
:
345 handle_client_session(ref_cast
<MClientSession
>(m
));
347 case CEPH_MSG_CLIENT_REQUEST
:
348 handle_client_request(ref_cast
<MClientRequest
>(m
));
350 case CEPH_MSG_CLIENT_RECLAIM
:
351 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
353 case MSG_MDS_PEER_REQUEST
:
354 handle_peer_request(ref_cast
<MMDSPeerRequest
>(m
));
357 derr
<< "server unknown message " << m
->get_type() << dendl
;
358 ceph_abort_msg("server unknown message");
364 // ----------------------------------------------------------
365 // SESSION management
367 class C_MDS_session_finish
: public ServerLogContext
{
372 interval_set
<inodeno_t
> inos_to_free
;
374 interval_set
<inodeno_t
> inos_to_purge
;
375 LogSegment
*ls
= nullptr;
378 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= nullptr) :
379 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
380 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
,
381 const interval_set
<inodeno_t
>& to_free
, version_t iv
,
382 const interval_set
<inodeno_t
>& to_purge
, LogSegment
*_ls
, Context
*fin_
= nullptr) :
383 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
),
384 inos_to_free(to_free
), inotablev(iv
), inos_to_purge(to_purge
), ls(_ls
), fin(fin_
) {}
385 void finish(int r
) override
{
387 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos_to_free
, inotablev
, inos_to_purge
, ls
);
394 Session
* Server::find_session_by_uuid(std::string_view uuid
)
396 Session
* session
= nullptr;
397 for (auto& it
: mds
->sessionmap
.get_sessions()) {
398 auto& metadata
= it
.second
->info
.client_metadata
;
400 auto p
= metadata
.find("uuid");
401 if (p
== metadata
.end() || p
->second
!= uuid
)
406 } else if (!session
->reclaiming_from
) {
407 assert(it
.second
->reclaiming_from
== session
);
410 assert(session
->reclaiming_from
== it
.second
);
416 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
418 if (!session
->is_open() && !session
->is_stale()) {
419 dout(10) << "session not open, dropping this req" << dendl
;
423 auto reply
= make_message
<MClientReclaimReply
>(0);
424 if (m
->get_uuid().empty()) {
425 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
426 reply
->set_result(-CEPHFS_EINVAL
);
427 mds
->send_message_client(reply
, session
);
431 unsigned flags
= m
->get_flags();
432 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
433 dout(10) << __func__
<< " unsupported flags" << dendl
;
434 reply
->set_result(-CEPHFS_EOPNOTSUPP
);
435 mds
->send_message_client(reply
, session
);
439 Session
* target
= find_session_by_uuid(m
->get_uuid());
441 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
442 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
443 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
444 reply
->set_result(-CEPHFS_EPERM
);
445 mds
->send_message_client(reply
, session
);
448 assert(!target
->reclaiming_from
);
449 assert(!session
->reclaiming_from
);
450 session
->reclaiming_from
= target
;
451 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
454 if (flags
& CEPH_RECLAIM_RESET
) {
455 finish_reclaim_session(session
, reply
);
462 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
464 Session
*target
= session
->reclaiming_from
;
466 session
->reclaiming_from
= nullptr;
470 int64_t session_id
= session
->get_client().v
;
471 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
472 assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
473 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
477 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
478 reply
->set_epoch(epoch
);
479 mds
->send_message_client(reply
, session
);
482 send_reply
= nullptr;
485 bool blocklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
486 return map
.is_blocklisted(target
->info
.inst
.addr
);
489 if (blocklisted
|| !g_conf()->mds_session_blocklist_on_evict
) {
490 kill_session(target
, send_reply
);
492 CachedStackStringStream css
;
493 mds
->evict_client(target
->get_client().v
, false, true, *css
, send_reply
);
496 mds
->send_message_client(reply
, session
);
500 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
502 Session
*session
= mds
->get_session(m
);
503 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
504 assert(m
->get_source().is_client()); // should _not_ come from an mds!
507 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
511 std::string_view fs_name
= mds
->get_fs_name();
512 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
513 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
517 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
518 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
522 if (m
->get_flags() & MClientReclaim::FLAG_FINISH
) {
523 finish_reclaim_session(session
);
525 reclaim_session(session
, m
);
529 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
532 Session
*session
= mds
->get_session(m
);
534 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
535 ceph_assert(m
->get_source().is_client()); // should _not_ come from an mds!
538 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
539 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
540 reply
->metadata
["error_string"] = "sessionless";
541 mds
->send_message(reply
, m
->get_connection());
545 std::string_view fs_name
= mds
->get_fs_name();
546 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
547 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
548 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
549 reply
->metadata
["error_string"] = "client doesn't have caps for FS \"" +
550 std::string(fs_name
) + "\"";
551 mds
->send_message(std::move(reply
), m
->get_connection());
555 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
556 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
557 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
558 // close requests need to be handled when mds is active
559 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
560 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
564 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
565 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
571 logger
->inc(l_mdss_handle_client_session
);
574 switch (m
->get_op()) {
575 case CEPH_SESSION_REQUEST_OPEN
:
576 if (session
->is_opening() ||
577 session
->is_open() ||
578 session
->is_stale() ||
579 session
->is_killing() ||
580 terminating_sessions
) {
581 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
584 ceph_assert(session
->is_closed() || session
->is_closing());
586 if (mds
->is_stopping()) {
587 dout(10) << "mds is stopping, dropping open req" << dendl
;
592 auto& addr
= session
->info
.inst
.addr
;
593 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
594 auto& client_metadata
= session
->info
.client_metadata
;
596 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
597 auto now
= ceph_clock_now();
598 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
599 auto elapsed
= now
- m
->get_recv_stamp();
600 CachedStackStringStream css
;
601 *css
<< "New client session:"
602 << " addr=\"" << session
->info
.inst
.addr
<< "\""
603 << ",elapsed=" << elapsed
604 << ",throttled=" << throttle_elapsed
605 << ",status=\"" << status
<< "\"";
607 *css
<< ",error=\"" << err
<< "\"";
609 const auto& metadata
= session
->info
.client_metadata
;
610 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
611 *css
<< ",root=\"" << it
->second
<< "\"";
613 dout(2) << css
->strv() << dendl
;
616 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
) {
617 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
618 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
619 m
->metadata
["error_string"] = err_str
;
620 mds
->send_message_client(m
, session
);
621 log_session_status("REJECTED", err_str
);
624 bool blocklisted
= mds
->objecter
->with_osdmap(
625 [&addr
](const OSDMap
&osd_map
) -> bool {
626 return osd_map
.is_blocklisted(addr
);
630 dout(10) << "rejecting blocklisted client " << addr
<< dendl
;
631 // This goes on the wire and the "blacklisted" substring is
632 // depended upon by the kernel client for detecting whether it
633 // has been blocklisted. If mounted with recover_session=clean
634 // (since 5.4), it tries to automatically recover itself from
636 send_reject_message("blocklisted (blacklisted)");
641 if (client_metadata
.features
.empty())
642 infer_supported_features(session
, client_metadata
);
644 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
645 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
646 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
647 for (const auto& p
: client_metadata
) {
648 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
651 feature_bitset_t missing_features
= required_client_features
;
652 missing_features
-= client_metadata
.features
;
653 if (!missing_features
.empty()) {
654 CachedStackStringStream css
;
655 *css
<< "missing required features '" << missing_features
<< "'";
656 send_reject_message(css
->strv());
657 mds
->clog
->warn() << "client session (" << session
->info
.inst
658 << ") lacks required features " << missing_features
659 << "; client supports " << client_metadata
.features
;
664 // Special case for the 'root' metadata path; validate that the claimed
665 // root is actually within the caps of the session
666 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
667 auto claimed_root
= it
->second
;
668 CachedStackStringStream css
;
670 // claimed_root has a leading "/" which we strip before passing
672 if (claimed_root
.empty() || claimed_root
[0] != '/') {
674 *css
<< "invalue root '" << claimed_root
<< "'";
675 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
677 *css
<< "non-allowable root '" << claimed_root
<< "'";
681 // Tell the client we're rejecting their open
682 send_reject_message(css
->strv());
683 mds
->clog
->warn() << "client session with " << css
->strv()
684 << " denied (" << session
->info
.inst
<< ")";
690 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
691 if (find_session_by_uuid(it
->second
)) {
692 send_reject_message("duplicated session uuid");
693 mds
->clog
->warn() << "client session with duplicated session uuid '"
694 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
700 if (session
->is_closed()) {
701 mds
->sessionmap
.add_session(session
);
704 pv
= mds
->sessionmap
.mark_projected(session
);
705 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
706 mds
->sessionmap
.touch_session(session
);
707 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
709 log_session_status("ACCEPTED", "");
711 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
712 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
717 case CEPH_SESSION_REQUEST_RENEWCAPS
:
718 if (session
->is_open() || session
->is_stale()) {
719 mds
->sessionmap
.touch_session(session
);
720 if (session
->is_stale()) {
721 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
722 mds
->locker
->resume_stale_caps(session
);
723 mds
->sessionmap
.touch_session(session
);
725 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
726 mds
->send_message_client(reply
, session
);
728 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
732 case CEPH_SESSION_REQUEST_CLOSE
:
734 if (session
->is_closed() ||
735 session
->is_closing() ||
736 session
->is_killing()) {
737 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
740 if (session
->is_importing()) {
741 dout(10) << "ignoring close req on importing session" << dendl
;
744 ceph_assert(session
->is_open() ||
745 session
->is_stale() ||
746 session
->is_opening());
747 if (m
->get_seq() < session
->get_push_seq()) {
748 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
749 << ", dropping" << dendl
;
752 // We are getting a seq that is higher than expected.
753 // Handle the same as any other seqn error.
755 if (m
->get_seq() != session
->get_push_seq()) {
756 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
757 << ", BUGGY!" << dendl
;
758 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
759 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
762 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
766 case CEPH_SESSION_FLUSHMSG_ACK
:
767 finish_flush_session(session
, m
->get_seq());
770 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
771 if (mds
->is_active())
780 void Server::flush_session(Session
*session
, MDSGatherBuilder
& gather
) {
781 if (!session
->is_open() ||
782 !session
->get_connection() ||
783 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
787 version_t seq
= session
->wait_for_flush(gather
.new_sub());
788 mds
->send_message_client(
789 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
792 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
794 for (const auto& client
: client_set
) {
795 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
796 ceph_assert(session
);
797 flush_session(session
, gather
);
801 void Server::finish_flush_session(Session
*session
, version_t seq
)
803 MDSContext::vec finished
;
804 session
->finish_flush(seq
, finished
);
805 mds
->queue_waiters(finished
);
808 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
809 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
810 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
)
812 dout(10) << "_session_logged " << session
->info
.inst
813 << " state_seq " << state_seq
814 << " " << (open
? "open":"close") << " " << pv
815 << " inos_to_free " << inos_to_free
<< " inotablev " << piv
816 << " inos_to_purge " << inos_to_purge
<< dendl
;
819 if (inos_to_purge
.size()){
821 session
->info
.prealloc_inos
.subtract(inos_to_purge
);
822 ls
->purging_inodes
.insert(inos_to_purge
);
823 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())
824 mdcache
->purge_inodes(inos_to_purge
, ls
);
827 if (inos_to_free
.size()) {
829 ceph_assert(session
->is_closing() || session
->is_killing() ||
830 session
->is_opening()); // re-open closing session
831 session
->info
.prealloc_inos
.subtract(inos_to_free
);
832 mds
->inotable
->apply_release_ids(inos_to_free
);
833 ceph_assert(mds
->inotable
->get_version() == piv
);
835 session
->free_prealloc_inos
= session
->info
.prealloc_inos
;
836 session
->delegated_inos
.clear();
839 mds
->sessionmap
.mark_dirty(session
);
842 if (session
->get_state_seq() != state_seq
) {
843 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
844 << ", noop" << dendl
;
845 // close must have been canceled (by an import?), or any number of other things..
847 ceph_assert(session
->is_opening());
848 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
849 mds
->sessionmap
.touch_session(session
);
850 metrics_handler
->add_session(session
);
851 ceph_assert(session
->get_connection());
852 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
853 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
854 reply
->supported_features
= supported_features
;
855 mds
->send_message_client(reply
, session
);
856 if (mdcache
->is_readonly()) {
857 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
858 mds
->send_message_client(m
, session
);
860 } else if (session
->is_closing() ||
861 session
->is_killing()) {
862 // kill any lingering capabilities, leases, requests
863 bool killing
= session
->is_killing();
864 while (!session
->caps
.empty()) {
865 Capability
*cap
= session
->caps
.front();
866 CInode
*in
= cap
->get_inode();
867 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
868 mds
->locker
->remove_client_cap(in
, cap
, killing
);
870 while (!session
->leases
.empty()) {
871 ClientLease
*r
= session
->leases
.front();
872 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
873 dout(20) << " killing client lease of " << *dn
<< dendl
;
874 dn
->remove_client_lease(r
, mds
->locker
);
876 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
877 dout(20) << " removing client from reconnect set" << dendl
;
878 if (client_reconnect_gather
.empty()) {
879 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
880 reconnect_gather_finish();
883 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
884 dout(20) << " removing client from reclaim set" << dendl
;
885 if (client_reclaim_gather
.empty()) {
886 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
887 mds
->maybe_clientreplay_done();
891 if (session
->is_closing()) {
892 // mark con disposable. if there is a fault, we will get a
893 // reset and clean it up. if the client hasn't received the
894 // CLOSE message yet, they will reconnect and get an
895 // ms_handle_remote_reset() and realize they had in fact closed.
896 // do this *before* sending the message to avoid a possible
898 if (session
->get_connection()) {
899 // Conditional because terminate_sessions will indiscrimately
900 // put sessions in CLOSING whether they ever had a conn or not.
901 session
->get_connection()->mark_disposable();
905 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
906 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
908 metrics_handler
->remove_session(session
);
909 mds
->sessionmap
.remove_session(session
);
910 } else if (session
->is_killing()) {
911 // destroy session, close connection
912 if (session
->get_connection()) {
913 session
->get_connection()->mark_down();
914 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
915 session
->set_connection(nullptr);
917 metrics_handler
->remove_session(session
);
918 mds
->sessionmap
.remove_session(session
);
928 * Inject sessions from some source other than actual connections.
931 * - sessions inferred from journal replay
932 * - sessions learned from other MDSs during rejoin
933 * - sessions learned from other MDSs during dir/caps migration
934 * - sessions learned from other MDSs during a cross-MDS rename
936 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
937 map
<client_t
,client_metadata_t
>& cmm
,
938 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
940 version_t pv
= mds
->sessionmap
.get_projected();
942 dout(10) << "prepare_force_open_sessions " << pv
943 << " on " << cm
.size() << " clients"
946 mds
->objecter
->with_osdmap(
947 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
948 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
949 if (osd_map
.is_blocklisted(p
->second
.addr
)) {
950 dout(10) << " ignoring blocklisted client." << p
->first
951 << " (" << p
->second
.addr
<< ")" << dendl
;
960 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
961 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
962 pv
= mds
->sessionmap
.mark_projected(session
);
964 if (session
->is_closed() ||
965 session
->is_closing() ||
966 session
->is_killing()) {
967 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
968 auto q
= cmm
.find(p
->first
);
970 session
->info
.client_metadata
.merge(q
->second
);
972 ceph_assert(session
->is_open() ||
973 session
->is_opening() ||
974 session
->is_stale());
977 smap
[p
->first
] = make_pair(session
, sseq
);
978 session
->inc_importing();
983 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
987 * FIXME: need to carefully consider the race conditions between a
988 * client trying to close a session and an MDS doing an import
989 * trying to force open a session...
991 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
992 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
994 for (auto &it
: smap
) {
995 Session
*session
= it
.second
.first
;
996 uint64_t sseq
= it
.second
.second
;
998 if (session
->get_state_seq() != sseq
) {
999 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
1001 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
1002 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1003 mds
->sessionmap
.touch_session(session
);
1004 metrics_handler
->add_session(session
);
1006 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1007 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1008 reply
->supported_features
= supported_features
;
1009 mds
->send_message_client(reply
, session
);
1011 if (mdcache
->is_readonly())
1012 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1015 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
1016 ceph_assert(session
->is_open() || session
->is_stale());
1020 session
->dec_importing();
1023 mds
->sessionmap
.mark_dirty(session
);
1026 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
1029 class C_MDS_TerminatedSessions
: public ServerContext
{
1030 void finish(int r
) override
{
1031 server
->terminating_sessions
= false;
1034 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
1037 void Server::terminate_sessions()
1039 dout(5) << "terminating all sessions..." << dendl
;
1041 terminating_sessions
= true;
1043 // kill them off. clients will retry etc.
1044 set
<Session
*> sessions
;
1045 mds
->sessionmap
.get_client_session_set(sessions
);
1046 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1047 p
!= sessions
.end();
1049 Session
*session
= *p
;
1050 if (session
->is_closing() ||
1051 session
->is_killing() ||
1052 session
->is_closed())
1054 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
1057 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
1061 void Server::find_idle_sessions()
1063 auto now
= clock::now();
1064 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1066 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1069 // (caps go stale, lease die)
1070 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1071 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1073 // don't kick clients if we've been laggy
1074 if (last_cleared_laggy
< cutoff
) {
1075 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1076 << "), not marking any client stale" << dendl
;
1080 std::vector
<Session
*> to_evict
;
1082 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1083 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1084 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1085 std::vector
<Session
*> new_stale
;
1087 for (auto session
: *(sessions_p1
->second
)) {
1088 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1089 if (last_cap_renew_span
< cutoff
) {
1090 dout(20) << "laggiest active session is " << session
->info
.inst
1091 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1095 if (session
->last_seen
> session
->last_cap_renew
) {
1096 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1097 if (last_cap_renew_span
< cutoff
) {
1098 dout(20) << "laggiest active session is " << session
->info
.inst
1099 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1104 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1105 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1106 "has arrived" << dendl
;
1107 // evict session without marking it stale
1108 to_evict
.push_back(session
);
1112 if (defer_session_stale
&&
1113 !session
->is_any_flush_waiter() &&
1114 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1115 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1116 "since it holds no caps" << dendl
;
1120 auto it
= session
->info
.client_metadata
.find("timeout");
1121 if (it
!= session
->info
.client_metadata
.end()) {
1122 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1124 dout(10) << "skipping session " << session
->info
.inst
1125 << ", infinite timeout specified" << dendl
;
1128 double cutoff
= queue_max_age
+ timeout
;
1129 if (last_cap_renew_span
< cutoff
) {
1130 dout(10) << "skipping session " << session
->info
.inst
1131 << ", timeout (" << timeout
<< ") specified"
1132 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1136 // do not go through stale, evict it directly.
1137 to_evict
.push_back(session
);
1139 dout(10) << "new stale session " << session
->info
.inst
1140 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1141 new_stale
.push_back(session
);
1145 for (auto session
: new_stale
) {
1146 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1147 if (mds
->locker
->revoke_stale_caps(session
)) {
1148 mds
->locker
->remove_stale_leases(session
);
1149 finish_flush_session(session
, session
->get_push_seq());
1150 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
, session
->get_push_seq());
1151 mds
->send_message_client(m
, session
);
1153 to_evict
.push_back(session
);
1159 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1161 // Collect a list of sessions exceeding the autoclose threshold
1162 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1163 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1164 for (auto session
: *(sessions_p2
->second
)) {
1165 assert(session
->is_stale());
1166 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1167 if (last_cap_renew_span
< cutoff
) {
1168 dout(20) << "oldest stale session is " << session
->info
.inst
1169 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1172 to_evict
.push_back(session
);
1176 for (auto session
: to_evict
) {
1177 if (session
->is_importing()) {
1178 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1182 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1183 mds
->clog
->warn() << "evicting unresponsive client " << *session
1184 << ", after " << last_cap_renew_span
<< " seconds";
1185 dout(10) << "autoclosing stale session " << session
->info
.inst
1186 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1188 if (g_conf()->mds_session_blocklist_on_timeout
) {
1189 CachedStackStringStream css
;
1190 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
1192 kill_session(session
, NULL
);
1197 void Server::evict_cap_revoke_non_responders() {
1198 if (!cap_revoke_eviction_timeout
) {
1202 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1204 for (auto const &client
: to_evict
) {
1205 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1206 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1207 << " seconds, evicting";
1208 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1211 CachedStackStringStream css
;
1212 bool evicted
= mds
->evict_client(client
.v
, false,
1213 g_conf()->mds_session_blocklist_on_evict
,
1215 if (evicted
&& logger
) {
1216 logger
->inc(l_mdss_cap_revoke_eviction
);
1221 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1222 if (changed
.count("mds_forward_all_requests_to_auth")){
1223 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
1225 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1226 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1227 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1228 << cap_revoke_eviction_timeout
<< dendl
;
1230 if (changed
.count("mds_recall_max_decay_rate")) {
1231 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1233 if (changed
.count("mds_max_snaps_per_dir")) {
1234 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1235 dout(20) << __func__
<< " max snapshots per directory changed to "
1236 << max_snaps_per_dir
<< dendl
;
1238 if (changed
.count("mds_client_delegate_inos_pct")) {
1239 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1241 if (changed
.count("mds_max_caps_per_client")) {
1242 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1244 if (changed
.count("mds_session_cap_acquisition_throttle")) {
1245 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
1247 if (changed
.count("mds_session_max_caps_throttle_ratio")) {
1248 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
1250 if (changed
.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1251 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1253 if (changed
.count("mds_alternate_name_max")) {
1254 alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
1259 * XXX bump in the interface here, not using an MDSContext here
1260 * because all the callers right now happen to use a SaferCond
1262 void Server::kill_session(Session
*session
, Context
*on_safe
)
1264 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1266 if ((session
->is_opening() ||
1267 session
->is_open() ||
1268 session
->is_stale()) &&
1269 !session
->is_importing()) {
1270 dout(10) << "kill_session " << session
<< dendl
;
1271 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1273 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1274 if (session
->is_closing() ||
1275 session
->is_killing()) {
1277 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1279 ceph_assert(session
->is_closed() ||
1280 session
->is_importing());
1282 on_safe
->complete(0);
1287 size_t Server::apply_blocklist(const std::set
<entity_addr_t
> &blocklist
)
1289 bool prenautilus
= mds
->objecter
->with_osdmap(
1290 [&](const OSDMap
& o
) {
1291 return o
.require_osd_release
< ceph_release_t::nautilus
;
1294 std::vector
<Session
*> victims
;
1295 const auto& sessions
= mds
->sessionmap
.get_sessions();
1296 for (const auto& p
: sessions
) {
1297 if (!p
.first
.is_client()) {
1298 // Do not apply OSDMap blocklist to MDS daemons, we find out
1299 // about their death via MDSMap.
1303 Session
*s
= p
.second
;
1304 auto inst_addr
= s
->info
.inst
.addr
;
1305 // blocklist entries are always TYPE_ANY for nautilus+
1306 inst_addr
.set_type(entity_addr_t::TYPE_ANY
);
1307 if (blocklist
.count(inst_addr
)) {
1308 victims
.push_back(s
);
1312 // ...except pre-nautilus, they were TYPE_LEGACY
1313 inst_addr
.set_type(entity_addr_t::TYPE_LEGACY
);
1314 if (blocklist
.count(inst_addr
)) {
1315 victims
.push_back(s
);
1320 for (const auto& s
: victims
) {
1321 kill_session(s
, nullptr);
1324 dout(10) << "apply_blocklist: killed " << victims
.size() << dendl
;
1326 return victims
.size();
1329 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1331 dout(10) << __func__
<< " : "
1332 << session
->info
.inst
1333 << " pending_prealloc_inos " << session
->pending_prealloc_inos
1334 << " free_prealloc_inos " << session
->free_prealloc_inos
1335 << " delegated_inos " << session
->delegated_inos
<< dendl
;
1337 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1338 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1341 // release alloc and pending-alloc inos for this session
1342 // and wipe out session state, in case the session close aborts for some reason
1343 interval_set
<inodeno_t
> inos_to_free
;
1344 inos_to_free
.insert(session
->pending_prealloc_inos
);
1345 inos_to_free
.insert(session
->free_prealloc_inos
);
1346 if (inos_to_free
.size()) {
1347 mds
->inotable
->project_release_ids(inos_to_free
);
1348 piv
= mds
->inotable
->get_projected_version();
1352 auto le
= new ESession(session
->info
.inst
, false, pv
, inos_to_free
, piv
, session
->delegated_inos
);
1353 auto fin
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, inos_to_free
, piv
,
1354 session
->delegated_inos
, mdlog
->get_current_segment(), on_safe
);
1355 mdlog
->start_submit_entry(le
, fin
);
1358 // clean up requests, too
1359 while(!session
->requests
.empty()) {
1360 auto mdr
= MDRequestRef(*session
->requests
.begin());
1361 mdcache
->request_kill(mdr
);
1364 finish_flush_session(session
, session
->get_push_seq());
1367 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1369 reconnect_done
= reconnect_done_
;
1371 auto now
= clock::now();
1372 set
<Session
*> sessions
;
1373 mds
->sessionmap
.get_client_session_set(sessions
);
1374 for (auto session
: sessions
) {
1375 if (session
->is_open()) {
1376 client_reconnect_gather
.insert(session
->get_client());
1377 session
->set_reconnecting(true);
1378 session
->last_cap_renew
= now
;
1382 if (client_reconnect_gather
.empty()) {
1383 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1384 reconnect_gather_finish();
1388 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1390 reconnect_start
= now
;
1391 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1392 mds
->sessionmap
.dump();
1395 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1397 dout(7) << "handle_client_reconnect " << m
->get_source()
1398 << (m
->has_more() ? " (more)" : "") << dendl
;
1399 client_t from
= m
->get_source().num();
1400 Session
*session
= mds
->get_session(m
);
1402 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1403 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1404 reply
->metadata
["error_string"] = "sessionless";
1405 mds
->send_message(reply
, m
->get_connection());
1409 if (!session
->is_open()) {
1410 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1411 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1412 mds
->send_message(reply
, m
->get_connection());
1416 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1418 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1419 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1420 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1424 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1425 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1428 if (reconnect_all_deny
|| !mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1429 // XXX maybe in the future we can do better than this?
1430 if (reconnect_all_deny
) {
1431 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl
;
1433 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1435 mds
->clog
->info() << "denied reconnect attempt (mds is "
1436 << ceph_mds_state_name(mds
->get_state())
1437 << ") from " << m
->get_source_inst()
1438 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1441 std::string error_str
;
1442 if (!session
->is_open()) {
1443 error_str
= "session is closed";
1444 } else if (mdcache
->is_readonly()) {
1445 error_str
= "mds is readonly";
1447 if (session
->info
.client_metadata
.features
.empty())
1448 infer_supported_features(session
, session
->info
.client_metadata
);
1450 feature_bitset_t missing_features
= required_client_features
;
1451 missing_features
-= session
->info
.client_metadata
.features
;
1452 if (!missing_features
.empty()) {
1453 CachedStackStringStream css
;
1454 *css
<< "missing required features '" << missing_features
<< "'";
1455 error_str
= css
->strv();
1459 if (!error_str
.empty()) {
1461 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1462 mds
->clog
->info() << "denied reconnect attempt from "
1463 << m
->get_source_inst() << " (" << error_str
<< ")";
1468 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1469 mds
->send_message_client(r
, session
);
1470 if (session
->is_open()) {
1471 client_reconnect_denied
.insert(session
->get_client());
1476 if (!m
->has_more()) {
1477 metrics_handler
->add_session(session
);
1478 // notify client of success with an OPEN
1479 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1480 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
1481 reply
->supported_features
= supported_features
;
1482 mds
->send_message_client(reply
, session
);
1483 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1486 session
->last_cap_renew
= clock::now();
1489 for (const auto &r
: m
->realms
) {
1490 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1491 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1494 if (in
->snaprealm
) {
1495 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1497 // this can happen if we are non-auth or we rollback snaprealm
1498 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1500 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1502 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1503 << " seq " << r
.realm
.seq
<< dendl
;
1504 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1509 for (const auto &p
: m
->caps
) {
1510 // make sure our last_cap_id is MAX over all issued caps
1511 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1512 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1514 CInode
*in
= mdcache
->get_inode(p
.first
);
1515 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1517 if (in
&& in
->is_auth()) {
1518 // we recovered it, and it's ours. take note.
1519 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1520 << " on " << *in
<< dendl
;
1521 in
->reconnect_cap(from
, p
.second
, session
);
1522 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1523 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1527 if (in
&& !in
->is_auth()) {
1529 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1530 // add to cap export list.
1531 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1532 in
->authority().first
, true);
1534 // don't know if the inode is mine
1535 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1536 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1540 reconnect_last_seen
= clock::now();
1542 if (!m
->has_more()) {
1543 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1545 // remove from gather set
1546 client_reconnect_gather
.erase(from
);
1547 session
->set_reconnecting(false);
1548 if (client_reconnect_gather
.empty())
1549 reconnect_gather_finish();
1553 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1556 auto it
= client_metadata
.find("ceph_version");
1557 if (it
!= client_metadata
.end()) {
1558 // user space client
1559 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1560 supported
= CEPHFS_FEATURE_LUMINOUS
;
1561 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1562 supported
= CEPHFS_FEATURE_KRAKEN
;
1564 it
= client_metadata
.find("kernel_version");
1565 if (it
!= client_metadata
.end()) {
1567 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1568 supported
= CEPHFS_FEATURE_LUMINOUS
;
1571 if (supported
== -1 &&
1572 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1573 supported
= CEPHFS_FEATURE_JEWEL
;
1575 if (supported
>= 0) {
1576 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1577 client_metadata
.features
= feature_bitset_t(value
);
1578 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1582 void Server::update_required_client_features()
1584 required_client_features
= mds
->mdsmap
->get_required_client_features();
1585 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1587 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1588 set
<Session
*> sessions
;
1589 mds
->sessionmap
.get_client_session_set(sessions
);
1590 for (auto session
: sessions
) {
1591 feature_bitset_t missing_features
= required_client_features
;
1592 missing_features
-= session
->info
.client_metadata
.features
;
1593 if (!missing_features
.empty()) {
1594 bool blocklisted
= mds
->objecter
->with_osdmap(
1595 [session
](const OSDMap
&osd_map
) -> bool {
1596 return osd_map
.is_blocklisted(session
->info
.inst
.addr
);
1601 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1602 << missing_features
<< "'";
1603 CachedStackStringStream css
;
1604 mds
->evict_client(session
->get_client().v
, false,
1605 g_conf()->mds_session_blocklist_on_evict
, *css
);
1611 void Server::reconnect_gather_finish()
1613 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1614 ceph_assert(reconnect_done
);
1616 if (!mds
->snapclient
->is_synced()) {
1617 // make sure snaptable cache is populated. snaprealms will be
1618 // extensively used in rejoin stage.
1619 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1620 mds
->snapclient
->wait_for_sync(reconnect_done
);
1622 reconnect_done
->complete(0);
1624 reconnect_done
= NULL
;
1627 void Server::reconnect_tick()
1629 bool reject_all_reconnect
= false;
1630 if (reconnect_evicting
) {
1631 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1636 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1637 * then load less meta information in rejoin phase. This will shorten reboot time.
1638 * Moreover, loading less meta increases the chance standby with less memory can failover.
1640 * Why not shorten reconnect period?
1641 * Clients may send unsafe or retry requests, which haven't been
1642 * completed before old mds stop, to new mds. These requests may
1643 * need to be processed during new mds's clientreplay phase,
1644 * see: #https://github.com/ceph/ceph/pull/29059.
1646 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1647 if (client_reconnect_gather
.empty())
1650 if (reconnect_all_deny
&& (client_reconnect_gather
== client_reconnect_denied
))
1651 reject_all_reconnect
= true;
1653 auto now
= clock::now();
1654 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1655 if (elapse1
< g_conf()->mds_reconnect_timeout
&& !reject_all_reconnect
)
1658 vector
<Session
*> remaining_sessions
;
1659 remaining_sessions
.reserve(client_reconnect_gather
.size());
1660 for (auto c
: client_reconnect_gather
) {
1661 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1662 ceph_assert(session
);
1663 remaining_sessions
.push_back(session
);
1664 // client re-sends cap flush messages before the reconnect message
1665 if (session
->last_seen
> reconnect_last_seen
)
1666 reconnect_last_seen
= session
->last_seen
;
1669 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1670 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2 && !reject_all_reconnect
) {
1671 dout(7) << "reconnect_tick: last seen " << elapse2
1672 << " seconds ago, extending reconnect interval" << dendl
;
1676 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1677 << " clients have not reconnected in time" << dendl
;
1679 // If we're doing blocklist evictions, use this to wait for them before
1680 // proceeding to reconnect_gather_finish
1681 MDSGatherBuilder
gather(g_ceph_context
);
1683 for (auto session
: remaining_sessions
) {
1684 // Keep sessions that have specified timeout. These sessions will prevent
1685 // mds from going to active. MDS goes to active after they all have been
1686 // killed or reclaimed.
1687 if (session
->info
.client_metadata
.find("timeout") !=
1688 session
->info
.client_metadata
.end()) {
1689 dout(1) << "reconnect keeps " << session
->info
.inst
1690 << ", need to be reclaimed" << dendl
;
1691 client_reclaim_gather
.insert(session
->get_client());
1695 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1697 mds
->clog
->warn() << "evicting unresponsive client " << *session
1698 << ", after waiting " << elapse1
1699 << " seconds during MDS startup";
1701 // make _session_logged() purge orphan objects of lost async/unsafe requests
1702 session
->delegated_inos
.swap(session
->free_prealloc_inos
);
1704 if (g_conf()->mds_session_blocklist_on_timeout
) {
1705 CachedStackStringStream css
;
1706 mds
->evict_client(session
->get_client().v
, false, true, *css
,
1709 kill_session(session
, NULL
);
1712 failed_reconnects
++;
1714 client_reconnect_gather
.clear();
1715 client_reconnect_denied
.clear();
1717 if (gather
.has_subs()) {
1718 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1719 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1720 [this](int r
){reconnect_gather_finish();})));
1722 reconnect_evicting
= true;
1724 reconnect_gather_finish();
1728 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1730 if (!locks
.length()) return;
1733 auto p
= locks
.cbegin();
1734 decode(numlocks
, p
);
1735 for (int i
= 0; i
< numlocks
; ++i
) {
1737 lock
.client
= client
;
1738 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1739 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1741 decode(numlocks
, p
);
1742 for (int i
= 0; i
< numlocks
; ++i
) {
1744 lock
.client
= client
;
1745 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1746 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1751 * Call this when the MDCache is oversized, to send requests to the clients
1752 * to trim some caps, and consequently unpin some inodes in the MDCache so
1753 * that it can trim too.
1755 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1757 const auto now
= clock::now();
1758 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1759 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1760 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1761 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1763 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1764 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1765 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1766 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1767 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1768 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1770 dout(7) << __func__
<< ":"
1771 << " min=" << min_caps_per_client
1772 << " max=" << max_caps_per_client
1773 << " total=" << Capability::count()
1774 << " flags=" << flags
1777 /* trim caps of sessions with the most caps first */
1778 std::multimap
<uint64_t, Session
*> caps_session
;
1779 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1780 auto num_caps
= s
->caps
.size();
1781 auto cache_liveness
= s
->get_session_cache_liveness();
1782 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1783 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1786 mds
->sessionmap
.get_client_sessions(std::move(f
));
1788 std::pair
<bool, uint64_t> result
= {false, 0};
1789 auto& [throttled
, caps_recalled
] = result
;
1790 last_recall_state
= now
;
1791 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1792 if (!session
->is_open() ||
1793 !session
->get_connection() ||
1794 !session
->info
.inst
.name
.is_client())
1797 dout(10) << __func__
<< ":"
1798 << " session " << session
->info
.inst
1799 << " caps " << num_caps
1800 << ", leases " << session
->leases
.size()
1804 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1805 newlim
= min_caps_per_client
;
1807 newlim
= num_caps
-recall_max_caps
;
1809 if (num_caps
> newlim
) {
1810 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1811 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1812 newlim
= num_caps
-recall
;
1813 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1814 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1815 const uint64_t global_recall_throttle
= recall_throttle
.get();
1816 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1817 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1820 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1821 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1824 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1825 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1830 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1832 const auto session_recall
= session
->get_recall_caps();
1833 const auto session_release
= session
->get_release_caps();
1834 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1835 /* The session has been unable to keep up with the number of caps
1836 * recalled (by half); additionally, to prevent marking sessions
1837 * we've just begun to recall from, the session_recall counter
1838 * (decayed count of caps recently recalled) is **greater** than the
1839 * session threshold for the session's cap recall throttle.
1841 dout(15) << " 2*session_release < session_recall"
1842 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1843 " 2*session_recall < recall_max_decay_threshold"
1844 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1845 " Skipping because we are unlikely to get more released." << dendl
;
1847 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1848 /* The number of caps recalled is less than the number we *could*
1849 * recall (so there isn't much left to recall?) and the number of
1850 * caps is less than the current recall_caps counter (decayed count
1851 * of caps recently recalled).
1853 dout(15) << " 2*recall < session_recall "
1854 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1855 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1856 " Skipping because we are unlikely to get more released." << dendl
;
1861 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1863 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1864 m
->head
.max_caps
= newlim
;
1865 mds
->send_message_client(m
, session
);
1867 flush_session(session
, *gather
);
1869 caps_recalled
+= session
->notify_recall_sent(newlim
);
1870 recall_throttle
.hit(recall
);
1874 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1879 void Server::force_clients_readonly()
1881 dout(10) << "force_clients_readonly" << dendl
;
1882 set
<Session
*> sessions
;
1883 mds
->sessionmap
.get_client_session_set(sessions
);
1884 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1885 p
!= sessions
.end();
1887 Session
*session
= *p
;
1888 if (!session
->info
.inst
.name
.is_client() ||
1889 !(session
->is_open() || session
->is_stale()))
1891 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1896 * some generic stuff for finishing off requests
1898 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1900 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1901 ceph_assert(!mdr
->has_completed
);
1903 // note trace items for eventual reply.
1912 early_reply(mdr
, in
, dn
);
1914 mdr
->committing
= true;
1915 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1917 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1918 if (mds
->queue_one_replay()) {
1919 dout(10) << " queued next replay op" << dendl
;
1921 dout(10) << " journaled last replay op" << dendl
;
1923 } else if (mdr
->did_early_reply
)
1924 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1929 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1930 std::string_view event
)
1933 string
event_str("submit entry: ");
1935 mdr
->mark_event(event_str
);
1937 mdlog
->submit_entry(le
, fin
);
1941 * send response built from mdr contents and error code; clean up mdr
1943 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1945 if (mdr
->client_request
) {
1946 if (mdr
->is_batch_head()) {
1947 dout(20) << __func__
<< " batch head " << *mdr
<< dendl
;
1948 mdr
->release_batch_op()->respond(r
);
1950 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
1952 } else if (mdr
->internal_op
> -1) {
1953 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1954 if (!mdr
->internal_op_finish
)
1955 ceph_abort_msg("trying to respond to internal op without finisher");
1956 mdr
->internal_op_finish
->complete(r
);
1957 mdcache
->request_finish(mdr
);
1961 // statistics mds req op number and latency
1962 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
1964 int code
= l_mdss_first
;
1965 switch(req
->get_op()) {
1966 case CEPH_MDS_OP_LOOKUPHASH
:
1967 code
= l_mdss_req_lookuphash_latency
;
1969 case CEPH_MDS_OP_LOOKUPINO
:
1970 code
= l_mdss_req_lookupino_latency
;
1972 case CEPH_MDS_OP_LOOKUPPARENT
:
1973 code
= l_mdss_req_lookupparent_latency
;
1975 case CEPH_MDS_OP_LOOKUPNAME
:
1976 code
= l_mdss_req_lookupname_latency
;
1978 case CEPH_MDS_OP_LOOKUP
:
1979 code
= l_mdss_req_lookup_latency
;
1981 case CEPH_MDS_OP_LOOKUPSNAP
:
1982 code
= l_mdss_req_lookupsnap_latency
;
1984 case CEPH_MDS_OP_GETATTR
:
1985 code
= l_mdss_req_getattr_latency
;
1987 case CEPH_MDS_OP_SETATTR
:
1988 code
= l_mdss_req_setattr_latency
;
1990 case CEPH_MDS_OP_SETLAYOUT
:
1991 code
= l_mdss_req_setlayout_latency
;
1993 case CEPH_MDS_OP_SETDIRLAYOUT
:
1994 code
= l_mdss_req_setdirlayout_latency
;
1996 case CEPH_MDS_OP_SETXATTR
:
1997 code
= l_mdss_req_setxattr_latency
;
1999 case CEPH_MDS_OP_RMXATTR
:
2000 code
= l_mdss_req_rmxattr_latency
;
2002 case CEPH_MDS_OP_READDIR
:
2003 code
= l_mdss_req_readdir_latency
;
2005 case CEPH_MDS_OP_SETFILELOCK
:
2006 code
= l_mdss_req_setfilelock_latency
;
2008 case CEPH_MDS_OP_GETFILELOCK
:
2009 code
= l_mdss_req_getfilelock_latency
;
2011 case CEPH_MDS_OP_CREATE
:
2012 code
= l_mdss_req_create_latency
;
2014 case CEPH_MDS_OP_OPEN
:
2015 code
= l_mdss_req_open_latency
;
2017 case CEPH_MDS_OP_MKNOD
:
2018 code
= l_mdss_req_mknod_latency
;
2020 case CEPH_MDS_OP_LINK
:
2021 code
= l_mdss_req_link_latency
;
2023 case CEPH_MDS_OP_UNLINK
:
2024 code
= l_mdss_req_unlink_latency
;
2026 case CEPH_MDS_OP_RMDIR
:
2027 code
= l_mdss_req_rmdir_latency
;
2029 case CEPH_MDS_OP_RENAME
:
2030 code
= l_mdss_req_rename_latency
;
2032 case CEPH_MDS_OP_MKDIR
:
2033 code
= l_mdss_req_mkdir_latency
;
2035 case CEPH_MDS_OP_SYMLINK
:
2036 code
= l_mdss_req_symlink_latency
;
2038 case CEPH_MDS_OP_LSSNAP
:
2039 code
= l_mdss_req_lssnap_latency
;
2041 case CEPH_MDS_OP_MKSNAP
:
2042 code
= l_mdss_req_mksnap_latency
;
2044 case CEPH_MDS_OP_RMSNAP
:
2045 code
= l_mdss_req_rmsnap_latency
;
2047 case CEPH_MDS_OP_RENAMESNAP
:
2048 code
= l_mdss_req_renamesnap_latency
;
2050 default: ceph_abort();
2052 logger
->tinc(code
, lat
);
2055 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
2057 if (!g_conf()->mds_early_reply
)
2060 if (mdr
->no_early_reply
) {
2061 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2065 if (mdr
->has_more() && mdr
->more()->has_journaled_peers
) {
2066 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl
;
2070 if (mdr
->alloc_ino
) {
2071 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2075 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2076 entity_inst_t client_inst
= req
->get_source_inst();
2077 if (client_inst
.name
.is_mds())
2080 if (req
->is_replay()) {
2081 dout(10) << " no early reply on replay op" << dendl
;
2086 auto reply
= make_message
<MClientReply
>(*req
, 0);
2087 reply
->set_unsafe();
2089 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2091 //_rename_finish() does not send dentry link/unlink message to replicas.
2092 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2093 // that have projected linkages from getting new replica.
2094 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2096 dout(10) << "early_reply " << reply
->get_result()
2097 << " (" << cpp_strerror(reply
->get_result())
2098 << ") " << *req
<< dendl
;
2100 if (tracei
|| tracedn
) {
2102 mdr
->cap_releases
.erase(tracei
->vino());
2104 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2106 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2109 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2110 mds
->send_message_client(reply
, mdr
->session
);
2112 mdr
->did_early_reply
= true;
2114 mds
->logger
->inc(l_mds_reply
);
2115 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2116 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2117 if (client_inst
.name
.is_client()) {
2118 mds
->sessionmap
.hit_session(mdr
->session
);
2120 perf_gather_op_latency(req
, lat
);
2121 dout(20) << "lat " << lat
<< dendl
;
2123 mdr
->mark_event("early_replied");
2128 * include a trace to tracei
2131 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2133 ceph_assert(mdr
.get());
2134 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2136 dout(7) << "reply_client_request " << reply
->get_result()
2137 << " (" << cpp_strerror(reply
->get_result())
2138 << ") " << *req
<< dendl
;
2140 mdr
->mark_event("replying");
2142 Session
*session
= mdr
->session
;
2144 // note successful request in session map?
2146 // setfilelock requests are special, they only modify states in MDS memory.
2147 // The states get lost when MDS fails. If Client re-send a completed
2148 // setfilelock request, it means that client did not receive corresponding
2149 // setfilelock reply. So MDS should re-execute the setfilelock request.
2150 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2151 reply
->get_result() == 0 && session
) {
2152 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2153 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2155 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2159 // give any preallocated inos to the session
2160 apply_allocated_inos(mdr
, session
);
2162 // get tracei/tracedn from mdr?
2163 CInode
*tracei
= mdr
->tracei
;
2164 CDentry
*tracedn
= mdr
->tracedn
;
2166 bool is_replay
= mdr
->client_request
->is_replay();
2167 bool did_early_reply
= mdr
->did_early_reply
;
2168 entity_inst_t client_inst
= req
->get_source_inst();
2170 if (!did_early_reply
&& !is_replay
) {
2172 mds
->logger
->inc(l_mds_reply
);
2173 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2174 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2175 if (session
&& client_inst
.name
.is_client()) {
2176 mds
->sessionmap
.hit_session(session
);
2178 perf_gather_op_latency(req
, lat
);
2179 dout(20) << "lat " << lat
<< dendl
;
2182 mdr
->cap_releases
.erase(tracei
->vino());
2184 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2187 // drop non-rdlocks before replying, so that we can issue leases
2188 mdcache
->request_drop_non_rdlocks(mdr
);
2191 if (session
&& !client_inst
.name
.is_mds()) {
2193 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2194 (tracei
|| tracedn
)) {
2197 mdcache
->try_reconnect_cap(tracei
, session
);
2199 // include metadata in reply
2200 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2204 // We can set the extra bl unconditionally: if it's already been sent in the
2205 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2206 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2208 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2209 mds
->send_message_client(reply
, session
);
2212 if (req
->is_queued_for_replay() &&
2213 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2214 if (reply
->get_result() < 0) {
2215 int r
= reply
->get_result();
2216 derr
<< "reply_client_request: failed to replay " << *req
2217 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2218 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2220 mds
->queue_one_replay();
2224 mdcache
->request_finish(mdr
);
2226 // take a closer look at tracei, if it happens to be a remote link
2229 tracedn
->get_projected_linkage()->is_remote()) {
2230 mdcache
->eval_remote(tracedn
);
2235 * pass inode OR dentry (not both, or we may get confused)
2237 * trace is in reverse order (i.e. root inode comes last)
2239 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2240 CInode
*in
, CDentry
*dn
,
2243 // skip doing this for debugging purposes?
2244 if (g_conf()->mds_inject_traceless_reply_probability
&&
2245 mdr
->ls
&& !mdr
->o_trunc
&&
2246 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2247 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2251 // inode, dentry, dir, ..., inode
2253 mds_rank_t whoami
= mds
->get_nodeid();
2254 Session
*session
= mdr
->session
;
2255 snapid_t snapid
= mdr
->snapid
;
2256 utime_t now
= ceph_clock_now();
2258 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2261 if (snapid
== CEPH_NOSNAP
) {
2264 realm
= in
->find_snaprealm();
2266 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2267 reply
->snapbl
= realm
->get_snap_trace();
2268 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2273 reply
->head
.is_dentry
= 1;
2274 CDir
*dir
= dn
->get_dir();
2275 CInode
*diri
= dir
->get_inode();
2277 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2278 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2280 #ifdef MDS_VERIFY_FRAGSTAT
2281 if (dir
->is_complete())
2282 dir
->verify_fragstat();
2285 ds
.frag
= dir
->get_frag();
2286 ds
.auth
= dir
->get_dir_auth().first
;
2287 if (dir
->is_auth() && !forward_all_requests_to_auth
)
2288 dir
->get_dist_spec(ds
.dist
, whoami
);
2290 dir
->encode_dirstat(bl
, session
->info
, ds
);
2291 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2293 encode(dn
->get_name(), bl
);
2296 CDentry::linkage_t
*dnl
= dn
->get_linkage(mdr
->get_client(), mdr
);
2297 if (dnl
->is_primary()) {
2298 ceph_assert(dnl
->get_inode() == in
);
2299 lease_mask
= CEPH_LEASE_PRIMARY_LINK
;
2301 if (dnl
->is_remote())
2302 ceph_assert(dnl
->get_remote_ino() == in
->ino());
2306 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, bl
);
2307 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
2309 reply
->head
.is_dentry
= 0;
2313 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2314 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
2315 reply
->head
.is_target
= 1;
2317 reply
->head
.is_target
= 0;
2319 reply
->set_trace(bl
);
2322 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2324 dout(4) << "handle_client_request " << *req
<< dendl
;
2327 mds
->logger
->inc(l_mds_request
);
2329 logger
->inc(l_mdss_handle_client_request
);
2331 if (!mdcache
->is_open()) {
2332 dout(5) << "waiting for root" << dendl
;
2333 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2337 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2339 Session
*session
= 0;
2340 if (req
->get_source().is_client()) {
2341 session
= mds
->get_session(req
);
2343 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2344 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2345 session
->is_closing() ||
2346 session
->is_killing()) {
2347 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2351 if (req
->is_queued_for_replay())
2352 mds
->queue_one_replay();
2358 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2359 // send it? hrm, this isn't ideal; they may get a lot of copies if
2360 // they have a high request rate.
2363 // completed request?
2364 bool has_completed
= false;
2365 if (req
->is_replay() || req
->get_retry_attempt()) {
2366 ceph_assert(session
);
2368 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2369 has_completed
= true;
2370 if (!session
->is_open())
2372 // Don't send traceless reply if the completed request has created
2373 // new inode. Treat the request as lookup request instead.
2374 if (req
->is_replay() ||
2375 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2376 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2377 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2378 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2379 auto reply
= make_message
<MClientReply
>(*req
, 0);
2380 if (created
!= inodeno_t()) {
2382 encode(created
, extra
);
2383 reply
->set_extra_bl(extra
);
2385 mds
->send_message_client(reply
, session
);
2387 if (req
->is_queued_for_replay())
2388 mds
->queue_one_replay();
2392 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2393 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2394 dout(10) << " completed request which created new inode " << created
2395 << ", convert it to lookup request" << dendl
;
2396 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2397 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2402 // trim completed_request list
2403 if (req
->get_oldest_client_tid() > 0) {
2404 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2405 ceph_assert(session
);
2406 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2407 // Sessions 'completed_requests' was dirtied, mark it to be
2408 // potentially flushed at segment expiry.
2409 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2411 if (session
->get_num_trim_requests_warnings() > 0 &&
2412 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2413 session
->reset_num_trim_requests_warnings();
2415 if (session
->get_num_completed_requests() >=
2416 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2417 session
->inc_num_trim_requests_warnings();
2418 CachedStackStringStream css
;
2419 *css
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2420 << req
->get_oldest_client_tid() << "), "
2421 << session
->get_num_completed_requests()
2422 << " completed requests recorded in session\n";
2423 mds
->clog
->warn() << css
->strv();
2424 dout(20) << __func__
<< " " << css
->strv() << dendl
;
2429 // register + dispatch
2430 MDRequestRef mdr
= mdcache
->request_start(req
);
2435 mdr
->session
= session
;
2436 session
->requests
.push_back(&mdr
->item_session_request
);
2440 mdr
->has_completed
= true;
2442 // process embedded cap releases?
2443 // (only if NOT replay!)
2444 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
2445 client_t client
= req
->get_source().num();
2446 for (const auto &r
: req
->releases
) {
2447 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2449 req
->releases
.clear();
2452 dispatch_client_request(mdr
);
2456 void Server::handle_osd_map()
2458 /* Note that we check the OSDMAP_FULL flag directly rather than
2459 * using osdmap_full_flag(), because we want to know "is the flag set"
2460 * rather than "does the flag apply to us?" */
2461 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2462 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
2463 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2464 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2465 << o
.get_epoch() << dendl
;
2469 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2471 // we shouldn't be waiting on anyone.
2472 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_peer
.empty());
2475 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2476 //if the mdr is a "batch_op" and it has followers, pick a follower as
2477 //the new "head of the batch ops" and go on processing the new one.
2478 if (mdr
->is_batch_head()) {
2479 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2480 auto it
= mdr
->batch_op_map
->find(mask
);
2481 auto new_batch_head
= it
->second
->find_new_head();
2482 if (!new_batch_head
) {
2483 mdr
->batch_op_map
->erase(it
);
2486 mdr
= std::move(new_batch_head
);
2490 } else if (mdr
->aborted
) {
2491 mdr
->aborted
= false;
2492 mdcache
->request_kill(mdr
);
2496 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2498 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2500 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2502 if (req
->may_write() && mdcache
->is_readonly()) {
2503 dout(10) << " read-only FS" << dendl
;
2504 respond_to_request(mdr
, -CEPHFS_EROFS
);
2507 if (mdr
->has_more() && mdr
->more()->peer_error
) {
2508 dout(10) << " got error from peers" << dendl
;
2509 respond_to_request(mdr
, mdr
->more()->peer_error
);
2514 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2515 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2516 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2517 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2518 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2519 req
->get_op() == CEPH_MDS_OP_CREATE
||
2520 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2521 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2522 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2523 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2524 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started peer request
2527 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2528 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
2531 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2535 switch (req
->get_op()) {
2536 case CEPH_MDS_OP_LOOKUPHASH
:
2537 case CEPH_MDS_OP_LOOKUPINO
:
2538 handle_client_lookup_ino(mdr
, false, false);
2540 case CEPH_MDS_OP_LOOKUPPARENT
:
2541 handle_client_lookup_ino(mdr
, true, false);
2543 case CEPH_MDS_OP_LOOKUPNAME
:
2544 handle_client_lookup_ino(mdr
, false, true);
2548 case CEPH_MDS_OP_LOOKUP
:
2549 handle_client_getattr(mdr
, true);
2552 case CEPH_MDS_OP_LOOKUPSNAP
:
2553 // lookupsnap does not reference a CDentry; treat it as a getattr
2554 case CEPH_MDS_OP_GETATTR
:
2555 handle_client_getattr(mdr
, false);
2558 case CEPH_MDS_OP_SETATTR
:
2559 handle_client_setattr(mdr
);
2561 case CEPH_MDS_OP_SETLAYOUT
:
2562 handle_client_setlayout(mdr
);
2564 case CEPH_MDS_OP_SETDIRLAYOUT
:
2565 handle_client_setdirlayout(mdr
);
2567 case CEPH_MDS_OP_SETXATTR
:
2568 handle_client_setxattr(mdr
);
2570 case CEPH_MDS_OP_RMXATTR
:
2571 handle_client_removexattr(mdr
);
2574 case CEPH_MDS_OP_READDIR
:
2575 handle_client_readdir(mdr
);
2578 case CEPH_MDS_OP_SETFILELOCK
:
2579 handle_client_file_setlock(mdr
);
2582 case CEPH_MDS_OP_GETFILELOCK
:
2583 handle_client_file_readlock(mdr
);
2587 case CEPH_MDS_OP_CREATE
:
2588 if (mdr
->has_completed
)
2589 handle_client_open(mdr
); // already created.. just open
2591 handle_client_openc(mdr
);
2594 case CEPH_MDS_OP_OPEN
:
2595 handle_client_open(mdr
);
2600 case CEPH_MDS_OP_MKNOD
:
2601 handle_client_mknod(mdr
);
2603 case CEPH_MDS_OP_LINK
:
2604 handle_client_link(mdr
);
2606 case CEPH_MDS_OP_UNLINK
:
2607 case CEPH_MDS_OP_RMDIR
:
2608 handle_client_unlink(mdr
);
2610 case CEPH_MDS_OP_RENAME
:
2611 handle_client_rename(mdr
);
2613 case CEPH_MDS_OP_MKDIR
:
2614 handle_client_mkdir(mdr
);
2616 case CEPH_MDS_OP_SYMLINK
:
2617 handle_client_symlink(mdr
);
2622 case CEPH_MDS_OP_LSSNAP
:
2623 handle_client_lssnap(mdr
);
2625 case CEPH_MDS_OP_MKSNAP
:
2626 handle_client_mksnap(mdr
);
2628 case CEPH_MDS_OP_RMSNAP
:
2629 handle_client_rmsnap(mdr
);
2631 case CEPH_MDS_OP_RENAMESNAP
:
2632 handle_client_renamesnap(mdr
);
2636 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2637 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
2642 // ---------------------------------------
2645 void Server::handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
)
2647 dout(4) << "handle_peer_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2648 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2650 if (logger
) logger
->inc(l_mdss_handle_peer_request
);
2654 return handle_peer_request_reply(m
);
2656 // the purpose of rename notify is enforcing causal message ordering. making sure
2657 // bystanders have received all messages from rename srcdn's auth MDS.
2658 if (m
->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY
) {
2659 auto reply
= make_message
<MMDSPeerRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK
);
2660 mds
->send_message(reply
, m
->get_connection());
2664 CDentry
*straydn
= NULL
;
2665 if (m
->straybl
.length() > 0) {
2666 mdcache
->decode_replica_stray(straydn
, m
->straybl
, from
);
2667 ceph_assert(straydn
);
2671 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2672 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2673 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2679 if (mdcache
->have_request(m
->get_reqid())) {
2681 mdr
= mdcache
->request_get(m
->get_reqid());
2683 // is my request newer?
2684 if (mdr
->attempt
> m
->get_attempt()) {
2685 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2686 << ", dropping " << *m
<< dendl
;
2690 if (mdr
->attempt
< m
->get_attempt()) {
2691 // mine is old, close it out
2692 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2693 << ", closing out" << dendl
;
2694 mdcache
->request_finish(mdr
);
2696 } else if (mdr
->peer_to_mds
!= from
) {
2697 dout(10) << "local request " << *mdr
<< " not peer to mds." << from
<< dendl
;
2701 // may get these while mdr->peer_request is non-null
2702 if (m
->get_op() == MMDSPeerRequest::OP_DROPLOCKS
) {
2703 mds
->locker
->drop_locks(mdr
.get());
2706 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2707 if (m
->is_abort()) {
2708 mdr
->aborted
= true;
2709 if (mdr
->peer_request
) {
2710 // only abort on-going xlock, wrlock and auth pin
2711 ceph_assert(!mdr
->peer_did_prepare());
2713 mdcache
->request_finish(mdr
);
2716 if (m
->inode_export
.length() > 0)
2717 mdr
->more()->inode_import
= m
->inode_export
;
2718 // finish off request.
2719 mdcache
->request_finish(mdr
);
2726 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2727 dout(10) << "missing peer request for " << m
->get_reqid()
2728 << " OP_FINISH, must have lost race with a forward" << dendl
;
2731 mdr
= mdcache
->request_start_peer(m
->get_reqid(), m
->get_attempt(), m
);
2732 mdr
->set_op_stamp(m
->op_stamp
);
2734 ceph_assert(mdr
->peer_request
== 0); // only one at a time, please!
2738 mdr
->straydn
= straydn
;
2741 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2742 mdr
->locks
.empty()) {
2743 dout(3) << "not active yet, waiting" << dendl
;
2744 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2748 mdr
->reset_peer_request(m
);
2750 dispatch_peer_request(mdr
);
2753 void Server::handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
)
2755 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2757 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2758 metareqid_t r
= m
->get_reqid();
2759 if (!mdcache
->have_uncommitted_leader(r
, from
)) {
2760 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2761 << from
<< " reqid " << r
<< dendl
;
2764 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2765 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2769 if (m
->get_op() == MMDSPeerRequest::OP_COMMITTED
) {
2770 metareqid_t r
= m
->get_reqid();
2771 mdcache
->committed_leader_peer(r
, from
);
2775 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2776 if (m
->get_attempt() != mdr
->attempt
) {
2777 dout(10) << "handle_peer_request_reply " << *mdr
<< " ignoring reply from other attempt "
2778 << m
->get_attempt() << dendl
;
2782 switch (m
->get_op()) {
2783 case MMDSPeerRequest::OP_XLOCKACK
:
2785 // identify lock, leader request
2786 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2787 m
->get_object_info());
2788 mdr
->more()->peers
.insert(from
);
2789 lock
->decode_locked_state(m
->get_lock_data());
2790 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2791 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2792 mdr
->finish_locking(lock
);
2793 lock
->get_xlock(mdr
, mdr
->get_client());
2795 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2796 mdr
->more()->waiting_on_peer
.erase(from
);
2797 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2798 mdcache
->dispatch_request(mdr
);
2802 case MMDSPeerRequest::OP_WRLOCKACK
:
2804 // identify lock, leader request
2805 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2806 m
->get_object_info());
2807 mdr
->more()->peers
.insert(from
);
2808 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2809 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2810 ceph_assert(it
->is_remote_wrlock());
2811 ceph_assert(it
->wrlock_target
== from
);
2813 mdr
->finish_locking(lock
);
2815 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2816 mdr
->more()->waiting_on_peer
.erase(from
);
2817 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2818 mdcache
->dispatch_request(mdr
);
2822 case MMDSPeerRequest::OP_AUTHPINACK
:
2823 handle_peer_auth_pin_ack(mdr
, m
);
2826 case MMDSPeerRequest::OP_LINKPREPACK
:
2827 handle_peer_link_prep_ack(mdr
, m
);
2830 case MMDSPeerRequest::OP_RMDIRPREPACK
:
2831 handle_peer_rmdir_prep_ack(mdr
, m
);
2834 case MMDSPeerRequest::OP_RENAMEPREPACK
:
2835 handle_peer_rename_prep_ack(mdr
, m
);
2838 case MMDSPeerRequest::OP_RENAMENOTIFYACK
:
2839 handle_peer_rename_notify_ack(mdr
, m
);
2847 void Server::dispatch_peer_request(MDRequestRef
& mdr
)
2849 dout(7) << "dispatch_peer_request " << *mdr
<< " " << *mdr
->peer_request
<< dendl
;
2852 dout(7) << " abort flag set, finishing" << dendl
;
2853 mdcache
->request_finish(mdr
);
2857 if (logger
) logger
->inc(l_mdss_dispatch_peer_request
);
2859 int op
= mdr
->peer_request
->get_op();
2861 case MMDSPeerRequest::OP_XLOCK
:
2862 case MMDSPeerRequest::OP_WRLOCK
:
2865 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2866 mdr
->peer_request
->get_object_info());
2869 dout(10) << "don't have object, dropping" << dendl
;
2870 ceph_abort(); // can this happen, if we auth pinned properly.
2872 if (op
== MMDSPeerRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2873 dout(10) << "not auth for remote xlock attempt, dropping on "
2874 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2876 // use acquire_locks so that we get auth_pinning.
2877 MutationImpl::LockOpVec lov
;
2878 for (const auto& p
: mdr
->locks
) {
2880 lov
.add_xlock(p
.lock
);
2881 else if (p
.is_wrlock())
2882 lov
.add_wrlock(p
.lock
);
2887 case MMDSPeerRequest::OP_XLOCK
:
2888 lov
.add_xlock(lock
);
2889 replycode
= MMDSPeerRequest::OP_XLOCKACK
;
2891 case MMDSPeerRequest::OP_WRLOCK
:
2892 lov
.add_wrlock(lock
);
2893 replycode
= MMDSPeerRequest::OP_WRLOCKACK
;
2897 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2901 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2902 r
->set_lock_type(lock
->get_type());
2903 lock
->get_parent()->set_object_info(r
->get_object_info());
2904 if (replycode
== MMDSPeerRequest::OP_XLOCKACK
)
2905 lock
->encode_locked_state(r
->get_lock_data());
2906 mds
->send_message(r
, mdr
->peer_request
->get_connection());
2910 mdr
->reset_peer_request();
2914 case MMDSPeerRequest::OP_UNXLOCK
:
2915 case MMDSPeerRequest::OP_UNWRLOCK
:
2917 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2918 mdr
->peer_request
->get_object_info());
2920 auto it
= mdr
->locks
.find(lock
);
2921 ceph_assert(it
!= mdr
->locks
.end());
2922 bool need_issue
= false;
2924 case MMDSPeerRequest::OP_UNXLOCK
:
2925 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
2927 case MMDSPeerRequest::OP_UNWRLOCK
:
2928 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
2932 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2934 // done. no ack necessary.
2935 mdr
->reset_peer_request();
2939 case MMDSPeerRequest::OP_AUTHPIN
:
2940 handle_peer_auth_pin(mdr
);
2943 case MMDSPeerRequest::OP_LINKPREP
:
2944 case MMDSPeerRequest::OP_UNLINKPREP
:
2945 handle_peer_link_prep(mdr
);
2948 case MMDSPeerRequest::OP_RMDIRPREP
:
2949 handle_peer_rmdir_prep(mdr
);
2952 case MMDSPeerRequest::OP_RENAMEPREP
:
2953 handle_peer_rename_prep(mdr
);
2961 void Server::handle_peer_auth_pin(MDRequestRef
& mdr
)
2963 dout(10) << "handle_peer_auth_pin " << *mdr
<< dendl
;
2965 // build list of objects
2966 list
<MDSCacheObject
*> objects
;
2967 CInode
*auth_pin_freeze
= NULL
;
2968 bool nonblocking
= mdr
->peer_request
->is_nonblocking();
2969 bool fail
= false, wouldblock
= false, readonly
= false;
2970 ref_t
<MMDSPeerRequest
> reply
;
2972 if (mdcache
->is_readonly()) {
2973 dout(10) << " read-only FS" << dendl
;
2979 for (const auto &oi
: mdr
->peer_request
->get_authpins()) {
2980 MDSCacheObject
*object
= mdcache
->get_object(oi
);
2982 dout(10) << " don't have " << oi
<< dendl
;
2987 objects
.push_back(object
);
2988 if (oi
== mdr
->peer_request
->get_authpin_freeze())
2989 auth_pin_freeze
= static_cast<CInode
*>(object
);
2993 // can we auth pin them?
2995 for (const auto& obj
: objects
) {
2996 if (!obj
->is_auth()) {
2997 dout(10) << " not auth for " << *obj
<< dendl
;
3001 if (mdr
->is_auth_pinned(obj
))
3003 if (!mdr
->can_auth_pin(obj
)) {
3005 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
3011 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
3012 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3013 mdr
->drop_local_auth_pins();
3015 mds
->locker
->notify_freeze_waiter(obj
);
3022 /* freeze authpin wrong inode */
3023 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
3024 mdr
->more()->rename_inode
!= auth_pin_freeze
)
3025 mdr
->unfreeze_auth_pin(true);
3027 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3028 * on the source inode to complete. This happens after all locks for the rename
3029 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3030 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3031 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3032 * The solution is freeze the inode and prevent other MDRequests from getting new
3035 if (auth_pin_freeze
) {
3036 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3037 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3038 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3039 mds
->mdlog
->flush();
3045 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3048 mdr
->drop_local_auth_pins(); // just in case
3050 reply
->mark_error_rofs();
3052 reply
->mark_error_wouldblock();
3055 for (const auto& obj
: objects
) {
3056 dout(10) << "auth_pinning " << *obj
<< dendl
;
3059 // return list of my auth_pins (if any)
3060 for (const auto &p
: mdr
->object_states
) {
3061 if (!p
.second
.auth_pinned
)
3063 MDSCacheObjectInfo info
;
3064 p
.first
->set_object_info(info
);
3065 reply
->get_authpins().push_back(info
);
3066 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3067 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3071 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3073 // clean up this request
3074 mdr
->reset_peer_request();
3078 if (mdr
->peer_request
->should_notify_blocking()) {
3079 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3080 reply
->mark_req_blocked();
3081 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3082 mdr
->peer_request
->clear_notify_blocking();
3087 void Server::handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
3089 dout(10) << "handle_peer_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3090 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3092 if (ack
->is_req_blocked()) {
3093 mdr
->disable_lock_cache();
3094 // peer auth pin is blocked, drop locks to avoid deadlock
3095 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3100 set
<MDSCacheObject
*> pinned
;
3101 for (const auto &oi
: ack
->get_authpins()) {
3102 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3103 ceph_assert(object
); // we pinned it
3104 dout(10) << " remote has pinned " << *object
<< dendl
;
3105 mdr
->set_remote_auth_pinned(object
, from
);
3106 if (oi
== ack
->get_authpin_freeze())
3107 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3108 pinned
.insert(object
);
3111 // removed frozen auth pin ?
3112 if (mdr
->more()->is_remote_frozen_authpin
&&
3113 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3114 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3115 ceph_assert(stat_p
);
3116 if (stat_p
->remote_auth_pinned
== from
) {
3117 mdr
->more()->is_remote_frozen_authpin
= false;
3121 // removed auth pins?
3122 for (auto& p
: mdr
->object_states
) {
3123 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3125 MDSCacheObject
* object
= p
.first
;
3126 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3127 dout(10) << " remote has unpinned " << *object
<< dendl
;
3128 mdr
->_clear_remote_auth_pinned(p
.second
);
3133 mdr
->more()->peers
.insert(from
);
3135 // clear from waiting list
3136 auto ret
= mdr
->more()->waiting_on_peer
.erase(from
);
3139 if (ack
->is_error_rofs()) {
3140 mdr
->more()->peer_error
= -CEPHFS_EROFS
;
3141 } else if (ack
->is_error_wouldblock()) {
3142 mdr
->more()->peer_error
= -CEPHFS_EWOULDBLOCK
;
3146 if (mdr
->more()->waiting_on_peer
.empty())
3147 mdcache
->dispatch_request(mdr
);
3149 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
3153 // ---------------------------------------
3158 * check whether we are permitted to complete a request
3160 * Check whether we have permission to perform the operation specified
3161 * by mask on the given inode, based on the capability in the mdr's
3164 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3167 int r
= mdr
->session
->check_access(
3169 mdr
->client_request
->get_caller_uid(),
3170 mdr
->client_request
->get_caller_gid(),
3171 &mdr
->client_request
->get_caller_gid_list(),
3172 mdr
->client_request
->head
.args
.setattr
.uid
,
3173 mdr
->client_request
->head
.args
.setattr
.gid
);
3175 respond_to_request(mdr
, r
);
3183 * check whether fragment has reached maximum size
3186 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
3188 const auto size
= in
->get_frag_size();
3189 if (size
>= g_conf()->mds_bal_fragment_size_max
) {
3190 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf()->mds_bal_fragment_size_max
<< " (CEPHFS_ENOSPC)" << dendl
;
3191 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
3198 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3201 in
->name_stray_dentry(straydname
);
3203 CDentry
*straydn
= mdr
->straydn
;
3205 ceph_assert(straydn
->get_name() == straydname
);
3208 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3210 if (!mdr
->client_request
->is_replay() &&
3211 !check_fragment_space(mdr
, straydir
))
3214 straydn
= straydir
->lookup(straydname
);
3216 if (straydir
->is_frozen_dir()) {
3217 dout(10) << __func__
<< ": " << *straydir
<< " is frozen, waiting" << dendl
;
3218 mds
->locker
->drop_locks(mdr
.get());
3219 mdr
->drop_local_auth_pins();
3220 straydir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3223 straydn
= straydir
->add_null_dentry(straydname
);
3224 straydn
->mark_new();
3226 ceph_assert(straydn
->get_projected_linkage()->is_null());
3229 straydn
->state_set(CDentry::STATE_STRAY
);
3230 mdr
->straydn
= straydn
;
3236 /** prepare_new_inode
3238 * create a new inode. set c/m/atime. hit dir pop.
3240 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3241 const file_layout_t
*layout
)
3243 CInode
*in
= new CInode(mdcache
);
3244 auto _inode
= in
->_get_inode();
3246 // Server::prepare_force_open_sessions() can re-open session in closing
3247 // state. In that corner case, session's prealloc_inos are being freed.
3248 // To simplify the code, we disallow using/refilling session's prealloc_ino
3249 // while session is opening.
3250 bool allow_prealloc_inos
= mdr
->session
->is_open();
3253 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= _inode
->ino
= mdr
->session
->take_ino(useino
))) {
3254 mds
->sessionmap
.mark_projected(mdr
->session
);
3255 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3256 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3260 _inode
->ino
= mds
->inotable
->project_alloc_id(useino
);
3261 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3264 if (useino
&& useino
!= _inode
->ino
) {
3265 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << _inode
->ino
<< dendl
;
3266 mds
->clog
->error() << mdr
->client_request
->get_source()
3267 << " specified ino " << useino
3268 << " but mds." << mds
->get_nodeid() << " allocated " << _inode
->ino
;
3269 //ceph_abort(); // just for now.
3272 if (allow_prealloc_inos
&&
3273 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3274 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3275 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3276 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3277 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3278 mds
->sessionmap
.mark_projected(mdr
->session
);
3279 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3282 _inode
->version
= 1;
3283 _inode
->xattr_version
= 1;
3284 _inode
->nlink
= 1; // FIXME
3286 _inode
->mode
= mode
;
3288 // FIPS zeroization audit 20191117: this memset is not security related.
3289 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
3290 if (_inode
->is_dir()) {
3291 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3292 } else if (layout
) {
3293 _inode
->layout
= *layout
;
3295 _inode
->layout
= mdcache
->default_file_layout
;
3298 _inode
->truncate_size
= -1ull; // not truncated, yet!
3299 _inode
->truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3301 CInode
*diri
= dir
->get_inode();
3303 dout(10) << oct
<< " dir mode 0" << diri
->get_inode()->mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3305 if (diri
->get_inode()->mode
& S_ISGID
) {
3306 dout(10) << " dir is sticky" << dendl
;
3307 _inode
->gid
= diri
->get_inode()->gid
;
3308 if (S_ISDIR(mode
)) {
3309 dout(10) << " new dir also sticky" << dendl
;
3310 _inode
->mode
|= S_ISGID
;
3313 _inode
->gid
= mdr
->client_request
->get_caller_gid();
3315 _inode
->uid
= mdr
->client_request
->get_caller_uid();
3317 _inode
->btime
= _inode
->ctime
= _inode
->mtime
= _inode
->atime
=
3318 mdr
->get_op_stamp();
3320 _inode
->change_attr
= 0;
3322 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3323 if (req
->get_data().length()) {
3324 auto p
= req
->get_data().cbegin();
3326 // xattrs on new inode?
3327 auto _xattrs
= CInode::allocate_xattr_map();
3328 decode_noshare(*_xattrs
, p
);
3329 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs
<< dendl
;
3330 in
->reset_xattrs(std::move(_xattrs
));
3333 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3334 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3335 _inode
->inline_data
.version
= CEPH_INLINE_NONE
;
3337 mdcache
->add_inode(in
); // add
3338 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3342 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3344 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3345 << " inotablev " << mds
->inotable
->get_projected_version()
3347 blob
->set_ino_alloc(mdr
->alloc_ino
,
3348 mdr
->used_prealloc_ino
,
3350 mdr
->client_request
->get_source(),
3351 mds
->sessionmap
.get_projected(),
3352 mds
->inotable
->get_projected_version());
3355 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3357 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3358 << " / " << mdr
->prealloc_inos
3359 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3361 if (mdr
->alloc_ino
) {
3362 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3364 if (mdr
->prealloc_inos
.size()) {
3365 ceph_assert(session
);
3366 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3367 session
->free_prealloc_inos
.insert(mdr
->prealloc_inos
);
3368 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3369 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3370 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3372 if (mdr
->used_prealloc_ino
) {
3373 ceph_assert(session
);
3374 session
->info
.prealloc_inos
.erase(mdr
->used_prealloc_ino
);
3375 mds
->sessionmap
.mark_dirty(session
);
3379 class C_MDS_TryFindInode
: public ServerContext
{
3382 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3383 void finish(int r
) override
{
3384 if (r
== -CEPHFS_ESTALE
) // :( find_ino_peers failed
3385 server
->respond_to_request(mdr
, r
);
3387 server
->dispatch_client_request(mdr
);
3391 /* If this returns null, the request has been handled
3392 * as appropriate: forwarded on, or the client's been replied to */
3393 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3397 const filepath
& refpath
= mdr
->get_filepath();
3398 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3400 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3404 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3406 if (refpath
.is_last_snap()) {
3410 if (!no_want_auth
&& forward_all_requests_to_auth
)
3412 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3415 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3416 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3418 return nullptr; // delayed
3419 if (r
< 0) { // error
3420 if (r
== -CEPHFS_ENOENT
&& !mdr
->dn
[0].empty()) {
3421 if (mdr
->client_request
&&
3422 mdr
->client_request
->get_dentry_wanted())
3423 mdr
->tracedn
= mdr
->dn
[0].back();
3424 respond_to_request(mdr
, r
);
3425 } else if (r
== -CEPHFS_ESTALE
) {
3426 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3427 MDSContext
*c
= new C_MDS_TryFindInode(this, mdr
);
3428 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
3430 dout(10) << "FAIL on error " << r
<< dendl
;
3431 respond_to_request(mdr
, r
);
3435 CInode
*ref
= mdr
->in
[0];
3436 dout(10) << "ref is " << *ref
<< dendl
;
3440 // do NOT proceed if freezing, as cap release may defer in that case, and
3441 // we could deadlock when we try to lock @ref.
3442 // if we're already auth_pinned, continue; the release has already been processed.
3443 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3444 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3445 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3446 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3447 if (mdr
->is_any_remote_auth_pin())
3448 mds
->locker
->notify_freeze_waiter(ref
);
3460 /** rdlock_path_xlock_dentry
3461 * traverse path to the directory that could/would contain dentry.
3462 * make sure i am auth for that dentry, forward as necessary.
3463 * create null dentry in place (or use existing if okexist).
3464 * get rdlocks on traversed dentries, xlock on new dentry.
3466 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3467 bool create
, bool okexist
, bool want_layout
)
3469 const filepath
& refpath
= mdr
->get_filepath();
3470 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3472 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3473 return mdr
->dn
[0].back();
3475 // figure parent dir vs dname
3476 if (refpath
.depth() == 0) {
3477 dout(7) << "invalid path (zero length)" << dendl
;
3478 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3482 if (refpath
.is_last_snap()) {
3483 respond_to_request(mdr
, -CEPHFS_EROFS
);
3487 if (refpath
.is_last_dot_or_dotdot()) {
3488 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3490 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3492 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
3496 // traverse to parent dir
3497 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3498 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3499 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3500 MDS_TRAVERSE_WANT_AUTH
;
3501 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3502 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3504 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3506 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3507 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3509 return nullptr; // delayed
3511 if (r
== -CEPHFS_ESTALE
) {
3512 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3513 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3516 respond_to_request(mdr
, r
);
3520 CDentry
*dn
= mdr
->dn
[0].back();
3521 CDir
*dir
= dn
->get_dir();
3522 CInode
*diri
= dir
->get_inode();
3524 if (!mdr
->reqid
.name
.is_mds()) {
3525 if (diri
->is_system() && !diri
->is_root()) {
3526 respond_to_request(mdr
, -CEPHFS_EROFS
);
3531 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3532 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3536 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3537 if (dnl
->is_null()) {
3538 if (!create
&& okexist
) {
3539 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3543 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3544 dn
->first
= std::max(dn
->first
, next_snap
);
3547 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3550 mdr
->in
[0] = dnl
->get_inode();
3556 /** rdlock_two_paths_xlock_destdn
3557 * traverse two paths and lock the two paths in proper order.
3558 * The order of taking locks is:
3559 * 1. Lock directory inodes or dentries according to which trees they
3560 * are under. Lock objects under fs root before objects under mdsdir.
3561 * 2. Lock directory inodes or dentries according to their depth, in
3563 * 3. Lock directory inodes or dentries according to inode numbers or
3564 * dentries' parent inode numbers, in ascending order.
3565 * 4. Lock dentries in the same directory in order of their keys.
3566 * 5. Lock non-directory inodes according to inode numbers, in ascending
3569 std::pair
<CDentry
*, CDentry
*>
3570 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3573 const filepath
& refpath
= mdr
->get_filepath();
3574 const filepath
& refpath2
= mdr
->get_filepath2();
3576 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3578 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3579 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3581 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3582 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3583 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3586 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3587 respond_to_request(mdr
, -CEPHFS_EROFS
);
3588 return std::make_pair(nullptr, nullptr);
3591 // traverse to parent dir
3592 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3593 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3594 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3596 if (r
== -CEPHFS_ESTALE
) {
3597 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl
;
3598 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3600 respond_to_request(mdr
, r
);
3602 return std::make_pair(nullptr, nullptr);
3605 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3606 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3608 if (r
== -CEPHFS_ESTALE
) {
3609 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
3610 mdcache
->find_ino_peers(refpath2
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
3612 respond_to_request(mdr
, r
);
3614 return std::make_pair(nullptr, nullptr);
3617 CDentry
*srcdn
= mdr
->dn
[1].back();
3618 CDir
*srcdir
= srcdn
->get_dir();
3619 CDentry
*destdn
= mdr
->dn
[0].back();
3620 CDir
*destdir
= destdn
->get_dir();
3622 if (!mdr
->reqid
.name
.is_mds()) {
3623 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3624 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3625 respond_to_request(mdr
, -CEPHFS_EROFS
);
3626 return std::make_pair(nullptr, nullptr);
3630 if (!destdir
->get_inode()->is_base() &&
3631 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3632 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3633 return std::make_pair(nullptr, nullptr);
3636 MutationImpl::LockOpVec lov
;
3637 if (srcdir
->get_inode() == destdir
->get_inode()) {
3638 lov
.add_wrlock(&destdir
->inode
->filelock
);
3639 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3640 if (xlock_srcdn
&& srcdir
!= destdir
) {
3641 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3642 if (srcdir_auth
!= mds
->get_nodeid()) {
3643 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3644 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3648 if (srcdn
->get_name() > destdn
->get_name())
3649 lov
.add_xlock(&destdn
->lock
);
3652 lov
.add_xlock(&srcdn
->lock
);
3654 lov
.add_rdlock(&srcdn
->lock
);
3656 if (srcdn
->get_name() < destdn
->get_name())
3657 lov
.add_xlock(&destdn
->lock
);
3659 int cmp
= mdr
->compare_paths();
3660 bool lock_destdir_first
=
3661 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3663 if (lock_destdir_first
) {
3664 lov
.add_wrlock(&destdir
->inode
->filelock
);
3665 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3666 lov
.add_xlock(&destdn
->lock
);
3670 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3671 if (srcdir_auth
== mds
->get_nodeid()) {
3672 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3673 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3675 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3676 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3678 lov
.add_xlock(&srcdn
->lock
);
3680 lov
.add_rdlock(&srcdn
->lock
);
3683 if (!lock_destdir_first
) {
3684 lov
.add_wrlock(&destdir
->inode
->filelock
);
3685 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3686 lov
.add_xlock(&destdn
->lock
);
3690 CInode
*auth_pin_freeze
= nullptr;
3691 // XXX any better way to do this?
3692 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3693 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3694 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3696 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3697 return std::make_pair(nullptr, nullptr);
3699 if (srcdn
->get_projected_linkage()->is_null()) {
3700 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3701 return std::make_pair(nullptr, nullptr);
3704 if (destdn
->get_projected_linkage()->is_null()) {
3705 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3706 destdn
->first
= std::max(destdn
->first
, next_snap
);
3709 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3711 return std::make_pair(destdn
, srcdn
);
3715 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3717 * @param diri base inode
3718 * @param fg the exact frag we want
3719 * @param mdr request
3720 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3722 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3724 CDir
*dir
= diri
->get_dirfrag(fg
);
3727 // am i auth for the dirfrag?
3728 if (!dir
->is_auth()) {
3729 mds_rank_t auth
= dir
->authority().first
;
3730 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3731 << ", fw to mds." << auth
<< dendl
;
3732 mdcache
->request_forward(mdr
, auth
);
3736 // not open and inode not mine?
3737 if (!diri
->is_auth()) {
3738 mds_rank_t inauth
= diri
->authority().first
;
3739 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3740 mdcache
->request_forward(mdr
, inauth
);
3744 // not open and inode frozen?
3745 if (diri
->is_frozen()) {
3746 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3747 ceph_assert(diri
->get_parent_dir());
3748 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3753 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3760 // ===============================================================================
3763 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3765 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3767 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3768 // refpath can't be empty for lookup but it can for
3769 // getattr (we do getattr with empty refpath for mount of '/')
3770 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3774 bool want_auth
= false;
3775 int mask
= req
->head
.args
.getattr
.mask
;
3776 if (mask
& CEPH_STAT_RSTAT
)
3777 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3779 if (!mdr
->is_batch_head() && mdr
->can_batch()) {
3780 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
3781 int r
= mdcache
->path_traverse(mdr
, cf
, mdr
->get_filepath(),
3782 (want_auth
? MDS_TRAVERSE_WANT_AUTH
: 0),
3783 &mdr
->dn
[0], &mdr
->in
[0]);
3788 // fall-thru. let rdlock_path_pin_ref() check again.
3789 } else if (is_lookup
) {
3790 CDentry
* dn
= mdr
->dn
[0].back();
3792 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3794 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3796 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3797 em
.first
->second
->add_request(mdr
);
3801 CInode
*in
= mdr
->in
[0];
3803 auto em
= in
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3805 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3807 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3808 em
.first
->second
->add_request(mdr
);
3814 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
3818 mdr
->getattr_caps
= mask
;
3821 * if client currently holds the EXCL cap on a field, do not rdlock
3822 * it; client's stat() will result in valid info if _either_ EXCL
3823 * cap is held or MDS rdlocks and reads the value here.
3825 * handling this case here is easier than weakening rdlock
3826 * semantics... that would cause problems elsewhere.
3828 client_t client
= mdr
->get_client();
3830 Capability
*cap
= ref
->get_client_cap(client
);
3831 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3832 mdr
->snapid
<= cap
->client_follows
))
3833 issued
= cap
->issued();
3836 MutationImpl::LockOpVec lov
;
3837 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3838 lov
.add_rdlock(&ref
->linklock
);
3839 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3840 lov
.add_rdlock(&ref
->authlock
);
3841 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3842 lov
.add_rdlock(&ref
->xattrlock
);
3843 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3844 // Don't wait on unstable filelock if client is allowed to read file size.
3845 // This can reduce the response time of getattr in the case that multiple
3846 // clients do stat(2) and there are writers.
3847 // The downside of this optimization is that mds may not issue Fs caps along
3848 // with getattr reply. Client may need to send more getattr requests.
3849 if (mdr
->is_rdlocked(&ref
->filelock
)) {
3850 lov
.add_rdlock(&ref
->filelock
);
3851 } else if (ref
->filelock
.is_stable() ||
3852 ref
->filelock
.get_num_wrlocks() > 0 ||
3853 !ref
->filelock
.can_read(mdr
->get_client())) {
3854 lov
.add_rdlock(&ref
->filelock
);
3855 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
3859 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3862 if (!check_access(mdr
, ref
, MAY_READ
))
3865 utime_t now
= ceph_clock_now();
3866 mdr
->set_mds_stamp(now
);
3868 // note which caps are requested, so we return at least a snapshot
3869 // value for them. (currently this matters for xattrs and inline data)
3870 mdr
->getattr_caps
= mask
;
3872 mds
->balancer
->hit_inode(ref
, META_POP_IRD
, req
->get_source().num());
3875 dout(10) << "reply to stat on " << *req
<< dendl
;
3878 mdr
->tracedn
= mdr
->dn
[0].back();
3879 respond_to_request(mdr
, 0);
3882 struct C_MDS_LookupIno2
: public ServerContext
{
3884 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3885 void finish(int r
) override
{
3886 server
->_lookup_ino_2(mdr
, r
);
3893 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3894 bool want_parent
, bool want_dentry
)
3896 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3898 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
3899 return _lookup_snap_ino(mdr
);
3901 inodeno_t ino
= req
->get_filepath().get_ino();
3902 CInode
*in
= mdcache
->get_inode(ino
);
3903 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3904 respond_to_request(mdr
, -CEPHFS_ESTALE
);
3908 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3912 // check for nothing (not read or write); this still applies the
3914 if (!check_access(mdr
, in
, 0))
3917 CDentry
*dn
= in
->get_projected_parent_dn();
3918 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3920 MutationImpl::LockOpVec lov
;
3921 if (dn
&& (want_parent
|| want_dentry
)) {
3923 lov
.add_rdlock(&dn
->lock
);
3926 unsigned mask
= req
->head
.args
.lookupino
.mask
;
3928 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3930 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3931 issued
= cap
->issued();
3933 // permission bits, ACL/security xattrs
3934 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3935 lov
.add_rdlock(&in
->authlock
);
3936 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3937 lov
.add_rdlock(&in
->xattrlock
);
3939 mdr
->getattr_caps
= mask
;
3943 if (!mds
->locker
->acquire_locks(mdr
, lov
))
3947 // need read access to directory inode
3948 if (!check_access(mdr
, diri
, MAY_READ
))
3954 if (in
->is_base()) {
3955 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3958 if (!diri
|| diri
->is_stray()) {
3959 respond_to_request(mdr
, -CEPHFS_ESTALE
);
3962 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3964 respond_to_request(mdr
, 0);
3967 inodeno_t dirino
= req
->get_filepath2().get_ino();
3968 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3969 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3972 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3974 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3979 respond_to_request(mdr
, 0);
3983 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
3985 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3988 vino
.ino
= req
->get_filepath().get_ino();
3989 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
3990 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
3991 __u32 hash
= req
->head
.args
.lookupino
.hash
;
3993 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
3995 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
3997 in
= mdcache
->get_inode(vino
.ino
);
3999 if (in
->state_test(CInode::STATE_PURGING
) ||
4000 !in
->has_snap_data(vino
.snapid
)) {
4001 if (in
->is_dir() || !parent_ino
) {
4002 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4011 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
4012 mdr
->snapid
= vino
.snapid
;
4014 respond_to_request(mdr
, 0);
4018 CInode
*diri
= NULL
;
4020 diri
= mdcache
->get_inode(parent_ino
);
4022 mdcache
->open_ino(parent_ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
4026 if (!diri
->is_dir()) {
4027 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4031 MutationImpl::LockOpVec lov
;
4032 lov
.add_rdlock(&diri
->dirfragtreelock
);
4033 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4036 frag_t frag
= diri
->dirfragtree
[hash
];
4037 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4041 if (!dir
->is_complete()) {
4042 if (dir
->is_frozen()) {
4043 mds
->locker
->drop_locks(mdr
.get());
4044 mdr
->drop_local_auth_pins();
4045 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4048 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4052 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4054 mdcache
->open_ino(vino
.ino
, mds
->mdsmap
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4058 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4060 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4061 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4063 // `r` is a rank if >=0, else an error code
4065 mds_rank_t
dest_rank(r
);
4066 if (dest_rank
== mds
->get_nodeid())
4067 dispatch_client_request(mdr
);
4069 mdcache
->request_forward(mdr
, dest_rank
);
4074 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
4076 respond_to_request(mdr
, r
);
4080 /* This function takes responsibility for the passed mdr*/
4081 void Server::handle_client_open(MDRequestRef
& mdr
)
4083 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4084 dout(7) << "open on " << req
->get_filepath() << dendl
;
4086 int flags
= req
->head
.args
.open
.flags
;
4087 int cmode
= ceph_flags_to_mode(flags
);
4089 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4093 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4094 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4096 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4097 dout(7) << "read-only FS" << dendl
;
4098 respond_to_request(mdr
, -CEPHFS_EROFS
);
4102 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4106 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4107 ceph_assert(!need_auth
);
4108 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4109 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4114 if (!cur
->is_file()) {
4115 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4116 cmode
= CEPH_FILE_MODE_PIN
;
4117 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4118 if (cur
->is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4119 flags
&= ~CEPH_O_TRUNC
;
4122 dout(10) << "open flags = " << flags
4123 << ", filemode = " << cmode
4124 << ", need_auth = " << need_auth
4128 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4129 dout(7) << "not a file or dir " << *cur << dendl;
4130 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4133 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->is_dir() && !cur
->is_symlink()) {
4134 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4135 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4139 if ((flags
& CEPH_O_TRUNC
) && !cur
->is_file()) {
4140 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4141 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4142 respond_to_request(mdr
, cur
->is_dir() ? -CEPHFS_EISDIR
: -CEPHFS_EINVAL
);
4146 if (cur
->get_inode()->inline_data
.version
!= CEPH_INLINE_NONE
&&
4147 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4148 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4149 respond_to_request(mdr
, -CEPHFS_EPERM
);
4153 // snapped data is read only
4154 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4155 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4156 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4157 respond_to_request(mdr
, -CEPHFS_EROFS
);
4161 MutationImpl::LockOpVec lov
;
4163 unsigned mask
= req
->head
.args
.open
.mask
;
4165 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4167 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4168 issued
= cap
->issued();
4169 // permission bits, ACL/security xattrs
4170 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4171 lov
.add_rdlock(&cur
->authlock
);
4172 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4173 lov
.add_rdlock(&cur
->xattrlock
);
4175 mdr
->getattr_caps
= mask
;
4179 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4180 ceph_assert(cur
->is_auth());
4182 lov
.add_xlock(&cur
->filelock
);
4183 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4186 if (!check_access(mdr
, cur
, MAY_WRITE
))
4189 // wait for pending truncate?
4190 const auto& pi
= cur
->get_projected_inode();
4191 if (pi
->is_truncating()) {
4192 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4193 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4194 mds
->locker
->drop_locks(mdr
.get());
4195 mdr
->drop_local_auth_pins();
4196 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4200 do_open_truncate(mdr
, cmode
);
4204 // sync filelock if snapped.
4205 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4206 // and that data itself is flushed so that we can read the snapped data off disk.
4207 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4208 lov
.add_rdlock(&cur
->filelock
);
4211 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4215 if (cmode
& CEPH_FILE_MODE_WR
)
4217 if (!check_access(mdr
, cur
, mask
))
4220 utime_t now
= ceph_clock_now();
4221 mdr
->set_mds_stamp(now
);
4223 if (cur
->is_file() || cur
->is_dir()) {
4224 if (mdr
->snapid
== CEPH_NOSNAP
) {
4226 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4228 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4229 << " for " << req
->get_source()
4230 << " on " << *cur
<< dendl
;
4232 int caps
= ceph_caps_for_mode(cmode
);
4233 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4234 << " for " << req
->get_source()
4235 << " snapid " << mdr
->snapid
4236 << " on " << *cur
<< dendl
;
4237 mdr
->snap_caps
= caps
;
4241 // increase max_size?
4242 if (cmode
& CEPH_FILE_MODE_WR
)
4243 mds
->locker
->check_inode_max_size(cur
);
4245 // make sure this inode gets into the journal
4246 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4247 mdcache
->open_file_table
.should_log_open(cur
)) {
4248 EOpen
*le
= new EOpen(mds
->mdlog
);
4249 mdlog
->start_entry(le
);
4250 le
->add_clean_inode(cur
);
4251 mdlog
->submit_entry(le
);
4255 if (cmode
& CEPH_FILE_MODE_WR
)
4256 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4258 mds
->balancer
->hit_inode(cur
, META_POP_IRD
,
4259 mdr
->client_request
->get_source().num());
4262 if (req
->get_dentry_wanted()) {
4263 ceph_assert(mdr
->dn
[0].size());
4264 dn
= mdr
->dn
[0].back();
4269 respond_to_request(mdr
, 0);
4272 class C_MDS_openc_finish
: public ServerLogContext
{
4276 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4277 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4278 void finish(int r
) override
{
4279 ceph_assert(r
== 0);
4281 dn
->pop_projected_linkage();
4283 // dirty inode, dn, dir
4284 newi
->mark_dirty(mdr
->ls
);
4285 newi
->mark_dirty_parent(mdr
->ls
, true);
4289 get_mds()->locker
->share_inode_max_size(newi
);
4291 MDRequestRef null_ref
;
4292 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4294 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4296 server
->respond_to_request(mdr
, 0);
4298 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4302 /* This function takes responsibility for the passed mdr*/
4303 void Server::handle_client_openc(MDRequestRef
& mdr
)
4305 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4306 client_t client
= mdr
->get_client();
4308 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4310 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4312 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4316 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4317 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true);
4321 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4322 if (!excl
&& !dnl
->is_null()) {
4324 mds
->locker
->xlock_downgrade(&dn
->lock
, mdr
.get());
4326 MutationImpl::LockOpVec lov
;
4327 lov
.add_rdlock(&dnl
->get_inode()->snaplock
);
4328 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4331 handle_client_open(mdr
);
4335 ceph_assert(dnl
->is_null());
4337 if (req
->get_alternate_name().size() > alternate_name_max
) {
4338 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
4339 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
4342 dn
->set_alternate_name(req
->get_alternate_name());
4345 file_layout_t layout
;
4346 if (mdr
->dir_layout
!= file_layout_t())
4347 layout
= mdr
->dir_layout
;
4349 layout
= mdcache
->default_file_layout
;
4351 // What kind of client caps are required to complete this operation
4352 uint64_t access
= MAY_WRITE
;
4354 const auto default_layout
= layout
;
4356 // fill in any special params from client
4357 if (req
->head
.args
.open
.stripe_unit
)
4358 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4359 if (req
->head
.args
.open
.stripe_count
)
4360 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4361 if (req
->head
.args
.open
.object_size
)
4362 layout
.object_size
= req
->head
.args
.open
.object_size
;
4363 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4364 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4365 layout
.pool_id
= req
->head
.args
.open
.pool
;
4367 // make sure we have as new a map as the client
4368 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4369 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4374 // If client doesn't have capability to modify layout pools, then
4375 // only permit this request if the requested pool matches what the
4376 // file would have inherited anyway from its parent.
4377 if (default_layout
!= layout
) {
4378 access
|= MAY_SET_VXATTR
;
4381 if (!layout
.is_valid()) {
4382 dout(10) << " invalid initial file layout" << dendl
;
4383 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4386 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4387 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4388 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4393 CDir
*dir
= dn
->get_dir();
4394 CInode
*diri
= dir
->get_inode();
4395 if (!check_access(mdr
, diri
, access
))
4397 if (!check_fragment_space(mdr
, dir
))
4400 if (mdr
->dn
[0].size() == 1)
4401 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4404 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4405 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4409 dn
->push_projected_linkage(newi
);
4411 auto _inode
= newi
->_get_inode();
4412 _inode
->version
= dn
->pre_dirty();
4413 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4414 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
4415 _inode
->update_backtrace();
4416 _inode
->rstat
.rfiles
= 1;
4417 _inode
->accounted_rstat
= _inode
->rstat
;
4419 SnapRealm
*realm
= diri
->find_snaprealm();
4420 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4421 ceph_assert(follows
>= realm
->get_newest_seq());
4423 ceph_assert(dn
->first
== follows
+1);
4424 newi
->first
= dn
->first
;
4427 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
4428 newi
->authlock
.set_state(LOCK_EXCL
);
4429 newi
->xattrlock
.set_state(LOCK_EXCL
);
4431 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4432 _inode
->client_ranges
[client
].range
.first
= 0;
4433 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
4434 _inode
->client_ranges
[client
].follows
= follows
;
4435 newi
->mark_clientwriteable();
4436 cap
->mark_clientwriteable();
4440 mdr
->ls
= mdlog
->get_current_segment();
4441 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4442 mdlog
->start_entry(le
);
4443 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4444 journal_allocated_inos(mdr
, &le
->metablob
);
4445 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4446 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4448 // make sure this inode gets into the journal
4449 le
->metablob
.add_opened_ino(newi
->ino());
4451 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, newi
);
4453 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4454 openc_response_t ocresp
;
4456 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4457 ocresp
.created_ino
= _inode
->ino
;
4459 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4460 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4461 unsigned frac
= 100 / delegate_inos_pct
;
4462 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4463 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4466 encode(ocresp
, mdr
->reply_extra_bl
);
4467 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4468 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4469 // add the file created flag onto the reply if create_flags features is supported
4470 encode(newi
->ino(), mdr
->reply_extra_bl
);
4473 journal_and_reply(mdr
, newi
, dn
, le
, fin
);
4475 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4476 // have overshot the split size (multiple opencs in flight), so here is
4477 // an early chance to split the dir if this openc makes it oversized.
4478 mds
->balancer
->maybe_fragment(dir
, false);
4483 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4485 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4486 Session
*session
= mds
->get_session(req
);
4487 client_t client
= req
->get_source().num();
4488 MutationImpl::LockOpVec lov
;
4489 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4492 // it's a directory, right?
4493 if (!diri
->is_dir()) {
4495 dout(10) << "reply to " << *req
<< " readdir -CEPHFS_ENOTDIR" << dendl
;
4496 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
4500 auto num_caps
= session
->get_num_caps();
4501 auto session_cap_acquisition
= session
->get_cap_acquisition();
4503 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
4504 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
4505 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
4507 logger
->inc(l_mdss_cap_acquisition_throttle
);
4509 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
4513 lov
.add_rdlock(&diri
->filelock
);
4514 lov
.add_rdlock(&diri
->dirfragtreelock
);
4516 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4519 if (!check_access(mdr
, diri
, MAY_READ
))
4523 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4524 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4525 string offset_str
= req
->get_path2();
4527 __u32 offset_hash
= 0;
4528 if (!offset_str
.empty())
4529 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4531 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4533 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4534 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4536 // does the frag exist?
4537 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4539 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4540 if (fg
.contains((unsigned)offset_hash
)) {
4541 newfg
= diri
->dirfragtree
[offset_hash
];
4543 // client actually wants next frag
4544 newfg
= diri
->dirfragtree
[fg
.value()];
4548 newfg
= diri
->dirfragtree
[fg
.value()];
4550 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4554 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4558 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4559 ceph_assert(dir
->is_auth());
4561 if (!dir
->is_complete()) {
4562 if (dir
->is_frozen()) {
4563 dout(7) << "dir is frozen " << *dir
<< dendl
;
4564 mds
->locker
->drop_locks(mdr
.get());
4565 mdr
->drop_local_auth_pins();
4566 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4570 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4571 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4575 #ifdef MDS_VERIFY_FRAGSTAT
4576 dir
->verify_fragstat();
4579 utime_t now
= ceph_clock_now();
4580 mdr
->set_mds_stamp(now
);
4582 snapid_t snapid
= mdr
->snapid
;
4583 dout(10) << "snapid " << snapid
<< dendl
;
4585 SnapRealm
*realm
= diri
->find_snaprealm();
4587 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4589 max
= dir
->get_num_any(); // whatever, something big.
4590 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4592 // make sure at least one item can be encoded
4593 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4598 ds
.frag
= dir
->get_frag();
4599 ds
.auth
= dir
->get_dir_auth().first
;
4600 if (dir
->is_auth() && !forward_all_requests_to_auth
)
4601 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4603 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4605 // count bytes available.
4606 // this isn't perfect, but we should capture the main variable/unbounded size items!
4607 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4608 int bytes_left
= max_bytes
- front_bytes
;
4609 bytes_left
-= realm
->get_snap_trace().length();
4611 // build dir contents
4614 bool start
= !offset_hash
&& offset_str
.empty();
4615 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4616 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4617 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4618 bool end
= (it
== dir
->end());
4619 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4620 CDentry
*dn
= it
->second
;
4623 if (dn
->state_test(CDentry::STATE_PURGING
))
4626 bool dnp
= dn
->use_projected(client
, mdr
);
4627 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4632 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4633 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4638 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4639 if (!(offset_key
< dn
->key()))
4643 CInode
*in
= dnl
->get_inode();
4645 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4649 // better for the MDS to do the work, if we think the client will stat any of these files.
4650 if (dnl
->is_remote() && !in
) {
4651 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4653 dn
->link_remote(dnl
, in
);
4654 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4655 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4658 // touch everything i _do_ have
4659 for (auto &p
: *dir
) {
4660 if (!p
.second
->get_linkage()->is_null())
4661 mdcache
->lru
.lru_touch(p
.second
);
4664 // already issued caps and leases, reply immediately.
4665 if (dnbl
.length() > 0) {
4666 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4667 dout(10) << " open remote dentry after caps were issued, stopping at "
4668 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4672 mds
->locker
->drop_locks(mdr
.get());
4673 mdr
->drop_local_auth_pins();
4674 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4680 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4681 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4685 unsigned start_len
= dnbl
.length();
4688 dout(12) << "including dn " << *dn
<< dendl
;
4689 encode(dn
->get_name(), dnbl
);
4690 int lease_mask
= dnl
->is_primary() ? CEPH_LEASE_PRIMARY_LINK
: 0;
4691 mds
->locker
->issue_client_lease(dn
, mdr
, lease_mask
, now
, dnbl
);
4694 dout(12) << "including inode " << *in
<< dendl
;
4695 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4697 // chop off dn->name, lease
4698 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4700 keep
.substr_of(dnbl
, 0, start_len
);
4704 ceph_assert(r
>= 0);
4708 mdcache
->lru
.lru_touch(dn
);
4711 session
->touch_readdir_cap(numfiles
);
4715 flags
= CEPH_READDIR_FRAG_END
;
4717 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4719 // client only understand END and COMPLETE flags ?
4720 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4721 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4724 // finish final blob
4725 encode(numfiles
, dirbl
);
4726 encode(flags
, dirbl
);
4727 dirbl
.claim_append(dnbl
);
4730 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4731 << " bytes=" << dirbl
.length()
4732 << " start=" << (int)start
4733 << " end=" << (int)end
4735 mdr
->reply_extra_bl
= dirbl
;
4737 // bump popularity. NOTE: this doesn't quite capture it.
4738 mds
->balancer
->hit_dir(dir
, META_POP_IRD
, -1, numfiles
);
4742 respond_to_request(mdr
, 0);
4747 // ===============================================================================
4752 * finisher for basic inode updates
4754 class C_MDS_inode_update_finish
: public ServerLogContext
{
4756 bool truncating_smaller
, changed_ranges
, adjust_realm
;
4758 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
4759 bool sm
=false, bool cr
=false, bool ar
=false) :
4760 ServerLogContext(s
, r
), in(i
),
4761 truncating_smaller(sm
), changed_ranges(cr
), adjust_realm(ar
) { }
4762 void finish(int r
) override
{
4763 ceph_assert(r
== 0);
4765 int snap_op
= (in
->snaprealm
? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
);
4770 MDSRank
*mds
= get_mds();
4772 // notify any clients
4773 if (truncating_smaller
&& in
->get_inode()->is_truncating()) {
4774 mds
->locker
->issue_truncate(in
);
4775 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
4779 mds
->mdcache
->send_snap_update(in
, 0, snap_op
);
4780 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, snap_op
);
4783 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
4785 server
->respond_to_request(mdr
, 0);
4788 get_mds()->locker
->share_inode_max_size(in
);
4792 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
4794 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4795 MutationImpl::LockOpVec lov
;
4797 // get the inode to operate on, and set up any locks needed for that
4798 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4802 lov
.add_xlock(&cur
->flocklock
);
4803 /* acquire_locks will return true if it gets the locks. If it fails,
4804 it will redeliver this request at a later date, so drop the request.
4806 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4807 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
4811 // copy the lock change into a ceph_filelock so we can store/apply it
4812 ceph_filelock set_lock
;
4813 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
4814 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
4815 set_lock
.client
= req
->get_orig_source().num();
4816 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4817 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4818 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
4819 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
4821 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
4823 ceph_lock_state_t
*lock_state
= NULL
;
4824 bool interrupt
= false;
4826 // get the appropriate lock state
4827 switch (req
->head
.args
.filelock_change
.rule
) {
4828 case CEPH_LOCK_FLOCK_INTR
:
4831 case CEPH_LOCK_FLOCK
:
4832 lock_state
= cur
->get_flock_lock_state();
4835 case CEPH_LOCK_FCNTL_INTR
:
4838 case CEPH_LOCK_FCNTL
:
4839 lock_state
= cur
->get_fcntl_lock_state();
4843 dout(10) << "got unknown lock type " << set_lock
.type
4844 << ", dropping request!" << dendl
;
4845 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
4849 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4850 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4851 list
<ceph_filelock
> activated_locks
;
4852 MDSContext::vec waiters
;
4853 if (lock_state
->is_waiting(set_lock
)) {
4854 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4855 lock_state
->remove_waiting(set_lock
);
4856 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4857 } else if (!interrupt
) {
4858 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4859 lock_state
->remove_lock(set_lock
, activated_locks
);
4860 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4862 mds
->queue_waiters(waiters
);
4864 respond_to_request(mdr
, 0);
4866 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4867 bool deadlock
= false;
4868 if (mdr
->more()->flock_was_waiting
&&
4869 !lock_state
->is_waiting(set_lock
)) {
4870 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4871 respond_to_request(mdr
, -CEPHFS_EINTR
);
4872 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4873 dout(10) << " it failed on this attempt" << dendl
;
4874 // couldn't set lock right now
4876 respond_to_request(mdr
, -CEPHFS_EDEADLK
);
4877 } else if (!will_wait
) {
4878 respond_to_request(mdr
, -CEPHFS_EWOULDBLOCK
);
4880 dout(10) << " added to waiting list" << dendl
;
4881 ceph_assert(lock_state
->is_waiting(set_lock
));
4882 mdr
->more()->flock_was_waiting
= true;
4883 mds
->locker
->drop_locks(mdr
.get());
4884 mdr
->drop_local_auth_pins();
4885 mdr
->mark_event("failed to add lock, waiting");
4887 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4890 respond_to_request(mdr
, 0);
4892 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4895 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4897 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4898 MutationImpl::LockOpVec lov
;
4900 // get the inode to operate on, and set up any locks needed for that
4901 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4905 /* acquire_locks will return true if it gets the locks. If it fails,
4906 it will redeliver this request at a later date, so drop the request.
4908 lov
.add_rdlock(&cur
->flocklock
);
4909 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
4910 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4914 // copy the lock change into a ceph_filelock so we can store/apply it
4915 ceph_filelock checking_lock
;
4916 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4917 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4918 checking_lock
.client
= req
->get_orig_source().num();
4919 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4920 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4921 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4923 // get the appropriate lock state
4924 ceph_lock_state_t
*lock_state
= NULL
;
4925 switch (req
->head
.args
.filelock_change
.rule
) {
4926 case CEPH_LOCK_FLOCK
:
4927 lock_state
= cur
->get_flock_lock_state();
4930 case CEPH_LOCK_FCNTL
:
4931 lock_state
= cur
->get_fcntl_lock_state();
4935 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4936 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4939 lock_state
->look_for_lock(checking_lock
);
4942 encode(checking_lock
, lock_bl
);
4944 mdr
->reply_extra_bl
= lock_bl
;
4945 respond_to_request(mdr
, 0);
4948 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4950 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4951 MutationImpl::LockOpVec lov
;
4952 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4955 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4956 respond_to_request(mdr
, -CEPHFS_EROFS
);
4959 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4960 respond_to_request(mdr
, -CEPHFS_EPERM
);
4964 __u32 mask
= req
->head
.args
.setattr
.mask
;
4965 __u32 access_mask
= MAY_WRITE
;
4968 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4969 lov
.add_xlock(&cur
->authlock
);
4970 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4971 lov
.add_xlock(&cur
->filelock
);
4972 if (mask
& CEPH_SETATTR_CTIME
)
4973 lov
.add_wrlock(&cur
->versionlock
);
4975 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4978 if ((mask
& CEPH_SETATTR_UID
) && (cur
->get_inode()->uid
!= req
->head
.args
.setattr
.uid
))
4979 access_mask
|= MAY_CHOWN
;
4981 if ((mask
& CEPH_SETATTR_GID
) && (cur
->get_inode()->gid
!= req
->head
.args
.setattr
.gid
))
4982 access_mask
|= MAY_CHGRP
;
4984 if (!check_access(mdr
, cur
, access_mask
))
4987 // trunc from bigger -> smaller?
4988 const auto& pip
= cur
->get_projected_inode();
4990 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
4992 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
4993 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4994 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl
;
4995 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
4999 bool truncating_smaller
= false;
5000 if (mask
& CEPH_SETATTR_SIZE
) {
5001 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
5002 if (truncating_smaller
&& pip
->is_truncating()) {
5003 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
5004 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
5005 mds
->locker
->drop_locks(mdr
.get());
5006 mdr
->drop_local_auth_pins();
5007 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
5012 bool changed_ranges
= false;
5015 mdr
->ls
= mdlog
->get_current_segment();
5016 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
5017 mdlog
->start_entry(le
);
5019 auto pi
= cur
->project_inode(mdr
);
5021 if (mask
& CEPH_SETATTR_UID
)
5022 pi
.inode
->uid
= req
->head
.args
.setattr
.uid
;
5023 if (mask
& CEPH_SETATTR_GID
)
5024 pi
.inode
->gid
= req
->head
.args
.setattr
.gid
;
5026 if (mask
& CEPH_SETATTR_MODE
)
5027 pi
.inode
->mode
= (pi
.inode
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
5028 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
5029 S_ISREG(pi
.inode
->mode
) &&
5030 (pi
.inode
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
5031 pi
.inode
->mode
&= ~(S_ISUID
|S_ISGID
);
5034 if (mask
& CEPH_SETATTR_MTIME
)
5035 pi
.inode
->mtime
= req
->head
.args
.setattr
.mtime
;
5036 if (mask
& CEPH_SETATTR_ATIME
)
5037 pi
.inode
->atime
= req
->head
.args
.setattr
.atime
;
5038 if (mask
& CEPH_SETATTR_BTIME
)
5039 pi
.inode
->btime
= req
->head
.args
.setattr
.btime
;
5040 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
5041 pi
.inode
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
5042 if (mask
& CEPH_SETATTR_SIZE
) {
5043 if (truncating_smaller
) {
5044 pi
.inode
->truncate(old_size
, req
->head
.args
.setattr
.size
);
5045 le
->metablob
.add_truncate_start(cur
->ino());
5047 pi
.inode
->size
= req
->head
.args
.setattr
.size
;
5048 pi
.inode
->rstat
.rbytes
= pi
.inode
->size
;
5050 pi
.inode
->mtime
= mdr
->get_op_stamp();
5052 // adjust client's max_size?
5053 if (mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
->size
)) {
5054 dout(10) << " client_ranges " << cur
->get_previous_projected_inode()->client_ranges
5055 << " -> " << pi
.inode
->client_ranges
<< dendl
;
5056 changed_ranges
= true;
5060 pi
.inode
->version
= cur
->pre_dirty();
5061 pi
.inode
->ctime
= mdr
->get_op_stamp();
5062 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5063 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5064 pi
.inode
->change_attr
++;
5067 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5068 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5069 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5071 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5072 truncating_smaller
, changed_ranges
));
5074 // flush immediately if there are readers/writers waiting
5075 if (mdr
->is_xlocked(&cur
->filelock
) &&
5076 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5077 mds
->mdlog
->flush();
5080 /* Takes responsibility for mdr */
5081 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5083 CInode
*in
= mdr
->in
[0];
5084 client_t client
= mdr
->get_client();
5087 dout(10) << "do_open_truncate " << *in
<< dendl
;
5089 SnapRealm
*realm
= in
->find_snaprealm();
5090 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5092 mdr
->ls
= mdlog
->get_current_segment();
5093 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5094 mdlog
->start_entry(le
);
5097 auto pi
= in
->project_inode(mdr
);
5098 pi
.inode
->version
= in
->pre_dirty();
5099 pi
.inode
->mtime
= pi
.inode
->ctime
= mdr
->get_op_stamp();
5100 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5101 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5102 pi
.inode
->change_attr
++;
5104 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
5106 pi
.inode
->truncate(old_size
, 0);
5107 le
->metablob
.add_truncate_start(in
->ino());
5110 bool changed_ranges
= false;
5111 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5112 pi
.inode
->client_ranges
[client
].range
.first
= 0;
5113 pi
.inode
->client_ranges
[client
].range
.last
= pi
.inode
->get_layout_size_increment();
5114 pi
.inode
->client_ranges
[client
].follows
= realm
->get_newest_seq();
5115 changed_ranges
= true;
5116 in
->mark_clientwriteable();
5117 cap
->mark_clientwriteable();
5120 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5122 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5123 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5125 // make sure ino gets into the journal
5126 le
->metablob
.add_opened_ino(in
->ino());
5128 mdr
->o_trunc
= true;
5131 if (mdr
->client_request
->get_dentry_wanted()) {
5132 ceph_assert(mdr
->dn
[0].size());
5133 dn
= mdr
->dn
[0].back();
5136 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5138 // Although the `open` part can give an early reply, the truncation won't
5139 // happen until our EUpdate is persistent, to give the client a prompt
5140 // response we must also flush that event.
5145 /* This function cleans up the passed mdr */
5146 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5148 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5149 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5152 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5153 respond_to_request(mdr
, -CEPHFS_EROFS
);
5156 if (!cur
->is_file()) {
5157 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5160 if (cur
->get_projected_inode()->size
||
5161 cur
->get_projected_inode()->truncate_seq
> 1) {
5162 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5167 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5168 // save existing layout for later
5169 const auto old_layout
= layout
;
5171 int access
= MAY_WRITE
;
5173 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5174 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5175 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5176 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5177 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5178 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5179 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5180 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5182 // make sure we have as new a map as the client
5183 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5184 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5189 // Don't permit layout modifications without 'p' caps
5190 if (layout
!= old_layout
) {
5191 access
|= MAY_SET_VXATTR
;
5194 if (!layout
.is_valid()) {
5195 dout(10) << "bad layout" << dendl
;
5196 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5199 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5200 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5201 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5205 MutationImpl::LockOpVec lov
;
5206 lov
.add_xlock(&cur
->filelock
);
5207 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5210 if (!check_access(mdr
, cur
, access
))
5214 auto pi
= cur
->project_inode(mdr
);
5215 pi
.inode
->layout
= layout
;
5216 // add the old pool to the inode
5217 pi
.inode
->add_old_pool(old_layout
.pool_id
);
5218 pi
.inode
->version
= cur
->pre_dirty();
5219 pi
.inode
->ctime
= mdr
->get_op_stamp();
5220 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5221 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5222 pi
.inode
->change_attr
++;
5225 mdr
->ls
= mdlog
->get_current_segment();
5226 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5227 mdlog
->start_entry(le
);
5228 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5229 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5230 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5232 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5235 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5237 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5240 MutationImpl::LockOpVec lov
;
5241 lov
.add_xlock(&in
->policylock
);
5243 lov
.add_xlock(&in
->snaplock
);
5245 lov
.add_rdlock(&in
->snaplock
);
5246 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5249 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5250 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5251 want_layout
= false;
5253 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5254 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5258 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5262 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5264 CInode
*in
= mdcache
->get_inode(ino
);
5265 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5266 respond_to_request(mdr
, -CEPHFS_ESTALE
);
5269 if (!in
->is_auth()) {
5270 mdcache
->request_forward(mdr
, in
->authority().first
);
5277 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5279 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5281 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5282 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5286 if (!cur
->is_dir()) {
5287 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
5291 if (!xlock_policylock(mdr
, cur
, true))
5295 const auto& old_pi
= cur
->get_projected_inode();
5296 file_layout_t layout
;
5297 if (old_pi
->has_layout())
5298 layout
= old_pi
->layout
;
5299 else if (mdr
->dir_layout
!= file_layout_t())
5300 layout
= mdr
->dir_layout
;
5302 layout
= mdcache
->default_file_layout
;
5304 // Level of access required to complete
5305 int access
= MAY_WRITE
;
5307 const auto old_layout
= layout
;
5309 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5310 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5311 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5312 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5313 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5314 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5315 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5316 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5317 // make sure we have as new a map as the client
5318 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5319 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5324 if (layout
!= old_layout
) {
5325 access
|= MAY_SET_VXATTR
;
5328 if (!layout
.is_valid()) {
5329 dout(10) << "bad layout" << dendl
;
5330 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5333 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5334 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5335 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5339 if (!check_access(mdr
, cur
, access
))
5342 auto pi
= cur
->project_inode(mdr
);
5343 pi
.inode
->layout
= layout
;
5344 pi
.inode
->version
= cur
->pre_dirty();
5347 mdr
->ls
= mdlog
->get_current_segment();
5348 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5349 mdlog
->start_entry(le
);
5350 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5351 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5352 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5354 mdr
->no_early_reply
= true;
5355 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5360 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5361 file_layout_t
*layout
, bool validate
)
5363 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5365 if (name
== "layout") {
5366 string::iterator begin
= value
.begin();
5367 string::iterator end
= value
.end();
5368 keys_and_values
<string::iterator
> p
; // create instance of parser
5369 std::map
<string
, string
> m
; // map to receive results
5370 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5371 return -CEPHFS_EINVAL
;
5373 string
left(begin
, end
);
5374 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5376 return -CEPHFS_EINVAL
;
5377 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5378 // Skip validation on each attr, we do it once at the end (avoid
5379 // rejecting intermediate states if the overall result is ok)
5380 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
5381 osdmap
, layout
, false);
5385 } else if (name
== "layout.object_size") {
5386 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5387 } else if (name
== "layout.stripe_unit") {
5388 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5389 } else if (name
== "layout.stripe_count") {
5390 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5391 } else if (name
== "layout.pool") {
5393 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5394 } catch (boost::bad_lexical_cast
const&) {
5395 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5397 dout(10) << " unknown pool " << value
<< dendl
;
5398 return -CEPHFS_ENOENT
;
5400 layout
->pool_id
= pool
;
5402 } else if (name
== "layout.pool_namespace") {
5403 layout
->pool_ns
= value
;
5405 dout(10) << " unknown layout vxattr " << name
<< dendl
;
5406 return -CEPHFS_EINVAL
;
5408 } catch (boost::bad_lexical_cast
const&) {
5409 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5410 return -CEPHFS_EINVAL
;
5413 if (validate
&& !layout
->is_valid()) {
5414 dout(10) << "bad layout" << dendl
;
5415 return -CEPHFS_EINVAL
;
5417 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5418 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
5419 return -CEPHFS_EINVAL
;
5424 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5426 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5428 if (name
== "quota") {
5429 string::iterator begin
= value
.begin();
5430 string::iterator end
= value
.end();
5432 // keep quota unchanged. (for create_quota_realm())
5435 keys_and_values
<string::iterator
> p
; // create instance of parser
5436 std::map
<string
, string
> m
; // map to receive results
5437 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5438 return -CEPHFS_EINVAL
;
5440 string
left(begin
, end
);
5441 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5443 return -CEPHFS_EINVAL
;
5444 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5445 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5449 } else if (name
== "quota.max_bytes") {
5450 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5452 return -CEPHFS_EINVAL
;
5453 quota
->max_bytes
= q
;
5454 } else if (name
== "quota.max_files") {
5455 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5457 return -CEPHFS_EINVAL
;
5458 quota
->max_files
= q
;
5460 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5461 return -CEPHFS_EINVAL
;
5463 } catch (boost::bad_lexical_cast
const&) {
5464 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5465 return -CEPHFS_EINVAL
;
5468 if (!quota
->is_valid()) {
5469 dout(10) << "bad quota" << dendl
;
5470 return -CEPHFS_EINVAL
;
5475 void Server::create_quota_realm(CInode
*in
)
5477 dout(10) << __func__
<< " " << *in
<< dendl
;
5479 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5480 req
->set_filepath(filepath(in
->ino()));
5481 req
->set_string2("ceph.quota");
5482 // empty vxattr value
5483 req
->set_tid(mds
->issue_tid());
5485 mds
->send_message_mds(req
, in
->authority().first
);
5489 * Verify that the file layout attribute carried by client
5490 * is well-formatted.
5491 * Return 0 on success, otherwise this function takes
5492 * responsibility for the passed mdr.
5494 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5497 file_layout_t
*layout
)
5499 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5503 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5504 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5505 epoch
= osdmap
.get_epoch();
5508 if (r
== -CEPHFS_ENOENT
) {
5510 // we don't have the specified pool, make sure our map
5511 // is newer than or as new as the client.
5512 epoch_t req_epoch
= req
->get_osdmap_epoch();
5514 if (req_epoch
> epoch
) {
5516 // well, our map is older. consult mds.
5517 auto fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5519 mds
->objecter
->wait_for_map(req_epoch
, lambdafy(fin
));
5521 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5523 // For compatibility with client w/ old code, we still need get the
5524 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5525 // we can remove those code.
5526 mdr
->waited_for_osdmap
= true;
5527 mds
->objecter
->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5528 mds
, new C_MDS_RetryRequest(mdcache
, mdr
))));
5535 if (r
== -CEPHFS_ENOENT
)
5538 respond_to_request(mdr
, r
);
5546 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5548 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5549 string
name(req
->get_path2());
5550 bufferlist bl
= req
->get_data();
5551 string
value (bl
.c_str(), bl
.length());
5552 dout(10) << "handle_set_vxattr " << name
5553 << " val " << value
.length()
5554 << " bytes on " << *cur
5557 CInode::mempool_inode
*pip
= nullptr;
5560 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5564 bool adjust_realm
= false;
5565 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5566 if (!cur
->is_dir()) {
5567 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5571 if (!xlock_policylock(mdr
, cur
, true))
5574 file_layout_t layout
;
5575 if (cur
->get_projected_inode()->has_layout())
5576 layout
= cur
->get_projected_inode()->layout
;
5577 else if (mdr
->dir_layout
!= file_layout_t())
5578 layout
= mdr
->dir_layout
;
5580 layout
= mdcache
->default_file_layout
;
5582 rest
= name
.substr(name
.find("layout"));
5583 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5586 auto pi
= cur
->project_inode(mdr
);
5587 pi
.inode
->layout
= layout
;
5588 mdr
->no_early_reply
= true;
5589 pip
= pi
.inode
.get();
5590 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
5591 if (!cur
->is_file()) {
5592 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5595 if (cur
->get_projected_inode()->size
||
5596 cur
->get_projected_inode()->truncate_seq
> 1) {
5597 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5600 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5601 rest
= name
.substr(name
.find("layout"));
5602 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
5605 MutationImpl::LockOpVec lov
;
5606 lov
.add_xlock(&cur
->filelock
);
5607 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5610 auto pi
= cur
->project_inode(mdr
);
5611 int64_t old_pool
= pi
.inode
->layout
.pool_id
;
5612 pi
.inode
->add_old_pool(old_pool
);
5613 pi
.inode
->layout
= layout
;
5614 pip
= pi
.inode
.get();
5615 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
5616 if (!cur
->is_dir()) {
5617 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5621 quota_info_t quota
= cur
->get_projected_inode()->quota
;
5623 rest
= name
.substr(name
.find("quota"));
5624 int r
= parse_quota_vxattr(rest
, value
, "a
);
5626 respond_to_request(mdr
, r
);
5630 if (quota
.is_enable() && !cur
->get_projected_srnode())
5631 adjust_realm
= true;
5633 if (!xlock_policylock(mdr
, cur
, false, adjust_realm
))
5636 if (cur
->get_projected_inode()->quota
== quota
) {
5637 respond_to_request(mdr
, 0);
5641 auto pi
= cur
->project_inode(mdr
, false, adjust_realm
);
5642 pi
.inode
->quota
= quota
;
5645 pi
.snapnode
->created
= pi
.snapnode
->seq
= cur
->find_snaprealm()->get_newest_seq();
5647 mdr
->no_early_reply
= true;
5648 pip
= pi
.inode
.get();
5650 client_t exclude_ct
= mdr
->get_client();
5651 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
5652 } else if (name
== "ceph.dir.subvolume"sv
) {
5653 if (!cur
->is_dir()) {
5654 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5660 val
= boost::lexical_cast
<bool>(value
);
5661 } catch (boost::bad_lexical_cast
const&) {
5662 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5663 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5667 if (!xlock_policylock(mdr
, cur
, false, true))
5670 SnapRealm
*realm
= cur
->find_snaprealm();
5672 inodeno_t subvol_ino
= realm
->get_subvolume_ino();
5673 // can't create subvolume inside another subvolume
5674 if (subvol_ino
&& subvol_ino
!= cur
->ino()) {
5675 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5680 const auto srnode
= cur
->get_projected_srnode();
5681 if (val
== (srnode
&& srnode
->is_subvolume())) {
5682 respond_to_request(mdr
, 0);
5686 auto pi
= cur
->project_inode(mdr
, false, true);
5688 pi
.snapnode
->created
= pi
.snapnode
->seq
= realm
->get_newest_seq();
5690 pi
.snapnode
->mark_subvolume();
5692 pi
.snapnode
->clear_subvolume();
5694 mdr
->no_early_reply
= true;
5695 pip
= pi
.inode
.get();
5696 adjust_realm
= true;
5697 } else if (name
== "ceph.dir.pin"sv
) {
5698 if (!cur
->is_dir() || cur
->is_root()) {
5699 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5705 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
5706 if (rank
< 0) rank
= MDS_RANK_NONE
;
5707 } catch (boost::bad_lexical_cast
const&) {
5708 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5709 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5713 if (!xlock_policylock(mdr
, cur
))
5716 auto pi
= cur
->project_inode(mdr
);
5717 cur
->set_export_pin(rank
);
5718 pip
= pi
.inode
.get();
5719 } else if (name
== "ceph.dir.pin.random"sv
) {
5720 if (!cur
->is_dir() || cur
->is_root()) {
5721 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5727 val
= boost::lexical_cast
<double>(value
);
5728 } catch (boost::bad_lexical_cast
const&) {
5729 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
5730 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5734 if (val
< 0.0 || 1.0 < val
) {
5735 respond_to_request(mdr
, -CEPHFS_EDOM
);
5737 } else if (mdcache
->export_ephemeral_random_max
< val
) {
5738 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5742 if (!xlock_policylock(mdr
, cur
))
5745 auto pi
= cur
->project_inode(mdr
);
5746 cur
->setxattr_ephemeral_rand(val
);
5747 pip
= pi
.inode
.get();
5748 } else if (name
== "ceph.dir.pin.distributed"sv
) {
5749 if (!cur
->is_dir() || cur
->is_root()) {
5750 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5756 val
= boost::lexical_cast
<bool>(value
);
5757 } catch (boost::bad_lexical_cast
const&) {
5758 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
5759 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5763 if (!xlock_policylock(mdr
, cur
))
5766 auto pi
= cur
->project_inode(mdr
);
5767 cur
->setxattr_ephemeral_dist(val
);
5768 pip
= pi
.inode
.get();
5770 dout(10) << " unknown vxattr " << name
<< dendl
;
5771 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5776 pip
->ctime
= mdr
->get_op_stamp();
5777 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
5778 pip
->rstat
.rctime
= mdr
->get_op_stamp();
5779 pip
->version
= cur
->pre_dirty();
5781 pip
->update_backtrace();
5784 mdr
->ls
= mdlog
->get_current_segment();
5785 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
5786 mdlog
->start_entry(le
);
5787 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5788 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5789 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5791 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5792 false, false, adjust_realm
));
5796 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5798 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5799 string
name(req
->get_path2());
5801 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
5803 if (name
== "ceph.dir.layout") {
5804 if (!cur
->is_dir()) {
5805 respond_to_request(mdr
, -CEPHFS_ENODATA
);
5808 if (cur
->is_root()) {
5809 dout(10) << "can't remove layout policy on the root directory" << dendl
;
5810 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5814 if (!cur
->get_projected_inode()->has_layout()) {
5815 respond_to_request(mdr
, -CEPHFS_ENODATA
);
5819 MutationImpl::LockOpVec lov
;
5820 lov
.add_xlock(&cur
->policylock
);
5821 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5824 auto pi
= cur
->project_inode(mdr
);
5825 pi
.inode
->clear_layout();
5826 pi
.inode
->version
= cur
->pre_dirty();
5829 mdr
->ls
= mdlog
->get_current_segment();
5830 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
5831 mdlog
->start_entry(le
);
5832 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5833 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5834 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5836 mdr
->no_early_reply
= true;
5837 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5839 } else if (name
== "ceph.dir.layout.pool_namespace"
5840 || name
== "ceph.file.layout.pool_namespace") {
5841 // Namespace is the only layout field that has a meaningful
5842 // null/none value (empty string, means default layout). Is equivalent
5843 // to a setxattr with empty string: pass through the empty payload of
5844 // the rmxattr request to do this.
5845 handle_set_vxattr(mdr
, cur
);
5849 respond_to_request(mdr
, -CEPHFS_ENODATA
);
5852 const Server::XattrHandler
Server::xattr_handlers
[] = {
5854 xattr_name
: Server::DEFAULT_HANDLER
,
5855 description
: "default xattr handler",
5856 validate
: &Server::default_xattr_validate
,
5857 setxattr
: &Server::default_setxattr_handler
,
5858 removexattr
: &Server::default_removexattr_handler
,
5861 xattr_name
: "ceph.mirror.info",
5862 description
: "mirror info xattr handler",
5863 validate
: &Server::mirror_info_xattr_validate
,
5864 setxattr
: &Server::mirror_info_setxattr_handler
,
5865 removexattr
: &Server::mirror_info_removexattr_handler
5869 const Server::XattrHandler
* Server::get_xattr_or_default_handler(std::string_view xattr_name
) {
5870 const XattrHandler
*default_xattr_handler
= nullptr;
5872 for (auto &handler
: xattr_handlers
) {
5873 if (handler
.xattr_name
== Server::DEFAULT_HANDLER
) {
5874 ceph_assert(default_xattr_handler
== nullptr);
5875 default_xattr_handler
= &handler
;
5877 if (handler
.xattr_name
== xattr_name
) {
5878 dout(20) << "handler=" << handler
.description
<< dendl
;
5883 ceph_assert(default_xattr_handler
!= nullptr);
5884 dout(20) << "handler=" << default_xattr_handler
->description
<< dendl
;
5885 return default_xattr_handler
;
5888 int Server::xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
5889 const std::string
&xattr_name
, int op
, int flags
) {
5890 if (op
== CEPH_MDS_OP_SETXATTR
) {
5892 if ((flags
& CEPH_XATTR_CREATE
) && xattrs
->count(mempool::mds_co::string(xattr_name
))) {
5893 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur
<< dendl
;
5894 return -CEPHFS_EEXIST
;
5897 if ((flags
& CEPH_XATTR_REPLACE
) && !(xattrs
&& xattrs
->count(mempool::mds_co::string(xattr_name
)))) {
5898 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur
<< dendl
;
5899 return -CEPHFS_ENODATA
;
5905 if (op
== CEPH_MDS_OP_RMXATTR
) {
5906 if (!xattrs
|| xattrs
->count(mempool::mds_co::string(xattr_name
)) == 0) {
5907 dout(10) << "removexattr '" << xattr_name
<< "' and CEPHFS_ENODATA on " << *cur
<< dendl
;
5908 return -CEPHFS_ENODATA
;
5914 derr
<< ": unhandled validation for: " << xattr_name
<< dendl
;
5915 return -CEPHFS_EINVAL
;
5918 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
5919 const bufferlist
&xattr_value
) {
5920 size_t len
= xattr_value
.length();
5921 bufferptr b
= buffer::create(len
);
5923 xattr_value
.begin().copy(len
, b
.c_str());
5925 auto em
= xattrs
->emplace(std::piecewise_construct
,
5926 std::forward_as_tuple(mempool::mds_co::string(xattr_name
)),
5927 std::forward_as_tuple(b
));
5929 em
.first
->second
= b
;
5933 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
) {
5934 xattrs
->erase(mempool::mds_co::string(xattr_name
));
5937 int Server::default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
5938 XattrOp
*xattr_op
) {
5939 return xattr_validate(cur
, xattrs
, xattr_op
->xattr_name
, xattr_op
->op
, xattr_op
->flags
);
5942 void Server::default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
5943 const XattrOp
&xattr_op
) {
5944 xattr_set(xattrs
, xattr_op
.xattr_name
, xattr_op
.xattr_value
);
5947 void Server::default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
5948 const XattrOp
&xattr_op
) {
5949 xattr_rm(xattrs
, xattr_op
.xattr_name
);
5952 // mirror info xattr handlers
5953 const std::string
Server::MirrorXattrInfo::MIRROR_INFO_REGEX
= "^cluster_id=([a-f0-9]{8}-" \
5954 "[a-f0-9]{4}-[a-f0-9]{4}-" \
5955 "[a-f0-9]{4}-[a-f0-9]{12})" \
5957 const std::string
Server::MirrorXattrInfo::CLUSTER_ID
= "ceph.mirror.info.cluster_id";
5958 const std::string
Server::MirrorXattrInfo::FS_ID
= "ceph.mirror.info.fs_id";
5959 int Server::parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
5960 std::string
&cluster_id
, std::string
&fs_id
) {
5961 dout(20) << "parsing name=" << name
<< ", value=" << value
<< dendl
;
5963 static const std::regex
regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX
);
5966 std::regex_search(value
, match
, regex
);
5967 if (match
.size() != 3) {
5968 derr
<< "mirror info parse error" << dendl
;
5969 return -CEPHFS_EINVAL
;
5972 cluster_id
= match
[1];
5974 dout(20) << " parsed cluster_id=" << cluster_id
<< ", fs_id=" << fs_id
<< dendl
;
5978 int Server::mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
5979 XattrOp
*xattr_op
) {
5980 if (!cur
->is_root()) {
5981 return -CEPHFS_EINVAL
;
5984 int v1
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, xattr_op
->op
, xattr_op
->flags
);
5985 int v2
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::FS_ID
, xattr_op
->op
, xattr_op
->flags
);
5987 derr
<< "inconsistent mirror info state (" << v1
<< "," << v2
<< ")" << dendl
;
5988 return -CEPHFS_EINVAL
;
5995 if (xattr_op
->op
== CEPH_MDS_OP_RMXATTR
) {
5999 std::string cluster_id
;
6001 int r
= parse_mirror_info_xattr(xattr_op
->xattr_name
, xattr_op
->xattr_value
.to_str(),
6007 xattr_op
->xinfo
= std::make_unique
<MirrorXattrInfo
>(cluster_id
, fs_id
);
6011 void Server::mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6012 const XattrOp
&xattr_op
) {
6013 auto mirror_info
= dynamic_cast<MirrorXattrInfo
&>(*(xattr_op
.xinfo
));
6016 bl
.append(mirror_info
.cluster_id
.c_str(), mirror_info
.cluster_id
.length());
6017 xattr_set(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, bl
);
6020 bl
.append(mirror_info
.fs_id
.c_str(), mirror_info
.fs_id
.length());
6021 xattr_set(xattrs
, Server::MirrorXattrInfo::FS_ID
, bl
);
6024 void Server::mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6025 const XattrOp
&xattr_op
) {
6026 xattr_rm(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
);
6027 xattr_rm(xattrs
, Server::MirrorXattrInfo::FS_ID
);
6030 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
6032 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6033 string
name(req
->get_path2());
6035 // is a ceph virtual xattr?
6036 if (is_ceph_vxattr(name
)) {
6037 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6038 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6042 handle_set_vxattr(mdr
, cur
);
6046 if (!is_allowed_ceph_xattr(name
)) {
6047 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6051 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
6055 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6056 respond_to_request(mdr
, -CEPHFS_EROFS
);
6060 int flags
= req
->head
.args
.setxattr
.flags
;
6062 MutationImpl::LockOpVec lov
;
6063 lov
.add_xlock(&cur
->xattrlock
);
6064 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6067 if (!check_access(mdr
, cur
, MAY_WRITE
))
6070 size_t len
= req
->get_data().length();
6071 size_t inc
= len
+ name
.length();
6073 auto handler
= Server::get_xattr_or_default_handler(name
);
6074 const auto& pxattrs
= cur
->get_projected_xattrs();
6076 // check xattrs kv pairs size
6077 size_t cur_xattrs_size
= 0;
6078 for (const auto& p
: *pxattrs
) {
6079 if ((flags
& CEPH_XATTR_REPLACE
) && name
.compare(p
.first
) == 0) {
6082 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
6085 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
6086 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6087 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
6088 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
6093 XattrOp
xattr_op(CEPH_MDS_OP_SETXATTR
, name
, req
->get_data(), flags
);
6094 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6096 respond_to_request(mdr
, r
);
6100 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
6103 auto pi
= cur
->project_inode(mdr
, true);
6104 pi
.inode
->version
= cur
->pre_dirty();
6105 pi
.inode
->ctime
= mdr
->get_op_stamp();
6106 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6107 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6108 if (name
== "encryption.ctx"sv
)
6109 pi
.inode
->fscrypt
= true;
6110 pi
.inode
->change_attr
++;
6111 pi
.inode
->xattr_version
++;
6113 if ((flags
& CEPH_XATTR_REMOVE
)) {
6114 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6116 std::invoke(handler
->setxattr
, this, cur
, pi
.xattrs
, xattr_op
);
6120 mdr
->ls
= mdlog
->get_current_segment();
6121 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
6122 mdlog
->start_entry(le
);
6123 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6124 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6125 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6127 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6130 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
6132 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6133 std::string
name(req
->get_path2());
6135 // is a ceph virtual xattr?
6136 if (is_ceph_vxattr(name
)) {
6137 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6138 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6142 handle_remove_vxattr(mdr
, cur
);
6146 if (!is_allowed_ceph_xattr(name
)) {
6147 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6151 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
6155 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6156 respond_to_request(mdr
, -CEPHFS_EROFS
);
6160 MutationImpl::LockOpVec lov
;
6161 lov
.add_xlock(&cur
->xattrlock
);
6162 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6166 auto handler
= Server::get_xattr_or_default_handler(name
);
6168 XattrOp
xattr_op(CEPH_MDS_OP_RMXATTR
, name
, bl
, 0);
6170 const auto& pxattrs
= cur
->get_projected_xattrs();
6171 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6173 respond_to_request(mdr
, r
);
6177 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
6180 auto pi
= cur
->project_inode(mdr
, true);
6181 pi
.inode
->version
= cur
->pre_dirty();
6182 pi
.inode
->ctime
= mdr
->get_op_stamp();
6183 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6184 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6185 pi
.inode
->change_attr
++;
6186 pi
.inode
->xattr_version
++;
6187 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6190 mdr
->ls
= mdlog
->get_current_segment();
6191 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
6192 mdlog
->start_entry(le
);
6193 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6194 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6195 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6197 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6201 // =================================================================
6202 // DIRECTORY and NAMESPACE OPS
6205 // ------------------------------------------------
6209 class C_MDS_mknod_finish
: public ServerLogContext
{
6213 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
6214 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
6215 void finish(int r
) override
{
6216 ceph_assert(r
== 0);
6219 dn
->pop_projected_linkage();
6221 // be a bit hacky with the inode version, here.. we decrement it
6222 // just to keep mark_dirty() happen. (we didn't bother projecting
6223 // a new version of hte inode since it's just been created)
6224 newi
->mark_dirty(mdr
->ls
);
6225 newi
->mark_dirty_parent(mdr
->ls
, true);
6228 if (newi
->is_dir()) {
6229 CDir
*dir
= newi
->get_dirfrag(frag_t());
6231 dir
->mark_dirty(mdr
->ls
);
6232 dir
->mark_new(mdr
->ls
);
6237 MDRequestRef null_ref
;
6238 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
6240 if (newi
->is_file()) {
6241 get_mds()->locker
->share_inode_max_size(newi
);
6242 } else if (newi
->is_dir()) {
6243 // We do this now so that the linkages on the new directory are stable.
6244 newi
->maybe_ephemeral_rand();
6248 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
6251 server
->respond_to_request(mdr
, 0);
6256 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6258 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6259 client_t client
= mdr
->get_client();
6261 unsigned mode
= req
->head
.args
.mknod
.mode
;
6262 if ((mode
& S_IFMT
) == 0)
6265 mdr
->disable_lock_cache();
6266 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, S_ISREG(mode
));
6270 CDir
*dir
= dn
->get_dir();
6271 CInode
*diri
= dir
->get_inode();
6272 if (!check_access(mdr
, diri
, MAY_WRITE
))
6274 if (!check_fragment_space(mdr
, dn
->get_dir()))
6277 ceph_assert(dn
->get_projected_linkage()->is_null());
6278 if (req
->get_alternate_name().size() > alternate_name_max
) {
6279 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6280 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6283 dn
->set_alternate_name(req
->get_alternate_name());
6286 file_layout_t layout
;
6287 if (mdr
->dir_layout
!= file_layout_t())
6288 layout
= mdr
->dir_layout
;
6290 layout
= mdcache
->default_file_layout
;
6292 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6295 dn
->push_projected_linkage(newi
);
6297 auto _inode
= newi
->_get_inode();
6298 _inode
->version
= dn
->pre_dirty();
6299 _inode
->rdev
= req
->head
.args
.mknod
.rdev
;
6300 _inode
->rstat
.rfiles
= 1;
6301 _inode
->accounted_rstat
= _inode
->rstat
;
6302 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6303 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
6304 _inode
->update_backtrace();
6306 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6307 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6308 ceph_assert(follows
>= realm
->get_newest_seq());
6310 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6311 // want to write to it (e.g., if they are reexporting NFS)
6312 if (S_ISREG(_inode
->mode
)) {
6313 // issue a cap on the file
6314 int cmode
= CEPH_FILE_MODE_RDWR
;
6315 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6319 // put locks in excl mode
6320 newi
->filelock
.set_state(LOCK_EXCL
);
6321 newi
->authlock
.set_state(LOCK_EXCL
);
6322 newi
->xattrlock
.set_state(LOCK_EXCL
);
6324 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6325 _inode
->client_ranges
[client
].range
.first
= 0;
6326 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
6327 _inode
->client_ranges
[client
].follows
= follows
;
6328 newi
->mark_clientwriteable();
6329 cap
->mark_clientwriteable();
6333 ceph_assert(dn
->first
== follows
+ 1);
6334 newi
->first
= dn
->first
;
6336 dout(10) << "mknod mode " << _inode
->mode
<< " rdev " << _inode
->rdev
<< dendl
;
6339 mdr
->ls
= mdlog
->get_current_segment();
6340 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6341 mdlog
->start_entry(le
);
6342 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6343 journal_allocated_inos(mdr
, &le
->metablob
);
6345 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6346 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6347 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6349 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6350 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6356 /* This function takes responsibility for the passed mdr*/
6357 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6359 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6361 mdr
->disable_lock_cache();
6362 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6366 CDir
*dir
= dn
->get_dir();
6367 CInode
*diri
= dir
->get_inode();
6369 // mkdir check access
6370 if (!check_access(mdr
, diri
, MAY_WRITE
))
6373 if (!check_fragment_space(mdr
, dir
))
6376 ceph_assert(dn
->get_projected_linkage()->is_null());
6377 if (req
->get_alternate_name().size() > alternate_name_max
) {
6378 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6379 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6382 dn
->set_alternate_name(req
->get_alternate_name());
6385 unsigned mode
= req
->head
.args
.mkdir
.mode
;
6388 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6391 // it's a directory.
6392 dn
->push_projected_linkage(newi
);
6394 auto _inode
= newi
->_get_inode();
6395 _inode
->version
= dn
->pre_dirty();
6396 _inode
->rstat
.rsubdirs
= 1;
6397 _inode
->accounted_rstat
= _inode
->rstat
;
6398 _inode
->update_backtrace();
6400 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6401 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6402 ceph_assert(follows
>= realm
->get_newest_seq());
6404 dout(12) << " follows " << follows
<< dendl
;
6405 ceph_assert(dn
->first
== follows
+ 1);
6406 newi
->first
= dn
->first
;
6408 // ...and that new dir is empty.
6409 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
6410 newdir
->state_set(CDir::STATE_CREATING
);
6411 newdir
->mark_complete();
6412 newdir
->_get_fnode()->version
= newdir
->pre_dirty();
6415 mdr
->ls
= mdlog
->get_current_segment();
6416 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
6417 mdlog
->start_entry(le
);
6418 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6419 journal_allocated_inos(mdr
, &le
->metablob
);
6420 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6421 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6422 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
6424 // issue a cap on the directory
6425 int cmode
= CEPH_FILE_MODE_RDWR
;
6426 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6430 // put locks in excl mode
6431 newi
->filelock
.set_state(LOCK_EXCL
);
6432 newi
->authlock
.set_state(LOCK_EXCL
);
6433 newi
->xattrlock
.set_state(LOCK_EXCL
);
6436 // make sure this inode gets into the journal
6437 le
->metablob
.add_opened_ino(newi
->ino());
6439 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6441 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6442 // have overshot the split size (multiple mkdir in flight), so here is
6443 // an early chance to split the dir if this mkdir makes it oversized.
6444 mds
->balancer
->maybe_fragment(dir
, false);
6450 void Server::handle_client_symlink(MDRequestRef
& mdr
)
6452 const auto& req
= mdr
->client_request
;
6454 mdr
->disable_lock_cache();
6455 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6459 CDir
*dir
= dn
->get_dir();
6460 CInode
*diri
= dir
->get_inode();
6462 if (!check_access(mdr
, diri
, MAY_WRITE
))
6464 if (!check_fragment_space(mdr
, dir
))
6467 ceph_assert(dn
->get_projected_linkage()->is_null());
6468 if (req
->get_alternate_name().size() > alternate_name_max
) {
6469 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6470 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6472 dn
->set_alternate_name(req
->get_alternate_name());
6474 unsigned mode
= S_IFLNK
| 0777;
6475 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
6479 dn
->push_projected_linkage(newi
);
6481 newi
->symlink
= req
->get_path2();
6482 auto _inode
= newi
->_get_inode();
6483 _inode
->version
= dn
->pre_dirty();
6484 _inode
->size
= newi
->symlink
.length();
6485 _inode
->rstat
.rbytes
= _inode
->size
;
6486 _inode
->rstat
.rfiles
= 1;
6487 _inode
->accounted_rstat
= _inode
->rstat
;
6488 _inode
->update_backtrace();
6490 newi
->first
= dn
->first
;
6493 mdr
->ls
= mdlog
->get_current_segment();
6494 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
6495 mdlog
->start_entry(le
);
6496 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6497 journal_allocated_inos(mdr
, &le
->metablob
);
6498 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6499 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
6501 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6502 mds
->balancer
->maybe_fragment(dir
, false);
6511 void Server::handle_client_link(MDRequestRef
& mdr
)
6513 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6515 dout(7) << "handle_client_link " << req
->get_filepath()
6516 << " to " << req
->get_filepath2()
6519 mdr
->disable_lock_cache();
6524 if (req
->get_filepath2().depth() == 0) {
6525 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
6527 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
6528 mdcache
->find_ino_peers(req
->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr
));
6533 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
6534 CDentry
*pdn
= targeti
->get_projected_parent_dn();
6536 dout(7) << "target has no parent dn, failing..." << dendl
;
6537 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6540 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
6542 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
6545 destdn
= rdlock_path_xlock_dentry(mdr
, false);
6549 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
6554 if (!destdn
->get_projected_linkage()->is_null()) {
6555 respond_to_request(mdr
, -CEPHFS_EEXIST
);
6559 targeti
= ret
.second
->get_projected_linkage()->get_inode();
6562 ceph_assert(destdn
->get_projected_linkage()->is_null());
6563 if (req
->get_alternate_name().size() > alternate_name_max
) {
6564 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6565 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6568 destdn
->set_alternate_name(req
->get_alternate_name());
6570 if (targeti
->is_dir()) {
6571 dout(7) << "target is a dir, failing..." << dendl
;
6572 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6576 CDir
*dir
= destdn
->get_dir();
6577 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
6578 dout(7) << "target is " << *targeti
<< dendl
;
6580 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6581 MutationImpl::LockOpVec lov
;
6582 lov
.add_xlock(&targeti
->snaplock
);
6583 lov
.add_xlock(&targeti
->linklock
);
6585 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6588 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6591 if (targeti
->get_projected_inode()->nlink
== 0) {
6592 dout(7) << "target has no link, failing..." << dendl
;
6593 respond_to_request(mdr
, -CEPHFS_ENOENT
);
6596 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6597 if (!check_access(mdr
, targeti
, MAY_WRITE
))
6600 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
6603 if (!check_fragment_space(mdr
, dir
))
6607 CInode
* target_pin
= targeti
->get_projected_parent_dir()->inode
;
6608 SnapRealm
*target_realm
= target_pin
->find_snaprealm();
6609 if (target_pin
!= dir
->inode
&&
6610 target_realm
->get_subvolume_ino() !=
6611 dir
->inode
->find_snaprealm()->get_subvolume_ino()) {
6612 dout(7) << "target is in different subvolume, failing..." << dendl
;
6613 respond_to_request(mdr
, -CEPHFS_EXDEV
);
6618 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
6621 if (targeti
->is_auth())
6622 _link_local(mdr
, destdn
, targeti
, target_realm
);
6624 _link_remote(mdr
, true, destdn
, targeti
);
6625 mds
->balancer
->maybe_fragment(dir
, false);
6629 class C_MDS_link_local_finish
: public ServerLogContext
{
6636 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
6637 version_t dnpv_
, version_t tipv_
, bool ar
) :
6638 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
6639 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
6640 void finish(int r
) override
{
6641 ceph_assert(r
== 0);
6642 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
6647 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
)
6649 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
6651 mdr
->ls
= mdlog
->get_current_segment();
6653 // predirty NEW dentry
6654 version_t dnpv
= dn
->pre_dirty();
6655 version_t tipv
= targeti
->pre_dirty();
6657 // project inode update
6658 auto pi
= targeti
->project_inode(mdr
);
6660 pi
.inode
->ctime
= mdr
->get_op_stamp();
6661 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6662 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6663 pi
.inode
->change_attr
++;
6664 pi
.inode
->version
= tipv
;
6666 bool adjust_realm
= false;
6667 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
6668 sr_t
*newsnap
= targeti
->project_snaprealm();
6669 targeti
->mark_snaprealm_global(newsnap
);
6670 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, targeti
->get_projected_parent_dn(), true);
6671 adjust_realm
= true;
6675 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
6676 mdlog
->start_entry(le
);
6677 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6678 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
6679 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
6680 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6681 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
6683 // do this after predirty_*, to avoid funky extra dnl arg
6684 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6686 journal_and_reply(mdr
, targeti
, dn
, le
,
6687 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
6690 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
6691 version_t dnpv
, version_t tipv
, bool adjust_realm
)
6693 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
6695 // link and unlock the NEW dentry
6696 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6697 if (!dnl
->get_inode())
6698 dn
->link_remote(dnl
, targeti
);
6699 dn
->mark_dirty(dnpv
, mdr
->ls
);
6704 MDRequestRef null_ref
;
6705 mdcache
->send_dentry_link(dn
, null_ref
);
6708 int op
= CEPH_SNAP_OP_SPLIT
;
6709 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
6710 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
6713 // bump target popularity
6714 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6715 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6718 respond_to_request(mdr
, 0);
6722 // link / unlink remote
6724 class C_MDS_link_remote_finish
: public ServerLogContext
{
6730 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
6731 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
6732 dpv(d
->get_projected_version()) {}
6733 void finish(int r
) override
{
6734 ceph_assert(r
== 0);
6735 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
6739 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
6741 dout(10) << "_link_remote "
6742 << (inc
? "link ":"unlink ")
6743 << *dn
<< " to " << *targeti
<< dendl
;
6745 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6746 mds_rank_t linkauth
= targeti
->authority().first
;
6747 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
6748 if (mds
->is_cluster_degraded() &&
6749 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
6750 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
6751 if (mdr
->more()->waiting_on_peer
.empty())
6752 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
6756 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
6759 op
= MMDSPeerRequest::OP_LINKPREP
;
6761 op
= MMDSPeerRequest::OP_UNLINKPREP
;
6762 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
6763 targeti
->set_object_info(req
->get_object_info());
6764 req
->op_stamp
= mdr
->get_op_stamp();
6765 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
6766 encode(*desti_srnode
, req
->desti_snapbl
);
6767 mds
->send_message_mds(req
, linkauth
);
6769 ceph_assert(mdr
->more()->waiting_on_peer
.count(linkauth
) == 0);
6770 mdr
->more()->waiting_on_peer
.insert(linkauth
);
6773 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
6775 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
6777 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
6778 delete desti_srnode
;
6779 desti_srnode
= NULL
;
6782 mdr
->set_mds_stamp(ceph_clock_now());
6785 mdr
->ls
= mdlog
->get_current_segment();
6786 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
6787 mdlog
->start_entry(le
);
6788 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6789 if (!mdr
->more()->witnessed
.empty()) {
6790 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
6791 le
->reqid
= mdr
->reqid
;
6792 le
->had_peers
= true;
6793 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6798 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
6799 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
6800 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
6803 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6804 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6805 le
->metablob
.add_null_dentry(dn
, true);
6806 dn
->push_projected_linkage();
6809 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
6810 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
6813 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
6814 CDentry
*dn
, CInode
*targeti
,
6817 dout(10) << "_link_remote_finish "
6818 << (inc
? "link ":"unlink ")
6819 << *dn
<< " to " << *targeti
<< dendl
;
6821 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
6823 if (!mdr
->more()->witnessed
.empty())
6824 mdcache
->logged_leader_update(mdr
->reqid
);
6827 // link the new dentry
6828 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
6829 if (!dnl
->get_inode())
6830 dn
->link_remote(dnl
, targeti
);
6831 dn
->mark_dirty(dpv
, mdr
->ls
);
6833 // unlink main dentry
6834 dn
->get_dir()->unlink_inode(dn
);
6835 dn
->pop_projected_linkage();
6836 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
6841 MDRequestRef null_ref
;
6843 mdcache
->send_dentry_link(dn
, null_ref
);
6845 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
6847 // bump target popularity
6848 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
6849 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
6852 respond_to_request(mdr
, 0);
6855 // removing a new dn?
6856 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6860 // remote linking/unlinking
6862 class C_MDS_PeerLinkPrep
: public ServerLogContext
{
6866 C_MDS_PeerLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
6867 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
6868 void finish(int r
) override
{
6869 ceph_assert(r
== 0);
6870 server
->_logged_peer_link(mdr
, targeti
, adjust_realm
);
6874 class C_MDS_PeerLinkCommit
: public ServerContext
{
6878 C_MDS_PeerLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
6879 ServerContext(s
), mdr(r
), targeti(t
) { }
6880 void finish(int r
) override
{
6881 server
->_commit_peer_link(mdr
, r
, targeti
);
6885 void Server::handle_peer_link_prep(MDRequestRef
& mdr
)
6887 dout(10) << "handle_peer_link_prep " << *mdr
6888 << " on " << mdr
->peer_request
->get_object_info()
6891 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
6893 CInode
*targeti
= mdcache
->get_inode(mdr
->peer_request
->get_object_info().ino
);
6894 ceph_assert(targeti
);
6895 dout(10) << "targeti " << *targeti
<< dendl
;
6896 CDentry
*dn
= targeti
->get_parent_dn();
6897 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6898 ceph_assert(dnl
->is_primary());
6900 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
6902 mdr
->auth_pin(targeti
);
6904 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
6905 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
6908 mdr
->ls
= mdlog
->get_current_segment();
6909 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_prep", mdr
->reqid
, mdr
->peer_to_mds
,
6910 EPeerUpdate::OP_PREPARE
, EPeerUpdate::LINK
);
6911 mdlog
->start_entry(le
);
6913 auto pi
= dnl
->get_inode()->project_inode(mdr
);
6915 // update journaled target inode
6917 bool adjust_realm
= false;
6918 bool realm_projected
= false;
6919 if (mdr
->peer_request
->get_op() == MMDSPeerRequest::OP_LINKPREP
) {
6923 CDentry
*target_pdn
= targeti
->get_projected_parent_dn();
6924 SnapRealm
*target_realm
= target_pdn
->get_dir()->inode
->find_snaprealm();
6925 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
6926 sr_t
*newsnap
= targeti
->project_snaprealm();
6927 targeti
->mark_snaprealm_global(newsnap
);
6928 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, target_pdn
, true);
6929 adjust_realm
= true;
6930 realm_projected
= true;
6935 if (targeti
->is_projected_snaprealm_global()) {
6936 ceph_assert(mdr
->peer_request
->desti_snapbl
.length());
6937 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
6939 sr_t
*newsnap
= targeti
->project_snaprealm();
6940 decode(*newsnap
, p
);
6942 if (pi
.inode
->nlink
== 0)
6943 ceph_assert(!newsnap
->is_parent_global());
6945 realm_projected
= true;
6947 ceph_assert(mdr
->peer_request
->desti_snapbl
.length() == 0);
6951 link_rollback rollback
;
6952 rollback
.reqid
= mdr
->reqid
;
6953 rollback
.ino
= targeti
->ino();
6954 rollback
.old_ctime
= targeti
->get_inode()->ctime
; // we hold versionlock xlock; no concorrent projections
6955 const auto& pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
6956 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
6957 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
6958 rollback
.was_inc
= inc
;
6959 if (realm_projected
) {
6960 if (targeti
->snaprealm
) {
6961 encode(true, rollback
.snapbl
);
6962 targeti
->encode_snap_blob(rollback
.snapbl
);
6964 encode(false, rollback
.snapbl
);
6967 encode(rollback
, le
->rollback
);
6968 mdr
->more()->rollback_bl
= le
->rollback
;
6970 pi
.inode
->ctime
= mdr
->get_op_stamp();
6971 pi
.inode
->version
= targeti
->pre_dirty();
6973 dout(10) << " projected inode " << pi
.inode
->ino
<< " v " << pi
.inode
->version
<< dendl
;
6976 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
6977 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
6978 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
6980 // set up commit waiter
6981 mdr
->more()->peer_commit
= new C_MDS_PeerLinkCommit(this, mdr
, targeti
);
6983 mdr
->more()->peer_update_journaled
= true;
6984 submit_mdlog_entry(le
, new C_MDS_PeerLinkPrep(this, mdr
, targeti
, adjust_realm
),
6989 void Server::_logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
6991 dout(10) << "_logged_peer_link " << *mdr
6992 << " " << *targeti
<< dendl
;
6994 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
6996 // update the target
7000 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7003 mdr
->reset_peer_request();
7006 int op
= CEPH_SNAP_OP_SPLIT
;
7007 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7008 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7012 if (!mdr
->aborted
) {
7013 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_LINKPREPACK
);
7014 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7016 dout(10) << " abort flag set, finishing" << dendl
;
7017 mdcache
->request_finish(mdr
);
7022 struct C_MDS_CommittedPeer
: public ServerLogContext
{
7023 C_MDS_CommittedPeer(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
7024 void finish(int r
) override
{
7025 server
->_committed_peer(mdr
);
7029 void Server::_commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
7031 dout(10) << "_commit_peer_link " << *mdr
7033 << " " << *targeti
<< dendl
;
7035 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
7038 // drop our pins, etc.
7041 // write a commit to the journal
7042 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_commit", mdr
->reqid
, mdr
->peer_to_mds
,
7043 EPeerUpdate::OP_COMMIT
, EPeerUpdate::LINK
);
7044 mdlog
->start_entry(le
);
7045 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7048 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7052 void Server::_committed_peer(MDRequestRef
& mdr
)
7054 dout(10) << "_committed_peer " << *mdr
<< dendl
;
7056 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
7058 bool assert_exist
= mdr
->more()->peer_update_journaled
;
7059 mdcache
->finish_uncommitted_peer(mdr
->reqid
, assert_exist
);
7060 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_COMMITTED
);
7061 mds
->send_message_mds(req
, mdr
->peer_to_mds
);
7062 mdcache
->request_finish(mdr
);
7065 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
7067 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7068 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7069 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
7070 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
7072 void finish(int r
) override
{
7073 server
->_link_rollback_finish(mut
, mdr
, splits
);
7077 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7079 link_rollback rollback
;
7080 auto p
= rbl
.cbegin();
7081 decode(rollback
, p
);
7083 dout(10) << "do_link_rollback on " << rollback
.reqid
7084 << (rollback
.was_inc
? " inc":" dec")
7085 << " ino " << rollback
.ino
7088 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
7090 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7091 ceph_assert(mdr
|| mds
->is_resolve());
7093 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7094 mut
->ls
= mds
->mdlog
->get_current_segment();
7096 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
7098 dout(10) << " target is " << *in
<< dendl
;
7099 ceph_assert(!in
->is_projected()); // live peer request hold versionlock xlock.
7101 auto pi
= in
->project_inode(mut
);
7102 pi
.inode
->version
= in
->pre_dirty();
7104 // parent dir rctime
7105 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
7106 auto pf
= parent
->project_fnode(mut
);
7107 pf
->version
= parent
->pre_dirty();
7108 if (pf
->fragstat
.mtime
== pi
.inode
->ctime
) {
7109 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
7110 if (pf
->rstat
.rctime
== pi
.inode
->ctime
)
7111 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
7112 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
7113 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
7117 pi
.inode
->ctime
= rollback
.old_ctime
;
7118 if (rollback
.was_inc
)
7123 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7124 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7126 auto p
= rollback
.snapbl
.cbegin();
7127 decode(hadrealm
, p
);
7129 if (!mds
->is_resolve()) {
7130 sr_t
*new_srnode
= new sr_t();
7131 decode(*new_srnode
, p
);
7132 in
->project_snaprealm(new_srnode
);
7134 decode(in
->snaprealm
->srnode
, p
);
7137 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
7138 if (!mds
->is_resolve())
7139 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
7140 in
->project_snaprealm(NULL
);
7145 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_rollback", rollback
.reqid
, leader
,
7146 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::LINK
);
7147 mdlog
->start_entry(le
);
7148 le
->commit
.add_dir_context(parent
);
7149 le
->commit
.add_dir(parent
, true);
7150 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
7152 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
7157 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
7158 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
7160 dout(10) << "_link_rollback_finish" << dendl
;
7162 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
7166 if (!mds
->is_resolve())
7167 mdcache
->send_snaps(splits
);
7170 mdcache
->request_finish(mdr
);
7172 mdcache
->finish_rollback(mut
->reqid
, mdr
);
7178 void Server::handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
)
7180 dout(10) << "handle_peer_link_prep_ack " << *mdr
7181 << " " << *m
<< dendl
;
7182 mds_rank_t from
= mds_rank_t(m
->get_source().num());
7184 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
7187 mdr
->more()->peers
.insert(from
);
7190 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
7191 mdr
->more()->witnessed
.insert(from
);
7192 ceph_assert(!m
->is_not_journaled());
7193 mdr
->more()->has_journaled_peers
= true;
7195 // remove from waiting list
7196 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7197 mdr
->more()->waiting_on_peer
.erase(from
);
7199 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
7201 dispatch_client_request(mdr
); // go again!
7210 void Server::handle_client_unlink(MDRequestRef
& mdr
)
7212 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7213 client_t client
= mdr
->get_client();
7216 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
7219 mdr
->disable_lock_cache();
7220 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
7224 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
7225 ceph_assert(!dnl
->is_null());
7226 CInode
*in
= dnl
->get_inode();
7229 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
7231 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
7233 dout(7) << "dn links to " << *in
<< dendl
;
7238 // do empty directory checks
7239 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
7240 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7244 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
7245 respond_to_request(mdr
, -CEPHFS_EISDIR
);
7251 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
7252 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
7257 CInode
*diri
= dn
->get_dir()->get_inode();
7258 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7259 if (!check_access(mdr
, diri
, MAY_WRITE
))
7263 // -- create stray dentry? --
7264 CDentry
*straydn
= NULL
;
7265 if (dnl
->is_primary()) {
7266 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
7269 dout(10) << " straydn is " << *straydn
<< dendl
;
7270 } else if (mdr
->straydn
) {
7271 mdr
->unpin(mdr
->straydn
);
7272 mdr
->straydn
= NULL
;
7276 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7277 MutationImpl::LockOpVec lov
;
7279 lov
.add_xlock(&in
->linklock
);
7280 lov
.add_xlock(&in
->snaplock
);
7282 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
7285 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7286 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7287 lov
.add_xlock(&straydn
->lock
);
7290 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7293 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7297 _dir_is_nonempty(mdr
, in
)) {
7298 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7303 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7305 if (!mdr
->more()->desti_srnode
) {
7306 if (in
->is_projected_snaprealm_global()) {
7307 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7308 in
->record_snaprealm_parent_dentry(new_srnode
, nullptr, dn
, dnl
->is_primary());
7309 // dropping the last linkage or dropping the last remote linkage,
7310 // detch the inode from global snaprealm
7311 auto nlink
= in
->get_projected_inode()->nlink
;
7313 (nlink
== 2 && !dnl
->is_primary() &&
7314 !in
->get_projected_parent_dir()->inode
->is_stray()))
7315 in
->clear_snaprealm_global(new_srnode
);
7316 mdr
->more()->desti_srnode
= new_srnode
;
7317 } else if (dnl
->is_primary()) {
7318 // prepare snaprealm blob for peer request
7319 SnapRealm
*realm
= in
->find_snaprealm();
7320 snapid_t follows
= realm
->get_newest_seq();
7321 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7322 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7323 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7324 mdr
->more()->desti_srnode
= new_srnode
;
7330 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7331 // subtree root auths need to be witnesses
7332 set
<mds_rank_t
> witnesses
;
7333 in
->list_replicas(witnesses
);
7334 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7336 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7337 p
!= witnesses
.end();
7339 if (mdr
->more()->witnessed
.count(*p
)) {
7340 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7341 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
7342 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7344 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7348 if (!mdr
->more()->waiting_on_peer
.empty())
7349 return; // we're waiting for a witness.
7352 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
7353 mds
->locker
->create_lock_cache(mdr
, diri
);
7356 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7357 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7359 _unlink_local(mdr
, dn
, straydn
);
7362 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7365 version_t dnpv
; // deleted dentry
7367 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7368 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
7369 dnpv(d
->get_projected_version()) {}
7370 void finish(int r
) override
{
7371 ceph_assert(r
== 0);
7372 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
7376 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7378 dout(10) << "_unlink_local " << *dn
<< dendl
;
7380 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7381 CInode
*in
= dnl
->get_inode();
7385 mdr
->ls
= mdlog
->get_current_segment();
7387 // prepare log entry
7388 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
7389 mdlog
->start_entry(le
);
7390 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7391 if (!mdr
->more()->witnessed
.empty()) {
7392 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7393 le
->reqid
= mdr
->reqid
;
7394 le
->had_peers
= true;
7395 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7399 ceph_assert(dnl
->is_primary());
7400 straydn
->push_projected_linkage(in
);
7403 // the unlinked dentry
7406 auto pi
= in
->project_inode(mdr
);
7409 dn
->make_path_string(t
, true);
7410 pi
.inode
->stray_prior_path
= std::move(t
);
7412 pi
.inode
->version
= in
->pre_dirty();
7413 pi
.inode
->ctime
= mdr
->get_op_stamp();
7414 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7415 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7416 pi
.inode
->change_attr
++;
7418 if (pi
.inode
->nlink
== 0)
7419 in
->state_set(CInode::STATE_ORPHAN
);
7421 if (mdr
->more()->desti_srnode
) {
7422 auto& desti_srnode
= mdr
->more()->desti_srnode
;
7423 in
->project_snaprealm(desti_srnode
);
7424 desti_srnode
= NULL
;
7428 // will manually pop projected inode
7430 // primary link. add stray dentry.
7431 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
7432 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7434 pi
.inode
->update_backtrace();
7435 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
7437 // remote link. update remote inode.
7438 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7439 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
7440 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
7443 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7444 le
->metablob
.add_null_dentry(dn
, true);
7447 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7448 le
->metablob
.renamed_dirino
= in
->ino();
7451 dn
->push_projected_linkage();
7454 ceph_assert(in
->first
<= straydn
->first
);
7455 in
->first
= straydn
->first
;
7459 ceph_assert(straydn
);
7460 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7463 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
7466 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
7467 CDentry
*dn
, CDentry
*straydn
,
7470 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
7472 if (!mdr
->more()->witnessed
.empty())
7473 mdcache
->logged_leader_update(mdr
->reqid
);
7475 CInode
*strayin
= NULL
;
7476 bool hadrealm
= false;
7478 // if there is newly created snaprealm, need to split old snaprealm's
7479 // inodes_with_caps. So pop snaprealm before linkage changes.
7480 strayin
= dn
->get_linkage()->get_inode();
7481 hadrealm
= strayin
->snaprealm
? true : false;
7482 strayin
->early_pop_projected_snaprealm();
7485 // unlink main dentry
7486 dn
->get_dir()->unlink_inode(dn
);
7487 dn
->pop_projected_linkage();
7488 dn
->mark_dirty(dnpv
, mdr
->ls
);
7490 // relink as stray? (i.e. was primary link?)
7492 dout(20) << " straydn is " << *straydn
<< dendl
;
7493 straydn
->pop_projected_linkage();
7494 mdcache
->touch_dentry_bottom(straydn
);
7499 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
7502 // update subtree map?
7503 if (strayin
->is_dir())
7504 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
7506 if (strayin
->snaprealm
&& !hadrealm
)
7507 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
7511 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7514 respond_to_request(mdr
, 0);
7516 // removing a new dn?
7517 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7520 // respond_to_request() drops locks. So stray reintegration can race with us.
7521 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7522 // Tip off the MDCache that this dentry is a stray that
7523 // might be elegible for purge.
7524 mdcache
->notify_stray(straydn
);
7528 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
7530 if (mds
->is_cluster_degraded() &&
7531 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7532 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
7533 if (mdr
->more()->waiting_on_peer
.empty())
7534 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7538 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
7539 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREP
);
7540 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
7541 for (auto dn
: trace
)
7542 req
->srcdnpath
.push_dentry(dn
->get_name());
7543 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
7544 if (mdr
->more()->desti_srnode
)
7545 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
7547 req
->op_stamp
= mdr
->get_op_stamp();
7548 mds
->send_message_mds(req
, who
);
7550 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
7551 mdr
->more()->waiting_on_peer
.insert(who
);
7555 struct C_MDS_PeerRmdirPrep
: public ServerLogContext
{
7556 CDentry
*dn
, *straydn
;
7557 C_MDS_PeerRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
7558 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
7559 void finish(int r
) override
{
7560 server
->_logged_peer_rmdir(mdr
, dn
, straydn
);
7564 struct C_MDS_PeerRmdirCommit
: public ServerContext
{
7567 C_MDS_PeerRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
7568 : ServerContext(s
), mdr(r
), straydn(sd
) { }
7569 void finish(int r
) override
{
7570 server
->_commit_peer_rmdir(mdr
, r
, straydn
);
7574 void Server::handle_peer_rmdir_prep(MDRequestRef
& mdr
)
7576 dout(10) << "handle_peer_rmdir_prep " << *mdr
7577 << " " << mdr
->peer_request
->srcdnpath
7578 << " to " << mdr
->peer_request
->destdnpath
7581 vector
<CDentry
*> trace
;
7582 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
7583 dout(10) << " src " << srcpath
<< dendl
;
7585 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
7586 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
7587 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
7590 if (r
== -CEPHFS_ESTALE
) {
7591 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7592 mdr
->peer_to_mds
, true);
7595 ceph_assert(r
== 0);
7596 CDentry
*dn
= trace
.back();
7597 dout(10) << " dn " << *dn
<< dendl
;
7600 ceph_assert(mdr
->straydn
);
7601 CDentry
*straydn
= mdr
->straydn
;
7602 dout(10) << " straydn " << *straydn
<< dendl
;
7604 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7606 rmdir_rollback rollback
;
7607 rollback
.reqid
= mdr
->reqid
;
7608 rollback
.src_dir
= dn
->get_dir()->dirfrag();
7609 rollback
.src_dname
= dn
->get_name();
7610 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
7611 rollback
.dest_dname
= straydn
->get_name();
7612 if (mdr
->peer_request
->desti_snapbl
.length()) {
7613 if (in
->snaprealm
) {
7614 encode(true, rollback
.snapbl
);
7615 in
->encode_snap_blob(rollback
.snapbl
);
7617 encode(false, rollback
.snapbl
);
7620 encode(rollback
, mdr
->more()->rollback_bl
);
7621 // FIXME: rollback snaprealm
7622 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7624 // set up commit waiter
7625 mdr
->more()->peer_commit
= new C_MDS_PeerRmdirCommit(this, mdr
, straydn
);
7627 straydn
->push_projected_linkage(in
);
7628 dn
->push_projected_linkage();
7630 ceph_assert(straydn
->first
>= in
->first
);
7631 in
->first
= straydn
->first
;
7633 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
7634 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
7635 _logged_peer_rmdir(mdr
, dn
, straydn
);
7639 mdr
->ls
= mdlog
->get_current_segment();
7640 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir", mdr
->reqid
, mdr
->peer_to_mds
,
7641 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RMDIR
);
7642 mdlog
->start_entry(le
);
7643 le
->rollback
= mdr
->more()->rollback_bl
;
7645 le
->commit
.add_dir_context(straydn
->get_dir());
7646 le
->commit
.add_primary_dentry(straydn
, in
, true);
7647 // peer: no need to journal original dentry
7649 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7650 le
->commit
.renamed_dirino
= in
->ino();
7652 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
7653 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7655 mdr
->more()->peer_update_journaled
= true;
7656 submit_mdlog_entry(le
, new C_MDS_PeerRmdirPrep(this, mdr
, dn
, straydn
),
7661 void Server::_logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
7663 dout(10) << "_logged_peer_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
7664 CInode
*in
= dn
->get_linkage()->get_inode();
7667 if (mdr
->peer_request
->desti_snapbl
.length()) {
7668 new_realm
= !in
->snaprealm
;
7669 in
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
7670 ceph_assert(in
->snaprealm
);
7675 // update our cache now, so we are consistent with what is in the journal
7676 // when we journal a subtree map
7677 dn
->get_dir()->unlink_inode(dn
);
7678 straydn
->pop_projected_linkage();
7679 dn
->pop_projected_linkage();
7681 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->peer_update_journaled
);
7684 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
7687 mdr
->reset_peer_request();
7690 if (!mdr
->aborted
) {
7691 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREPACK
);
7692 if (!mdr
->more()->peer_update_journaled
)
7693 reply
->mark_not_journaled();
7694 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7696 dout(10) << " abort flag set, finishing" << dendl
;
7697 mdcache
->request_finish(mdr
);
7701 void Server::handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
7703 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7704 << " " << *ack
<< dendl
;
7706 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
7708 mdr
->more()->peers
.insert(from
);
7709 mdr
->more()->witnessed
.insert(from
);
7710 if (!ack
->is_not_journaled())
7711 mdr
->more()->has_journaled_peers
= true;
7713 // remove from waiting list
7714 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7715 mdr
->more()->waiting_on_peer
.erase(from
);
7717 if (mdr
->more()->waiting_on_peer
.empty())
7718 dispatch_client_request(mdr
); // go again!
7720 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
7723 void Server::_commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
7725 dout(10) << "_commit_peer_rmdir " << *mdr
<< " r=" << r
<< dendl
;
7728 if (mdr
->more()->peer_update_journaled
) {
7729 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7730 if (strayin
&& !strayin
->snaprealm
)
7731 mdcache
->clear_dirty_bits_for_stray(strayin
);
7736 if (mdr
->more()->peer_update_journaled
) {
7737 // write a commit to the journal
7738 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_commit", mdr
->reqid
,
7739 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
7740 EPeerUpdate::RMDIR
);
7741 mdlog
->start_entry(le
);
7742 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7745 _committed_peer(mdr
);
7749 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7753 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
7757 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
7758 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
7759 void finish(int r
) override
{
7760 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
7764 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7766 // unlink the other rollback methods, the rmdir rollback is only
7767 // needed to record the subtree changes in the journal for inode
7768 // replicas who are auth for empty dirfrags. no actual changes to
7769 // the file system are taking place here, so there is no Mutation.
7771 rmdir_rollback rollback
;
7772 auto p
= rbl
.cbegin();
7773 decode(rollback
, p
);
7775 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
7776 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7777 ceph_assert(mdr
|| mds
->is_resolve());
7779 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
7781 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
7783 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
7785 dout(10) << " dn " << *dn
<< dendl
;
7786 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
7787 ceph_assert(straydir
);
7788 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
7789 ceph_assert(straydn
);
7790 dout(10) << " straydn " << *straydn
<< dendl
;
7791 CInode
*in
= straydn
->get_linkage()->get_inode();
7793 dn
->push_projected_linkage(in
);
7794 straydn
->push_projected_linkage();
7796 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7798 auto p
= rollback
.snapbl
.cbegin();
7799 decode(hadrealm
, p
);
7801 decode(in
->snaprealm
->srnode
, p
);
7803 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
7807 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
7808 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
7810 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
7815 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_rollback", rollback
.reqid
, leader
,
7816 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RMDIR
);
7817 mdlog
->start_entry(le
);
7819 le
->commit
.add_dir_context(dn
->get_dir());
7820 le
->commit
.add_primary_dentry(dn
, in
, true);
7821 // peer: no need to journal straydn
7823 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
7824 le
->commit
.renamed_dirino
= in
->ino();
7826 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
7828 submit_mdlog_entry(le
,
7829 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
7835 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
7837 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
7839 straydn
->get_dir()->unlink_inode(straydn
);
7840 dn
->pop_projected_linkage();
7841 straydn
->pop_projected_linkage();
7843 CInode
*in
= dn
->get_linkage()->get_inode();
7844 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
7845 !mdr
|| mdr
->more()->peer_update_journaled
);
7847 if (mds
->is_resolve()) {
7848 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
7849 mdcache
->try_trim_non_auth_subtree(root
);
7853 mdcache
->request_finish(mdr
);
7855 mdcache
->finish_rollback(reqid
, mdr
);
7859 /** _dir_is_nonempty[_unlocked]
7861 * check if a directory is non-empty (i.e. we can rmdir it).
7863 * the unlocked varient this is a fastpath check. we can't really be
7864 * sure until we rdlock the filelock.
7866 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
7868 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
7869 ceph_assert(in
->is_auth());
7871 if (in
->filelock
.is_cached())
7872 return false; // there can be pending async create/unlink. don't know.
7873 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
7874 return true; // in a snapshot!
7876 auto&& ls
= in
->get_dirfrags();
7877 for (const auto& dir
: ls
) {
7878 // is the frag obviously non-empty?
7879 if (dir
->is_auth()) {
7880 if (dir
->get_projected_fnode()->fragstat
.size()) {
7881 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7882 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
7891 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
7893 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
7894 ceph_assert(in
->is_auth());
7895 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
7897 frag_info_t dirstat
;
7898 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
7900 auto&& ls
= in
->get_dirfrags();
7901 for (const auto& dir
: ls
) {
7902 const auto& pf
= dir
->get_projected_fnode();
7903 if (pf
->fragstat
.size()) {
7904 dout(10) << "dir_is_nonempty dirstat has "
7905 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
7909 if (pf
->accounted_fragstat
.version
== dirstat_version
)
7910 dirstat
.add(pf
->accounted_fragstat
);
7912 dirstat
.add(pf
->fragstat
);
7915 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
7919 // ======================================================
7922 class C_MDS_rename_finish
: public ServerLogContext
{
7927 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
7928 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
7929 ServerLogContext(s
, r
),
7930 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
7931 void finish(int r
) override
{
7932 ceph_assert(r
== 0);
7933 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
7938 /** handle_client_rename
7940 * rename leader is the destdn auth. this is because cached inodes
7941 * must remain connected. thus, any replica of srci, must also
7942 * replicate destdn, and possibly straydn, so that srci (and
7943 * destdn->inode) remain connected during the rename.
7945 * to do this, we freeze srci, then leader (destdn auth) verifies that
7946 * all other nodes have also replciated destdn and straydn. note that
7947 * destdn replicas need not also replicate srci. this only works when
7950 * This function takes responsibility for the passed mdr.
7952 void Server::handle_client_rename(MDRequestRef
& mdr
)
7954 const auto& req
= mdr
->client_request
;
7955 dout(7) << "handle_client_rename " << *req
<< dendl
;
7957 filepath destpath
= req
->get_filepath();
7958 filepath srcpath
= req
->get_filepath2();
7959 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
7960 respond_to_request(mdr
, -CEPHFS_EBUSY
);
7964 if (req
->get_alternate_name().size() > alternate_name_max
) {
7965 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
7966 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
7970 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
7974 dout(10) << " destdn " << *destdn
<< dendl
;
7975 CDir
*destdir
= destdn
->get_dir();
7976 ceph_assert(destdir
->is_auth());
7977 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7979 dout(10) << " srcdn " << *srcdn
<< dendl
;
7980 CDir
*srcdir
= srcdn
->get_dir();
7981 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7982 CInode
*srci
= srcdnl
->get_inode();
7983 dout(10) << " srci " << *srci
<< dendl
;
7985 // -- some sanity checks --
7986 if (destdn
== srcdn
) {
7987 dout(7) << "rename src=dest, noop" << dendl
;
7988 respond_to_request(mdr
, 0);
7992 // dest a child of src?
7993 // e.g. mv /usr /usr/foo
7994 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
7995 dout(7) << "cannot rename item to be a child of itself" << dendl
;
7996 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8000 // is this a stray migration, reintegration or merge? (sanity checks!)
8001 if (mdr
->reqid
.name
.is_mds() &&
8002 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
8003 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
8004 !(destdnl
->is_remote() &&
8005 destdnl
->get_remote_ino() == srci
->ino())) {
8006 respond_to_request(mdr
, -CEPHFS_EINVAL
); // actually, this won't reply, but whatev.
8011 if (!destdnl
->is_null()) {
8012 //dout(10) << "dest dn exists " << *destdn << dendl;
8013 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
8015 dout(10) << " oldin " << *oldin
<< dendl
;
8017 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8018 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
8019 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8023 // mv /some/thing /to/some/existing_other_thing
8024 if (oldin
->is_dir() && !srci
->is_dir()) {
8025 respond_to_request(mdr
, -CEPHFS_EISDIR
);
8028 if (!oldin
->is_dir() && srci
->is_dir()) {
8029 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
8032 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
8033 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
8036 if (destdn
->get_alternate_name() != req
->get_alternate_name()) {
8037 /* the dentry exists but the alternate_names do not match, fail... */
8038 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8043 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
8044 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
8046 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8047 if (destpath
.get_ino() != srcpath
.get_ino() &&
8048 !(req
->get_source().is_mds() &&
8049 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8050 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
8051 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
8052 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8053 while (srcbase
!= destbase
&&
8054 !srcbase
->is_projected_ancestor_of(destbase
)) {
8055 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
8056 srctrace
.insert(srctrace
.begin(), pdn
);
8057 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
8058 srcbase
= pdn
->get_dir()->get_inode();
8061 // then, extend destpath until it shares the same parent inode as srcpath.
8062 while (destbase
!= srcbase
) {
8063 CDentry
*pdn
= destbase
->get_projected_parent_dn();
8064 desttrace
.insert(desttrace
.begin(), pdn
);
8065 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
8066 destbase
= pdn
->get_dir()->get_inode();
8068 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
8072 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8074 dout(10) << " this is a link merge" << dendl
;
8076 // -- create stray dentry? --
8077 CDentry
*straydn
= NULL
;
8078 if (destdnl
->is_primary() && !linkmerge
) {
8079 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
8082 dout(10) << " straydn is " << *straydn
<< dendl
;
8083 } else if (mdr
->straydn
) {
8084 mdr
->unpin(mdr
->straydn
);
8085 mdr
->straydn
= NULL
;
8090 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
8091 MutationImpl::LockOpVec lov
;
8093 // we need to update srci's ctime. xlock its least contended lock to do that...
8094 lov
.add_xlock(&srci
->linklock
);
8095 lov
.add_xlock(&srci
->snaplock
);
8098 // xlock oldin (for nlink--)
8099 lov
.add_xlock(&oldin
->linklock
);
8100 lov
.add_xlock(&oldin
->snaplock
);
8101 if (oldin
->is_dir()) {
8102 ceph_assert(srci
->is_dir());
8103 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
8105 // adjust locking order?
8106 int cmp
= mdr
->compare_paths();
8107 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
8108 std::reverse(lov
.begin(), lov
.end());
8110 ceph_assert(!srci
->is_dir());
8111 // adjust locking order;
8112 if (srci
->ino() > oldin
->ino())
8113 std::reverse(lov
.begin(), lov
.end());
8119 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
8120 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
8121 lov
.add_xlock(&straydn
->lock
);
8124 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
8125 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
8128 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
8132 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
8134 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
8135 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
8138 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
8141 if (!check_fragment_space(mdr
, destdn
->get_dir()))
8144 if (!check_access(mdr
, srci
, MAY_WRITE
))
8148 // with read lock, really verify oldin is empty
8151 _dir_is_nonempty(mdr
, oldin
)) {
8152 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8156 /* project_snaprealm_past_parent() will do this job
8158 // moving between snaprealms?
8159 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8160 SnapRealm *srcrealm = srci->find_snaprealm();
8161 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8162 if (srcrealm != destrealm &&
8163 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8164 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8165 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8166 mdcache->snaprealm_create(mdr, srci);
8172 SnapRealm
*dest_realm
= nullptr;
8173 SnapRealm
*src_realm
= nullptr;
8175 dest_realm
= destdir
->inode
->find_snaprealm();
8176 if (srcdir
->inode
== destdir
->inode
)
8177 src_realm
= dest_realm
;
8179 src_realm
= srcdir
->inode
->find_snaprealm();
8180 if (src_realm
!= dest_realm
&&
8181 src_realm
->get_subvolume_ino() != dest_realm
->get_subvolume_ino()) {
8182 respond_to_request(mdr
, -CEPHFS_EXDEV
);
8187 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
8189 // -- open all srcdn inode frags, if any --
8190 // we need these open so that auth can properly delegate from inode to dirfrags
8191 // after the inode is _ours_.
8192 if (srcdnl
->is_primary() &&
8193 !srcdn
->is_auth() &&
8195 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
8196 mdr
->set_stickydirs(srci
);
8199 srci
->dirfragtree
.get_leaves(leaves
);
8200 for (const auto& leaf
: leaves
) {
8201 CDir
*dir
= srci
->get_dirfrag(leaf
);
8203 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
8204 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
8210 // -- prepare snaprealm ---
8213 if (!mdr
->more()->srci_srnode
&&
8214 srci
->get_projected_inode()->nlink
== 1 &&
8215 srci
->is_projected_snaprealm_global()) {
8216 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8217 srci
->record_snaprealm_parent_dentry(new_srnode
, nullptr, destdn
, false);
8219 srci
->clear_snaprealm_global(new_srnode
);
8220 mdr
->more()->srci_srnode
= new_srnode
;
8223 if (oldin
&& !mdr
->more()->desti_srnode
) {
8224 if (oldin
->is_projected_snaprealm_global()) {
8225 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
8226 oldin
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, destdn
, destdnl
->is_primary());
8227 // dropping the last linkage or dropping the last remote linkage,
8228 // detch the inode from global snaprealm
8229 auto nlink
= oldin
->get_projected_inode()->nlink
;
8231 (nlink
== 2 && !destdnl
->is_primary() &&
8232 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
8233 oldin
->clear_snaprealm_global(new_srnode
);
8234 mdr
->more()->desti_srnode
= new_srnode
;
8235 } else if (destdnl
->is_primary()) {
8236 snapid_t follows
= dest_realm
->get_newest_seq();
8237 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
8238 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
8239 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
8240 mdr
->more()->desti_srnode
= new_srnode
;
8244 if (!mdr
->more()->srci_srnode
) {
8245 if (srci
->is_projected_snaprealm_global()) {
8246 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8247 srci
->record_snaprealm_parent_dentry(new_srnode
, src_realm
, srcdn
, srcdnl
->is_primary());
8248 mdr
->more()->srci_srnode
= new_srnode
;
8249 } else if (srcdnl
->is_primary()) {
8250 snapid_t follows
= src_realm
->get_newest_seq();
8251 if (src_realm
!= dest_realm
&&
8252 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
8253 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
8254 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
8255 mdr
->more()->srci_srnode
= new_srnode
;
8261 // -- prepare witnesses --
8264 * NOTE: we use _all_ replicas as witnesses.
8265 * this probably isn't totally necessary (esp for file renames),
8266 * but if/when we change that, we have to make sure rejoin is
8267 * sufficiently robust to handle strong rejoins from survivors
8268 * with totally wrong dentry->inode linkage.
8269 * (currently, it can ignore rename effects, because the resolve
8270 * stage will sort them out.)
8272 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
8273 if (srcdn
->is_auth())
8274 srcdn
->list_replicas(witnesses
);
8276 witnesses
.insert(srcdn
->authority().first
);
8277 if (srcdnl
->is_remote() && !srci
->is_auth())
8278 witnesses
.insert(srci
->authority().first
);
8279 destdn
->list_replicas(witnesses
);
8280 if (destdnl
->is_remote() && !oldin
->is_auth())
8281 witnesses
.insert(oldin
->authority().first
);
8282 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
8284 if (!witnesses
.empty()) {
8285 // Replicas can't see projected dentry linkages and will get confused.
8286 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8287 // can't project these inodes' linkages.
8288 bool need_flush
= false;
8289 for (auto& dn
: srctrace
) {
8290 if (dn
->is_projected()) {
8296 CDentry
*dn
= destdn
;
8298 if (dn
->is_projected()) {
8302 CInode
*diri
= dn
->get_dir()->get_inode();
8303 dn
= diri
->get_projected_parent_dn();
8307 mdlog
->wait_for_safe(
8308 new MDSInternalContextWrapper(mds
,
8309 new C_MDS_RetryRequest(mdcache
, mdr
)));
8315 // do srcdn auth last
8316 mds_rank_t last
= MDS_RANK_NONE
;
8317 if (!srcdn
->is_auth()) {
8318 last
= srcdn
->authority().first
;
8319 mdr
->more()->srcdn_auth_mds
= last
;
8320 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8321 // are involved in the rename operation.
8322 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8323 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8324 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8325 ceph_assert(mdr
->more()->rename_inode
== srci
);
8326 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8331 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
8332 p
!= witnesses
.end();
8334 if (*p
== last
) continue; // do it last!
8335 if (mdr
->more()->witnessed
.count(*p
)) {
8336 dout(10) << " already witnessed by mds." << *p
<< dendl
;
8337 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
8338 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
8340 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
8344 if (!mdr
->more()->waiting_on_peer
.empty())
8345 return; // we're waiting for a witness.
8347 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
8348 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
8349 ceph_assert(mdr
->more()->waiting_on_peer
.count(last
) == 0);
8350 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8354 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8355 if (!mdr
->more()->peers
.empty() && !srci
->is_dir())
8356 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
8357 if (!mdr
->more()->peers
.empty() && srci
->is_dir())
8358 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
8360 // -- declare now --
8361 mdr
->set_mds_stamp(ceph_clock_now());
8363 // -- prepare journal entry --
8364 mdr
->ls
= mdlog
->get_current_segment();
8365 EUpdate
*le
= new EUpdate(mdlog
, "rename");
8366 mdlog
->start_entry(le
);
8367 le
->metablob
.add_client_req(mdr
->reqid
, req
->get_oldest_client_tid());
8368 if (!mdr
->more()->witnessed
.empty()) {
8369 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
8371 le
->reqid
= mdr
->reqid
;
8372 le
->had_peers
= true;
8374 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8375 // no need to send frozen auth pin to recovring auth MDS of srci
8376 mdr
->more()->is_remote_frozen_authpin
= false;
8379 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, req
->get_alternate_name(), straydn
);
8380 if (le
->client_map
.length())
8381 le
->cmapv
= mds
->sessionmap
.get_projected();
8383 // -- commit locally --
8384 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
8386 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
8387 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
8391 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8393 dout(10) << "_rename_finish " << *mdr
<< dendl
;
8395 if (!mdr
->more()->witnessed
.empty())
8396 mdcache
->logged_leader_update(mdr
->reqid
);
8399 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
8401 mdcache
->send_dentry_link(destdn
, mdr
);
8403 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8404 CInode
*in
= destdnl
->get_inode();
8405 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
8407 // test hack: test peer commit
8408 if (!mdr
->more()->peers
.empty() && !in
->is_dir())
8409 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
8410 if (!mdr
->more()->peers
.empty() && in
->is_dir())
8411 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
8414 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
8415 if (destdnl
->is_remote() && in
->is_auth())
8416 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
8418 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8420 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
8423 respond_to_request(mdr
, 0);
8426 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
8429 // respond_to_request() drops locks. So stray reintegration can race with us.
8430 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8431 mdcache
->notify_stray(straydn
);
8439 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
8440 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
8442 const auto& client_req
= mdr
->client_request
;
8443 ceph_assert(client_req
);
8445 if (mds
->is_cluster_degraded() &&
8446 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8447 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
8448 if (mdr
->more()->waiting_on_peer
.empty())
8449 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8453 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
8454 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREP
);
8456 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
8457 for (auto dn
: srctrace
)
8458 req
->srcdnpath
.push_dentry(dn
->get_name());
8459 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
8460 for (auto dn
: dsttrace
)
8461 req
->destdnpath
.push_dentry(dn
->get_name());
8462 req
->alternate_name
= client_req
->alternate_name
;
8464 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8466 if (mdr
->more()->srci_srnode
)
8467 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
8468 if (mdr
->more()->desti_srnode
)
8469 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8471 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
8473 // srcdn auth will verify our current witness list is sufficient
8474 req
->witnesses
= witnesse
;
8476 req
->op_stamp
= mdr
->get_op_stamp();
8477 mds
->send_message_mds(req
, who
);
8479 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
8480 mdr
->more()->waiting_on_peer
.insert(who
);
8484 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
8486 version_t oldpv
= mdr
->more()->inode_import_v
;
8488 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8491 auto blp
= mdr
->more()->inode_import
.cbegin();
8494 map
<client_t
,entity_inst_t
> client_map
;
8495 map
<client_t
, client_metadata_t
> client_metadata_map
;
8496 decode(client_map
, blp
);
8497 decode(client_metadata_map
, blp
);
8498 prepare_force_open_sessions(client_map
, client_metadata_map
,
8499 mdr
->more()->imported_session_map
);
8500 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
8501 encode(client_metadata_map
, *client_map_bl
);
8503 list
<ScatterLock
*> updated_scatterlocks
;
8504 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
8505 mdr
->more()->cap_imports
, updated_scatterlocks
);
8507 // hack: force back to !auth and clean, temporarily
8508 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
8509 srcdnl
->get_inode()->mark_clean();
8514 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
8516 auto&& dirs
= diri
->get_dirfrags();
8518 bool force_journal
= false;
8520 for (const auto& dir
: dirs
) {
8521 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
8522 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
8523 force_journal
= true;
8526 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
8529 // see if any children of our frags are auth subtrees.
8530 std::vector
<CDir
*> subtrees
;
8531 mdcache
->get_subtrees(subtrees
);
8532 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
8533 for (const auto& dir
: dirs
) {
8534 for (const auto& subtree
: subtrees
) {
8535 if (dir
->contains(subtree
)) {
8536 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
8537 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
8538 << *subtree
<< dendl
;
8539 force_journal
= true;
8542 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
8544 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
8550 return force_journal
;
8553 void Server::_rename_prepare(MDRequestRef
& mdr
,
8554 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
8555 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
8558 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8560 dout(10) << " straydn " << *straydn
<< dendl
;
8562 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8563 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8564 CInode
*srci
= srcdnl
->get_inode();
8565 CInode
*oldin
= destdnl
->get_inode();
8567 // primary+remote link merge?
8568 bool linkmerge
= (srci
== oldin
);
8570 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
8571 bool silent
= srcdn
->get_dir()->inode
->is_stray();
8573 bool force_journal_dest
= false;
8574 if (srci
->is_dir() && !destdn
->is_auth()) {
8575 if (srci
->is_auth()) {
8576 // if we are auth for srci and exporting it, force journal because journal replay needs
8577 // the source inode to create auth subtrees.
8578 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
8579 force_journal_dest
= true;
8581 force_journal_dest
= _need_force_journal(srci
, false);
8584 bool force_journal_stray
= false;
8585 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
8586 force_journal_stray
= _need_force_journal(oldin
, true);
8589 dout(10) << " merging remote and primary links to the same inode" << dendl
;
8591 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
8592 if (force_journal_dest
)
8593 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
8594 if (force_journal_stray
)
8595 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
8597 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
8598 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
8599 metablob
->renamed_dirino
= srci
->ino();
8600 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
8601 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
8602 metablob
->renamed_dirino
= oldin
->ino();
8606 CInode::mempool_inode
*spi
= 0; // renamed inode
8607 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
8611 if (destdnl
->is_primary()) {
8612 ceph_assert(straydn
); // moving to straydn.
8613 // link--, and move.
8614 if (destdn
->is_auth()) {
8615 auto pi
= oldin
->project_inode(mdr
); //project_snaprealm
8616 pi
.inode
->version
= straydn
->pre_dirty(pi
.inode
->version
);
8617 pi
.inode
->update_backtrace();
8618 tpi
= pi
.inode
.get();
8620 straydn
->push_projected_linkage(oldin
);
8621 } else if (destdnl
->is_remote()) {
8623 if (oldin
->is_auth()) {
8624 auto pi
= oldin
->project_inode(mdr
);
8625 pi
.inode
->version
= oldin
->pre_dirty();
8626 tpi
= pi
.inode
.get();
8632 if (destdnl
->is_null()) {
8633 /* handle_client_rename checks that alternate_name matches for existing destdn */
8634 destdn
->set_alternate_name(alternate_name
);
8636 if (srcdnl
->is_remote()) {
8639 if (destdn
->is_auth())
8640 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
8641 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8643 if (srci
->is_auth()) {
8644 auto pi
= srci
->project_inode(mdr
);
8645 pi
.inode
->version
= srci
->pre_dirty();
8646 spi
= pi
.inode
.get();
8649 dout(10) << " will merge remote onto primary link" << dendl
;
8650 if (destdn
->is_auth()) {
8651 auto pi
= oldin
->project_inode(mdr
);
8652 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->get_version());
8653 spi
= pi
.inode
.get();
8657 if (destdn
->is_auth()) {
8659 if (srcdn
->is_auth())
8660 oldpv
= srci
->get_projected_version();
8662 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
8664 // note which dirfrags have child subtrees in the journal
8665 // event, so that we can open those (as bounds) during replay.
8666 if (srci
->is_dir()) {
8667 auto&& ls
= srci
->get_dirfrags();
8668 for (const auto& dir
: ls
) {
8669 if (!dir
->is_auth())
8670 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
8672 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
8675 auto pi
= srci
->project_inode(mdr
); // project snaprealm if srcdnl->is_primary
8676 // & srcdnl->snaprealm
8677 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
8678 pi
.inode
->update_backtrace();
8679 spi
= pi
.inode
.get();
8681 destdn
->push_projected_linkage(srci
);
8685 if (srcdn
->is_auth())
8686 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
8687 srcdn
->push_projected_linkage(); // push null linkage
8691 spi
->ctime
= mdr
->get_op_stamp();
8692 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
8693 spi
->rstat
.rctime
= mdr
->get_op_stamp();
8699 tpi
->ctime
= mdr
->get_op_stamp();
8700 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
8701 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
8705 destdn
->make_path_string(t
, true);
8706 tpi
->stray_prior_path
= std::move(t
);
8709 if (tpi
->nlink
== 0)
8710 oldin
->state_set(CInode::STATE_ORPHAN
);
8714 // prepare nesting, mtime updates
8715 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
8717 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8718 // then link the source inode to destdn
8719 if (destdnl
->is_primary()) {
8720 ceph_assert(straydn
);
8721 if (straydn
->is_auth()) {
8722 metablob
->add_dir_context(straydn
->get_dir());
8723 metablob
->add_dir(straydn
->get_dir(), true);
8727 if (!linkmerge
&& destdnl
->is_remote() && oldin
->is_auth()) {
8728 CDir
*oldin_dir
= oldin
->get_projected_parent_dir();
8729 if (oldin_dir
!= srcdn
->get_dir() && oldin_dir
!= destdn
->get_dir())
8730 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, oldin_dir
, PREDIRTY_PRIMARY
);
8734 if (destdn
->is_auth() && !destdnl
->is_null()) {
8735 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
8736 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
8737 if (destdnl
->is_primary()) {
8738 ceph_assert(straydn
);
8739 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
8740 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8744 if (srcdnl
->is_remote() && srci
->is_auth()) {
8745 CDir
*srci_dir
= srci
->get_projected_parent_dir();
8746 if (srci_dir
!= srcdn
->get_dir() && srci_dir
!= destdn
->get_dir())
8747 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srci_dir
, PREDIRTY_PRIMARY
);
8751 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
8752 int flags
= predirty_dir
| predirty_primary
;
8753 if (srcdn
->is_auth())
8754 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
8755 if (destdn
->is_auth())
8756 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
8758 // add it all to the metablob
8761 if (destdnl
->is_primary()) {
8762 ceph_assert(straydn
);
8763 if (destdn
->is_auth()) {
8764 // project snaprealm, too
8765 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8766 oldin
->project_snaprealm(desti_srnode
);
8767 if (tpi
->nlink
== 0)
8768 ceph_assert(!desti_srnode
->is_parent_global());
8769 desti_srnode
= NULL
;
8771 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8772 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
8773 } else if (force_journal_stray
) {
8774 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
8775 metablob
->add_dir_context(straydn
->get_dir());
8776 metablob
->add_primary_dentry(straydn
, oldin
, true);
8778 } else if (destdnl
->is_remote()) {
8779 if (oldin
->is_auth()) {
8780 sr_t
*new_srnode
= NULL
;
8781 if (mdr
->peer_request
) {
8782 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
8783 new_srnode
= new sr_t();
8784 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
8785 decode(*new_srnode
, p
);
8787 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8788 new_srnode
= desti_srnode
;
8789 desti_srnode
= NULL
;
8792 oldin
->project_snaprealm(new_srnode
);
8793 if (tpi
->nlink
== 0)
8794 ceph_assert(!new_srnode
->is_parent_global());
8797 CDentry
*oldin_pdn
= oldin
->get_projected_parent_dn();
8798 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin_pdn
);
8799 metablob
->add_primary_dentry(oldin_pdn
, oldin
, true);
8805 if (srcdnl
->is_remote()) {
8806 ceph_assert(!linkmerge
);
8807 if (destdn
->is_auth() && !destdnl
->is_null())
8808 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8810 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8812 if (destdn
->is_auth())
8813 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
8815 if (srci
->is_auth() ) { // it's remote
8816 if (mdr
->peer_request
) {
8817 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
8818 sr_t
*new_srnode
= new sr_t();
8819 auto p
= mdr
->peer_request
->srci_snapbl
.cbegin();
8820 decode(*new_srnode
, p
);
8821 srci
->project_snaprealm(new_srnode
);
8823 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8824 srci
->project_snaprealm(srci_srnode
);
8828 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
8829 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
);
8830 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
8832 } else if (srcdnl
->is_primary()) {
8833 // project snap parent update?
8834 if (destdn
->is_auth()) {
8835 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
8836 srci
->project_snaprealm(srci_srnode
);
8841 if (destdn
->is_auth() && !destdnl
->is_null())
8842 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
8844 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
8846 if (destdn
->is_auth())
8847 metablob
->add_primary_dentry(destdn
, srci
, true, true);
8848 else if (force_journal_dest
) {
8849 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
8850 metablob
->add_dir_context(destdn
->get_dir());
8851 metablob
->add_primary_dentry(destdn
, srci
, true);
8852 if (srcdn
->is_auth() && srci
->is_dir()) {
8853 // journal new subtrees root dirfrags
8854 auto&& ls
= srci
->get_dirfrags();
8855 for (const auto& dir
: ls
) {
8857 metablob
->add_dir(dir
, true);
8864 if (srcdn
->is_auth()) {
8865 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
8866 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
8867 // also journal the inode in case we need do peer rename rollback. It is Ok to add
8868 // both primary and NULL dentries. Because during journal replay, null dentry is
8869 // processed after primary dentry.
8870 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
8871 metablob
->add_primary_dentry(srcdn
, srci
, true);
8872 metablob
->add_null_dentry(srcdn
, true);
8874 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
8876 // make renamed inode first track the dn
8877 if (srcdnl
->is_primary() && destdn
->is_auth()) {
8878 ceph_assert(srci
->first
<= destdn
->first
);
8879 srci
->first
= destdn
->first
;
8881 // make stray inode first track the straydn
8882 if (straydn
&& straydn
->is_auth()) {
8883 ceph_assert(oldin
->first
<= straydn
->first
);
8884 oldin
->first
= straydn
->first
;
8887 if (oldin
&& oldin
->is_dir()) {
8888 ceph_assert(straydn
);
8889 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
8892 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
8897 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
8899 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
8900 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
8902 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
8903 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
8905 CInode
*oldin
= destdnl
->get_inode();
8907 // primary+remote link merge?
8908 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
8910 ceph_assert(srcdnl
->is_primary() || destdnl
->is_remote());
8912 bool new_in_snaprealm
= false;
8913 bool new_oldin_snaprealm
= false;
8917 if (destdnl
->is_primary()) {
8918 ceph_assert(straydn
);
8919 dout(10) << "straydn is " << *straydn
<< dendl
;
8921 // if there is newly created snaprealm, need to split old snaprealm's
8922 // inodes_with_caps. So pop snaprealm before linkage changes.
8923 if (destdn
->is_auth()) {
8924 bool hadrealm
= (oldin
->snaprealm
? true : false);
8925 oldin
->early_pop_projected_snaprealm();
8926 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
8928 ceph_assert(mdr
->peer_request
);
8929 if (mdr
->peer_request
->desti_snapbl
.length()) {
8930 new_oldin_snaprealm
= !oldin
->snaprealm
;
8931 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
8932 ceph_assert(oldin
->snaprealm
);
8936 destdn
->get_dir()->unlink_inode(destdn
, false);
8938 straydn
->pop_projected_linkage();
8939 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
8940 ceph_assert(!straydn
->is_projected()); // no other projected
8943 if (destdn
->is_auth())
8944 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
8946 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
8947 } else if (destdnl
->is_remote()) {
8948 destdn
->get_dir()->unlink_inode(destdn
, false);
8949 if (oldin
->is_auth()) {
8950 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
8951 } else if (mdr
->peer_request
) {
8952 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
8953 ceph_assert(oldin
->snaprealm
);
8954 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
8956 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
8957 delete desti_srnode
;
8958 desti_srnode
= NULL
;
8963 // unlink src before we relink it at dest
8964 CInode
*in
= srcdnl
->get_inode();
8967 bool srcdn_was_remote
= srcdnl
->is_remote();
8968 if (!srcdn_was_remote
) {
8969 // if there is newly created snaprealm, need to split old snaprealm's
8970 // inodes_with_caps. So pop snaprealm before linkage changes.
8971 if (destdn
->is_auth()) {
8972 bool hadrealm
= (in
->snaprealm
? true : false);
8973 in
->early_pop_projected_snaprealm();
8974 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
8976 ceph_assert(mdr
->peer_request
);
8977 if (mdr
->peer_request
->srci_snapbl
.length()) {
8978 new_in_snaprealm
= !in
->snaprealm
;
8979 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
8980 ceph_assert(in
->snaprealm
);
8985 srcdn
->get_dir()->unlink_inode(srcdn
);
8988 if (srcdn_was_remote
) {
8991 destdnl
= destdn
->pop_projected_linkage();
8992 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
8993 ceph_assert(!destdn
->is_projected()); // no other projected
8995 destdn
->link_remote(destdnl
, in
);
8996 if (destdn
->is_auth())
8997 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
8999 if (in
->is_auth()) {
9000 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9001 } else if (mdr
->peer_request
) {
9002 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9003 ceph_assert(in
->snaprealm
);
9004 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9006 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9011 dout(10) << "merging remote onto primary link" << dendl
;
9012 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9016 dout(10) << "merging primary onto remote link" << dendl
;
9017 destdn
->get_dir()->unlink_inode(destdn
, false);
9019 destdnl
= destdn
->pop_projected_linkage();
9020 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9021 ceph_assert(!destdn
->is_projected()); // no other projected
9023 // srcdn inode import?
9024 if (!srcdn
->is_auth() && destdn
->is_auth()) {
9025 ceph_assert(mdr
->more()->inode_import
.length() > 0);
9027 map
<client_t
,Capability::Import
> imported_caps
;
9029 // finish cap imports
9030 finish_force_open_sessions(mdr
->more()->imported_session_map
);
9031 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
9032 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
9033 mdr
->more()->srcdn_auth_mds
, true,
9034 mdr
->more()->imported_session_map
,
9035 mdr
->more()->cap_imports
[destdnl
->get_inode()],
9039 mdr
->more()->inode_import
.clear();
9040 encode(imported_caps
, mdr
->more()->inode_import
);
9042 /* hack: add an auth pin for each xlock we hold. These were
9043 * remote xlocks previously but now they're local and
9044 * we're going to try and unpin when we xlock_finish. */
9046 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
9047 i
!= mdr
->locks
.end();
9049 SimpleLock
*lock
= i
->lock
;
9050 if (lock
->get_parent() != destdnl
->get_inode())
9052 if (i
->is_xlock() && !lock
->is_locallock())
9053 mds
->locker
->xlock_import(lock
);
9056 // hack: fix auth bit
9057 in
->state_set(CInode::STATE_AUTH
);
9059 mdr
->clear_ambiguous_auth();
9062 if (destdn
->is_auth())
9063 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9067 if (srcdn
->is_auth())
9068 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
9069 srcdn
->pop_projected_linkage();
9070 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9071 ceph_assert(!srcdn
->is_projected()); // no other projected
9073 // apply remaining projected inodes (nested)
9076 // update subtree map?
9077 if (destdnl
->is_primary() && in
->is_dir())
9078 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
9080 if (straydn
&& oldin
->is_dir())
9081 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
9083 if (new_oldin_snaprealm
)
9084 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
9085 if (new_in_snaprealm
)
9086 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
9088 // removing a new dn?
9089 if (srcdn
->is_auth())
9090 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
9098 class C_MDS_PeerRenamePrep
: public ServerLogContext
{
9099 CDentry
*srcdn
, *destdn
, *straydn
;
9101 C_MDS_PeerRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9102 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9103 void finish(int r
) override
{
9104 server
->_logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9108 class C_MDS_PeerRenameCommit
: public ServerContext
{
9110 CDentry
*srcdn
, *destdn
, *straydn
;
9112 C_MDS_PeerRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9113 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9114 void finish(int r
) override
{
9115 server
->_commit_peer_rename(mdr
, r
, srcdn
, destdn
, straydn
);
9119 class C_MDS_PeerRenameSessionsFlushed
: public ServerContext
{
9122 C_MDS_PeerRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
9123 ServerContext(s
), mdr(r
) {}
9124 void finish(int r
) override
{
9125 server
->_peer_rename_sessions_flushed(mdr
);
9129 void Server::handle_peer_rename_prep(MDRequestRef
& mdr
)
9131 dout(10) << "handle_peer_rename_prep " << *mdr
9132 << " " << mdr
->peer_request
->srcdnpath
9133 << " to " << mdr
->peer_request
->destdnpath
9136 if (mdr
->peer_request
->is_interrupted()) {
9137 dout(10) << " peer request interrupted, sending noop reply" << dendl
;
9138 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9139 reply
->mark_interrupted();
9140 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9141 mdr
->reset_peer_request();
9146 filepath
destpath(mdr
->peer_request
->destdnpath
);
9147 dout(10) << " dest " << destpath
<< dendl
;
9148 vector
<CDentry
*> trace
;
9149 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
9150 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
9151 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
9154 if (r
== -CEPHFS_ESTALE
) {
9155 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
9156 mdr
->peer_to_mds
, true);
9159 ceph_assert(r
== 0); // we shouldn't get an error here!
9161 CDentry
*destdn
= trace
.back();
9162 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9163 dout(10) << " destdn " << *destdn
<< dendl
;
9167 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
9168 dout(10) << " src " << srcpath
<< dendl
;
9169 CInode
*srci
= nullptr;
9170 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
9171 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
9174 ceph_assert(r
== 0);
9176 CDentry
*srcdn
= trace
.back();
9177 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9178 dout(10) << " srcdn " << *srcdn
<< dendl
;
9183 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
9185 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9186 CDentry
*straydn
= mdr
->straydn
;
9187 if (destdnl
->is_primary() && !linkmerge
)
9188 ceph_assert(straydn
);
9190 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
9191 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
9193 // set up commit waiter (early, to clean up any freezing etc we do)
9194 if (!mdr
->more()->peer_commit
)
9195 mdr
->more()->peer_commit
= new C_MDS_PeerRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
9198 if (srcdn
->is_auth()) {
9199 set
<mds_rank_t
> srcdnrep
;
9200 srcdn
->list_replicas(srcdnrep
);
9202 bool reply_witness
= false;
9203 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
9206 // - avoid conflicting lock state changes
9207 // - avoid concurrent updates to the inode
9208 // (this could also be accomplished with the versionlock)
9209 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9210 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
9211 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
9213 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9214 if (srcdnl
->get_inode()->is_frozen_auth_pin())
9215 mdr
->unfreeze_auth_pin();
9217 if (!frozen_inode
) {
9218 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
9223 * set ambiguous auth for srci
9224 * NOTE: we don't worry about ambiguous cache expire as we do
9225 * with subtree migrations because all peers will pin
9226 * srcdn->get_inode() for duration of this rename.
9228 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9230 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9231 // the leader will send another OP_RENAMEPREP peer request later.
9232 if (mdr
->peer_request
->witnesses
.size() > 1) {
9233 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
9234 reply_witness
= true;
9237 // make sure bystanders have received all lock related messages
9238 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9239 if (*p
== mdr
->peer_to_mds
||
9240 (mds
->is_cluster_degraded() &&
9241 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
9243 auto notify
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMENOTIFY
);
9244 mds
->send_message_mds(notify
, *p
);
9245 mdr
->more()->waiting_on_peer
.insert(*p
);
9248 // make sure clients have received all cap related messages
9249 set
<client_t
> export_client_set
;
9250 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
9252 MDSGatherBuilder
gather(g_ceph_context
);
9253 flush_client_sessions(export_client_set
, gather
);
9254 if (gather
.has_subs()) {
9255 mdr
->more()->waiting_on_peer
.insert(MDS_RANK_NONE
);
9256 gather
.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr
));
9261 // is witness list sufficient?
9262 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9263 if (*p
== mdr
->peer_to_mds
||
9264 mdr
->peer_request
->witnesses
.count(*p
)) continue;
9265 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
9266 reply_witness
= true;
9270 if (reply_witness
) {
9271 ceph_assert(!srcdnrep
.empty());
9272 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9273 reply
->witnesses
.swap(srcdnrep
);
9274 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9275 mdr
->reset_peer_request();
9278 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
9279 if (!mdr
->more()->waiting_on_peer
.empty()) {
9280 dout(10) << " still waiting for rename notify acks from "
9281 << mdr
->more()->waiting_on_peer
<< dendl
;
9284 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
9285 // set ambiguous auth for srci on witnesses
9286 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9289 // encode everything we'd need to roll this back... basically, just the original state.
9290 rename_rollback rollback
;
9292 rollback
.reqid
= mdr
->reqid
;
9294 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
9295 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9296 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9297 rollback
.orig_src
.dname
= srcdn
->get_name();
9298 if (srcdnl
->is_primary())
9299 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
9301 ceph_assert(srcdnl
->is_remote());
9302 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
9303 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
9306 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
9307 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9308 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9309 rollback
.orig_dest
.dname
= destdn
->get_name();
9310 if (destdnl
->is_primary())
9311 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
9312 else if (destdnl
->is_remote()) {
9313 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
9314 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
9318 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
9319 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9320 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9321 rollback
.stray
.dname
= straydn
->get_name();
9323 if (mdr
->peer_request
->desti_snapbl
.length()) {
9324 CInode
*oldin
= destdnl
->get_inode();
9325 if (oldin
->snaprealm
) {
9326 encode(true, rollback
.desti_snapbl
);
9327 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
9329 encode(false, rollback
.desti_snapbl
);
9332 if (mdr
->peer_request
->srci_snapbl
.length()) {
9333 if (srci
->snaprealm
) {
9334 encode(true, rollback
.srci_snapbl
);
9335 srci
->encode_snap_blob(rollback
.srci_snapbl
);
9337 encode(false, rollback
.srci_snapbl
);
9340 encode(rollback
, mdr
->more()->rollback_bl
);
9341 // FIXME: rollback snaprealm
9342 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
9345 mdr
->ls
= mdlog
->get_current_segment();
9346 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_prep", mdr
->reqid
, mdr
->peer_to_mds
,
9347 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RENAME
);
9348 mdlog
->start_entry(le
);
9349 le
->rollback
= mdr
->more()->rollback_bl
;
9351 bufferlist blah
; // inode import data... obviously not used if we're the peer
9352 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, mdr
->peer_request
->alternate_name
, straydn
);
9354 if (le
->commit
.empty()) {
9355 dout(10) << " empty metablob, skipping journal" << dendl
;
9356 mdlog
->cancel_entry(le
);
9358 _logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9360 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
9361 mdr
->more()->peer_update_journaled
= true;
9362 submit_mdlog_entry(le
, new C_MDS_PeerRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
9368 void Server::_logged_peer_rename(MDRequestRef
& mdr
,
9369 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9371 dout(10) << "_logged_peer_rename " << *mdr
<< dendl
;
9374 ref_t
<MMDSPeerRequest
> reply
;
9375 if (!mdr
->aborted
) {
9376 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9377 if (!mdr
->more()->peer_update_journaled
)
9378 reply
->mark_not_journaled();
9381 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9382 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9385 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
9386 // set export bounds for CInode::encode_export()
9388 std::vector
<CDir
*> bounds
;
9389 if (srcdnl
->get_inode()->is_dir()) {
9390 srcdnl
->get_inode()->get_dirfrags(bounds
);
9391 for (const auto& bound
: bounds
) {
9392 bound
->state_set(CDir::STATE_EXPORTBOUND
);
9396 map
<client_t
,entity_inst_t
> exported_client_map
;
9397 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
9399 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
9400 exported_client_map
,
9401 exported_client_metadata_map
);
9403 for (const auto& bound
: bounds
) {
9404 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
9407 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
9408 encode(exported_client_metadata_map
, reply
->inode_export
);
9409 reply
->inode_export
.claim_append(inodebl
);
9410 reply
->inode_export_v
= srcdnl
->get_inode()->get_version();
9413 // remove mdr auth pin
9414 mdr
->auth_unpin(srcdnl
->get_inode());
9415 mdr
->more()->is_inode_exporter
= true;
9417 if (srcdnl
->get_inode()->is_dirty())
9418 srcdnl
->get_inode()->mark_clean();
9420 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
9424 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9426 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9429 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9430 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
9431 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
9434 mdr
->reset_peer_request();
9438 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9440 ceph_assert(mdr
->aborted
);
9441 dout(10) << " abort flag set, finishing" << dendl
;
9442 mdcache
->request_finish(mdr
);
9446 void Server::_commit_peer_rename(MDRequestRef
& mdr
, int r
,
9447 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9449 dout(10) << "_commit_peer_rename " << *mdr
<< " r=" << r
<< dendl
;
9451 CInode
*in
= destdn
->get_linkage()->get_inode();
9453 inodeno_t migrated_stray
;
9454 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
9455 migrated_stray
= in
->ino();
9457 MDSContext::vec finished
;
9459 // unfreeze+singleauth inode
9460 // hmm, do i really need to delay this?
9461 if (mdr
->more()->is_inode_exporter
) {
9463 // we exported, clear out any xlocks that we moved to another MDS
9465 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
9466 i
!= mdr
->locks
.end(); ) {
9467 SimpleLock
*lock
= i
->lock
;
9468 if (lock
->get_parent() != in
)
9470 // we only care about xlocks on the exported inode
9471 if (i
->is_xlock() && !lock
->is_locallock())
9472 mds
->locker
->xlock_export(i
++, mdr
.get());
9477 map
<client_t
,Capability::Import
> peer_imported
;
9478 auto bp
= mdr
->more()->inode_import
.cbegin();
9479 decode(peer_imported
, bp
);
9481 dout(10) << " finishing inode export on " << *in
<< dendl
;
9482 mdcache
->migrator
->finish_export_inode(in
, mdr
->peer_to_mds
, peer_imported
, finished
);
9483 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
9486 ceph_assert(in
->is_frozen_inode());
9487 in
->unfreeze_inode(finished
);
9491 if (mdr
->more()->is_ambiguous_auth
) {
9492 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9493 mdr
->more()->is_ambiguous_auth
= false;
9496 if (straydn
&& mdr
->more()->peer_update_journaled
) {
9497 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
9498 if (strayin
&& !strayin
->snaprealm
)
9499 mdcache
->clear_dirty_bits_for_stray(strayin
);
9502 mds
->queue_waiters(finished
);
9505 if (mdr
->more()->peer_update_journaled
) {
9506 // write a commit to the journal
9507 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_commit", mdr
->reqid
,
9508 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
9509 EPeerUpdate::RENAME
);
9510 mdlog
->start_entry(le
);
9511 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
9514 _committed_peer(mdr
);
9519 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9520 // witness list from the leader, and they failed before we tried prep again.
9521 if (mdr
->more()->rollback_bl
.length()) {
9522 if (mdr
->more()->is_inode_exporter
) {
9523 dout(10) << " reversing inode export of " << *in
<< dendl
;
9526 if (mdcache
->is_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
)) {
9527 mdcache
->remove_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
);
9528 // rollback but preserve the peer request
9529 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, false);
9530 mdr
->more()->rollback_bl
.clear();
9532 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, true);
9534 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl
;
9536 if (mdr
->more()->is_ambiguous_auth
) {
9537 if (srcdn
->is_auth())
9538 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9540 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9541 mdr
->more()->is_ambiguous_auth
= false;
9543 mds
->queue_waiters(finished
);
9544 mdcache
->request_finish(mdr
);
9548 if (migrated_stray
&& mds
->is_stopping())
9549 mdcache
->shutdown_export_stray_finish(migrated_stray
);
9552 static void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
,
9553 rename_rollback::drec
&r
, utime_t ctime
,
9554 bool isdir
, const nest_info_t
&rstat
)
9556 auto pf
= dir
->project_fnode(mut
);
9557 pf
->version
= dir
->pre_dirty();
9560 pf
->fragstat
.nsubdirs
+= 1;
9562 pf
->fragstat
.nfiles
+= 1;
9565 pf
->rstat
.rbytes
+= rstat
.rbytes
;
9566 pf
->rstat
.rfiles
+= rstat
.rfiles
;
9567 pf
->rstat
.rsubdirs
+= rstat
.rsubdirs
;
9568 pf
->rstat
.rsnaps
+= rstat
.rsnaps
;
9570 if (pf
->fragstat
.mtime
== ctime
) {
9571 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
9572 if (pf
->rstat
.rctime
== ctime
)
9573 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
9575 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
9576 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
9579 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
9585 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9587 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
9588 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
9589 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
9590 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
9591 straydn(st
), finish_mdr(f
) {
9592 splits
[0].swap(_splits
[0]);
9593 splits
[1].swap(_splits
[1]);
9595 void finish(int r
) override
{
9596 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
9597 destdn
, straydn
, splits
, finish_mdr
);
9601 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
,
9604 rename_rollback rollback
;
9605 auto p
= rbl
.cbegin();
9606 decode(rollback
, p
);
9608 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
9609 // need to finish this update before sending resolve to claim the subtree
9610 mdcache
->add_rollback(rollback
.reqid
, leader
);
9612 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
9613 mut
->ls
= mds
->mdlog
->get_current_segment();
9615 CDentry
*srcdn
= NULL
;
9616 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
9618 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
9620 dout(10) << " srcdir " << *srcdir
<< dendl
;
9621 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
9623 dout(10) << " srcdn " << *srcdn
<< dendl
;
9624 ceph_assert(srcdn
->get_linkage()->is_null());
9626 dout(10) << " srcdn not found" << dendl
;
9628 dout(10) << " srcdir not found" << dendl
;
9630 CDentry
*destdn
= NULL
;
9631 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
9633 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
9635 dout(10) << " destdir " << *destdir
<< dendl
;
9636 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
9638 dout(10) << " destdn " << *destdn
<< dendl
;
9640 dout(10) << " destdn not found" << dendl
;
9642 dout(10) << " destdir not found" << dendl
;
9645 if (rollback
.orig_src
.ino
) {
9646 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
9647 if (in
&& in
->is_dir())
9648 ceph_assert(srcdn
&& destdn
);
9650 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
9652 CDir
*straydir
= NULL
;
9653 CDentry
*straydn
= NULL
;
9654 if (rollback
.stray
.dirfrag
.ino
) {
9655 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
9657 dout(10) << "straydir " << *straydir
<< dendl
;
9658 straydn
= straydir
->lookup(rollback
.stray
.dname
);
9660 dout(10) << " straydn " << *straydn
<< dendl
;
9661 ceph_assert(straydn
->get_linkage()->is_primary());
9663 dout(10) << " straydn not found" << dendl
;
9665 dout(10) << "straydir not found" << dendl
;
9668 CInode
*target
= NULL
;
9669 if (rollback
.orig_dest
.ino
) {
9670 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
9672 ceph_assert(destdn
&& straydn
);
9673 } else if (rollback
.orig_dest
.remote_ino
)
9674 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
9676 // can't use is_auth() in the resolve stage
9677 mds_rank_t whoami
= mds
->get_nodeid();
9679 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
9680 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
9682 bool force_journal_src
= false;
9683 bool force_journal_dest
= false;
9684 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
9685 force_journal_src
= _need_force_journal(in
, false);
9686 if (in
&& target
&& target
->is_dir())
9687 force_journal_dest
= _need_force_journal(in
, true);
9689 version_t srcdnpv
= 0;
9692 if (srcdn
->authority().first
== whoami
)
9693 srcdnpv
= srcdn
->pre_dirty();
9694 if (rollback
.orig_src
.ino
) {
9696 srcdn
->push_projected_linkage(in
);
9698 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
9699 rollback
.orig_src
.remote_d_type
);
9702 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
9704 const CInode::mempool_inode
*pip
= nullptr;
9707 CDir
*pdir
= in
->get_projected_parent_dir();
9708 if (pdir
->authority().first
== whoami
) {
9709 auto pi
= in
->project_inode(mut
);
9710 pi
.inode
->version
= in
->pre_dirty();
9711 if (pdir
!= srcdir
) {
9712 auto pf
= pdir
->project_fnode(mut
);
9713 pf
->version
= pdir
->pre_dirty();
9715 if (pi
.inode
->ctime
== rollback
.ctime
)
9716 pi
.inode
->ctime
= rollback
.orig_src
.old_ctime
;
9719 if (in
->get_inode()->ctime
== rollback
.ctime
) {
9720 auto _inode
= CInode::allocate_inode(*in
->get_inode());
9721 _inode
->ctime
= rollback
.orig_src
.old_ctime
;
9722 in
->reset_inode(_inode
);
9726 pip
= in
->get_projected_inode().get();
9728 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
9730 auto p
= rollback
.srci_snapbl
.cbegin();
9731 decode(hadrealm
, p
);
9733 if (projected
&& !mds
->is_resolve()) {
9734 sr_t
*new_srnode
= new sr_t();
9735 decode(*new_srnode
, p
);
9736 in
->project_snaprealm(new_srnode
);
9738 decode(in
->snaprealm
->srnode
, p
);
9741 if (rollback
.orig_src
.ino
) {
9742 ceph_assert(srcdir
);
9743 realm
= srcdir
->get_inode()->find_snaprealm();
9745 realm
= in
->snaprealm
->parent
;
9747 if (!mds
->is_resolve())
9748 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
9750 in
->project_snaprealm(NULL
);
9752 in
->snaprealm
->merge_to(realm
);
9759 if (rollback
.orig_dest
.ino
&& target
) {
9760 destdn
->push_projected_linkage(target
);
9761 } else if (rollback
.orig_dest
.remote_ino
) {
9762 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
9763 rollback
.orig_dest
.remote_d_type
);
9765 // the dentry will be trimmed soon, it's ok to have wrong linkage
9766 if (rollback
.orig_dest
.ino
)
9767 ceph_assert(mds
->is_resolve());
9768 destdn
->push_projected_linkage();
9773 straydn
->push_projected_linkage();
9777 CInode::inode_ptr ti
;
9778 CDir
*pdir
= target
->get_projected_parent_dir();
9779 if (pdir
->authority().first
== whoami
) {
9780 auto pi
= target
->project_inode(mut
);
9781 pi
.inode
->version
= target
->pre_dirty();
9782 if (pdir
!= srcdir
) {
9783 auto pf
= pdir
->project_fnode(mut
);
9784 pf
->version
= pdir
->pre_dirty();
9789 ti
= CInode::allocate_inode(*target
->get_inode());
9793 if (ti
->ctime
== rollback
.ctime
)
9794 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
9795 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
9796 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
9797 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
9799 ceph_assert(rollback
.orig_dest
.remote_ino
&&
9800 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
9805 target
->reset_inode(ti
);
9807 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
9809 auto p
= rollback
.desti_snapbl
.cbegin();
9810 decode(hadrealm
, p
);
9812 if (projected
&& !mds
->is_resolve()) {
9813 sr_t
*new_srnode
= new sr_t();
9814 decode(*new_srnode
, p
);
9815 target
->project_snaprealm(new_srnode
);
9817 decode(target
->snaprealm
->srnode
, p
);
9820 if (rollback
.orig_dest
.ino
) {
9821 ceph_assert(destdir
);
9822 realm
= destdir
->get_inode()->find_snaprealm();
9824 realm
= target
->snaprealm
->parent
;
9826 if (!mds
->is_resolve())
9827 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
9829 target
->project_snaprealm(NULL
);
9831 target
->snaprealm
->merge_to(realm
);
9836 if (srcdn
&& srcdn
->authority().first
== whoami
) {
9838 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
9839 in
&& in
->is_dir(), pip
? pip
->accounted_rstat
: blah
);
9843 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
9845 dout(0) << " srci back to " << *in
<< dendl
;
9847 dout(0) << " destdn back to " << *destdn
<< dendl
;
9849 dout(0) << " desti back to " << *target
<< dendl
;
9852 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_rollback", rollback
.reqid
, leader
,
9853 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RENAME
);
9854 mdlog
->start_entry(le
);
9856 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9857 le
->commit
.add_dir_context(srcdir
);
9858 if (rollback
.orig_src
.ino
)
9859 le
->commit
.add_primary_dentry(srcdn
, 0, true);
9861 le
->commit
.add_remote_dentry(srcdn
, true);
9864 if (!rollback
.orig_src
.ino
&& // remote linkage
9865 in
&& in
->authority().first
== whoami
) {
9866 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
9867 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9870 if (force_journal_dest
) {
9871 ceph_assert(rollback
.orig_dest
.ino
);
9872 le
->commit
.add_dir_context(destdir
);
9873 le
->commit
.add_primary_dentry(destdn
, 0, true);
9876 // peer: no need to journal straydn
9878 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
9879 ceph_assert(rollback
.orig_dest
.remote_ino
);
9880 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
9881 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
9884 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
9885 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
9886 le
->commit
.renamed_dirino
= in
->ino();
9887 if (srcdn
->authority().first
== whoami
) {
9888 auto&& ls
= in
->get_dirfrags();
9889 for (const auto& dir
: ls
) {
9890 if (!dir
->is_auth())
9891 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
9893 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
9895 } else if (force_journal_dest
) {
9896 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
9897 le
->commit
.renamed_dirino
= target
->ino();
9900 if (target
&& target
->is_dir()) {
9901 ceph_assert(destdn
);
9902 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
9905 if (in
&& in
->is_dir()) {
9907 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
9910 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
9911 ceph_assert(le
->commit
.empty());
9912 mdlog
->cancel_entry(le
);
9914 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
9916 ceph_assert(!le
->commit
.empty());
9918 mdr
->more()->peer_update_journaled
= false;
9919 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
9920 srcdn
, srcdnpv
, destdn
, straydn
,
9921 splits
, finish_mdr
);
9922 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
9927 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
9928 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
9929 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
9931 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
9934 straydn
->get_dir()->unlink_inode(straydn
);
9935 straydn
->pop_projected_linkage();
9938 destdn
->get_dir()->unlink_inode(destdn
);
9939 destdn
->pop_projected_linkage();
9942 srcdn
->pop_projected_linkage();
9943 if (srcdn
->authority().first
== mds
->get_nodeid()) {
9944 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
9945 if (srcdn
->get_linkage()->is_primary())
9946 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
9952 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
9953 CInode
*in
= srcdn
->get_linkage()->get_inode();
9954 if (in
&& in
->is_dir()) {
9955 ceph_assert(destdn
);
9956 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
9961 CInode
*oldin
= destdn
->get_linkage()->get_inode();
9962 // update subtree map?
9963 if (oldin
&& oldin
->is_dir()) {
9964 ceph_assert(straydn
);
9965 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
9969 if (mds
->is_resolve()) {
9972 root
= mdcache
->get_subtree_root(straydn
->get_dir());
9974 root
= mdcache
->get_subtree_root(destdn
->get_dir());
9976 mdcache
->try_trim_non_auth_subtree(root
);
9978 mdcache
->send_snaps(splits
[1]);
9979 mdcache
->send_snaps(splits
[0]);
9983 MDSContext::vec finished
;
9984 if (mdr
->more()->is_ambiguous_auth
) {
9985 if (srcdn
->is_auth())
9986 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
9988 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
9989 mdr
->more()->is_ambiguous_auth
= false;
9991 mds
->queue_waiters(finished
);
9992 if (finish_mdr
|| mdr
->aborted
)
9993 mdcache
->request_finish(mdr
);
9995 mdr
->more()->peer_rolling_back
= false;
9998 mdcache
->finish_rollback(mut
->reqid
, mdr
);
10003 void Server::handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10005 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10006 << " witnessed by " << ack
->get_source()
10007 << " " << *ack
<< dendl
;
10008 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10011 mdr
->more()->peers
.insert(from
);
10012 if (mdr
->more()->srcdn_auth_mds
== from
&&
10013 mdr
->more()->is_remote_frozen_authpin
&&
10014 !mdr
->more()->is_ambiguous_auth
) {
10015 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
10018 // witnessed? or add extra witnesses?
10019 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
10020 if (ack
->is_interrupted()) {
10021 dout(10) << " peer request interrupted, noop" << dendl
;
10022 } else if (ack
->witnesses
.empty()) {
10023 mdr
->more()->witnessed
.insert(from
);
10024 if (!ack
->is_not_journaled())
10025 mdr
->more()->has_journaled_peers
= true;
10027 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
10028 mdr
->more()->extra_witnesses
= ack
->witnesses
;
10029 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
10033 if (ack
->inode_export
.length()) {
10034 dout(10) << " got srci import" << dendl
;
10035 mdr
->more()->inode_import
.share(ack
->inode_export
);
10036 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
10039 // remove from waiting list
10040 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
10041 mdr
->more()->waiting_on_peer
.erase(from
);
10043 if (mdr
->more()->waiting_on_peer
.empty())
10044 dispatch_client_request(mdr
); // go again!
10046 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
10049 void Server::handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10051 dout(10) << "handle_peer_rename_notify_ack " << *mdr
<< " from mds."
10052 << ack
->get_source() << dendl
;
10053 ceph_assert(mdr
->is_peer());
10054 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10056 if (mdr
->more()->waiting_on_peer
.count(from
)) {
10057 mdr
->more()->waiting_on_peer
.erase(from
);
10059 if (mdr
->more()->waiting_on_peer
.empty()) {
10060 if (mdr
->peer_request
)
10061 dispatch_peer_request(mdr
);
10063 dout(10) << " still waiting for rename notify acks from "
10064 << mdr
->more()->waiting_on_peer
<< dendl
;
10068 void Server::_peer_rename_sessions_flushed(MDRequestRef
& mdr
)
10070 dout(10) << "_peer_rename_sessions_flushed " << *mdr
<< dendl
;
10072 if (mdr
->more()->waiting_on_peer
.count(MDS_RANK_NONE
)) {
10073 mdr
->more()->waiting_on_peer
.erase(MDS_RANK_NONE
);
10075 if (mdr
->more()->waiting_on_peer
.empty()) {
10076 if (mdr
->peer_request
)
10077 dispatch_peer_request(mdr
);
10079 dout(10) << " still waiting for rename notify acks from "
10080 << mdr
->more()->waiting_on_peer
<< dendl
;
10085 /* This function takes responsibility for the passed mdr*/
10086 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
10088 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10090 // traverse to path
10091 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10095 if (!diri
->is_dir()) {
10096 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10099 dout(10) << "lssnap on " << *diri
<< dendl
;
10102 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
10105 if (!check_access(mdr
, diri
, MAY_READ
))
10108 SnapRealm
*realm
= diri
->find_snaprealm();
10109 map
<snapid_t
,const SnapInfo
*> infomap
;
10110 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
10112 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
10114 max_entries
= infomap
.size();
10115 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
10117 // make sure at least one item can be encoded
10118 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
10120 __u64 last_snapid
= 0;
10121 string offset_str
= req
->get_path2();
10122 if (!offset_str
.empty())
10123 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
10127 static DirStat empty
;
10128 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
10130 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
10134 auto p
= infomap
.upper_bound(last_snapid
);
10135 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
10136 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
10140 if (p
->second
->ino
== diri
->ino())
10141 snap_name
= p
->second
->name
;
10143 snap_name
= p
->second
->get_long_name();
10145 unsigned start_len
= dnbl
.length();
10146 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
10149 encode(snap_name
, dnbl
);
10151 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
10152 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
10153 dout(20) << "encode_infinite_lease" << dendl
;
10155 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
10158 keep
.substr_of(dnbl
, 0, start_len
);
10165 encode(num
, dirbl
);
10167 if (p
== infomap
.end()) {
10168 flags
= CEPH_READDIR_FRAG_END
;
10169 if (last_snapid
== 0)
10170 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
10172 encode(flags
, dirbl
);
10173 dirbl
.claim_append(dnbl
);
10175 mdr
->reply_extra_bl
= dirbl
;
10176 mdr
->tracei
= diri
;
10177 respond_to_request(mdr
, 0);
10183 struct C_MDS_mksnap_finish
: public ServerLogContext
{
10186 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
10187 ServerLogContext(s
, r
), diri(di
), info(i
) {}
10188 void finish(int r
) override
{
10189 server
->_mksnap_finish(mdr
, diri
, info
);
10193 /* This function takes responsibility for the passed mdr*/
10194 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
10196 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10197 // make sure we have as new a map as the client
10198 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
10199 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
10202 if (!mds
->mdsmap
->allows_snaps()) {
10203 // you can't make snapshots until you set an option right now
10204 respond_to_request(mdr
, -CEPHFS_EPERM
);
10208 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10213 if (!diri
->is_dir()) {
10214 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10217 if (diri
->is_system() && !diri
->is_root()) {
10218 // no snaps in system dirs (root is ok)
10219 respond_to_request(mdr
, -CEPHFS_EPERM
);
10223 std::string_view snapname
= req
->get_filepath().last_dentry();
10225 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10226 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10227 respond_to_request(mdr
, -CEPHFS_EPERM
);
10231 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
10234 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10235 MutationImpl::LockOpVec lov
;
10236 lov
.add_xlock(&diri
->snaplock
);
10237 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10240 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10241 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10244 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10247 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10250 if (inodeno_t subvol_ino
= diri
->find_snaprealm()->get_subvolume_ino();
10251 (subvol_ino
&& subvol_ino
!= diri
->ino())) {
10252 respond_to_request(mdr
, -CEPHFS_EPERM
);
10256 // check if we can create any more snapshots
10257 // we don't allow any more if we are already at or beyond the limit
10258 if (diri
->snaprealm
&&
10259 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
10260 respond_to_request(mdr
, -CEPHFS_EMLINK
);
10264 // make sure name is unique
10265 if (diri
->snaprealm
&&
10266 diri
->snaprealm
->exists(snapname
)) {
10267 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10270 if (snapname
.length() == 0 ||
10271 snapname
[0] == '_') {
10272 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10276 // allocate a snapid
10277 if (!mdr
->more()->stid
) {
10279 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
10280 mdr
->get_mds_stamp(),
10281 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10282 new C_MDS_RetryRequest(mdcache
, mdr
));
10286 version_t stid
= mdr
->more()->stid
;
10288 auto p
= mdr
->more()->snapidbl
.cbegin();
10290 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
10292 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10294 SnapPayload payload
;
10295 if (req
->get_data().length()) {
10297 auto iter
= req
->get_data().cbegin();
10298 decode(payload
, iter
);
10299 } catch (const ceph::buffer::error
&e
) {
10300 // backward compat -- client sends xattr bufferlist. however,
10301 // that is not used anywhere -- so (log and) ignore.
10302 dout(20) << ": no metadata in payload (old client?)" << dendl
;
10308 info
.ino
= diri
->ino();
10309 info
.snapid
= snapid
;
10310 info
.name
= snapname
;
10311 info
.stamp
= mdr
->get_op_stamp();
10312 info
.metadata
= payload
.metadata
;
10314 auto pi
= diri
->project_inode(mdr
, false, true);
10315 pi
.inode
->ctime
= info
.stamp
;
10316 if (info
.stamp
> pi
.inode
->rstat
.rctime
)
10317 pi
.inode
->rstat
.rctime
= info
.stamp
;
10318 pi
.inode
->rstat
.rsnaps
++;
10319 pi
.inode
->version
= diri
->pre_dirty();
10321 // project the snaprealm
10322 auto &newsnap
= *pi
.snapnode
;
10323 newsnap
.created
= snapid
;
10324 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
10326 em
.first
->second
= info
;
10327 newsnap
.seq
= snapid
;
10328 newsnap
.last_created
= snapid
;
10330 // journal the inode changes
10331 mdr
->ls
= mdlog
->get_current_segment();
10332 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
10333 mdlog
->start_entry(le
);
10335 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10336 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10337 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10338 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10340 // journal the snaprealm changes
10341 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
10346 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
10348 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
10350 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
10354 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10357 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10359 // notify other mds
10360 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
10362 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
10366 mdr
->snapid
= info
.snapid
;
10367 mdr
->tracei
= diri
;
10368 respond_to_request(mdr
, 0);
10374 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
10377 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10378 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10379 void finish(int r
) override
{
10380 server
->_rmsnap_finish(mdr
, diri
, snapid
);
10384 /* This function takes responsibility for the passed mdr*/
10385 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
10387 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10389 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10393 if (!diri
->is_dir()) {
10394 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10398 std::string_view snapname
= req
->get_filepath().last_dentry();
10400 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10401 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10402 respond_to_request(mdr
, -CEPHFS_EPERM
);
10406 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
10408 // does snap exist?
10409 if (snapname
.length() == 0 || snapname
[0] == '_') {
10410 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't prune a parent snap, currently.
10413 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
10414 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10417 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
10418 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
10420 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10421 MutationImpl::LockOpVec lov
;
10422 lov
.add_xlock(&diri
->snaplock
);
10423 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10425 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10426 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10429 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10432 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10436 if (!mdr
->more()->stid
) {
10437 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
10438 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10439 new C_MDS_RetryRequest(mdcache
, mdr
));
10442 version_t stid
= mdr
->more()->stid
;
10443 auto p
= mdr
->more()->snapidbl
.cbegin();
10446 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
10448 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10451 auto pi
= diri
->project_inode(mdr
, false, true);
10452 pi
.inode
->version
= diri
->pre_dirty();
10453 pi
.inode
->ctime
= mdr
->get_op_stamp();
10454 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10455 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10456 pi
.inode
->rstat
.rsnaps
--;
10458 mdr
->ls
= mdlog
->get_current_segment();
10459 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
10460 mdlog
->start_entry(le
);
10462 // project the snaprealm
10463 auto &newnode
= *pi
.snapnode
;
10464 newnode
.snaps
.erase(snapid
);
10466 newnode
.last_destroyed
= seq
;
10468 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10469 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10470 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10471 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10473 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
10478 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10480 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10481 snapid_t stid
= mdr
->more()->stid
;
10482 auto p
= mdr
->more()->snapidbl
.cbegin();
10488 mds
->snapclient
->commit(stid
, mdr
->ls
);
10490 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10492 // notify other mds
10493 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
10495 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
10499 respond_to_request(mdr
, 0);
10501 // purge snapshot data
10502 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
10505 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
10508 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
10509 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
10510 void finish(int r
) override
{
10511 server
->_renamesnap_finish(mdr
, diri
, snapid
);
10515 /* This function takes responsibility for the passed mdr*/
10516 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
10518 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10519 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
10520 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10524 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10528 if (!diri
->is_dir()) { // dir only
10529 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10533 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
10534 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10535 respond_to_request(mdr
, -CEPHFS_EPERM
);
10539 std::string_view dstname
= req
->get_filepath().last_dentry();
10540 std::string_view srcname
= req
->get_filepath2().last_dentry();
10541 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
10543 if (srcname
.length() == 0 || srcname
[0] == '_') {
10544 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't rename a parent snap.
10547 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
10548 respond_to_request(mdr
, -CEPHFS_ENOENT
);
10551 if (dstname
.length() == 0 || dstname
[0] == '_') {
10552 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10555 if (diri
->snaprealm
->exists(dstname
)) {
10556 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10560 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
10561 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
10564 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10565 MutationImpl::LockOpVec lov
;
10566 lov
.add_xlock(&diri
->snaplock
);
10567 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10569 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10570 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10573 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10576 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10580 if (!mdr
->more()->stid
) {
10581 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
10582 &mdr
->more()->stid
,
10583 new C_MDS_RetryRequest(mdcache
, mdr
));
10587 version_t stid
= mdr
->more()->stid
;
10588 dout(10) << " stid is " << stid
<< dendl
;
10590 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10593 auto pi
= diri
->project_inode(mdr
, false, true);
10594 pi
.inode
->ctime
= mdr
->get_op_stamp();
10595 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
10596 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
10597 pi
.inode
->version
= diri
->pre_dirty();
10599 // project the snaprealm
10600 auto &newsnap
= *pi
.snapnode
;
10601 auto it
= newsnap
.snaps
.find(snapid
);
10602 ceph_assert(it
!= newsnap
.snaps
.end());
10603 it
->second
.name
= dstname
;
10605 // journal the inode changes
10606 mdr
->ls
= mdlog
->get_current_segment();
10607 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
10608 mdlog
->start_entry(le
);
10610 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10611 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10612 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10613 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10615 // journal the snaprealm changes
10616 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
10621 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
10623 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
10627 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
10629 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
10631 // notify other mds
10632 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
10634 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
10638 mdr
->tracei
= diri
;
10639 mdr
->snapid
= snapid
;
10640 respond_to_request(mdr
, 0);
10644 * Return true if server is in state RECONNECT and this
10645 * client has not yet reconnected.
10647 bool Server::waiting_for_reconnect(client_t c
) const
10649 return client_reconnect_gather
.count(c
) > 0;
10652 void Server::dump_reconnect_status(Formatter
*f
) const
10654 f
->open_object_section("reconnect_status");
10655 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
10656 f
->close_section();