1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
28 #include "MDBalancer.h"
30 #include "SnapClient.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
35 #include "msg/Messenger.h"
37 #include "osdc/Objecter.h"
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
59 #include <string_view>
62 #include "common/config.h"
64 #include "msg/Message.h"
66 #define dout_context g_ceph_context
67 #define dout_subsys ceph_subsys_mds
69 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
73 class ServerContext
: public MDSContext
{
76 MDSRank
*get_mds() override
82 explicit ServerContext(Server
*s
) : server(s
) {
83 ceph_assert(server
!= NULL
);
87 class Batch_Getattr_Lookup
: public BatchOp
{
90 ceph::ref_t
<MDRequestImpl
> mdr
;
91 std::vector
<ceph::ref_t
<MDRequestImpl
>> batch_reqs
;
94 Batch_Getattr_Lookup(Server
* s
, const ceph::ref_t
<MDRequestImpl
>& r
)
96 if (mdr
->client_request
->get_op() == CEPH_MDS_OP_LOOKUP
)
97 mdr
->batch_op_map
= &mdr
->dn
[0].back()->batch_ops
;
99 mdr
->batch_op_map
= &mdr
->in
[0]->batch_ops
;
101 void add_request(const ceph::ref_t
<MDRequestImpl
>& r
) override
{
102 batch_reqs
.push_back(r
);
104 ceph::ref_t
<MDRequestImpl
> find_new_head() override
{
105 while (!batch_reqs
.empty()) {
106 auto r
= std::move(batch_reqs
.back());
107 batch_reqs
.pop_back();
111 r
->batch_op_map
= mdr
->batch_op_map
;
112 mdr
->batch_op_map
= nullptr;
118 void _forward(mds_rank_t t
) override
{
119 MDCache
* mdcache
= server
->mdcache
;
120 mdcache
->mds
->forward_message_mds(mdr
, t
);
121 mdr
->set_mds_stamp(ceph_clock_now());
122 for (auto& m
: batch_reqs
) {
124 mdcache
->request_forward(m
, t
);
128 void _respond(int r
) override
{
129 mdr
->set_mds_stamp(ceph_clock_now());
130 for (auto& m
: batch_reqs
) {
132 m
->tracei
= mdr
->tracei
;
133 m
->tracedn
= mdr
->tracedn
;
134 server
->respond_to_request(m
, r
);
138 server
->reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
140 void print(std::ostream
& o
) const override
{
141 o
<< "[batch front=" << *mdr
<< "]";
145 class ServerLogContext
: public MDSLogContextBase
{
148 MDSRank
*get_mds() override
154 void pre_finish(int r
) override
{
156 mdr
->mark_event("journal_committed: ");
159 explicit ServerLogContext(Server
*s
) : server(s
) {
160 ceph_assert(server
!= NULL
);
162 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
163 ceph_assert(server
!= NULL
);
167 void Server::create_logger()
169 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
171 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
172 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
173 plb
.add_u64_counter(l_mdss_handle_peer_request
, "handle_peer_request",
174 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
175 plb
.add_u64_counter(l_mdss_handle_client_session
,
176 "handle_client_session", "Client session messages", "hcs",
177 PerfCountersBuilder::PRIO_INTERESTING
);
178 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
179 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
180 plb
.add_u64_counter(l_mdss_cap_acquisition_throttle
,
181 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
182 PerfCountersBuilder::PRIO_INTERESTING
);
184 // fop latencies are useful
185 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
186 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
187 "Request type lookup hash of inode latency");
188 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
189 "Request type lookup inode latency");
190 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
191 "Request type lookup parent latency");
192 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
193 "Request type lookup name latency");
194 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
195 "Request type lookup latency");
196 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
197 "Request type lookup snapshot latency");
198 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
199 "Request type get attribute latency");
200 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
201 "Request type set attribute latency");
202 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
203 "Request type set file layout latency");
204 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
205 "Request type set directory layout latency");
206 plb
.add_time_avg(l_mdss_req_getvxattr_latency
, "req_getvxattr_latency",
207 "Request type get virtual extended attribute latency");
208 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
209 "Request type set extended attribute latency");
210 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
211 "Request type remove extended attribute latency");
212 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
213 "Request type read directory latency");
214 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
215 "Request type set file lock latency");
216 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
217 "Request type get file lock latency");
218 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
219 "Request type create latency");
220 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
221 "Request type open latency");
222 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
223 "Request type make node latency");
224 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
225 "Request type link latency");
226 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
227 "Request type unlink latency");
228 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
229 "Request type remove directory latency");
230 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
231 "Request type rename latency");
232 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
233 "Request type make directory latency");
234 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
235 "Request type symbolic link latency");
236 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
237 "Request type list snapshot latency");
238 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
239 "Request type make snapshot latency");
240 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
241 "Request type remove snapshot latency");
242 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
243 "Request type rename snapshot latency");
244 plb
.add_time_avg(l_mdss_req_snapdiff_latency
, "req_snapdiff_latency",
245 "Request type snapshot difference latency");
247 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
248 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
249 "Client requests dispatched");
250 plb
.add_u64_counter(l_mdss_dispatch_peer_request
, "dispatch_server_request",
251 "Server requests dispatched");
253 logger
= plb
.create_perf_counters();
254 g_ceph_context
->get_perfcounters_collection()->add(logger
);
257 Server::Server(MDSRank
*m
, MetricsHandler
*metrics_handler
) :
259 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
260 inject_rename_corrupt_dentry_first(g_conf().get_val
<double>("mds_inject_rename_corrupt_dentry_first")),
261 recall_throttle(g_conf().get_val
<double>("mds_recall_max_decay_rate")),
262 metrics_handler(metrics_handler
)
264 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
265 replay_unsafe_with_closed_session
= g_conf().get_val
<bool>("mds_replay_unsafe_with_closed_session");
266 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
267 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
268 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
269 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
270 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
271 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
272 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
273 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
274 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
275 supported_features
= feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED
);
276 supported_metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
279 void Server::dispatch(const cref_t
<Message
> &m
)
281 switch (m
->get_type()) {
282 case CEPH_MSG_CLIENT_RECONNECT
:
283 handle_client_reconnect(ref_cast
<MClientReconnect
>(m
));
288 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
290 1. In reconnect phase, client sent unsafe requests to mds.
291 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
292 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
293 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
296 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
298 // handle_peer_request()/handle_client_session() will wait if necessary
299 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
300 const auto &req
= ref_cast
<MClientRequest
>(m
);
301 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
302 Session
*session
= mds
->get_session(req
);
303 if (!session
|| (!session
->is_open() && !sessionclosed_isok
)) {
304 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
307 bool queue_replay
= false;
308 if (req
->is_replay() || req
->is_async()) {
309 dout(3) << "queuing replayed op" << dendl
;
312 !session
->have_completed_request(req
->get_reqid().tid
, nullptr)) {
313 inodeno_t
ino(req
->head
.ino
);
314 mdcache
->add_replay_ino_alloc(ino
);
315 if (replay_unsafe_with_closed_session
&&
316 session
->free_prealloc_inos
.contains(ino
)) {
317 // don't purge inodes that will be created by later replay
318 session
->free_prealloc_inos
.erase(ino
);
319 session
->delegated_inos
.insert(ino
);
322 } else if (req
->get_retry_attempt()) {
323 // process completed request in clientreplay stage. The completed request
324 // might have created new file/directorie. This guarantees MDS sends a reply
325 // to client before other request modifies the new file/directorie.
326 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
327 dout(3) << "queuing completed op" << dendl
;
330 // this request was created before the cap reconnect message, drop any embedded
332 req
->releases
.clear();
335 req
->mark_queued_for_replay();
336 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
341 bool wait_for_active
= true;
342 if (mds
->is_stopping()) {
343 wait_for_active
= false;
344 } else if (mds
->is_clientreplay()) {
345 if (req
->is_queued_for_replay()) {
346 wait_for_active
= false;
349 if (wait_for_active
) {
350 dout(3) << "not active yet, waiting" << dendl
;
351 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
356 switch (m
->get_type()) {
357 case CEPH_MSG_CLIENT_SESSION
:
358 handle_client_session(ref_cast
<MClientSession
>(m
));
360 case CEPH_MSG_CLIENT_REQUEST
:
361 handle_client_request(ref_cast
<MClientRequest
>(m
));
363 case CEPH_MSG_CLIENT_RECLAIM
:
364 handle_client_reclaim(ref_cast
<MClientReclaim
>(m
));
366 case MSG_MDS_PEER_REQUEST
:
367 handle_peer_request(ref_cast
<MMDSPeerRequest
>(m
));
370 derr
<< "Server unknown message " << m
->get_type() << " from peer type " << m
->get_connection()->get_peer_type() << dendl
;
371 ceph_abort_msg("server unknown message " + to_string(m
->get_type()) + " from peer type " + to_string(m
->get_connection()->get_peer_type()));
377 // ----------------------------------------------------------
378 // SESSION management
380 class C_MDS_session_finish
: public ServerLogContext
{
385 interval_set
<inodeno_t
> inos_to_free
;
387 interval_set
<inodeno_t
> inos_to_purge
;
388 LogSegment
*ls
= nullptr;
391 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= nullptr) :
392 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
393 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
,
394 const interval_set
<inodeno_t
>& to_free
, version_t iv
,
395 const interval_set
<inodeno_t
>& to_purge
, LogSegment
*_ls
, Context
*fin_
= nullptr) :
396 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
),
397 inos_to_free(to_free
), inotablev(iv
), inos_to_purge(to_purge
), ls(_ls
), fin(fin_
) {}
398 void finish(int r
) override
{
400 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos_to_free
, inotablev
, inos_to_purge
, ls
);
407 Session
* Server::find_session_by_uuid(std::string_view uuid
)
409 Session
* session
= nullptr;
410 for (auto& it
: mds
->sessionmap
.get_sessions()) {
411 auto& metadata
= it
.second
->info
.client_metadata
;
413 auto p
= metadata
.find("uuid");
414 if (p
== metadata
.end() || p
->second
!= uuid
)
419 } else if (!session
->reclaiming_from
) {
420 ceph_assert(it
.second
->reclaiming_from
== session
);
423 ceph_assert(session
->reclaiming_from
== it
.second
);
429 void Server::reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
)
431 if (!session
->is_open() && !session
->is_stale()) {
432 dout(10) << "session not open, dropping this req" << dendl
;
436 auto reply
= make_message
<MClientReclaimReply
>(0);
437 if (m
->get_uuid().empty()) {
438 dout(10) << __func__
<< " invalid message (no uuid)" << dendl
;
439 reply
->set_result(-CEPHFS_EINVAL
);
440 mds
->send_message_client(reply
, session
);
444 unsigned flags
= m
->get_flags();
445 if (flags
!= CEPH_RECLAIM_RESET
) { // currently only support reset
446 dout(10) << __func__
<< " unsupported flags" << dendl
;
447 reply
->set_result(-CEPHFS_EINVAL
);
448 mds
->send_message_client(reply
, session
);
452 Session
* target
= find_session_by_uuid(m
->get_uuid());
454 if (session
->info
.auth_name
!= target
->info
.auth_name
) {
455 dout(10) << __func__
<< " session auth_name " << session
->info
.auth_name
456 << " != target auth_name " << target
->info
.auth_name
<< dendl
;
457 reply
->set_result(-CEPHFS_EPERM
);
458 mds
->send_message_client(reply
, session
);
461 ceph_assert(!target
->reclaiming_from
);
462 ceph_assert(!session
->reclaiming_from
);
463 session
->reclaiming_from
= target
;
464 reply
->set_addrs(entity_addrvec_t(target
->info
.inst
.addr
));
467 if (flags
& CEPH_RECLAIM_RESET
) {
468 finish_reclaim_session(session
, reply
);
469 } else ceph_assert(0); /* no other flags are handled at this time */
472 void Server::finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
)
474 Session
*target
= session
->reclaiming_from
;
476 session
->reclaiming_from
= nullptr;
480 int64_t session_id
= session
->get_client().v
;
481 send_reply
= new LambdaContext([this, session_id
, reply
](int r
) {
482 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
483 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(session_id
));
487 auto epoch
= mds
->objecter
->with_osdmap([](const OSDMap
&map
){ return map
.get_epoch(); });
488 reply
->set_epoch(epoch
);
489 mds
->send_message_client(reply
, session
);
492 send_reply
= nullptr;
495 bool blocklisted
= mds
->objecter
->with_osdmap([target
](const OSDMap
&map
) {
496 return map
.is_blocklisted(target
->info
.inst
.addr
);
499 if (blocklisted
|| !g_conf()->mds_session_blocklist_on_evict
) {
500 kill_session(target
, send_reply
);
502 CachedStackStringStream css
;
503 mds
->evict_client(target
->get_client().v
, false, true, *css
, send_reply
);
506 mds
->send_message_client(reply
, session
);
510 void Server::handle_client_reclaim(const cref_t
<MClientReclaim
> &m
)
512 Session
*session
= mds
->get_session(m
);
513 uint32_t flags
= m
->get_flags();
514 dout(3) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
515 ceph_assert(m
->is_a_client()); // should _not_ come from an mds!
518 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
522 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
523 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
524 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
528 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
529 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
533 if (flags
& MClientReclaim::FLAG_FINISH
) {
534 if (flags
^ MClientReclaim::FLAG_FINISH
) {
535 dout(0) << __func__
<< " client specified FLAG_FINISH with other flags."
536 " Other flags:" << flags
<< dendl
;
537 auto reply
= make_message
<MClientReclaimReply
>(0);
538 reply
->set_result(-CEPHFS_EINVAL
);
539 mds
->send_message_client(reply
, session
);
542 finish_reclaim_session(session
);
544 reclaim_session(session
, m
);
548 void Server::handle_client_session(const cref_t
<MClientSession
> &m
)
551 Session
*session
= mds
->get_session(m
);
553 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
554 ceph_assert(m
->is_a_client()); // should _not_ come from an mds!
557 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
558 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
559 reply
->metadata
["error_string"] = "sessionless";
560 mds
->send_message(reply
, m
->get_connection());
564 std::string_view fs_name
= mds
->mdsmap
->get_fs_name();
565 if (!fs_name
.empty() && !session
->fs_name_capable(fs_name
, MAY_READ
)) {
566 dout(0) << " dropping message not allowed for this fs_name: " << *m
<< dendl
;
567 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
568 reply
->metadata
["error_string"] = "client doesn't have caps for FS \"" +
569 std::string(fs_name
) + "\"";
570 mds
->send_message(std::move(reply
), m
->get_connection());
574 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
575 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
576 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
577 // close requests need to be handled when mds is active
578 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
579 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
583 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
584 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
590 logger
->inc(l_mdss_handle_client_session
);
593 switch (m
->get_op()) {
594 case CEPH_SESSION_REQUEST_OPEN
:
595 if(mds
->mdsmap
->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
)) {
596 dout(0) << "new sessions are not permitted, enable again via"
597 "`ceph fs set <fs_name> refuse_client_session false`" << dendl
;
598 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
599 reply
->metadata
["error_string"] = "new sessions are not permitted,"
600 " enable again via `ceph fs set"
601 " <fs_name> refuse_client_session false`";
602 mds
->send_message(reply
, m
->get_connection());
605 if (session
->is_opening() ||
606 session
->is_open() ||
607 session
->is_stale() ||
608 session
->is_killing() ||
609 terminating_sessions
) {
610 if (m
->supported_features
.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE
)) {
611 if (session
->is_open() && !mds
->is_stopping()) {
612 dout(10) << "currently already opened" << dendl
;
614 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
,
615 session
->get_push_seq());
616 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
617 reply
->supported_features
= supported_features
;
618 mds
->send_message_client(reply
, session
);
619 if (mdcache
->is_readonly()) {
620 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
621 mds
->send_message_client(m
, session
);
625 dout(10) << "currently " << session
->get_state_name()
626 << ", dropping this req" << dendl
;
629 ceph_assert(session
->is_closed() || session
->is_closing());
631 if (mds
->is_stopping()) {
632 dout(10) << "mds is stopping, dropping open req" << dendl
;
637 auto& addr
= session
->info
.inst
.addr
;
638 session
->set_client_metadata(client_metadata_t(m
->metadata
, m
->supported_features
, m
->metric_spec
));
639 auto& client_metadata
= session
->info
.client_metadata
;
641 auto log_session_status
= [this, m
, session
](std::string_view status
, std::string_view err
) {
642 auto now
= ceph_clock_now();
643 auto throttle_elapsed
= m
->get_recv_complete_stamp() - m
->get_throttle_stamp();
644 auto elapsed
= now
- m
->get_recv_stamp();
645 CachedStackStringStream css
;
646 *css
<< "New client session:"
647 << " addr=\"" << session
->info
.inst
.addr
<< "\""
648 << ",elapsed=" << elapsed
649 << ",throttled=" << throttle_elapsed
650 << ",status=\"" << status
<< "\"";
652 *css
<< ",error=\"" << err
<< "\"";
654 const auto& metadata
= session
->info
.client_metadata
;
655 if (auto it
= metadata
.find("root"); it
!= metadata
.end()) {
656 *css
<< ",root=\"" << it
->second
<< "\"";
658 dout(2) << css
->strv() << dendl
;
661 auto send_reject_message
= [this, &session
, &log_session_status
](std::string_view err_str
, unsigned flags
=0) {
662 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
, 0, flags
);
663 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
))
664 m
->metadata
["error_string"] = err_str
;
665 mds
->send_message_client(m
, session
);
666 log_session_status("REJECTED", err_str
);
669 bool blocklisted
= mds
->objecter
->with_osdmap(
670 [&addr
](const OSDMap
&osd_map
) -> bool {
671 return osd_map
.is_blocklisted(addr
);
675 dout(10) << "rejecting blocklisted client " << addr
<< dendl
;
676 // This goes on the wire and the "blacklisted" substring is
677 // depended upon by the kernel client for detecting whether it
678 // has been blocklisted. If mounted with recover_session=clean
679 // (since 5.4), it tries to automatically recover itself from
682 flags
|= MClientSession::SESSION_BLOCKLISTED
;
683 send_reject_message("blocklisted (blacklisted)", flags
);
688 if (client_metadata
.features
.empty())
689 infer_supported_features(session
, client_metadata
);
691 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl
;
692 dout(20) << " features: '" << client_metadata
.features
<< "'" << dendl
;
693 dout(20) << " metric specification: [" << client_metadata
.metric_spec
<< "]" << dendl
;
694 for (const auto& p
: client_metadata
) {
695 dout(20) << " " << p
.first
<< ": " << p
.second
<< dendl
;
698 feature_bitset_t missing_features
= required_client_features
;
699 missing_features
-= client_metadata
.features
;
700 if (!missing_features
.empty()) {
701 CachedStackStringStream css
;
702 *css
<< "missing required features '" << missing_features
<< "'";
703 send_reject_message(css
->strv());
704 mds
->clog
->warn() << "client session (" << session
->info
.inst
705 << ") lacks required features " << missing_features
706 << "; client supports " << client_metadata
.features
;
711 // Special case for the 'root' metadata path; validate that the claimed
712 // root is actually within the caps of the session
713 if (auto it
= client_metadata
.find("root"); it
!= client_metadata
.end()) {
714 auto claimed_root
= it
->second
;
715 CachedStackStringStream css
;
717 // claimed_root has a leading "/" which we strip before passing
719 if (claimed_root
.empty() || claimed_root
[0] != '/') {
721 *css
<< "invalue root '" << claimed_root
<< "'";
722 } else if (!session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
724 *css
<< "non-allowable root '" << claimed_root
<< "'";
728 // Tell the client we're rejecting their open
729 send_reject_message(css
->strv());
730 mds
->clog
->warn() << "client session with " << css
->strv()
731 << " denied (" << session
->info
.inst
<< ")";
737 if (auto it
= client_metadata
.find("uuid"); it
!= client_metadata
.end()) {
738 if (find_session_by_uuid(it
->second
)) {
739 send_reject_message("duplicated session uuid");
740 mds
->clog
->warn() << "client session with duplicated session uuid '"
741 << it
->second
<< "' denied (" << session
->info
.inst
<< ")";
747 if (session
->is_closed()) {
748 mds
->sessionmap
.add_session(session
);
751 pv
= mds
->sessionmap
.mark_projected(session
);
752 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
753 mds
->sessionmap
.touch_session(session
);
754 auto fin
= new LambdaContext([log_session_status
= std::move(log_session_status
)](int r
){
756 log_session_status("ACCEPTED", "");
758 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, client_metadata
),
759 new C_MDS_session_finish(this, session
, sseq
, true, pv
, fin
));
764 case CEPH_SESSION_REQUEST_RENEWCAPS
:
765 if (session
->is_open() || session
->is_stale()) {
766 mds
->sessionmap
.touch_session(session
);
767 if (session
->is_stale()) {
768 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
769 mds
->locker
->resume_stale_caps(session
);
770 mds
->sessionmap
.touch_session(session
);
772 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_RENEWCAPS
, m
->get_seq());
773 mds
->send_message_client(reply
, session
);
775 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
779 case CEPH_SESSION_REQUEST_CLOSE
:
781 if (session
->is_closed() ||
782 session
->is_closing() ||
783 session
->is_killing()) {
784 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
787 if (session
->is_importing()) {
788 dout(10) << "ignoring close req on importing session" << dendl
;
791 ceph_assert(session
->is_open() ||
792 session
->is_stale() ||
793 session
->is_opening());
794 if (m
->get_seq() < session
->get_push_seq()) {
795 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
796 << ", dropping" << dendl
;
799 // We are getting a seq that is higher than expected.
800 // Handle the same as any other seqn error.
802 if (m
->get_seq() != session
->get_push_seq()) {
803 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
804 << ", BUGGY!" << dendl
;
805 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
806 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
809 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
813 case CEPH_SESSION_FLUSHMSG_ACK
:
814 finish_flush_session(session
, m
->get_seq());
817 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
818 if (mds
->is_active())
823 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
824 mds
->send_message_client(m
, session
);
825 derr
<< "Server received unknown message " << m
->get_type() << ", closing session and blocklisting the client " << session
->get_client() << dendl
;
826 CachedStackStringStream css
;
827 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
831 void Server::flush_session(Session
*session
, MDSGatherBuilder
& gather
) {
832 if (!session
->is_open() ||
833 !session
->get_connection() ||
834 !session
->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER
)) {
838 version_t seq
= session
->wait_for_flush(gather
.new_sub());
839 mds
->send_message_client(
840 make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG
, seq
), session
);
843 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
845 for (const auto& client
: client_set
) {
846 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
847 ceph_assert(session
);
848 flush_session(session
, gather
);
852 void Server::finish_flush_session(Session
*session
, version_t seq
)
854 MDSContext::vec finished
;
855 session
->finish_flush(seq
, finished
);
856 mds
->queue_waiters(finished
);
859 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
860 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
861 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
)
863 dout(10) << "_session_logged " << session
->info
.inst
864 << " state_seq " << state_seq
865 << " " << (open
? "open":"close") << " " << pv
866 << " inos_to_free " << inos_to_free
<< " inotablev " << piv
867 << " inos_to_purge " << inos_to_purge
<< dendl
;
870 if (inos_to_purge
.size()){
872 session
->info
.prealloc_inos
.subtract(inos_to_purge
);
873 ls
->purging_inodes
.insert(inos_to_purge
);
874 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())
875 mdcache
->purge_inodes(inos_to_purge
, ls
);
878 if (inos_to_free
.size()) {
880 ceph_assert(session
->is_closing() || session
->is_killing() ||
881 session
->is_opening()); // re-open closing session
882 session
->info
.prealloc_inos
.subtract(inos_to_free
);
883 mds
->inotable
->apply_release_ids(inos_to_free
);
884 ceph_assert(mds
->inotable
->get_version() == piv
);
886 session
->free_prealloc_inos
= session
->info
.prealloc_inos
;
887 session
->delegated_inos
.clear();
890 mds
->sessionmap
.mark_dirty(session
);
893 if (session
->get_state_seq() != state_seq
) {
894 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
895 << ", noop" << dendl
;
896 // close must have been canceled (by an import?), or any number of other things..
898 ceph_assert(session
->is_opening());
899 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
900 mds
->sessionmap
.touch_session(session
);
901 metrics_handler
->add_session(session
);
902 ceph_assert(session
->get_connection());
903 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
904 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
905 reply
->supported_features
= supported_features
;
906 reply
->metric_spec
= supported_metric_spec
;
908 mds
->send_message_client(reply
, session
);
909 if (mdcache
->is_readonly()) {
910 auto m
= make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
);
911 mds
->send_message_client(m
, session
);
913 } else if (session
->is_closing() ||
914 session
->is_killing()) {
915 // kill any lingering capabilities, leases, requests
916 bool killing
= session
->is_killing();
917 while (!session
->caps
.empty()) {
918 Capability
*cap
= session
->caps
.front();
919 CInode
*in
= cap
->get_inode();
920 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
921 mds
->locker
->remove_client_cap(in
, cap
, killing
);
923 while (!session
->leases
.empty()) {
924 ClientLease
*r
= session
->leases
.front();
925 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
926 dout(20) << " killing client lease of " << *dn
<< dendl
;
927 dn
->remove_client_lease(r
, mds
->locker
);
929 if (client_reconnect_gather
.erase(session
->info
.get_client())) {
930 dout(20) << " removing client from reconnect set" << dendl
;
931 if (client_reconnect_gather
.empty()) {
932 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
933 reconnect_gather_finish();
936 if (client_reclaim_gather
.erase(session
->info
.get_client())) {
937 dout(20) << " removing client from reclaim set" << dendl
;
938 if (client_reclaim_gather
.empty()) {
939 dout(7) << " client " << session
->info
.inst
<< " was last reclaimed, finishing" << dendl
;
940 mds
->maybe_clientreplay_done();
944 if (session
->is_closing()) {
945 // mark con disposable. if there is a fault, we will get a
946 // reset and clean it up. if the client hasn't received the
947 // CLOSE message yet, they will reconnect and get an
948 // ms_handle_remote_reset() and realize they had in fact closed.
949 // do this *before* sending the message to avoid a possible
951 if (session
->get_connection()) {
952 // Conditional because terminate_sessions will indiscrimately
953 // put sessions in CLOSING whether they ever had a conn or not.
954 session
->get_connection()->mark_disposable();
958 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_CLOSE
), session
);
959 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
961 metrics_handler
->remove_session(session
);
962 mds
->sessionmap
.remove_session(session
);
963 } else if (session
->is_killing()) {
964 // destroy session, close connection
965 if (session
->get_connection()) {
966 session
->get_connection()->mark_down();
967 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
968 session
->set_connection(nullptr);
970 metrics_handler
->remove_session(session
);
971 mds
->sessionmap
.remove_session(session
);
981 * Inject sessions from some source other than actual connections.
984 * - sessions inferred from journal replay
985 * - sessions learned from other MDSs during rejoin
986 * - sessions learned from other MDSs during dir/caps migration
987 * - sessions learned from other MDSs during a cross-MDS rename
989 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
990 map
<client_t
,client_metadata_t
>& cmm
,
991 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
993 version_t pv
= mds
->sessionmap
.get_projected();
995 dout(10) << "prepare_force_open_sessions " << pv
996 << " on " << cm
.size() << " clients"
999 mds
->objecter
->with_osdmap(
1000 [this, &cm
, &cmm
](const OSDMap
&osd_map
) {
1001 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
1002 if (osd_map
.is_blocklisted(p
->second
.addr
)) {
1003 dout(10) << " ignoring blocklisted client." << p
->first
1004 << " (" << p
->second
.addr
<< ")" << dendl
;
1005 cmm
.erase(p
->first
);
1013 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
1014 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
1015 pv
= mds
->sessionmap
.mark_projected(session
);
1017 if (session
->is_closed() ||
1018 session
->is_closing() ||
1019 session
->is_killing()) {
1020 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
1021 auto q
= cmm
.find(p
->first
);
1023 session
->info
.client_metadata
.merge(q
->second
);
1025 ceph_assert(session
->is_open() ||
1026 session
->is_opening() ||
1027 session
->is_stale());
1030 smap
[p
->first
] = make_pair(session
, sseq
);
1031 session
->inc_importing();
1036 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
1040 * FIXME: need to carefully consider the race conditions between a
1041 * client trying to close a session and an MDS doing an import
1042 * trying to force open a session...
1044 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
1045 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
1047 for (auto &it
: smap
) {
1048 Session
*session
= it
.second
.first
;
1049 uint64_t sseq
= it
.second
.second
;
1051 if (session
->get_state_seq() != sseq
) {
1052 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
1054 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
1055 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1056 mds
->sessionmap
.touch_session(session
);
1057 metrics_handler
->add_session(session
);
1059 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1060 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1061 reply
->supported_features
= supported_features
;
1062 reply
->metric_spec
= supported_metric_spec
;
1064 mds
->send_message_client(reply
, session
);
1066 if (mdcache
->is_readonly())
1067 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1070 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
1071 ceph_assert(session
->is_open() || session
->is_stale());
1075 session
->dec_importing();
1078 mds
->sessionmap
.mark_dirty(session
);
1081 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
1084 class C_MDS_TerminatedSessions
: public ServerContext
{
1085 void finish(int r
) override
{
1086 server
->terminating_sessions
= false;
1089 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
1092 void Server::terminate_sessions()
1094 dout(5) << "terminating all sessions..." << dendl
;
1096 terminating_sessions
= true;
1098 // kill them off. clients will retry etc.
1099 set
<Session
*> sessions
;
1100 mds
->sessionmap
.get_client_session_set(sessions
);
1101 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1102 p
!= sessions
.end();
1104 Session
*session
= *p
;
1105 if (session
->is_closing() ||
1106 session
->is_killing() ||
1107 session
->is_closed())
1109 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
1112 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
1116 void Server::find_idle_sessions()
1118 auto now
= clock::now();
1119 auto last_cleared_laggy
= mds
->last_cleared_laggy();
1121 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
1124 // (caps go stale, lease die)
1125 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
1126 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
1128 // don't kick clients if we've been laggy
1129 if (last_cleared_laggy
< cutoff
) {
1130 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
1131 << "), not marking any client stale" << dendl
;
1135 std::vector
<Session
*> to_evict
;
1137 bool defer_session_stale
= g_conf().get_val
<bool>("mds_defer_session_stale");
1138 const auto sessions_p1
= mds
->sessionmap
.by_state
.find(Session::STATE_OPEN
);
1139 if (sessions_p1
!= mds
->sessionmap
.by_state
.end() && !sessions_p1
->second
->empty()) {
1140 std::vector
<Session
*> new_stale
;
1142 for (auto session
: *(sessions_p1
->second
)) {
1143 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1144 if (last_cap_renew_span
< cutoff
) {
1145 dout(20) << "laggiest active session is " << session
->info
.inst
1146 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1150 if (session
->last_seen
> session
->last_cap_renew
) {
1151 last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_seen
).count();
1152 if (last_cap_renew_span
< cutoff
) {
1153 dout(20) << "laggiest active session is " << session
->info
.inst
1154 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1159 if (last_cap_renew_span
>= mds
->mdsmap
->get_session_autoclose()) {
1160 dout(20) << "evicting session " << session
->info
.inst
<< " since autoclose "
1161 "has arrived" << dendl
;
1162 // evict session without marking it stale
1163 to_evict
.push_back(session
);
1167 if (defer_session_stale
&&
1168 !session
->is_any_flush_waiter() &&
1169 !mds
->locker
->is_revoking_any_caps_from(session
->get_client())) {
1170 dout(20) << "deferring marking session " << session
->info
.inst
<< " stale "
1171 "since it holds no caps" << dendl
;
1175 auto it
= session
->info
.client_metadata
.find("timeout");
1176 if (it
!= session
->info
.client_metadata
.end()) {
1177 unsigned timeout
= strtoul(it
->second
.c_str(), nullptr, 0);
1179 dout(10) << "skipping session " << session
->info
.inst
1180 << ", infinite timeout specified" << dendl
;
1183 double cutoff
= queue_max_age
+ timeout
;
1184 if (last_cap_renew_span
< cutoff
) {
1185 dout(10) << "skipping session " << session
->info
.inst
1186 << ", timeout (" << timeout
<< ") specified"
1187 << " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
1191 // do not go through stale, evict it directly.
1192 to_evict
.push_back(session
);
1194 dout(10) << "new stale session " << session
->info
.inst
1195 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1196 new_stale
.push_back(session
);
1200 for (auto session
: new_stale
) {
1201 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
1202 if (mds
->locker
->revoke_stale_caps(session
)) {
1203 mds
->locker
->remove_stale_leases(session
);
1204 finish_flush_session(session
, session
->get_push_seq());
1205 auto m
= make_message
<MClientSession
>(CEPH_SESSION_STALE
);
1206 mds
->send_message_client(m
, session
);
1208 to_evict
.push_back(session
);
1214 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
1216 // Collect a list of sessions exceeding the autoclose threshold
1217 const auto sessions_p2
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
1218 if (sessions_p2
!= mds
->sessionmap
.by_state
.end() && !sessions_p2
->second
->empty()) {
1219 for (auto session
: *(sessions_p2
->second
)) {
1220 ceph_assert(session
->is_stale());
1221 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1222 if (last_cap_renew_span
< cutoff
) {
1223 dout(20) << "oldest stale session is " << session
->info
.inst
1224 << " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1227 to_evict
.push_back(session
);
1231 for (auto session
: to_evict
) {
1232 if (session
->is_importing()) {
1233 dout(10) << "skipping session " << session
->info
.inst
<< ", it's being imported" << dendl
;
1237 auto last_cap_renew_span
= std::chrono::duration
<double>(now
- session
->last_cap_renew
).count();
1238 mds
->clog
->warn() << "evicting unresponsive client " << *session
1239 << ", after " << last_cap_renew_span
<< " seconds";
1240 dout(10) << "autoclosing stale session " << session
->info
.inst
1241 << " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
1243 if (g_conf()->mds_session_blocklist_on_timeout
) {
1244 CachedStackStringStream css
;
1245 mds
->evict_client(session
->get_client().v
, false, true, *css
, nullptr);
1247 kill_session(session
, NULL
);
1252 void Server::evict_cap_revoke_non_responders() {
1253 if (!cap_revoke_eviction_timeout
) {
1257 auto&& to_evict
= mds
->locker
->get_late_revoking_clients(cap_revoke_eviction_timeout
);
1259 for (auto const &client
: to_evict
) {
1260 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
1261 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1262 << " seconds, evicting";
1263 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
1266 CachedStackStringStream css
;
1267 bool evicted
= mds
->evict_client(client
.v
, false,
1268 g_conf()->mds_session_blocklist_on_evict
,
1270 if (evicted
&& logger
) {
1271 logger
->inc(l_mdss_cap_revoke_eviction
);
1276 void Server::handle_conf_change(const std::set
<std::string
>& changed
) {
1277 if (changed
.count("mds_forward_all_requests_to_auth")){
1278 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
1280 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
1281 cap_revoke_eviction_timeout
= g_conf().get_val
<double>("mds_cap_revoke_eviction_timeout");
1282 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
1283 << cap_revoke_eviction_timeout
<< dendl
;
1285 if (changed
.count("mds_recall_max_decay_rate")) {
1286 recall_throttle
= DecayCounter(g_conf().get_val
<double>("mds_recall_max_decay_rate"));
1288 if (changed
.count("mds_max_snaps_per_dir")) {
1289 max_snaps_per_dir
= g_conf().get_val
<uint64_t>("mds_max_snaps_per_dir");
1290 dout(20) << __func__
<< " max snapshots per directory changed to "
1291 << max_snaps_per_dir
<< dendl
;
1293 if (changed
.count("mds_client_delegate_inos_pct")) {
1294 delegate_inos_pct
= g_conf().get_val
<uint64_t>("mds_client_delegate_inos_pct");
1296 if (changed
.count("mds_max_caps_per_client")) {
1297 max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1299 if (changed
.count("mds_session_cap_acquisition_throttle")) {
1300 cap_acquisition_throttle
= g_conf().get_val
<uint64_t>("mds_session_cap_acquisition_throttle");
1302 if (changed
.count("mds_session_max_caps_throttle_ratio")) {
1303 max_caps_throttle_ratio
= g_conf().get_val
<double>("mds_session_max_caps_throttle_ratio");
1305 if (changed
.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1306 caps_throttle_retry_request_timeout
= g_conf().get_val
<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1308 if (changed
.count("mds_alternate_name_max")) {
1309 alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
1311 if (changed
.count("mds_fscrypt_last_block_max_size")) {
1312 fscrypt_last_block_max_size
= g_conf().get_val
<Option::size_t>("mds_fscrypt_last_block_max_size");
1314 if (changed
.count("mds_dir_max_entries")) {
1315 dir_max_entries
= g_conf().get_val
<uint64_t>("mds_dir_max_entries");
1316 dout(20) << __func__
<< " max entries per directory changed to "
1317 << dir_max_entries
<< dendl
;
1319 if (changed
.count("mds_bal_fragment_size_max")) {
1320 bal_fragment_size_max
= g_conf().get_val
<int64_t>("mds_bal_fragment_size_max");
1321 dout(20) << __func__
<< " max fragment size changed to "
1322 << bal_fragment_size_max
<< dendl
;
1324 if (changed
.count("mds_inject_rename_corrupt_dentry_first")) {
1325 inject_rename_corrupt_dentry_first
= g_conf().get_val
<double>("mds_inject_rename_corrupt_dentry_first");
1330 * XXX bump in the interface here, not using an MDSContext here
1331 * because all the callers right now happen to use a SaferCond
1333 void Server::kill_session(Session
*session
, Context
*on_safe
)
1335 ceph_assert(ceph_mutex_is_locked_by_me(mds
->mds_lock
));
1337 if ((session
->is_opening() ||
1338 session
->is_open() ||
1339 session
->is_stale()) &&
1340 !session
->is_importing()) {
1341 dout(10) << "kill_session " << session
<< dendl
;
1342 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
1344 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
1345 if (session
->is_closing() ||
1346 session
->is_killing()) {
1348 mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, on_safe
));
1350 ceph_assert(session
->is_closed() ||
1351 session
->is_importing());
1353 on_safe
->complete(0);
1358 size_t Server::apply_blocklist()
1360 std::vector
<Session
*> victims
;
1361 const auto& sessions
= mds
->sessionmap
.get_sessions();
1362 mds
->objecter
->with_osdmap(
1363 [&](const OSDMap
& o
) {
1364 for (const auto& p
: sessions
) {
1365 if (!p
.first
.is_client()) {
1366 // Do not apply OSDMap blocklist to MDS daemons, we find out
1367 // about their death via MDSMap.
1370 if (o
.is_blocklisted(p
.second
->info
.inst
.addr
)) {
1371 victims
.push_back(p
.second
);
1376 for (const auto& s
: victims
) {
1377 kill_session(s
, nullptr);
1380 dout(10) << "apply_blocklist: killed " << victims
.size() << dendl
;
1382 return victims
.size();
1385 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
1387 dout(10) << __func__
<< " : "
1388 << session
->info
.inst
1389 << " pending_prealloc_inos " << session
->pending_prealloc_inos
1390 << " free_prealloc_inos " << session
->free_prealloc_inos
1391 << " delegated_inos " << session
->delegated_inos
<< dendl
;
1393 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
1394 version_t pv
= mds
->sessionmap
.mark_projected(session
);
1397 // release alloc and pending-alloc inos for this session
1398 // and wipe out session state, in case the session close aborts for some reason
1399 interval_set
<inodeno_t
> inos_to_free
;
1400 inos_to_free
.insert(session
->pending_prealloc_inos
);
1401 inos_to_free
.insert(session
->free_prealloc_inos
);
1402 if (inos_to_free
.size()) {
1403 mds
->inotable
->project_release_ids(inos_to_free
);
1404 piv
= mds
->inotable
->get_projected_version();
1408 auto le
= new ESession(session
->info
.inst
, false, pv
, inos_to_free
, piv
, session
->delegated_inos
);
1409 auto fin
= new C_MDS_session_finish(this, session
, sseq
, false, pv
, inos_to_free
, piv
,
1410 session
->delegated_inos
, mdlog
->get_current_segment(), on_safe
);
1411 mdlog
->start_submit_entry(le
, fin
);
1414 // clean up requests, too
1415 while(!session
->requests
.empty()) {
1416 auto mdr
= MDRequestRef(*session
->requests
.begin());
1417 mdcache
->request_kill(mdr
);
1420 finish_flush_session(session
, session
->get_push_seq());
1423 void Server::reconnect_clients(MDSContext
*reconnect_done_
)
1425 reconnect_done
= reconnect_done_
;
1427 auto now
= clock::now();
1428 set
<Session
*> sessions
;
1429 mds
->sessionmap
.get_client_session_set(sessions
);
1430 for (auto session
: sessions
) {
1431 if (session
->is_open()) {
1432 client_reconnect_gather
.insert(session
->get_client());
1433 session
->set_reconnecting(true);
1434 session
->last_cap_renew
= now
;
1438 if (client_reconnect_gather
.empty()) {
1439 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
1440 reconnect_gather_finish();
1444 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1446 reconnect_start
= now
;
1447 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
1448 mds
->sessionmap
.dump();
1451 void Server::handle_client_reconnect(const cref_t
<MClientReconnect
> &m
)
1453 dout(7) << "handle_client_reconnect " << m
->get_source()
1454 << (m
->has_more() ? " (more)" : "") << dendl
;
1455 client_t from
= m
->get_source().num();
1456 Session
*session
= mds
->get_session(m
);
1458 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
1459 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_REJECT
);
1460 reply
->metadata
["error_string"] = "sessionless";
1461 mds
->send_message(reply
, m
->get_connection());
1465 if(mds
->mdsmap
->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
)) {
1466 mds
->clog
->warn() << "client could not reconnect as"
1467 " file system flag refuse_client_session is set";
1468 dout(0) << "client cannot reconnect when file system flag"
1469 " refuse_client_session is set" << dendl
;
1470 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1471 reply
->metadata
["error_string"] = "client cannot reconnect when file system flag"
1472 " refuse_client_session is set";
1473 mds
->send_message(reply
, m
->get_connection());
1477 if (!session
->is_open()) {
1478 dout(0) << " ignoring msg from not-open session" << *m
<< dendl
;
1479 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1480 mds
->send_message(reply
, m
->get_connection());
1484 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1486 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
1487 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
1488 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
1492 auto delay
= std::chrono::duration
<double>(clock::now() - reconnect_start
).count();
1493 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
1496 if (reconnect_all_deny
|| !mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
1497 // XXX maybe in the future we can do better than this?
1498 if (reconnect_all_deny
) {
1499 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl
;
1501 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
1503 mds
->clog
->info() << "denied reconnect attempt (mds is "
1504 << ceph_mds_state_name(mds
->get_state())
1505 << ") from " << m
->get_source_inst()
1506 << " after " << delay
<< " (allowed interval " << g_conf()->mds_reconnect_timeout
<< ")";
1509 std::string error_str
;
1510 if (!session
->is_open()) {
1511 error_str
= "session is closed";
1512 } else if (mdcache
->is_readonly()) {
1513 error_str
= "mds is readonly";
1515 if (session
->info
.client_metadata
.features
.empty())
1516 infer_supported_features(session
, session
->info
.client_metadata
);
1518 feature_bitset_t missing_features
= required_client_features
;
1519 missing_features
-= session
->info
.client_metadata
.features
;
1520 if (!missing_features
.empty()) {
1521 CachedStackStringStream css
;
1522 *css
<< "missing required features '" << missing_features
<< "'";
1523 error_str
= css
->strv();
1527 if (!error_str
.empty()) {
1529 dout(1) << " " << error_str
<< ", ignoring reconnect, sending close" << dendl
;
1530 mds
->clog
->info() << "denied reconnect attempt from "
1531 << m
->get_source_inst() << " (" << error_str
<< ")";
1536 auto r
= make_message
<MClientSession
>(CEPH_SESSION_CLOSE
);
1537 mds
->send_message_client(r
, session
);
1538 if (session
->is_open()) {
1539 client_reconnect_denied
.insert(session
->get_client());
1544 if (!m
->has_more()) {
1545 metrics_handler
->add_session(session
);
1546 // notify client of success with an OPEN
1547 auto reply
= make_message
<MClientSession
>(CEPH_SESSION_OPEN
);
1548 if (session
->info
.has_feature(CEPHFS_FEATURE_MIMIC
)) {
1549 reply
->supported_features
= supported_features
;
1550 reply
->metric_spec
= supported_metric_spec
;
1552 mds
->send_message_client(reply
, session
);
1553 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
1556 session
->last_cap_renew
= clock::now();
1559 for (const auto &r
: m
->realms
) {
1560 CInode
*in
= mdcache
->get_inode(inodeno_t(r
.realm
.ino
));
1561 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1564 if (in
->snaprealm
) {
1565 dout(15) << "open snaprealm (w inode) on " << *in
<< dendl
;
1567 // this can happen if we are non-auth or we rollback snaprealm
1568 dout(15) << "open snaprealm (null snaprealm) on " << *in
<< dendl
;
1570 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1572 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r
.realm
.ino
)
1573 << " seq " << r
.realm
.seq
<< dendl
;
1574 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(r
.realm
.ino
), snapid_t(r
.realm
.seq
));
1579 for (const auto &p
: m
->caps
) {
1580 // make sure our last_cap_id is MAX over all issued caps
1581 if (p
.second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1582 mdcache
->last_cap_id
= p
.second
.capinfo
.cap_id
;
1584 CInode
*in
= mdcache
->get_inode(p
.first
);
1585 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1587 if (in
&& in
->is_auth()) {
1588 // we recovered it, and it's ours. take note.
1589 dout(15) << "open cap realm " << inodeno_t(p
.second
.capinfo
.snaprealm
)
1590 << " on " << *in
<< dendl
;
1591 in
->reconnect_cap(from
, p
.second
, session
);
1592 mdcache
->add_reconnected_cap(from
, p
.first
, p
.second
);
1593 recover_filelocks(in
, p
.second
.flockbl
, m
->get_orig_source().num());
1597 if (in
&& !in
->is_auth()) {
1599 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1600 // add to cap export list.
1601 mdcache
->rejoin_export_caps(p
.first
, from
, p
.second
,
1602 in
->authority().first
, true);
1604 // don't know if the inode is mine
1605 dout(10) << "missing ino " << p
.first
<< ", will load later" << dendl
;
1606 mdcache
->rejoin_recovered_caps(p
.first
, from
, p
.second
, MDS_RANK_NONE
);
1610 reconnect_last_seen
= clock::now();
1612 if (!m
->has_more()) {
1613 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1615 // remove from gather set
1616 client_reconnect_gather
.erase(from
);
1617 session
->set_reconnecting(false);
1618 if (client_reconnect_gather
.empty())
1619 reconnect_gather_finish();
1623 void Server::infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
)
1626 auto it
= client_metadata
.find("ceph_version");
1627 if (it
!= client_metadata
.end()) {
1628 // user space client
1629 if (it
->second
.compare(0, 16, "ceph version 12.") == 0)
1630 supported
= CEPHFS_FEATURE_LUMINOUS
;
1631 else if (session
->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR
))
1632 supported
= CEPHFS_FEATURE_KRAKEN
;
1634 it
= client_metadata
.find("kernel_version");
1635 if (it
!= client_metadata
.end()) {
1637 if (session
->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING
))
1638 supported
= CEPHFS_FEATURE_LUMINOUS
;
1641 if (supported
== -1 &&
1642 session
->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
))
1643 supported
= CEPHFS_FEATURE_JEWEL
;
1645 if (supported
>= 0) {
1646 unsigned long value
= (1UL << (supported
+ 1)) - 1;
1647 client_metadata
.features
= feature_bitset_t(value
);
1648 dout(10) << __func__
<< " got '" << client_metadata
.features
<< "'" << dendl
;
1652 void Server::update_required_client_features()
1654 required_client_features
= mds
->mdsmap
->get_required_client_features();
1655 dout(7) << "required_client_features: " << required_client_features
<< dendl
;
1657 if (mds
->get_state() >= MDSMap::STATE_RECONNECT
) {
1658 set
<Session
*> sessions
;
1659 mds
->sessionmap
.get_client_session_set(sessions
);
1660 for (auto session
: sessions
) {
1661 feature_bitset_t missing_features
= required_client_features
;
1662 missing_features
-= session
->info
.client_metadata
.features
;
1663 if (!missing_features
.empty()) {
1664 bool blocklisted
= mds
->objecter
->with_osdmap(
1665 [session
](const OSDMap
&osd_map
) -> bool {
1666 return osd_map
.is_blocklisted(session
->info
.inst
.addr
);
1671 mds
->clog
->warn() << "evicting session " << *session
<< ", missing required features '"
1672 << missing_features
<< "'";
1673 CachedStackStringStream css
;
1674 mds
->evict_client(session
->get_client().v
, false,
1675 g_conf()->mds_session_blocklist_on_evict
, *css
);
1681 void Server::reconnect_gather_finish()
1683 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1684 ceph_assert(reconnect_done
);
1686 if (!mds
->snapclient
->is_synced()) {
1687 // make sure snaptable cache is populated. snaprealms will be
1688 // extensively used in rejoin stage.
1689 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl
;
1690 mds
->snapclient
->wait_for_sync(reconnect_done
);
1692 reconnect_done
->complete(0);
1694 reconnect_done
= NULL
;
1697 void Server::reconnect_tick()
1699 bool reject_all_reconnect
= false;
1700 if (reconnect_evicting
) {
1701 dout(7) << "reconnect_tick: waiting for evictions" << dendl
;
1706 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1707 * then load less meta information in rejoin phase. This will shorten reboot time.
1708 * Moreover, loading less meta increases the chance standby with less memory can failover.
1710 * Why not shorten reconnect period?
1711 * Clients may send unsafe or retry requests, which haven't been
1712 * completed before old mds stop, to new mds. These requests may
1713 * need to be processed during new mds's clientreplay phase,
1714 * see: #https://github.com/ceph/ceph/pull/29059.
1716 bool reconnect_all_deny
= g_conf().get_val
<bool>("mds_deny_all_reconnect");
1717 if (client_reconnect_gather
.empty())
1720 if (reconnect_all_deny
&& (client_reconnect_gather
== client_reconnect_denied
))
1721 reject_all_reconnect
= true;
1723 auto now
= clock::now();
1724 auto elapse1
= std::chrono::duration
<double>(now
- reconnect_start
).count();
1725 if (elapse1
< g_conf()->mds_reconnect_timeout
&& !reject_all_reconnect
)
1728 vector
<Session
*> remaining_sessions
;
1729 remaining_sessions
.reserve(client_reconnect_gather
.size());
1730 for (auto c
: client_reconnect_gather
) {
1731 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(c
.v
));
1732 ceph_assert(session
);
1733 remaining_sessions
.push_back(session
);
1734 // client re-sends cap flush messages before the reconnect message
1735 if (session
->last_seen
> reconnect_last_seen
)
1736 reconnect_last_seen
= session
->last_seen
;
1739 auto elapse2
= std::chrono::duration
<double>(now
- reconnect_last_seen
).count();
1740 if (elapse2
< g_conf()->mds_reconnect_timeout
/ 2 && !reject_all_reconnect
) {
1741 dout(7) << "reconnect_tick: last seen " << elapse2
1742 << " seconds ago, extending reconnect interval" << dendl
;
1746 dout(7) << "reconnect timed out, " << remaining_sessions
.size()
1747 << " clients have not reconnected in time" << dendl
;
1749 // If we're doing blocklist evictions, use this to wait for them before
1750 // proceeding to reconnect_gather_finish
1751 MDSGatherBuilder
gather(g_ceph_context
);
1753 for (auto session
: remaining_sessions
) {
1754 // Keep sessions that have specified timeout. These sessions will prevent
1755 // mds from going to active. MDS goes to active after they all have been
1756 // killed or reclaimed.
1757 if (session
->info
.client_metadata
.find("timeout") !=
1758 session
->info
.client_metadata
.end()) {
1759 dout(1) << "reconnect keeps " << session
->info
.inst
1760 << ", need to be reclaimed" << dendl
;
1761 client_reclaim_gather
.insert(session
->get_client());
1765 dout(1) << "reconnect gives up on " << session
->info
.inst
<< dendl
;
1767 mds
->clog
->warn() << "evicting unresponsive client " << *session
1768 << ", after waiting " << elapse1
1769 << " seconds during MDS startup";
1771 // make _session_logged() purge orphan objects of lost async/unsafe requests
1772 session
->delegated_inos
.swap(session
->free_prealloc_inos
);
1774 if (g_conf()->mds_session_blocklist_on_timeout
) {
1775 CachedStackStringStream css
;
1776 mds
->evict_client(session
->get_client().v
, false, true, *css
,
1779 kill_session(session
, NULL
);
1782 failed_reconnects
++;
1784 client_reconnect_gather
.clear();
1785 client_reconnect_denied
.clear();
1787 if (gather
.has_subs()) {
1788 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1789 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new LambdaContext(
1790 [this](int r
){reconnect_gather_finish();})));
1792 reconnect_evicting
= true;
1794 reconnect_gather_finish();
1798 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1800 if (!locks
.length()) return;
1803 auto p
= locks
.cbegin();
1804 decode(numlocks
, p
);
1805 for (int i
= 0; i
< numlocks
; ++i
) {
1807 lock
.client
= client
;
1808 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1809 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1811 decode(numlocks
, p
);
1812 for (int i
= 0; i
< numlocks
; ++i
) {
1814 lock
.client
= client
;
1815 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1816 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1821 * Call this when the MDCache is oversized, to send requests to the clients
1822 * to trim some caps, and consequently unpin some inodes in the MDCache so
1823 * that it can trim too.
1825 std::pair
<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder
* gather
, RecallFlags flags
)
1827 const auto now
= clock::now();
1828 const bool steady
= !!(flags
&RecallFlags::STEADY
);
1829 const bool enforce_max
= !!(flags
&RecallFlags::ENFORCE_MAX
);
1830 const bool enforce_liveness
= !!(flags
&RecallFlags::ENFORCE_LIVENESS
);
1831 const bool trim
= !!(flags
&RecallFlags::TRIM
);
1833 const auto max_caps_per_client
= g_conf().get_val
<uint64_t>("mds_max_caps_per_client");
1834 const auto min_caps_per_client
= g_conf().get_val
<uint64_t>("mds_min_caps_per_client");
1835 const auto recall_global_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_global_max_decay_threshold");
1836 const auto recall_max_caps
= g_conf().get_val
<Option::size_t>("mds_recall_max_caps");
1837 const auto recall_max_decay_threshold
= g_conf().get_val
<Option::size_t>("mds_recall_max_decay_threshold");
1838 const auto cache_liveness_magnitude
= g_conf().get_val
<Option::size_t>("mds_session_cache_liveness_magnitude");
1840 dout(7) << __func__
<< ":"
1841 << " min=" << min_caps_per_client
1842 << " max=" << max_caps_per_client
1843 << " total=" << Capability::count()
1844 << " flags=" << flags
1847 /* trim caps of sessions with the most caps first */
1848 std::multimap
<uint64_t, Session
*> caps_session
;
1849 auto f
= [&caps_session
, enforce_max
, enforce_liveness
, trim
, max_caps_per_client
, cache_liveness_magnitude
](auto& s
) {
1850 auto num_caps
= s
->caps
.size();
1851 auto cache_liveness
= s
->get_session_cache_liveness();
1852 if (trim
|| (enforce_max
&& num_caps
> max_caps_per_client
) || (enforce_liveness
&& cache_liveness
< (num_caps
>>cache_liveness_magnitude
))) {
1853 caps_session
.emplace(std::piecewise_construct
, std::forward_as_tuple(num_caps
), std::forward_as_tuple(s
));
1856 mds
->sessionmap
.get_client_sessions(std::move(f
));
1858 std::pair
<bool, uint64_t> result
= {false, 0};
1859 auto& [throttled
, caps_recalled
] = result
;
1860 last_recall_state
= now
;
1861 for (const auto& [num_caps
, session
] : boost::adaptors::reverse(caps_session
)) {
1862 if (!session
->is_open() ||
1863 !session
->get_connection() ||
1864 !session
->info
.inst
.name
.is_client())
1867 dout(10) << __func__
<< ":"
1868 << " session " << session
->info
.inst
1869 << " caps " << num_caps
1870 << ", leases " << session
->leases
.size()
1874 if (num_caps
< recall_max_caps
|| (num_caps
-recall_max_caps
) < min_caps_per_client
) {
1875 newlim
= min_caps_per_client
;
1877 newlim
= num_caps
-recall_max_caps
;
1879 if (num_caps
> newlim
) {
1880 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1881 uint64_t recall
= std::min
<uint64_t>(recall_max_caps
, num_caps
-newlim
);
1882 newlim
= num_caps
-recall
;
1883 const uint64_t session_recall_throttle
= session
->get_recall_caps_throttle();
1884 const uint64_t session_recall_throttle2o
= session
->get_recall_caps_throttle2o();
1885 const uint64_t global_recall_throttle
= recall_throttle
.get();
1886 if (session_recall_throttle
+recall
> recall_max_decay_threshold
) {
1887 dout(15) << " session recall threshold (" << recall_max_decay_threshold
<< ") hit at " << session_recall_throttle
<< "; skipping!" << dendl
;
1890 } else if (session_recall_throttle2o
+recall
> recall_max_caps
*2) {
1891 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps
<< ") hit at " << session_recall_throttle2o
<< "; skipping!" << dendl
;
1894 } else if (global_recall_throttle
+recall
> recall_global_max_decay_threshold
) {
1895 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold
<< ") hit at " << global_recall_throttle
<< "; skipping!" << dendl
;
1900 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1902 const auto session_recall
= session
->get_recall_caps();
1903 const auto session_release
= session
->get_release_caps();
1904 if (2*session_release
< session_recall
&& 2*session_recall
> recall_max_decay_threshold
) {
1905 /* The session has been unable to keep up with the number of caps
1906 * recalled (by half); additionally, to prevent marking sessions
1907 * we've just begun to recall from, the session_recall counter
1908 * (decayed count of caps recently recalled) is **greater** than the
1909 * session threshold for the session's cap recall throttle.
1911 dout(15) << " 2*session_release < session_recall"
1912 " (2*" << session_release
<< " < " << session_recall
<< ") &&"
1913 " 2*session_recall < recall_max_decay_threshold"
1914 " (2*" << session_recall
<< " > " << recall_max_decay_threshold
<< ")"
1915 " Skipping because we are unlikely to get more released." << dendl
;
1917 } else if (recall
< recall_max_caps
&& 2*recall
< session_recall
) {
1918 /* The number of caps recalled is less than the number we *could*
1919 * recall (so there isn't much left to recall?) and the number of
1920 * caps is less than the current recall_caps counter (decayed count
1921 * of caps recently recalled).
1923 dout(15) << " 2*recall < session_recall "
1924 " (2*" << recall
<< " < " << session_recall
<< ") &&"
1925 " recall < recall_max_caps (" << recall
<< " < " << recall_max_caps
<< ");"
1926 " Skipping because we are unlikely to get more released." << dendl
;
1931 dout(7) << " recalling " << recall
<< " caps; session_recall_throttle = " << session_recall_throttle
<< "; global_recall_throttle = " << global_recall_throttle
<< dendl
;
1933 auto m
= make_message
<MClientSession
>(CEPH_SESSION_RECALL_STATE
);
1934 m
->head
.max_caps
= newlim
;
1935 mds
->send_message_client(m
, session
);
1937 flush_session(session
, *gather
);
1939 caps_recalled
+= session
->notify_recall_sent(newlim
);
1940 recall_throttle
.hit(recall
);
1944 dout(7) << "recalled" << (throttled
? " (throttled)" : "") << " " << caps_recalled
<< " client caps." << dendl
;
1949 void Server::force_clients_readonly()
1951 dout(10) << "force_clients_readonly" << dendl
;
1952 set
<Session
*> sessions
;
1953 mds
->sessionmap
.get_client_session_set(sessions
);
1954 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1955 p
!= sessions
.end();
1957 Session
*session
= *p
;
1958 if (!session
->info
.inst
.name
.is_client() ||
1959 !(session
->is_open() || session
->is_stale()))
1961 mds
->send_message_client(make_message
<MClientSession
>(CEPH_SESSION_FORCE_RO
), session
);
1966 * some generic stuff for finishing off requests
1968 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1970 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1971 ceph_assert(!mdr
->has_completed
);
1973 // note trace items for eventual reply.
1982 early_reply(mdr
, in
, dn
);
1984 mdr
->committing
= true;
1985 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1987 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1988 if (mds
->queue_one_replay()) {
1989 dout(10) << " queued next replay op" << dendl
;
1991 dout(10) << " journaled last replay op" << dendl
;
1993 } else if (mdr
->did_early_reply
)
1994 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1999 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
2000 std::string_view event
)
2003 string
event_str("submit entry: ");
2005 mdr
->mark_event(event_str
);
2007 mdlog
->submit_entry(le
, fin
);
2011 * send response built from mdr contents and error code; clean up mdr
2013 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
2015 if (mdr
->client_request
) {
2016 if (mdr
->is_batch_head()) {
2017 dout(20) << __func__
<< " batch head " << *mdr
<< dendl
;
2018 mdr
->release_batch_op()->respond(r
);
2020 reply_client_request(mdr
, make_message
<MClientReply
>(*mdr
->client_request
, r
));
2022 } else if (mdr
->internal_op
> -1) {
2023 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
2024 if (!mdr
->internal_op_finish
)
2025 ceph_abort_msg("trying to respond to internal op without finisher");
2026 mdr
->internal_op_finish
->complete(r
);
2027 mdcache
->request_finish(mdr
);
2031 // statistics mds req op number and latency
2032 void Server::perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
)
2034 int code
= l_mdss_first
;
2035 switch(req
->get_op()) {
2036 case CEPH_MDS_OP_LOOKUPHASH
:
2037 code
= l_mdss_req_lookuphash_latency
;
2039 case CEPH_MDS_OP_LOOKUPINO
:
2040 code
= l_mdss_req_lookupino_latency
;
2042 case CEPH_MDS_OP_LOOKUPPARENT
:
2043 code
= l_mdss_req_lookupparent_latency
;
2045 case CEPH_MDS_OP_LOOKUPNAME
:
2046 code
= l_mdss_req_lookupname_latency
;
2048 case CEPH_MDS_OP_LOOKUP
:
2049 code
= l_mdss_req_lookup_latency
;
2051 case CEPH_MDS_OP_LOOKUPSNAP
:
2052 code
= l_mdss_req_lookupsnap_latency
;
2054 case CEPH_MDS_OP_GETATTR
:
2055 code
= l_mdss_req_getattr_latency
;
2057 case CEPH_MDS_OP_SETATTR
:
2058 code
= l_mdss_req_setattr_latency
;
2060 case CEPH_MDS_OP_SETLAYOUT
:
2061 code
= l_mdss_req_setlayout_latency
;
2063 case CEPH_MDS_OP_SETDIRLAYOUT
:
2064 code
= l_mdss_req_setdirlayout_latency
;
2066 case CEPH_MDS_OP_GETVXATTR
:
2067 code
= l_mdss_req_getvxattr_latency
;
2069 case CEPH_MDS_OP_SETXATTR
:
2070 code
= l_mdss_req_setxattr_latency
;
2072 case CEPH_MDS_OP_RMXATTR
:
2073 code
= l_mdss_req_rmxattr_latency
;
2075 case CEPH_MDS_OP_READDIR
:
2076 code
= l_mdss_req_readdir_latency
;
2078 case CEPH_MDS_OP_SETFILELOCK
:
2079 code
= l_mdss_req_setfilelock_latency
;
2081 case CEPH_MDS_OP_GETFILELOCK
:
2082 code
= l_mdss_req_getfilelock_latency
;
2084 case CEPH_MDS_OP_CREATE
:
2085 code
= l_mdss_req_create_latency
;
2087 case CEPH_MDS_OP_OPEN
:
2088 code
= l_mdss_req_open_latency
;
2090 case CEPH_MDS_OP_MKNOD
:
2091 code
= l_mdss_req_mknod_latency
;
2093 case CEPH_MDS_OP_LINK
:
2094 code
= l_mdss_req_link_latency
;
2096 case CEPH_MDS_OP_UNLINK
:
2097 code
= l_mdss_req_unlink_latency
;
2099 case CEPH_MDS_OP_RMDIR
:
2100 code
= l_mdss_req_rmdir_latency
;
2102 case CEPH_MDS_OP_RENAME
:
2103 code
= l_mdss_req_rename_latency
;
2105 case CEPH_MDS_OP_MKDIR
:
2106 code
= l_mdss_req_mkdir_latency
;
2108 case CEPH_MDS_OP_SYMLINK
:
2109 code
= l_mdss_req_symlink_latency
;
2111 case CEPH_MDS_OP_LSSNAP
:
2112 code
= l_mdss_req_lssnap_latency
;
2114 case CEPH_MDS_OP_MKSNAP
:
2115 code
= l_mdss_req_mksnap_latency
;
2117 case CEPH_MDS_OP_RMSNAP
:
2118 code
= l_mdss_req_rmsnap_latency
;
2120 case CEPH_MDS_OP_RENAMESNAP
:
2121 code
= l_mdss_req_renamesnap_latency
;
2123 case CEPH_MDS_OP_READDIR_SNAPDIFF
:
2124 code
= l_mdss_req_snapdiff_latency
;
2127 dout(1) << ": unknown client op" << dendl
;
2130 logger
->tinc(code
, lat
);
2133 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
2135 if (!g_conf()->mds_early_reply
)
2138 if (mdr
->no_early_reply
) {
2139 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
2143 if (mdr
->has_more() && mdr
->more()->has_journaled_peers
) {
2144 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl
;
2148 if (mdr
->alloc_ino
) {
2149 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
2153 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2154 entity_inst_t client_inst
= req
->get_source_inst();
2155 if (client_inst
.name
.is_mds())
2158 if (req
->is_replay()) {
2159 dout(10) << " no early reply on replay op" << dendl
;
2164 auto reply
= make_message
<MClientReply
>(*req
, 0);
2165 reply
->set_unsafe();
2167 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2169 //_rename_finish() does not send dentry link/unlink message to replicas.
2170 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2171 // that have projected linkages from getting new replica.
2172 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
2174 dout(10) << "early_reply " << reply
->get_result()
2175 << " (" << cpp_strerror(reply
->get_result())
2176 << ") " << *req
<< dendl
;
2178 if (tracei
|| tracedn
) {
2180 mdr
->cap_releases
.erase(tracei
->vino());
2182 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2184 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2187 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2188 mds
->send_message_client(reply
, mdr
->session
);
2190 mdr
->did_early_reply
= true;
2192 mds
->logger
->inc(l_mds_reply
);
2193 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
2194 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2195 if (lat
>= g_conf()->mds_op_complaint_time
) {
2196 mds
->logger
->inc(l_mds_slow_reply
);
2198 if (client_inst
.name
.is_client()) {
2199 mds
->sessionmap
.hit_session(mdr
->session
);
2201 perf_gather_op_latency(req
, lat
);
2202 dout(20) << "lat " << lat
<< dendl
;
2204 mdr
->mark_event("early_replied");
2209 * include a trace to tracei
2212 void Server::reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
)
2214 ceph_assert(mdr
.get());
2215 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2217 dout(7) << "reply_client_request " << reply
->get_result()
2218 << " (" << cpp_strerror(reply
->get_result())
2219 << ") " << *req
<< dendl
;
2221 mdr
->mark_event("replying");
2223 Session
*session
= mdr
->session
;
2225 // note successful request in session map?
2227 // setfilelock requests are special, they only modify states in MDS memory.
2228 // The states get lost when MDS fails. If Client re-send a completed
2229 // setfilelock request, it means that client did not receive corresponding
2230 // setfilelock reply. So MDS should re-execute the setfilelock request.
2231 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
2232 reply
->get_result() == 0 && session
) {
2233 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
2234 session
->add_completed_request(mdr
->reqid
.tid
, created
);
2236 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
2240 // give any preallocated inos to the session
2241 apply_allocated_inos(mdr
, session
);
2243 // get tracei/tracedn from mdr?
2244 CInode
*tracei
= mdr
->tracei
;
2245 CDentry
*tracedn
= mdr
->tracedn
;
2247 bool is_replay
= mdr
->client_request
->is_replay();
2248 bool did_early_reply
= mdr
->did_early_reply
;
2249 entity_inst_t client_inst
= req
->get_source_inst();
2251 if (!did_early_reply
&& !is_replay
) {
2253 mds
->logger
->inc(l_mds_reply
);
2254 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
2255 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
2256 if (lat
>= g_conf()->mds_op_complaint_time
) {
2257 mds
->logger
->inc(l_mds_slow_reply
);
2259 if (session
&& client_inst
.name
.is_client()) {
2260 mds
->sessionmap
.hit_session(session
);
2262 perf_gather_op_latency(req
, lat
);
2263 dout(20) << "lat " << lat
<< dendl
;
2266 mdr
->cap_releases
.erase(tracei
->vino());
2268 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
2271 // drop non-rdlocks before replying, so that we can issue leases
2272 mdcache
->request_drop_non_rdlocks(mdr
);
2275 if (session
&& !client_inst
.name
.is_mds()) {
2277 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
2278 (tracei
|| tracedn
)) {
2281 mdcache
->try_reconnect_cap(tracei
, session
);
2283 // include metadata in reply
2284 set_trace_dist(reply
, tracei
, tracedn
, mdr
);
2288 // We can set the extra bl unconditionally: if it's already been sent in the
2289 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2290 reply
->set_extra_bl(mdr
->reply_extra_bl
);
2292 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
2293 mds
->send_message_client(reply
, session
);
2296 if (req
->is_queued_for_replay() &&
2297 (mdr
->has_completed
|| reply
->get_result() < 0)) {
2298 if (reply
->get_result() < 0) {
2299 int r
= reply
->get_result();
2300 derr
<< "reply_client_request: failed to replay " << *req
2301 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
2302 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
2304 mds
->queue_one_replay();
2308 mdcache
->request_finish(mdr
);
2310 // take a closer look at tracei, if it happens to be a remote link
2313 tracedn
->get_projected_linkage()->is_remote()) {
2314 mdcache
->eval_remote(tracedn
);
2319 * pass inode OR dentry (not both, or we may get confused)
2321 * trace is in reverse order (i.e. root inode comes last)
2323 void Server::set_trace_dist(const ref_t
<MClientReply
> &reply
,
2324 CInode
*in
, CDentry
*dn
,
2327 // skip doing this for debugging purposes?
2328 if (g_conf()->mds_inject_traceless_reply_probability
&&
2329 mdr
->ls
&& !mdr
->o_trunc
&&
2330 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability
* 10000.0)) {
2331 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
2335 // inode, dentry, dir, ..., inode
2337 mds_rank_t whoami
= mds
->get_nodeid();
2338 Session
*session
= mdr
->session
;
2339 snapid_t snapid
= mdr
->snapid
;
2340 utime_t now
= ceph_clock_now();
2342 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
2345 if (snapid
== CEPH_NOSNAP
) {
2348 realm
= in
->find_snaprealm();
2350 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
2351 reply
->snapbl
= get_snap_trace(session
, realm
);
2352 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
2357 reply
->head
.is_dentry
= 1;
2358 CDir
*dir
= dn
->get_dir();
2359 CInode
*diri
= dir
->get_inode();
2361 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
2362 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
2364 #ifdef MDS_VERIFY_FRAGSTAT
2365 if (dir
->is_complete())
2366 dir
->verify_fragstat();
2369 ds
.frag
= dir
->get_frag();
2370 ds
.auth
= dir
->get_dir_auth().first
;
2371 if (dir
->is_auth() && !forward_all_requests_to_auth
)
2372 dir
->get_dist_spec(ds
.dist
, whoami
);
2374 dir
->encode_dirstat(bl
, session
->info
, ds
);
2375 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
2377 encode(dn
->get_name(), bl
);
2378 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, bl
);
2380 reply
->head
.is_dentry
= 0;
2384 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
2385 dout(20) << "set_trace_dist added snap " << snapid
<< " in " << *in
2387 reply
->head
.is_target
= 1;
2389 reply
->head
.is_target
= 0;
2391 reply
->set_trace(bl
);
2394 void Server::handle_client_request(const cref_t
<MClientRequest
> &req
)
2396 dout(4) << "handle_client_request " << *req
<< dendl
;
2399 mds
->logger
->inc(l_mds_request
);
2401 logger
->inc(l_mdss_handle_client_request
);
2403 if (!mdcache
->is_open()) {
2404 dout(5) << "waiting for root" << dendl
;
2405 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
2409 bool sessionclosed_isok
= replay_unsafe_with_closed_session
;
2411 Session
*session
= 0;
2412 if (req
->is_a_client()) {
2413 session
= mds
->get_session(req
);
2415 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
2416 } else if ((session
->is_closed() && (!mds
->is_clientreplay() || !sessionclosed_isok
)) ||
2417 session
->is_closing() ||
2418 session
->is_killing()) {
2419 dout(5) << "session closed|closing|killing, dropping" << dendl
;
2423 if (req
->is_queued_for_replay())
2424 mds
->queue_one_replay();
2430 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
2431 // send it? hrm, this isn't ideal; they may get a lot of copies if
2432 // they have a high request rate.
2435 // completed request?
2436 bool has_completed
= false;
2437 if (req
->is_replay() || req
->get_retry_attempt()) {
2438 ceph_assert(session
);
2440 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
2441 has_completed
= true;
2442 if (!session
->is_open())
2444 // Don't send traceless reply if the completed request has created
2445 // new inode. Treat the request as lookup request instead.
2446 if (req
->is_replay() ||
2447 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
2448 req
->get_op() != CEPH_MDS_OP_OPEN
&&
2449 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
2450 dout(5) << "already completed " << req
->get_reqid() << dendl
;
2451 auto reply
= make_message
<MClientReply
>(*req
, 0);
2452 if (created
!= inodeno_t()) {
2454 encode(created
, extra
);
2455 reply
->set_extra_bl(extra
);
2457 mds
->send_message_client(reply
, session
);
2459 if (req
->is_queued_for_replay())
2460 mds
->queue_one_replay();
2464 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
2465 req
->get_op() != CEPH_MDS_OP_CREATE
) {
2466 dout(10) << " completed request which created new inode " << created
2467 << ", convert it to lookup request" << dendl
;
2468 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
2469 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
2474 // trim completed_request list
2475 if (req
->get_oldest_client_tid() > 0) {
2476 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
2477 ceph_assert(session
);
2478 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
2479 // Sessions 'completed_requests' was dirtied, mark it to be
2480 // potentially flushed at segment expiry.
2481 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
2483 if (session
->get_num_trim_requests_warnings() > 0 &&
2484 session
->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests
)
2485 session
->reset_num_trim_requests_warnings();
2487 if (session
->get_num_completed_requests() >=
2488 (g_conf()->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
2489 session
->inc_num_trim_requests_warnings();
2490 CachedStackStringStream css
;
2491 *css
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
2492 << req
->get_oldest_client_tid() << "), "
2493 << session
->get_num_completed_requests()
2494 << " completed requests recorded in session\n";
2495 mds
->clog
->warn() << css
->strv();
2496 dout(20) << __func__
<< " " << css
->strv() << dendl
;
2501 // register + dispatch
2502 MDRequestRef mdr
= mdcache
->request_start(req
);
2507 mdr
->session
= session
;
2508 session
->requests
.push_back(&mdr
->item_session_request
);
2512 mdr
->has_completed
= true;
2514 // process embedded cap releases?
2515 // (only if NOT replay!)
2516 if (!req
->releases
.empty() && req
->is_a_client() && !req
->is_replay()) {
2517 client_t client
= req
->get_source().num();
2518 for (const auto &r
: req
->releases
) {
2519 mds
->locker
->process_request_cap_release(mdr
, client
, r
.item
, r
.dname
);
2521 req
->releases
.clear();
2524 dispatch_client_request(mdr
);
2528 void Server::handle_osd_map()
2530 /* Note that we check the OSDMAP_FULL flag directly rather than
2531 * using osdmap_full_flag(), because we want to know "is the flag set"
2532 * rather than "does the flag apply to us?" */
2533 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
2534 auto pi
= o
.get_pg_pool(mds
->get_metadata_pool());
2535 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
2536 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
2537 << o
.get_epoch() << dendl
;
2541 void Server::dispatch_client_request(MDRequestRef
& mdr
)
2543 // we shouldn't be waiting on anyone.
2544 ceph_assert(!mdr
->has_more() || mdr
->more()->waiting_on_peer
.empty());
2547 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
2548 //if the mdr is a "batch_op" and it has followers, pick a follower as
2549 //the new "head of the batch ops" and go on processing the new one.
2550 if (mdr
->is_batch_head()) {
2551 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
2552 auto it
= mdr
->batch_op_map
->find(mask
);
2553 auto new_batch_head
= it
->second
->find_new_head();
2554 if (!new_batch_head
) {
2555 mdr
->batch_op_map
->erase(it
);
2558 mdr
= std::move(new_batch_head
);
2562 } else if (mdr
->aborted
) {
2563 mdr
->aborted
= false;
2564 mdcache
->request_kill(mdr
);
2568 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
2570 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
2572 dout(7) << "dispatch_client_request " << *req
<< dendl
;
2574 if (req
->may_write() && mdcache
->is_readonly()) {
2575 dout(10) << " read-only FS" << dendl
;
2576 respond_to_request(mdr
, -CEPHFS_EROFS
);
2579 if (mdr
->has_more() && mdr
->more()->peer_error
) {
2580 dout(10) << " got error from peers" << dendl
;
2581 respond_to_request(mdr
, mdr
->more()->peer_error
);
2586 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
2588 // the request is already responded to
2591 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2592 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
2593 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
2594 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
2595 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
2596 req
->get_op() == CEPH_MDS_OP_CREATE
||
2597 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
2598 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
2599 ((req
->get_op() == CEPH_MDS_OP_LINK
||
2600 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
2601 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started peer request
2604 if (check_access(mdr
, cur
, MAY_FULL
)) {
2605 dout(20) << __func__
<< ": full, has FULL caps, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2607 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2608 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
2612 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
2616 switch (req
->get_op()) {
2617 case CEPH_MDS_OP_LOOKUPHASH
:
2618 case CEPH_MDS_OP_LOOKUPINO
:
2619 handle_client_lookup_ino(mdr
, false, false);
2621 case CEPH_MDS_OP_LOOKUPPARENT
:
2622 handle_client_lookup_ino(mdr
, true, false);
2624 case CEPH_MDS_OP_LOOKUPNAME
:
2625 handle_client_lookup_ino(mdr
, false, true);
2629 case CEPH_MDS_OP_LOOKUP
:
2630 handle_client_getattr(mdr
, true);
2633 case CEPH_MDS_OP_LOOKUPSNAP
:
2634 // lookupsnap does not reference a CDentry; treat it as a getattr
2635 case CEPH_MDS_OP_GETATTR
:
2636 handle_client_getattr(mdr
, false);
2638 case CEPH_MDS_OP_GETVXATTR
:
2639 handle_client_getvxattr(mdr
);
2642 case CEPH_MDS_OP_SETATTR
:
2643 handle_client_setattr(mdr
);
2645 case CEPH_MDS_OP_SETLAYOUT
:
2646 handle_client_setlayout(mdr
);
2648 case CEPH_MDS_OP_SETDIRLAYOUT
:
2649 handle_client_setdirlayout(mdr
);
2651 case CEPH_MDS_OP_SETXATTR
:
2652 handle_client_setxattr(mdr
);
2654 case CEPH_MDS_OP_RMXATTR
:
2655 handle_client_removexattr(mdr
);
2658 case CEPH_MDS_OP_READDIR
:
2659 handle_client_readdir(mdr
);
2662 case CEPH_MDS_OP_SETFILELOCK
:
2663 handle_client_file_setlock(mdr
);
2666 case CEPH_MDS_OP_GETFILELOCK
:
2667 handle_client_file_readlock(mdr
);
2671 case CEPH_MDS_OP_CREATE
:
2672 if (mdr
->has_completed
)
2673 handle_client_open(mdr
); // already created.. just open
2675 handle_client_openc(mdr
);
2678 case CEPH_MDS_OP_OPEN
:
2679 handle_client_open(mdr
);
2684 case CEPH_MDS_OP_MKNOD
:
2685 handle_client_mknod(mdr
);
2687 case CEPH_MDS_OP_LINK
:
2688 handle_client_link(mdr
);
2690 case CEPH_MDS_OP_UNLINK
:
2691 case CEPH_MDS_OP_RMDIR
:
2692 handle_client_unlink(mdr
);
2694 case CEPH_MDS_OP_RENAME
:
2695 handle_client_rename(mdr
);
2697 case CEPH_MDS_OP_MKDIR
:
2698 handle_client_mkdir(mdr
);
2700 case CEPH_MDS_OP_SYMLINK
:
2701 handle_client_symlink(mdr
);
2706 case CEPH_MDS_OP_LSSNAP
:
2707 handle_client_lssnap(mdr
);
2709 case CEPH_MDS_OP_MKSNAP
:
2710 handle_client_mksnap(mdr
);
2712 case CEPH_MDS_OP_RMSNAP
:
2713 handle_client_rmsnap(mdr
);
2715 case CEPH_MDS_OP_RENAMESNAP
:
2716 handle_client_renamesnap(mdr
);
2718 case CEPH_MDS_OP_READDIR_SNAPDIFF
:
2719 handle_client_readdir_snapdiff(mdr
);
2723 dout(1) << " unknown client op " << req
->get_op() << dendl
;
2724 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
2729 // ---------------------------------------
2732 void Server::handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
)
2734 dout(4) << "handle_peer_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
2735 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2737 if (logger
) logger
->inc(l_mdss_handle_peer_request
);
2741 return handle_peer_request_reply(m
);
2743 // the purpose of rename notify is enforcing causal message ordering. making sure
2744 // bystanders have received all messages from rename srcdn's auth MDS.
2745 if (m
->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY
) {
2746 auto reply
= make_message
<MMDSPeerRequest
>(m
->get_reqid(), m
->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK
);
2747 mds
->send_message(reply
, m
->get_connection());
2751 CDentry
*straydn
= NULL
;
2752 if (m
->straybl
.length() > 0) {
2753 mdcache
->decode_replica_stray(straydn
, nullptr, m
->straybl
, from
);
2754 ceph_assert(straydn
);
2758 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2759 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2760 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2766 if (mdcache
->have_request(m
->get_reqid())) {
2768 mdr
= mdcache
->request_get(m
->get_reqid());
2770 // is my request newer?
2771 if (mdr
->attempt
> m
->get_attempt()) {
2772 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2773 << ", dropping " << *m
<< dendl
;
2777 if (mdr
->attempt
< m
->get_attempt()) {
2778 // mine is old, close it out
2779 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2780 << ", closing out" << dendl
;
2781 mdcache
->request_finish(mdr
);
2783 } else if (mdr
->peer_to_mds
!= from
) {
2784 dout(10) << "local request " << *mdr
<< " not peer to mds." << from
<< dendl
;
2788 // may get these while mdr->peer_request is non-null
2789 if (m
->get_op() == MMDSPeerRequest::OP_DROPLOCKS
) {
2790 mds
->locker
->drop_locks(mdr
.get());
2793 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2794 if (m
->is_abort()) {
2795 mdr
->aborted
= true;
2796 if (mdr
->peer_request
) {
2797 // only abort on-going xlock, wrlock and auth pin
2798 ceph_assert(!mdr
->peer_did_prepare());
2800 mdcache
->request_finish(mdr
);
2803 if (m
->inode_export
.length() > 0)
2804 mdr
->more()->inode_import
= m
->inode_export
;
2805 // finish off request.
2806 mdcache
->request_finish(mdr
);
2813 if (m
->get_op() == MMDSPeerRequest::OP_FINISH
) {
2814 dout(10) << "missing peer request for " << m
->get_reqid()
2815 << " OP_FINISH, must have lost race with a forward" << dendl
;
2818 mdr
= mdcache
->request_start_peer(m
->get_reqid(), m
->get_attempt(), m
);
2819 mdr
->set_op_stamp(m
->op_stamp
);
2821 ceph_assert(mdr
->peer_request
== 0); // only one at a time, please!
2825 mdr
->straydn
= straydn
;
2828 if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2829 mdr
->locks
.empty()) {
2830 dout(3) << "not active yet, waiting" << dendl
;
2831 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2835 mdr
->reset_peer_request(m
);
2837 dispatch_peer_request(mdr
);
2840 void Server::handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
)
2842 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2844 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2845 metareqid_t r
= m
->get_reqid();
2846 if (!mdcache
->have_uncommitted_leader(r
, from
)) {
2847 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2848 << from
<< " reqid " << r
<< dendl
;
2851 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2852 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2856 if (m
->get_op() == MMDSPeerRequest::OP_COMMITTED
) {
2857 metareqid_t r
= m
->get_reqid();
2858 mdcache
->committed_leader_peer(r
, from
);
2862 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2863 if (m
->get_attempt() != mdr
->attempt
) {
2864 dout(10) << "handle_peer_request_reply " << *mdr
<< " ignoring reply from other attempt "
2865 << m
->get_attempt() << dendl
;
2869 switch (m
->get_op()) {
2870 case MMDSPeerRequest::OP_XLOCKACK
:
2872 // identify lock, leader request
2873 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2874 m
->get_object_info());
2875 mdr
->more()->peers
.insert(from
);
2876 lock
->decode_locked_state(m
->get_lock_data());
2877 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2878 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
2879 mdr
->finish_locking(lock
);
2880 lock
->get_xlock(mdr
, mdr
->get_client());
2882 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2883 mdr
->more()->waiting_on_peer
.erase(from
);
2884 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2885 mdcache
->dispatch_request(mdr
);
2889 case MMDSPeerRequest::OP_WRLOCKACK
:
2891 // identify lock, leader request
2892 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2893 m
->get_object_info());
2894 mdr
->more()->peers
.insert(from
);
2895 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2896 auto it
= mdr
->emplace_lock(lock
, MutationImpl::LockOp::REMOTE_WRLOCK
, from
);
2897 ceph_assert(it
->is_remote_wrlock());
2898 ceph_assert(it
->wrlock_target
== from
);
2900 mdr
->finish_locking(lock
);
2902 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
2903 mdr
->more()->waiting_on_peer
.erase(from
);
2904 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
2905 mdcache
->dispatch_request(mdr
);
2909 case MMDSPeerRequest::OP_AUTHPINACK
:
2910 handle_peer_auth_pin_ack(mdr
, m
);
2913 case MMDSPeerRequest::OP_LINKPREPACK
:
2914 handle_peer_link_prep_ack(mdr
, m
);
2917 case MMDSPeerRequest::OP_RMDIRPREPACK
:
2918 handle_peer_rmdir_prep_ack(mdr
, m
);
2921 case MMDSPeerRequest::OP_RENAMEPREPACK
:
2922 handle_peer_rename_prep_ack(mdr
, m
);
2925 case MMDSPeerRequest::OP_RENAMENOTIFYACK
:
2926 handle_peer_rename_notify_ack(mdr
, m
);
2930 ceph_abort_msg("unknown op " + to_string(m
->get_op()) + " requested");
2934 void Server::dispatch_peer_request(MDRequestRef
& mdr
)
2936 dout(7) << "dispatch_peer_request " << *mdr
<< " " << *mdr
->peer_request
<< dendl
;
2939 dout(7) << " abort flag set, finishing" << dendl
;
2940 mdcache
->request_finish(mdr
);
2944 if (logger
) logger
->inc(l_mdss_dispatch_peer_request
);
2946 int op
= mdr
->peer_request
->get_op();
2948 case MMDSPeerRequest::OP_XLOCK
:
2949 case MMDSPeerRequest::OP_WRLOCK
:
2952 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
2953 mdr
->peer_request
->get_object_info());
2956 dout(10) << "don't have object, dropping" << dendl
;
2957 ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
2959 if (op
== MMDSPeerRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2960 dout(10) << "not auth for remote xlock attempt, dropping on "
2961 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2963 // use acquire_locks so that we get auth_pinning.
2964 MutationImpl::LockOpVec lov
;
2965 for (const auto& p
: mdr
->locks
) {
2967 lov
.add_xlock(p
.lock
);
2968 else if (p
.is_wrlock())
2969 lov
.add_wrlock(p
.lock
);
2974 case MMDSPeerRequest::OP_XLOCK
:
2975 lov
.add_xlock(lock
);
2976 replycode
= MMDSPeerRequest::OP_XLOCKACK
;
2978 case MMDSPeerRequest::OP_WRLOCK
:
2979 lov
.add_wrlock(lock
);
2980 replycode
= MMDSPeerRequest::OP_WRLOCKACK
;
2984 if (!mds
->locker
->acquire_locks(mdr
, lov
))
2988 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, replycode
);
2989 r
->set_lock_type(lock
->get_type());
2990 lock
->get_parent()->set_object_info(r
->get_object_info());
2991 if (replycode
== MMDSPeerRequest::OP_XLOCKACK
)
2992 lock
->encode_locked_state(r
->get_lock_data());
2993 mds
->send_message(r
, mdr
->peer_request
->get_connection());
2997 mdr
->reset_peer_request();
3001 case MMDSPeerRequest::OP_UNXLOCK
:
3002 case MMDSPeerRequest::OP_UNWRLOCK
:
3004 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->peer_request
->get_lock_type(),
3005 mdr
->peer_request
->get_object_info());
3007 auto it
= mdr
->locks
.find(lock
);
3008 ceph_assert(it
!= mdr
->locks
.end());
3009 bool need_issue
= false;
3011 case MMDSPeerRequest::OP_UNXLOCK
:
3012 mds
->locker
->xlock_finish(it
, mdr
.get(), &need_issue
);
3014 case MMDSPeerRequest::OP_UNWRLOCK
:
3015 mds
->locker
->wrlock_finish(it
, mdr
.get(), &need_issue
);
3019 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
3021 // done. no ack necessary.
3022 mdr
->reset_peer_request();
3026 case MMDSPeerRequest::OP_AUTHPIN
:
3027 handle_peer_auth_pin(mdr
);
3030 case MMDSPeerRequest::OP_LINKPREP
:
3031 case MMDSPeerRequest::OP_UNLINKPREP
:
3032 handle_peer_link_prep(mdr
);
3035 case MMDSPeerRequest::OP_RMDIRPREP
:
3036 handle_peer_rmdir_prep(mdr
);
3039 case MMDSPeerRequest::OP_RENAMEPREP
:
3040 handle_peer_rename_prep(mdr
);
3044 ceph_abort_msg("unknown op "+ to_string(op
)+ " received");
3048 void Server::handle_peer_auth_pin(MDRequestRef
& mdr
)
3050 dout(10) << "handle_peer_auth_pin " << *mdr
<< dendl
;
3052 // build list of objects
3053 list
<MDSCacheObject
*> objects
;
3054 CInode
*auth_pin_freeze
= NULL
;
3055 bool nonblocking
= mdr
->peer_request
->is_nonblocking();
3056 bool fail
= false, wouldblock
= false, readonly
= false;
3057 ref_t
<MMDSPeerRequest
> reply
;
3059 if (mdcache
->is_readonly()) {
3060 dout(10) << " read-only FS" << dendl
;
3066 for (const auto &oi
: mdr
->peer_request
->get_authpins()) {
3067 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3069 dout(10) << " don't have " << oi
<< dendl
;
3074 objects
.push_back(object
);
3075 if (oi
== mdr
->peer_request
->get_authpin_freeze())
3076 auth_pin_freeze
= static_cast<CInode
*>(object
);
3080 // can we auth pin them?
3082 for (const auto& obj
: objects
) {
3083 if (!obj
->is_auth()) {
3084 dout(10) << " not auth for " << *obj
<< dendl
;
3088 if (mdr
->is_auth_pinned(obj
))
3090 if (!mdr
->can_auth_pin(obj
)) {
3092 dout(10) << " can't auth_pin (freezing?) " << *obj
<< " nonblocking" << dendl
;
3098 dout(10) << " waiting for authpinnable on " << *obj
<< dendl
;
3099 obj
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3100 mdr
->drop_local_auth_pins();
3102 mds
->locker
->notify_freeze_waiter(obj
);
3109 /* freeze authpin wrong inode */
3110 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
3111 mdr
->more()->rename_inode
!= auth_pin_freeze
)
3112 mdr
->unfreeze_auth_pin(true);
3114 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3115 * on the source inode to complete. This happens after all locks for the rename
3116 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3117 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3118 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3119 * The solution is freeze the inode and prevent other MDRequests from getting new
3122 if (auth_pin_freeze
) {
3123 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
3124 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
3125 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
3126 mds
->mdlog
->flush();
3132 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3135 mdr
->drop_local_auth_pins(); // just in case
3137 reply
->mark_error_rofs();
3139 reply
->mark_error_wouldblock();
3142 for (const auto& obj
: objects
) {
3143 dout(10) << "auth_pinning " << *obj
<< dendl
;
3146 // return list of my auth_pins (if any)
3147 for (const auto &p
: mdr
->object_states
) {
3148 if (!p
.second
.auth_pinned
)
3150 MDSCacheObjectInfo info
;
3151 p
.first
->set_object_info(info
);
3152 reply
->get_authpins().push_back(info
);
3153 if (p
.first
== (MDSCacheObject
*)auth_pin_freeze
)
3154 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
3158 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3160 // clean up this request
3161 mdr
->reset_peer_request();
3165 if (mdr
->peer_request
->should_notify_blocking()) {
3166 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_AUTHPINACK
);
3167 reply
->mark_req_blocked();
3168 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
3169 mdr
->peer_request
->clear_notify_blocking();
3174 void Server::handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
3176 dout(10) << "handle_peer_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
3177 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3179 if (ack
->is_req_blocked()) {
3180 mdr
->disable_lock_cache();
3181 // peer auth pin is blocked, drop locks to avoid deadlock
3182 mds
->locker
->drop_locks(mdr
.get(), nullptr);
3187 set
<MDSCacheObject
*> pinned
;
3188 for (const auto &oi
: ack
->get_authpins()) {
3189 MDSCacheObject
*object
= mdcache
->get_object(oi
);
3190 ceph_assert(object
); // we pinned it
3191 dout(10) << " remote has pinned " << *object
<< dendl
;
3192 mdr
->set_remote_auth_pinned(object
, from
);
3193 if (oi
== ack
->get_authpin_freeze())
3194 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
3195 pinned
.insert(object
);
3198 // removed frozen auth pin ?
3199 if (mdr
->more()->is_remote_frozen_authpin
&&
3200 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
3201 auto stat_p
= mdr
->find_object_state(mdr
->more()->rename_inode
);
3202 ceph_assert(stat_p
);
3203 if (stat_p
->remote_auth_pinned
== from
) {
3204 mdr
->more()->is_remote_frozen_authpin
= false;
3208 // removed auth pins?
3209 for (auto& p
: mdr
->object_states
) {
3210 if (p
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
3212 MDSCacheObject
* object
= p
.first
;
3213 if (p
.second
.remote_auth_pinned
== from
&& pinned
.count(object
) == 0) {
3214 dout(10) << " remote has unpinned " << *object
<< dendl
;
3215 mdr
->_clear_remote_auth_pinned(p
.second
);
3220 mdr
->more()->peers
.insert(from
);
3222 // clear from waiting list
3223 auto ret
= mdr
->more()->waiting_on_peer
.erase(from
);
3226 if (ack
->is_error_rofs()) {
3227 mdr
->more()->peer_error
= -CEPHFS_EROFS
;
3228 } else if (ack
->is_error_wouldblock()) {
3229 mdr
->more()->peer_error
= -CEPHFS_EWOULDBLOCK
;
3233 if (mdr
->more()->waiting_on_peer
.empty())
3234 mdcache
->dispatch_request(mdr
);
3236 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
3240 // ---------------------------------------
3245 * check whether we are permitted to complete a request
3247 * Check whether we have permission to perform the operation specified
3248 * by mask on the given inode, based on the capability in the mdr's
3251 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
3254 int r
= mdr
->session
->check_access(
3256 mdr
->client_request
->get_caller_uid(),
3257 mdr
->client_request
->get_caller_gid(),
3258 &mdr
->client_request
->get_caller_gid_list(),
3259 mdr
->client_request
->head
.args
.setattr
.uid
,
3260 mdr
->client_request
->head
.args
.setattr
.gid
);
3262 respond_to_request(mdr
, r
);
3270 * check whether fragment has reached maximum size
3273 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*dir
)
3275 const auto size
= dir
->get_frag_size();
3276 const auto max
= bal_fragment_size_max
;
3278 dout(10) << "fragment " << *dir
<< " size exceeds " << max
<< " (CEPHFS_ENOSPC)" << dendl
;
3279 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
3282 dout(20) << "fragment " << *dir
<< " size " << size
<< " < " << max
<< dendl
;
3289 * check whether entries in a dir reached maximum size
3292 bool Server::check_dir_max_entries(MDRequestRef
&mdr
, CDir
*in
)
3294 const uint64_t size
= in
->inode
->get_projected_inode()->dirstat
.nfiles
+
3295 in
->inode
->get_projected_inode()->dirstat
.nsubdirs
;
3296 if (dir_max_entries
&& size
>= dir_max_entries
) {
3297 dout(10) << "entries per dir " << *in
<< " size exceeds " << dir_max_entries
<< " (ENOSPC)" << dendl
;
3298 respond_to_request(mdr
, -ENOSPC
);
3305 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
3308 in
->name_stray_dentry(straydname
);
3310 CDentry
*straydn
= mdr
->straydn
;
3312 ceph_assert(straydn
->get_name() == straydname
);
3315 CDir
*straydir
= mdcache
->get_stray_dir(in
);
3317 if (!mdr
->client_request
->is_replay() &&
3318 !check_fragment_space(mdr
, straydir
))
3321 straydn
= straydir
->lookup(straydname
);
3323 if (straydir
->is_frozen_dir()) {
3324 dout(10) << __func__
<< ": " << *straydir
<< " is frozen, waiting" << dendl
;
3325 mds
->locker
->drop_locks(mdr
.get());
3326 mdr
->drop_local_auth_pins();
3327 straydir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3330 straydn
= straydir
->add_null_dentry(straydname
);
3331 straydn
->mark_new();
3333 ceph_assert(straydn
->get_projected_linkage()->is_null());
3336 straydn
->state_set(CDentry::STATE_STRAY
);
3337 mdr
->straydn
= straydn
;
3343 /** prepare_new_inode
3345 * create a new inode. set c/m/atime. hit dir pop.
3347 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
3348 const file_layout_t
*layout
)
3350 CInode
*in
= new CInode(mdcache
);
3351 auto _inode
= in
->_get_inode();
3353 // Server::prepare_force_open_sessions() can re-open session in closing
3354 // state. In that corner case, session's prealloc_inos are being freed.
3355 // To simplify the code, we disallow using/refilling session's prealloc_ino
3356 // while session is opening.
3357 bool allow_prealloc_inos
= mdr
->session
->is_open();
3359 inodeno_t _useino
= useino
;
3363 if (allow_prealloc_inos
&& (mdr
->used_prealloc_ino
= _inode
->ino
= mdr
->session
->take_ino(_useino
))) {
3364 if (mdcache
->test_and_clear_taken_inos(_inode
->ino
)) {
3366 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3367 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3368 << " but has been taken, will try again!" << dendl
;
3370 mds
->sessionmap
.mark_projected(mdr
->session
);
3371 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
3372 << " (" << mdr
->session
->info
.prealloc_inos
.size() << " left)"
3377 _inode
->ino
= mds
->inotable
->project_alloc_id(_useino
);
3378 if (mdcache
->test_and_clear_taken_inos(_inode
->ino
)) {
3379 mds
->inotable
->apply_alloc_id(_inode
->ino
);
3381 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
3382 << " but has been taken, will try again!" << dendl
;
3384 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
3388 } while (!_inode
->ino
);
3390 if (useino
&& useino
!= _inode
->ino
) {
3391 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << _inode
->ino
<< dendl
;
3392 mds
->clog
->error() << mdr
->client_request
->get_source()
3393 << " specified ino " << useino
3394 << " but mds." << mds
->get_nodeid() << " allocated " << _inode
->ino
;
3395 //ceph_abort(); // just for now.
3398 if (allow_prealloc_inos
&&
3399 mdr
->session
->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos
/ 2) {
3400 int need
= g_conf()->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
3401 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
3402 ceph_assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
3403 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
3404 mds
->sessionmap
.mark_projected(mdr
->session
);
3405 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
3408 _inode
->version
= 1;
3409 _inode
->xattr_version
= 1;
3410 _inode
->nlink
= 1; // FIXME
3412 _inode
->mode
= mode
;
3414 // FIPS zeroization audit 20191117: this memset is not security related.
3415 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
3416 if (_inode
->is_dir()) {
3417 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
3418 } else if (layout
) {
3419 _inode
->layout
= *layout
;
3421 _inode
->layout
= mdcache
->default_file_layout
;
3424 _inode
->truncate_size
= -1ull; // not truncated, yet!
3425 _inode
->truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
3427 CInode
*diri
= dir
->get_inode();
3428 auto pip
= diri
->get_projected_inode();
3430 dout(10) << oct
<< " dir mode 0" << pip
->mode
<< " new mode 0" << mode
<< dec
<< dendl
;
3432 if (pip
->mode
& S_ISGID
) {
3433 dout(10) << " dir is sticky" << dendl
;
3434 _inode
->gid
= pip
->gid
;
3435 if (S_ISDIR(mode
)) {
3436 dout(10) << " new dir also sticky" << dendl
;
3437 _inode
->mode
|= S_ISGID
;
3440 _inode
->gid
= mdr
->client_request
->get_owner_gid();
3441 ceph_assert(_inode
->gid
!= (unsigned)-1);
3444 _inode
->uid
= mdr
->client_request
->get_owner_uid();
3445 ceph_assert(_inode
->uid
!= (unsigned)-1);
3447 _inode
->btime
= _inode
->ctime
= _inode
->mtime
= _inode
->atime
=
3448 mdr
->get_op_stamp();
3450 _inode
->change_attr
= 0;
3452 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3454 dout(10) << "copying fscrypt_auth len " << req
->fscrypt_auth
.size() << dendl
;
3455 _inode
->fscrypt_auth
= req
->fscrypt_auth
;
3456 _inode
->fscrypt_file
= req
->fscrypt_file
;
3458 if (req
->get_data().length()) {
3459 auto p
= req
->get_data().cbegin();
3461 // xattrs on new inode?
3462 auto _xattrs
= CInode::allocate_xattr_map();
3463 decode_noshare(*_xattrs
, p
);
3464 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs
<< dendl
;
3465 in
->reset_xattrs(std::move(_xattrs
));
3468 if (!mds
->mdsmap
->get_inline_data_enabled() ||
3469 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
3470 _inode
->inline_data
.version
= CEPH_INLINE_NONE
;
3472 mdcache
->add_inode(in
); // add
3473 dout(10) << "prepare_new_inode " << *in
<< dendl
;
3477 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
3479 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
3480 << " inotablev " << mds
->inotable
->get_projected_version()
3482 blob
->set_ino_alloc(mdr
->alloc_ino
,
3483 mdr
->used_prealloc_ino
,
3485 mdr
->client_request
->get_source(),
3486 mds
->sessionmap
.get_projected(),
3487 mds
->inotable
->get_projected_version());
3490 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
3492 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
3493 << " / " << mdr
->prealloc_inos
3494 << " / " << mdr
->used_prealloc_ino
<< dendl
;
3496 if (mdr
->alloc_ino
) {
3497 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
3499 if (mdr
->prealloc_inos
.size()) {
3500 ceph_assert(session
);
3501 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
3502 session
->free_prealloc_inos
.insert(mdr
->prealloc_inos
);
3503 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
3504 mds
->sessionmap
.mark_dirty(session
, !mdr
->used_prealloc_ino
);
3505 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
3507 if (mdr
->used_prealloc_ino
) {
3508 ceph_assert(session
);
3509 session
->info
.prealloc_inos
.erase(mdr
->used_prealloc_ino
);
3510 mds
->sessionmap
.mark_dirty(session
);
3514 struct C_MDS_TryOpenInode
: public ServerContext
{
3517 C_MDS_TryOpenInode(Server
*s
, MDRequestRef
& r
, inodeno_t i
) :
3518 ServerContext(s
), mdr(r
), ino(i
) {}
3519 void finish(int r
) override
{
3520 server
->_try_open_ino(mdr
, r
, ino
);
3524 void Server::_try_open_ino(MDRequestRef
& mdr
, int r
, inodeno_t ino
)
3526 dout(10) << "_try_open_ino " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3528 // `r` is a rank if >=0, else an error code
3530 mds_rank_t
dest_rank(r
);
3531 if (dest_rank
== mds
->get_nodeid())
3532 dispatch_client_request(mdr
);
3534 mdcache
->request_forward(mdr
, dest_rank
);
3539 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
3541 respond_to_request(mdr
, r
);
3544 class C_MDS_TryFindInode
: public ServerContext
{
3549 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
, MDCache
*m
, inodeno_t i
) :
3550 ServerContext(s
), mdr(r
), mdcache(m
), ino(i
) {}
3551 void finish(int r
) override
{
3552 if (r
== -CEPHFS_ESTALE
) { // :( find_ino_peers failed
3554 * There has one case that when the MDS crashes and the
3555 * openfiletable journal couldn't be flushed and then
3556 * the replacing MDS is possibly won't load some already
3557 * opened CInodes into the MDCache. And if the clients
3558 * will retry some requests after reconnected, the MDS
3559 * will return -ESTALE after failing to find the ino in
3562 * As a workaround users can run `ls -R ${mountpoint}`
3563 * to list all the sub-files or sub-direcotries from the
3566 * We need try to open the ino and try it again.
3568 CInode
*in
= mdcache
->get_inode(ino
);
3569 if (in
&& in
->state_test(CInode::STATE_PURGING
))
3570 server
->respond_to_request(mdr
, r
);
3572 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_TryOpenInode(server
, mdr
, ino
));
3574 server
->dispatch_client_request(mdr
);
3579 /* If this returns null, the request has been handled
3580 * as appropriate: forwarded on, or the client's been replied to */
3581 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
,
3585 const filepath
& refpath
= mdr
->get_filepath();
3586 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
3588 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3592 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3594 if (refpath
.is_last_snap()) {
3598 if (!no_want_auth
&& forward_all_requests_to_auth
)
3600 flags
|= MDS_TRAVERSE_RDLOCK_PATH
| MDS_TRAVERSE_RDLOCK_SNAP
;
3603 flags
|= MDS_TRAVERSE_WANT_AUTH
;
3604 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0], &mdr
->in
[0]);
3606 return nullptr; // delayed
3607 if (r
< 0) { // error
3608 if (r
== -CEPHFS_ENOENT
&& !mdr
->dn
[0].empty()) {
3609 if (mdr
->client_request
&&
3610 mdr
->client_request
->get_dentry_wanted())
3611 mdr
->tracedn
= mdr
->dn
[0].back();
3612 respond_to_request(mdr
, r
);
3613 } else if (r
== -CEPHFS_ESTALE
) {
3614 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3615 inodeno_t ino
= refpath
.get_ino();
3616 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3618 dout(10) << "FAIL on error " << r
<< dendl
;
3619 respond_to_request(mdr
, r
);
3623 CInode
*ref
= mdr
->in
[0];
3624 dout(10) << "ref is " << *ref
<< dendl
;
3628 // do NOT proceed if freezing, as cap release may defer in that case, and
3629 // we could deadlock when we try to lock @ref.
3630 // if we're already auth_pinned, continue; the release has already been processed.
3631 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
3632 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
3633 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
3634 ref
->add_waiter(CInode::WAIT_UNFREEZE
, cf
.build());
3635 if (mdr
->is_any_remote_auth_pin())
3636 mds
->locker
->notify_freeze_waiter(ref
);
3648 /** rdlock_path_xlock_dentry
3649 * traverse path to the directory that could/would contain dentry.
3650 * make sure i am auth for that dentry (or target inode if it exists and authexist),
3651 * forward as necessary. create null dentry in place (or use existing if okexist).
3652 * get rdlocks on traversed dentries, xlock on new dentry.
3654 * set authexist true if caller requires the target inode to be auth when it exists.
3655 * the tail dentry is not always auth any more if authexist because it is impossible
3656 * to ensure tail dentry and target inode are both auth in one mds. the tail dentry
3657 * will not be xlocked too if authexist and the target inode exists.
3659 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
,
3660 bool create
, bool okexist
, bool authexist
,
3663 const filepath
& refpath
= mdr
->get_filepath();
3664 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
3666 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3667 return mdr
->dn
[0].back();
3669 // figure parent dir vs dname
3670 if (refpath
.depth() == 0) {
3671 dout(7) << "invalid path (zero length)" << dendl
;
3672 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3676 if (refpath
.is_last_snap()) {
3677 respond_to_request(mdr
, -CEPHFS_EROFS
);
3681 if (refpath
.is_last_dot_or_dotdot()) {
3682 dout(7) << "invalid path (last dot or dot_dot)" << dendl
;
3684 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3686 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
3690 // traverse to parent dir
3691 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3692 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_PATH
|
3693 MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_XLOCK_DENTRY
|
3694 MDS_TRAVERSE_WANT_AUTH
;
3695 if (refpath
.depth() == 1 && !mdr
->lock_cache_disabled
)
3696 flags
|= MDS_TRAVERSE_CHECK_LOCKCACHE
;
3698 flags
|= MDS_TRAVERSE_RDLOCK_AUTHLOCK
;
3700 flags
|= MDS_TRAVERSE_WANT_INODE
;
3702 flags
|= MDS_TRAVERSE_WANT_DIRLAYOUT
;
3703 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3705 return nullptr; // delayed
3707 if (r
== -CEPHFS_ESTALE
) {
3708 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl
;
3709 inodeno_t ino
= refpath
.get_ino();
3710 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3713 respond_to_request(mdr
, r
);
3717 CDentry
*dn
= mdr
->dn
[0].back();
3718 CDir
*dir
= dn
->get_dir();
3719 CInode
*diri
= dir
->get_inode();
3721 if (!mdr
->reqid
.name
.is_mds()) {
3722 if (diri
->is_system() && !diri
->is_root() &&
3723 (!diri
->is_lost_and_found() ||
3724 mdr
->client_request
->get_op() != CEPH_MDS_OP_UNLINK
)) {
3725 respond_to_request(mdr
, -CEPHFS_EROFS
);
3730 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
3731 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3735 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3736 if (dnl
->is_null()) {
3737 if (!create
&& okexist
) {
3738 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3742 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3743 dn
->first
= std::max(dn
->first
, next_snap
);
3746 respond_to_request(mdr
, -CEPHFS_EEXIST
);
3749 mdr
->in
[0] = dnl
->get_inode();
3755 /** rdlock_two_paths_xlock_destdn
3756 * traverse two paths and lock the two paths in proper order.
3757 * The order of taking locks is:
3758 * 1. Lock directory inodes or dentries according to which trees they
3759 * are under. Lock objects under fs root before objects under mdsdir.
3760 * 2. Lock directory inodes or dentries according to their depth, in
3762 * 3. Lock directory inodes or dentries according to inode numbers or
3763 * dentries' parent inode numbers, in ascending order.
3764 * 4. Lock dentries in the same directory in order of their keys.
3765 * 5. Lock non-directory inodes according to inode numbers, in ascending
3768 std::pair
<CDentry
*, CDentry
*>
3769 Server::rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
)
3772 const filepath
& refpath
= mdr
->get_filepath();
3773 const filepath
& refpath2
= mdr
->get_filepath2();
3775 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr
<< " " << refpath
<< " " << refpath2
<< dendl
;
3777 if (mdr
->locking_state
& MutationImpl::PATH_LOCKED
)
3778 return std::make_pair(mdr
->dn
[0].back(), mdr
->dn
[1].back());
3780 if (refpath
.depth() != 1 || refpath2
.depth() != 1) {
3781 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3782 return std::pair
<CDentry
*, CDentry
*>(nullptr, nullptr);
3785 if (refpath
.is_last_snap() || refpath2
.is_last_snap()) {
3786 respond_to_request(mdr
, -CEPHFS_EROFS
);
3787 return std::make_pair(nullptr, nullptr);
3790 // traverse to parent dir
3791 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, true);
3792 int flags
= MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_WANT_AUTH
;
3793 int r
= mdcache
->path_traverse(mdr
, cf
, refpath
, flags
, &mdr
->dn
[0]);
3795 if (r
== -CEPHFS_ESTALE
) {
3796 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl
;
3797 inodeno_t ino
= refpath
.get_ino();
3798 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3800 respond_to_request(mdr
, r
);
3802 return std::make_pair(nullptr, nullptr);
3805 flags
= MDS_TRAVERSE_RDLOCK_SNAP2
| MDS_TRAVERSE_WANT_DENTRY
| MDS_TRAVERSE_DISCOVER
;
3806 r
= mdcache
->path_traverse(mdr
, cf
, refpath2
, flags
, &mdr
->dn
[1]);
3808 if (r
== -CEPHFS_ESTALE
) {
3809 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
3810 inodeno_t ino
= refpath2
.get_ino();
3811 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
3813 respond_to_request(mdr
, r
);
3815 return std::make_pair(nullptr, nullptr);
3818 CDentry
*srcdn
= mdr
->dn
[1].back();
3819 CDir
*srcdir
= srcdn
->get_dir();
3820 CDentry
*destdn
= mdr
->dn
[0].back();
3821 CDir
*destdir
= destdn
->get_dir();
3823 if (!mdr
->reqid
.name
.is_mds()) {
3824 if ((srcdir
->get_inode()->is_system() && !srcdir
->get_inode()->is_root()) ||
3825 (destdir
->get_inode()->is_system() && !destdir
->get_inode()->is_root())) {
3826 respond_to_request(mdr
, -CEPHFS_EROFS
);
3827 return std::make_pair(nullptr, nullptr);
3831 if (!destdir
->get_inode()->is_base() &&
3832 destdir
->get_inode()->get_projected_parent_dir()->inode
->is_stray()) {
3833 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3834 return std::make_pair(nullptr, nullptr);
3837 MutationImpl::LockOpVec lov
;
3838 if (srcdir
->get_inode() == destdir
->get_inode()) {
3839 lov
.add_wrlock(&destdir
->inode
->filelock
);
3840 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3841 if (xlock_srcdn
&& srcdir
!= destdir
) {
3842 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3843 if (srcdir_auth
!= mds
->get_nodeid()) {
3844 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3845 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3849 if (srcdn
->get_name() > destdn
->get_name())
3850 lov
.add_xlock(&destdn
->lock
);
3853 lov
.add_xlock(&srcdn
->lock
);
3855 lov
.add_rdlock(&srcdn
->lock
);
3857 if (srcdn
->get_name() < destdn
->get_name())
3858 lov
.add_xlock(&destdn
->lock
);
3860 int cmp
= mdr
->compare_paths();
3861 bool lock_destdir_first
=
3862 (cmp
< 0 || (cmp
== 0 && destdir
->ino() < srcdir
->ino()));
3864 if (lock_destdir_first
) {
3865 lov
.add_wrlock(&destdir
->inode
->filelock
);
3866 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3867 lov
.add_xlock(&destdn
->lock
);
3871 mds_rank_t srcdir_auth
= srcdir
->authority().first
;
3872 if (srcdir_auth
== mds
->get_nodeid()) {
3873 lov
.add_wrlock(&srcdir
->inode
->filelock
);
3874 lov
.add_wrlock(&srcdir
->inode
->nestlock
);
3876 lov
.add_remote_wrlock(&srcdir
->inode
->filelock
, srcdir_auth
);
3877 lov
.add_remote_wrlock(&srcdir
->inode
->nestlock
, srcdir_auth
);
3879 lov
.add_xlock(&srcdn
->lock
);
3881 lov
.add_rdlock(&srcdn
->lock
);
3884 if (!lock_destdir_first
) {
3885 lov
.add_wrlock(&destdir
->inode
->filelock
);
3886 lov
.add_wrlock(&destdir
->inode
->nestlock
);
3887 lov
.add_xlock(&destdn
->lock
);
3891 CInode
*auth_pin_freeze
= nullptr;
3892 // XXX any better way to do this?
3893 if (xlock_srcdn
&& !srcdn
->is_auth()) {
3894 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
3895 auth_pin_freeze
= srcdnl
->is_primary() ? srcdnl
->get_inode() : nullptr;
3897 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
3898 return std::make_pair(nullptr, nullptr);
3900 if (srcdn
->get_projected_linkage()->is_null()) {
3901 respond_to_request(mdr
, -CEPHFS_ENOENT
);
3902 return std::make_pair(nullptr, nullptr);
3905 if (destdn
->get_projected_linkage()->is_null()) {
3906 snapid_t next_snap
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
3907 destdn
->first
= std::max(destdn
->first
, next_snap
);
3910 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
3912 return std::make_pair(destdn
, srcdn
);
3916 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3918 * @param diri base inode
3919 * @param fg the exact frag we want
3920 * @param mdr request
3921 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3923 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3925 CDir
*dir
= diri
->get_dirfrag(fg
);
3928 // am i auth for the dirfrag?
3929 if (!dir
->is_auth()) {
3930 mds_rank_t auth
= dir
->authority().first
;
3931 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3932 << ", fw to mds." << auth
<< dendl
;
3933 mdcache
->request_forward(mdr
, auth
);
3937 // not open and inode not mine?
3938 if (!diri
->is_auth()) {
3939 mds_rank_t inauth
= diri
->authority().first
;
3940 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3941 mdcache
->request_forward(mdr
, inauth
);
3945 // not open and inode frozen?
3946 if (diri
->is_frozen()) {
3947 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3948 ceph_assert(diri
->get_parent_dir());
3949 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3954 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3961 // ===============================================================================
3964 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3966 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
3968 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3969 // refpath can't be empty for lookup but it can for
3970 // getattr (we do getattr with empty refpath for mount of '/')
3971 respond_to_request(mdr
, -CEPHFS_EINVAL
);
3975 bool want_auth
= false;
3976 int mask
= req
->head
.args
.getattr
.mask
;
3977 if (mask
& CEPH_STAT_RSTAT
)
3978 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3980 if (!mdr
->is_batch_head() && mdr
->can_batch()) {
3981 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
3982 int r
= mdcache
->path_traverse(mdr
, cf
, mdr
->get_filepath(),
3983 (want_auth
? MDS_TRAVERSE_WANT_AUTH
: 0),
3984 &mdr
->dn
[0], &mdr
->in
[0]);
3989 // fall-thru. let rdlock_path_pin_ref() check again.
3990 } else if (is_lookup
) {
3991 CDentry
* dn
= mdr
->dn
[0].back();
3993 auto em
= dn
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
3995 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
3997 dout(20) << __func__
<< ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
3998 em
.first
->second
->add_request(mdr
);
3999 mdr
->mark_event("joining batch lookup");
4003 CInode
*in
= mdr
->in
[0];
4005 auto em
= in
->batch_ops
.emplace(std::piecewise_construct
, std::forward_as_tuple(mask
), std::forward_as_tuple());
4007 em
.first
->second
= std::make_unique
<Batch_Getattr_Lookup
>(this, mdr
);
4009 dout(20) << __func__
<< ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr
<< dendl
;
4010 em
.first
->second
->add_request(mdr
);
4011 mdr
->mark_event("joining batch getattr");
4017 CInode
*ref
= rdlock_path_pin_ref(mdr
, want_auth
, false);
4022 * if client currently holds the EXCL cap on a field, do not rdlock
4023 * it; client's stat() will result in valid info if _either_ EXCL
4024 * cap is held or MDS rdlocks and reads the value here.
4026 * handling this case here is easier than weakening rdlock
4027 * semantics... that would cause problems elsewhere.
4029 client_t client
= mdr
->get_client();
4031 Capability
*cap
= ref
->get_client_cap(client
);
4032 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
4033 mdr
->snapid
<= cap
->client_follows
))
4034 issued
= cap
->issued();
4037 MutationImpl::LockOpVec lov
;
4038 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
4039 lov
.add_rdlock(&ref
->linklock
);
4040 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
4041 lov
.add_rdlock(&ref
->authlock
);
4042 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
4043 lov
.add_rdlock(&ref
->xattrlock
);
4044 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
4045 // Don't wait on unstable filelock if client is allowed to read file size.
4046 // This can reduce the response time of getattr in the case that multiple
4047 // clients do stat(2) and there are writers.
4048 // The downside of this optimization is that mds may not issue Fs caps along
4049 // with getattr reply. Client may need to send more getattr requests.
4050 if (mdr
->is_rdlocked(&ref
->filelock
)) {
4051 lov
.add_rdlock(&ref
->filelock
);
4052 } else if (ref
->filelock
.is_stable() ||
4053 ref
->filelock
.get_num_wrlocks() > 0 ||
4054 !ref
->filelock
.can_read(mdr
->get_client())) {
4055 /* Since we're taking advantage of an optimization here:
4057 * We cannot suddenly, due to a changing condition, add this filelock as
4058 * it can cause lock-order deadlocks. In this case, that condition is the
4059 * lock state changes between request retries. If that happens, we need
4060 * to check if we've acquired the other locks in this vector. If we have,
4061 * then we need to drop those locks and retry.
4063 if (mdr
->is_rdlocked(&ref
->linklock
) ||
4064 mdr
->is_rdlocked(&ref
->authlock
) ||
4065 mdr
->is_rdlocked(&ref
->xattrlock
)) {
4067 dout(20) << " dropping locks and restarting request because filelock state change" << dendl
;
4068 mds
->locker
->drop_locks(mdr
.get());
4069 mdr
->drop_local_auth_pins();
4070 mds
->queue_waiter(new C_MDS_RetryRequest(mdcache
, mdr
));
4073 lov
.add_rdlock(&ref
->filelock
);
4074 mdr
->locking_state
&= ~MutationImpl::ALL_LOCKED
;
4078 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4081 if (!check_access(mdr
, ref
, MAY_READ
))
4084 utime_t now
= ceph_clock_now();
4085 mdr
->set_mds_stamp(now
);
4087 // note which caps are requested, so we return at least a snapshot
4088 // value for them. (currently this matters for xattrs and inline data)
4089 mdr
->getattr_caps
= mask
;
4091 mds
->balancer
->hit_inode(ref
, META_POP_IRD
);
4094 dout(10) << "reply to stat on " << *req
<< dendl
;
4097 mdr
->tracedn
= mdr
->dn
[0].back();
4098 respond_to_request(mdr
, 0);
4101 struct C_MDS_LookupIno2
: public ServerContext
{
4103 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
4104 void finish(int r
) override
{
4105 server
->_lookup_ino_2(mdr
, r
);
4112 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
4113 bool want_parent
, bool want_dentry
)
4115 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4117 if ((uint64_t)req
->head
.args
.lookupino
.snapid
> 0)
4118 return _lookup_snap_ino(mdr
);
4120 inodeno_t ino
= req
->get_filepath().get_ino();
4121 auto _ino
= ino
.val
;
4123 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4124 * I do not have an explanation for how that happened organically but this
4125 * check will ensure that the client can no longer do that.
4127 * [1] https://tracker.ceph.com/issues/49922
4129 if (MDS_IS_PRIVATE_INO(_ino
)) {
4130 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4134 CInode
*in
= mdcache
->get_inode(ino
);
4135 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
4136 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4140 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
4144 // check for nothing (not read or write); this still applies the
4146 if (!check_access(mdr
, in
, 0))
4149 CDentry
*dn
= in
->get_projected_parent_dn();
4150 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
4152 MutationImpl::LockOpVec lov
;
4153 if (dn
&& (want_parent
|| want_dentry
)) {
4155 lov
.add_rdlock(&dn
->lock
);
4158 unsigned mask
= req
->head
.args
.lookupino
.mask
;
4160 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
4162 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4163 issued
= cap
->issued();
4165 // permission bits, ACL/security xattrs
4166 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4167 lov
.add_rdlock(&in
->authlock
);
4168 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4169 lov
.add_rdlock(&in
->xattrlock
);
4171 mdr
->getattr_caps
= mask
;
4175 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4179 // need read access to directory inode
4180 if (!check_access(mdr
, diri
, MAY_READ
))
4186 if (in
->is_base()) {
4187 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4190 if (!diri
|| diri
->is_stray()) {
4191 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4194 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
4196 respond_to_request(mdr
, 0);
4199 inodeno_t dirino
= req
->get_filepath2().get_ino();
4200 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
4201 respond_to_request(mdr
, -CEPHFS_ENOENT
);
4204 dout(10) << "reply to lookup_name " << *in
<< dendl
;
4206 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
4211 respond_to_request(mdr
, 0);
4215 void Server::_lookup_snap_ino(MDRequestRef
& mdr
)
4217 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4220 vino
.ino
= req
->get_filepath().get_ino();
4221 vino
.snapid
= (__u64
)req
->head
.args
.lookupino
.snapid
;
4222 inodeno_t parent_ino
= (__u64
)req
->head
.args
.lookupino
.parent
;
4223 __u32 hash
= req
->head
.args
.lookupino
.hash
;
4225 dout(7) << "lookup_snap_ino " << vino
<< " parent " << parent_ino
<< " hash " << hash
<< dendl
;
4227 CInode
*in
= mdcache
->lookup_snap_inode(vino
);
4229 in
= mdcache
->get_inode(vino
.ino
);
4231 if (in
->state_test(CInode::STATE_PURGING
) ||
4232 !in
->has_snap_data(vino
.snapid
)) {
4233 if (in
->is_dir() || !parent_ino
) {
4234 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4243 dout(10) << "reply to lookup_snap_ino " << *in
<< dendl
;
4244 mdr
->snapid
= vino
.snapid
;
4246 respond_to_request(mdr
, 0);
4250 CInode
*diri
= NULL
;
4252 diri
= mdcache
->get_inode(parent_ino
);
4254 mdcache
->open_ino(parent_ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
));
4258 if (!diri
->is_dir()) {
4259 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4263 MutationImpl::LockOpVec lov
;
4264 lov
.add_rdlock(&diri
->dirfragtreelock
);
4265 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4268 frag_t frag
= diri
->dirfragtree
[hash
];
4269 CDir
*dir
= try_open_auth_dirfrag(diri
, frag
, mdr
);
4273 if (!dir
->is_complete()) {
4274 if (dir
->is_frozen()) {
4275 mds
->locker
->drop_locks(mdr
.get());
4276 mdr
->drop_local_auth_pins();
4277 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4280 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4284 respond_to_request(mdr
, -CEPHFS_ESTALE
);
4286 mdcache
->open_ino(vino
.ino
, mds
->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr
), false);
4290 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
4292 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
4293 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
4295 // `r` is a rank if >=0, else an error code
4297 mds_rank_t
dest_rank(r
);
4298 if (dest_rank
== mds
->get_nodeid())
4299 dispatch_client_request(mdr
);
4301 mdcache
->request_forward(mdr
, dest_rank
);
4306 if (r
== -CEPHFS_ENOENT
|| r
== -CEPHFS_ENODATA
)
4308 respond_to_request(mdr
, r
);
4312 /* This function takes responsibility for the passed mdr*/
4313 void Server::handle_client_open(MDRequestRef
& mdr
)
4315 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4316 dout(7) << "open on " << req
->get_filepath() << dendl
;
4318 int flags
= req
->head
.args
.open
.flags
;
4319 int cmode
= ceph_flags_to_mode(flags
);
4321 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4325 bool need_auth
= !file_mode_is_readonly(cmode
) ||
4326 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
4328 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
4329 dout(7) << "read-only FS" << dendl
;
4330 respond_to_request(mdr
, -CEPHFS_EROFS
);
4334 CInode
*cur
= rdlock_path_pin_ref(mdr
, need_auth
);
4338 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
4339 ceph_assert(!need_auth
);
4340 mdr
->locking_state
&= ~(MutationImpl::PATH_LOCKED
| MutationImpl::ALL_LOCKED
);
4341 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
4346 if (!cur
->is_file()) {
4347 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4348 cmode
= CEPH_FILE_MODE_PIN
;
4349 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4350 if (cur
->is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
4351 flags
&= ~CEPH_O_TRUNC
;
4354 dout(10) << "open flags = " << flags
4355 << ", filemode = " << cmode
4356 << ", need_auth = " << need_auth
4360 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4361 dout(7) << "not a file or dir " << *cur << dendl;
4362 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4365 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->is_dir() && !cur
->is_symlink()) {
4366 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
4367 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4371 if ((flags
& CEPH_O_TRUNC
) && !cur
->is_file()) {
4372 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
4373 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4374 respond_to_request(mdr
, cur
->is_dir() ? -CEPHFS_EISDIR
: -CEPHFS_EINVAL
);
4378 if (cur
->get_inode()->inline_data
.version
!= CEPH_INLINE_NONE
&&
4379 !mdr
->session
->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4380 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
4381 respond_to_request(mdr
, -CEPHFS_EPERM
);
4385 // snapped data is read only
4386 if (mdr
->snapid
!= CEPH_NOSNAP
&&
4387 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
4388 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
4389 respond_to_request(mdr
, -CEPHFS_EROFS
);
4393 MutationImpl::LockOpVec lov
;
4394 lov
.add_rdlock(&cur
->snaplock
);
4396 unsigned mask
= req
->head
.args
.open
.mask
;
4398 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
4400 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
4401 issued
= cap
->issued();
4402 // permission bits, ACL/security xattrs
4403 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
4404 lov
.add_rdlock(&cur
->authlock
);
4405 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
4406 lov
.add_rdlock(&cur
->xattrlock
);
4408 mdr
->getattr_caps
= mask
;
4412 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
4413 ceph_assert(cur
->is_auth());
4415 lov
.add_xlock(&cur
->filelock
);
4416 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4419 if (!check_access(mdr
, cur
, MAY_WRITE
))
4422 // wait for pending truncate?
4423 const auto& pi
= cur
->get_projected_inode();
4424 if (pi
->is_truncating()) {
4425 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4426 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4427 mds
->locker
->drop_locks(mdr
.get());
4428 mdr
->drop_local_auth_pins();
4429 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4433 do_open_truncate(mdr
, cmode
);
4437 // sync filelock if snapped.
4438 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4439 // and that data itself is flushed so that we can read the snapped data off disk.
4440 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
4441 lov
.add_rdlock(&cur
->filelock
);
4444 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4448 if (cmode
& CEPH_FILE_MODE_WR
)
4450 if (!check_access(mdr
, cur
, mask
))
4453 utime_t now
= ceph_clock_now();
4454 mdr
->set_mds_stamp(now
);
4456 if (cur
->is_file() || cur
->is_dir()) {
4457 if (mdr
->snapid
== CEPH_NOSNAP
) {
4459 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
, nullptr);
4461 dout(12) << "open issued caps " << ccap_string(cap
->pending())
4462 << " for " << req
->get_source()
4463 << " on " << *cur
<< dendl
;
4465 int caps
= ceph_caps_for_mode(cmode
);
4466 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
4467 << " for " << req
->get_source()
4468 << " snapid " << mdr
->snapid
4469 << " on " << *cur
<< dendl
;
4470 mdr
->snap_caps
= caps
;
4474 // increase max_size?
4475 if (cmode
& CEPH_FILE_MODE_WR
)
4476 mds
->locker
->check_inode_max_size(cur
);
4478 // make sure this inode gets into the journal
4479 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
4480 mdcache
->open_file_table
.should_log_open(cur
)) {
4481 EOpen
*le
= new EOpen(mds
->mdlog
);
4482 mdlog
->start_entry(le
);
4483 le
->add_clean_inode(cur
);
4484 mdlog
->submit_entry(le
);
4488 if (cmode
& CEPH_FILE_MODE_WR
)
4489 mds
->balancer
->hit_inode(cur
, META_POP_IWR
);
4491 mds
->balancer
->hit_inode(cur
, META_POP_IRD
);
4494 if (req
->get_dentry_wanted()) {
4495 ceph_assert(mdr
->dn
[0].size());
4496 dn
= mdr
->dn
[0].back();
4501 respond_to_request(mdr
, 0);
4504 class C_MDS_openc_finish
: public ServerLogContext
{
4508 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4509 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4510 void finish(int r
) override
{
4511 ceph_assert(r
== 0);
4513 // crash current MDS and the replacing MDS will test the journal
4514 ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable
);
4516 dn
->pop_projected_linkage();
4518 // dirty inode, dn, dir
4519 newi
->mark_dirty(mdr
->ls
);
4520 newi
->mark_dirty_parent(mdr
->ls
, true);
4524 get_mds()->locker
->share_inode_max_size(newi
);
4526 MDRequestRef null_ref
;
4527 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4529 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
4531 server
->respond_to_request(mdr
, 0);
4533 ceph_assert(g_conf()->mds_kill_openc_at
!= 1);
4537 /* This function takes responsibility for the passed mdr*/
4538 void Server::handle_client_openc(MDRequestRef
& mdr
)
4540 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4541 client_t client
= mdr
->get_client();
4543 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
4545 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
4547 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4551 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
4552 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, !excl
, true, true);
4556 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
4557 if (!excl
&& !dnl
->is_null()) {
4559 ceph_assert(mdr
.get()->is_rdlocked(&dn
->lock
));
4561 handle_client_open(mdr
);
4565 ceph_assert(dnl
->is_null());
4567 if (req
->get_alternate_name().size() > alternate_name_max
) {
4568 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
4569 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
4572 dn
->set_alternate_name(req
->get_alternate_name());
4575 file_layout_t layout
;
4576 if (mdr
->dir_layout
!= file_layout_t())
4577 layout
= mdr
->dir_layout
;
4579 layout
= mdcache
->default_file_layout
;
4581 // What kind of client caps are required to complete this operation
4582 uint64_t access
= MAY_WRITE
;
4584 const auto default_layout
= layout
;
4586 // fill in any special params from client
4587 if (req
->head
.args
.open
.stripe_unit
)
4588 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
4589 if (req
->head
.args
.open
.stripe_count
)
4590 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
4591 if (req
->head
.args
.open
.object_size
)
4592 layout
.object_size
= req
->head
.args
.open
.object_size
;
4593 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
4594 (__s32
)req
->head
.args
.open
.pool
>= 0) {
4595 layout
.pool_id
= req
->head
.args
.open
.pool
;
4597 // make sure we have as new a map as the client
4598 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4599 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4604 // If client doesn't have capability to modify layout pools, then
4605 // only permit this request if the requested pool matches what the
4606 // file would have inherited anyway from its parent.
4607 if (default_layout
!= layout
) {
4608 access
|= MAY_SET_VXATTR
;
4611 if (!layout
.is_valid()) {
4612 dout(10) << " invalid initial file layout" << dendl
;
4613 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4616 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4617 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4618 respond_to_request(mdr
, -CEPHFS_EINVAL
);
4623 CDir
*dir
= dn
->get_dir();
4624 CInode
*diri
= dir
->get_inode();
4625 if (!check_access(mdr
, diri
, access
))
4627 if (!check_fragment_space(mdr
, dir
))
4629 if (!check_dir_max_entries(mdr
, dir
))
4632 if (mdr
->dn
[0].size() == 1)
4633 mds
->locker
->create_lock_cache(mdr
, diri
, &mdr
->dir_layout
);
4636 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4637 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
4641 dn
->push_projected_linkage(newi
);
4643 auto _inode
= newi
->_get_inode();
4644 _inode
->version
= dn
->pre_dirty();
4645 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4646 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
4647 _inode
->update_backtrace();
4648 _inode
->rstat
.rfiles
= 1;
4649 _inode
->accounted_rstat
= _inode
->rstat
;
4651 SnapRealm
*realm
= diri
->find_snaprealm();
4652 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
4653 ceph_assert(follows
>= realm
->get_newest_seq());
4655 ceph_assert(dn
->first
== follows
+1);
4656 newi
->first
= dn
->first
;
4659 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
4660 newi
->authlock
.set_state(LOCK_EXCL
);
4661 newi
->xattrlock
.set_state(LOCK_EXCL
);
4663 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
4664 _inode
->client_ranges
[client
].range
.first
= 0;
4665 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
4666 _inode
->client_ranges
[client
].follows
= follows
;
4667 newi
->mark_clientwriteable();
4668 cap
->mark_clientwriteable();
4672 mdr
->ls
= mdlog
->get_current_segment();
4673 EUpdate
*le
= new EUpdate(mdlog
, "openc");
4674 mdlog
->start_entry(le
);
4675 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4676 journal_allocated_inos(mdr
, &le
->metablob
);
4677 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4678 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4680 // make sure this inode gets into the journal
4681 le
->metablob
.add_opened_ino(newi
->ino());
4683 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, newi
);
4685 if (mdr
->session
->info
.has_feature(CEPHFS_FEATURE_DELEG_INO
)) {
4686 openc_response_t ocresp
;
4688 dout(10) << "adding created_ino and delegated_inos" << dendl
;
4689 ocresp
.created_ino
= _inode
->ino
;
4691 if (delegate_inos_pct
&& !req
->is_queued_for_replay()) {
4692 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4693 unsigned frac
= 100 / delegate_inos_pct
;
4694 if (mdr
->session
->delegated_inos
.size() < (unsigned)g_conf()->mds_client_prealloc_inos
/ frac
/ 2)
4695 mdr
->session
->delegate_inos(g_conf()->mds_client_prealloc_inos
/ frac
, ocresp
.delegated_inos
);
4698 encode(ocresp
, mdr
->reply_extra_bl
);
4699 } else if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
4700 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
4701 // add the file created flag onto the reply if create_flags features is supported
4702 encode(newi
->ino(), mdr
->reply_extra_bl
);
4705 journal_and_reply(mdr
, newi
, dn
, le
, fin
);
4707 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4708 // have overshot the split size (multiple opencs in flight), so here is
4709 // an early chance to split the dir if this openc makes it oversized.
4710 mds
->balancer
->maybe_fragment(dir
, false);
4714 void Server::_finalize_readdir(MDRequestRef
& mdr
,
4724 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4725 Session
*session
= mds
->get_session(req
);
4727 session
->touch_readdir_cap(numfiles
);
4730 flags
|= CEPH_READDIR_FRAG_END
;
4732 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
4735 // finish final blob
4736 encode(numfiles
, dirbl
);
4737 encode(flags
, dirbl
);
4738 dirbl
.claim_append(dnbl
);
4741 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
4742 << " bytes=" << dirbl
.length()
4743 << " start=" << (int)start
4744 << " end=" << (int)end
4746 mdr
->reply_extra_bl
= dirbl
;
4748 // bump popularity. NOTE: this doesn't quite capture it.
4749 mds
->balancer
->hit_dir(dir
, META_POP_READDIR
, numfiles
);
4753 respond_to_request(mdr
, 0);
4756 void Server::handle_client_readdir(MDRequestRef
& mdr
)
4758 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
4759 Session
*session
= mds
->get_session(req
);
4760 client_t client
= req
->get_source().num();
4761 MutationImpl::LockOpVec lov
;
4762 CInode
*diri
= rdlock_path_pin_ref(mdr
, false, true);
4765 // it's a directory, right?
4766 if (!diri
->is_dir()) {
4768 dout(10) << "reply to " << *req
<< " readdir -CEPHFS_ENOTDIR" << dendl
;
4769 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
4773 auto num_caps
= session
->get_num_caps();
4774 auto session_cap_acquisition
= session
->get_cap_acquisition();
4776 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
4777 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
4778 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
4780 logger
->inc(l_mdss_cap_acquisition_throttle
);
4782 mdr
->mark_event("cap_acquisition_throttle");
4783 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
4787 lov
.add_rdlock(&diri
->filelock
);
4788 lov
.add_rdlock(&diri
->dirfragtreelock
);
4790 if (!mds
->locker
->acquire_locks(mdr
, lov
))
4793 if (!check_access(mdr
, diri
, MAY_READ
))
4797 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
4798 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
4799 string offset_str
= req
->get_path2();
4801 __u32 offset_hash
= 0;
4802 if (!offset_str
.empty())
4803 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
4805 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
4807 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
4808 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
4810 // does the frag exist?
4811 if (diri
->dirfragtree
[fg
.value()] != fg
) {
4813 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4814 if (fg
.contains((unsigned)offset_hash
)) {
4815 newfg
= diri
->dirfragtree
[offset_hash
];
4817 // client actually wants next frag
4818 newfg
= diri
->dirfragtree
[fg
.value()];
4822 newfg
= diri
->dirfragtree
[fg
.value()];
4824 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
4828 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
4832 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
4833 ceph_assert(dir
->is_auth());
4835 if (!dir
->is_complete()) {
4836 if (dir
->is_frozen()) {
4837 dout(7) << "dir is frozen " << *dir
<< dendl
;
4838 mds
->locker
->drop_locks(mdr
.get());
4839 mdr
->drop_local_auth_pins();
4840 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
4844 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
4845 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
4849 #ifdef MDS_VERIFY_FRAGSTAT
4850 dir
->verify_fragstat();
4853 utime_t now
= ceph_clock_now();
4854 mdr
->set_mds_stamp(now
);
4856 snapid_t snapid
= mdr
->snapid
;
4857 dout(10) << "snapid " << snapid
<< dendl
;
4859 SnapRealm
*realm
= diri
->find_snaprealm();
4861 unsigned max
= req
->head
.args
.readdir
.max_entries
;
4863 max
= dir
->get_num_any(); // whatever, something big.
4864 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
4866 // make sure at least one item can be encoded
4867 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
4872 ds
.frag
= dir
->get_frag();
4873 ds
.auth
= dir
->get_dir_auth().first
;
4874 if (dir
->is_auth() && !forward_all_requests_to_auth
)
4875 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
4877 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
4879 // count bytes available.
4880 // this isn't perfect, but we should capture the main variable/unbounded size items!
4881 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
4882 int bytes_left
= max_bytes
- front_bytes
;
4883 bytes_left
-= get_snap_trace(session
, realm
).length();
4885 // build dir contents
4888 bool start
= !offset_hash
&& offset_str
.empty();
4889 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4890 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
4891 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
4892 bool end
= (it
== dir
->end());
4893 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
4894 CDentry
*dn
= it
->second
;
4897 if (dn
->state_test(CDentry::STATE_PURGING
))
4900 bool dnp
= dn
->use_projected(client
, mdr
);
4901 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
4903 if (dnl
->is_null()) {
4904 if (dn
->get_num_ref() == 0 && !dn
->is_projected())
4905 dir
->remove_dentry(dn
);
4909 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
4910 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
4915 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
4916 if (!(offset_key
< dn
->key()))
4920 CInode
*in
= dnl
->get_inode();
4922 if (in
&& in
->ino() == CEPH_INO_CEPH
)
4926 // better for the MDS to do the work, if we think the client will stat any of these files.
4927 if (dnl
->is_remote() && !in
) {
4928 in
= mdcache
->get_inode(dnl
->get_remote_ino());
4930 dn
->link_remote(dnl
, in
);
4931 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
4932 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
4935 // touch everything i _do_ have
4936 for (auto &p
: *dir
) {
4937 if (!p
.second
->get_linkage()->is_null())
4938 mdcache
->lru
.lru_touch(p
.second
);
4941 // already issued caps and leases, reply immediately.
4942 if (dnbl
.length() > 0) {
4943 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
4944 dout(10) << " open remote dentry after caps were issued, stopping at "
4945 << dnbl
.length() << " < " << bytes_left
<< dendl
;
4949 mds
->locker
->drop_locks(mdr
.get());
4950 mdr
->drop_local_auth_pins();
4951 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
4957 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
4958 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
4962 unsigned start_len
= dnbl
.length();
4965 dout(12) << "including dn " << *dn
<< dendl
;
4966 encode(dn
->get_name(), dnbl
);
4967 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, dnbl
);
4970 dout(12) << "including inode in " << *in
<< " snap " << snapid
<< dendl
;
4971 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
4973 // chop off dn->name, lease
4974 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
4976 keep
.substr_of(dnbl
, 0, start_len
);
4980 ceph_assert(r
>= 0);
4984 mdcache
->lru
.lru_touch(dn
);
4987 // client only understand END and COMPLETE flags ?
4988 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
4989 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
4991 _finalize_readdir(mdr
, diri
, dir
, start
, end
, flags
, numfiles
, dirbl
, dnbl
);
4996 // ===============================================================================
5001 * finisher for basic inode updates
5003 class C_MDS_inode_update_finish
: public ServerLogContext
{
5005 bool truncating_smaller
, changed_ranges
, adjust_realm
;
5007 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
5008 bool sm
=false, bool cr
=false, bool ar
=false) :
5009 ServerLogContext(s
, r
), in(i
),
5010 truncating_smaller(sm
), changed_ranges(cr
), adjust_realm(ar
) { }
5011 void finish(int r
) override
{
5012 ceph_assert(r
== 0);
5014 int snap_op
= (in
->snaprealm
? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
);
5019 MDSRank
*mds
= get_mds();
5021 // notify any clients
5022 if (truncating_smaller
&& in
->get_inode()->is_truncating()) {
5023 mds
->locker
->issue_truncate(in
);
5024 mds
->mdcache
->truncate_inode(in
, mdr
->ls
);
5028 mds
->mdcache
->send_snap_update(in
, 0, snap_op
);
5029 mds
->mdcache
->do_realm_invalidate_and_update_notify(in
, snap_op
);
5032 get_mds()->balancer
->hit_inode(in
, META_POP_IWR
);
5034 server
->respond_to_request(mdr
, 0);
5037 get_mds()->locker
->share_inode_max_size(in
);
5041 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
5043 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5044 MutationImpl::LockOpVec lov
;
5046 // get the inode to operate on, and set up any locks needed for that
5047 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5051 lov
.add_xlock(&cur
->flocklock
);
5052 /* acquire_locks will return true if it gets the locks. If it fails,
5053 it will redeliver this request at a later date, so drop the request.
5055 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
5056 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
5060 // copy the lock change into a ceph_filelock so we can store/apply it
5061 ceph_filelock set_lock
;
5062 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
5063 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
5064 set_lock
.client
= req
->get_orig_source().num();
5065 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
5066 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
5067 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
5068 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
5070 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
5072 ceph_lock_state_t
*lock_state
= NULL
;
5073 bool interrupt
= false;
5075 // get the appropriate lock state
5076 switch (req
->head
.args
.filelock_change
.rule
) {
5077 case CEPH_LOCK_FLOCK_INTR
:
5080 case CEPH_LOCK_FLOCK
:
5081 lock_state
= cur
->get_flock_lock_state();
5084 case CEPH_LOCK_FCNTL_INTR
:
5087 case CEPH_LOCK_FCNTL
:
5088 lock_state
= cur
->get_fcntl_lock_state();
5092 dout(10) << "got unknown lock type " << set_lock
.type
5093 << ", dropping request!" << dendl
;
5094 respond_to_request(mdr
, -CEPHFS_EOPNOTSUPP
);
5098 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
5099 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
5100 list
<ceph_filelock
> activated_locks
;
5101 MDSContext::vec waiters
;
5102 if (lock_state
->is_waiting(set_lock
)) {
5103 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
5104 lock_state
->remove_waiting(set_lock
);
5105 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
5106 } else if (!interrupt
) {
5107 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
5108 lock_state
->remove_lock(set_lock
, activated_locks
);
5109 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
5111 mds
->queue_waiters(waiters
);
5113 respond_to_request(mdr
, 0);
5115 dout(10) << " lock attempt on " << set_lock
<< dendl
;
5116 bool deadlock
= false;
5117 if (mdr
->more()->flock_was_waiting
&&
5118 !lock_state
->is_waiting(set_lock
)) {
5119 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
5120 respond_to_request(mdr
, -CEPHFS_EINTR
);
5121 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
5122 dout(10) << " it failed on this attempt" << dendl
;
5123 // couldn't set lock right now
5125 respond_to_request(mdr
, -CEPHFS_EDEADLK
);
5126 } else if (!will_wait
) {
5127 respond_to_request(mdr
, -CEPHFS_EWOULDBLOCK
);
5129 dout(10) << " added to waiting list" << dendl
;
5130 ceph_assert(lock_state
->is_waiting(set_lock
));
5131 mdr
->more()->flock_was_waiting
= true;
5132 mds
->locker
->drop_locks(mdr
.get());
5133 mdr
->drop_local_auth_pins();
5134 mdr
->mark_event("failed to add lock, waiting");
5136 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
5139 respond_to_request(mdr
, 0);
5141 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
5144 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
5146 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5147 MutationImpl::LockOpVec lov
;
5149 // get the inode to operate on, and set up any locks needed for that
5150 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5154 /* acquire_locks will return true if it gets the locks. If it fails,
5155 it will redeliver this request at a later date, so drop the request.
5157 lov
.add_rdlock(&cur
->flocklock
);
5158 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
5159 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
5163 // copy the lock change into a ceph_filelock so we can store/apply it
5164 ceph_filelock checking_lock
;
5165 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
5166 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
5167 checking_lock
.client
= req
->get_orig_source().num();
5168 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
5169 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
5170 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
5172 // get the appropriate lock state
5173 ceph_lock_state_t
*lock_state
= NULL
;
5174 switch (req
->head
.args
.filelock_change
.rule
) {
5175 case CEPH_LOCK_FLOCK
:
5176 lock_state
= cur
->get_flock_lock_state();
5179 case CEPH_LOCK_FCNTL
:
5180 lock_state
= cur
->get_fcntl_lock_state();
5184 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
5185 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5188 lock_state
->look_for_lock(checking_lock
);
5191 encode(checking_lock
, lock_bl
);
5193 mdr
->reply_extra_bl
= lock_bl
;
5194 respond_to_request(mdr
, 0);
5197 void Server::handle_client_setattr(MDRequestRef
& mdr
)
5199 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5200 MutationImpl::LockOpVec lov
;
5201 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5204 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5205 respond_to_request(mdr
, -CEPHFS_EROFS
);
5208 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
5209 respond_to_request(mdr
, -CEPHFS_EPERM
);
5213 __u32 mask
= req
->head
.args
.setattr
.mask
;
5214 __u32 access_mask
= MAY_WRITE
;
5216 if (req
->get_header().version
< 6) {
5217 // No changes to fscrypted inodes by downrevved clients
5218 if (!cur
->get_inode()->fscrypt_auth
.empty()) {
5219 respond_to_request(mdr
, -CEPHFS_EPERM
);
5223 // Only allow fscrypt field changes by capable clients
5224 if (mask
& (CEPH_SETATTR_FSCRYPT_FILE
|CEPH_SETATTR_FSCRYPT_AUTH
)) {
5225 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5231 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
|CEPH_SETATTR_FSCRYPT_AUTH
|CEPH_SETATTR_KILL_SUID
|CEPH_SETATTR_KILL_SGID
))
5232 lov
.add_xlock(&cur
->authlock
);
5233 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
|CEPH_SETATTR_FSCRYPT_FILE
))
5234 lov
.add_xlock(&cur
->filelock
);
5235 if (mask
& CEPH_SETATTR_CTIME
)
5236 lov
.add_wrlock(&cur
->versionlock
);
5238 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5241 if ((mask
& CEPH_SETATTR_UID
) && (cur
->get_inode()->uid
!= req
->head
.args
.setattr
.uid
))
5242 access_mask
|= MAY_CHOWN
;
5244 if ((mask
& CEPH_SETATTR_GID
) && (cur
->get_inode()->gid
!= req
->head
.args
.setattr
.gid
))
5245 access_mask
|= MAY_CHGRP
;
5247 if (!check_access(mdr
, cur
, access_mask
))
5250 // trunc from bigger -> smaller?
5251 const auto& pip
= cur
->get_projected_inode();
5253 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
5255 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5256 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
5257 dout(20) << __func__
<< ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl
;
5258 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
5262 bool truncating_smaller
= false;
5263 if (mask
& CEPH_SETATTR_SIZE
) {
5264 if (req
->get_data().length() >
5265 sizeof(struct ceph_fscrypt_last_block_header
) + fscrypt_last_block_max_size
) {
5266 dout(10) << __func__
<< ": the last block size is too large" << dendl
;
5267 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5271 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
||
5272 (req
->head
.args
.setattr
.size
== old_size
&& req
->get_data().length());
5273 if (truncating_smaller
&& pip
->is_truncating()) {
5274 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
5275 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
5276 mds
->locker
->drop_locks(mdr
.get());
5277 mdr
->drop_local_auth_pins();
5278 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
5282 if (truncating_smaller
&& req
->get_data().length()) {
5283 struct ceph_fscrypt_last_block_header header
;
5284 memset(&header
, 0, sizeof(header
));
5285 auto bl
= req
->get_data().cbegin();
5286 DECODE_START(1, bl
);
5287 decode(header
.change_attr
, bl
);
5290 dout(20) << __func__
<< " mdr->retry:" << mdr
->retry
5291 << " header.change_attr: " << header
.change_attr
5292 << " header.file_offset: " << header
.file_offset
5293 << " header.block_size: " << header
.block_size
5296 if (header
.change_attr
!= pip
->change_attr
) {
5297 dout(5) << __func__
<< ": header.change_attr:" << header
.change_attr
5298 << " != current change_attr:" << pip
->change_attr
5299 << ", let client retry it!" << dendl
;
5300 // flush the journal to make sure the clients will get the lasted
5301 // change_attr as possible for the next retry
5302 mds
->mdlog
->flush();
5303 respond_to_request(mdr
, -CEPHFS_EAGAIN
);
5309 bool changed_ranges
= false;
5312 mdr
->ls
= mdlog
->get_current_segment();
5313 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
5314 mdlog
->start_entry(le
);
5316 auto pi
= cur
->project_inode(mdr
);
5318 if (mask
& CEPH_SETATTR_UID
)
5319 pi
.inode
->uid
= req
->head
.args
.setattr
.uid
;
5320 if (mask
& CEPH_SETATTR_GID
)
5321 pi
.inode
->gid
= req
->head
.args
.setattr
.gid
;
5323 if (mask
& CEPH_SETATTR_MODE
)
5324 pi
.inode
->mode
= (pi
.inode
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
5325 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
|
5326 CEPH_SETATTR_KILL_SUID
|CEPH_SETATTR_KILL_SGID
)) &&
5327 S_ISREG(pi
.inode
->mode
)) {
5328 if (mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
) &&
5329 (pi
.inode
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
5330 pi
.inode
->mode
&= ~(S_ISUID
|S_ISGID
);
5332 if (mask
& CEPH_SETATTR_KILL_SUID
) {
5333 pi
.inode
->mode
&= ~S_ISUID
;
5335 if (mask
& CEPH_SETATTR_KILL_SGID
) {
5336 pi
.inode
->mode
&= ~S_ISGID
;
5341 if (mask
& CEPH_SETATTR_MTIME
)
5342 pi
.inode
->mtime
= req
->head
.args
.setattr
.mtime
;
5343 if (mask
& CEPH_SETATTR_ATIME
)
5344 pi
.inode
->atime
= req
->head
.args
.setattr
.atime
;
5345 if (mask
& CEPH_SETATTR_BTIME
)
5346 pi
.inode
->btime
= req
->head
.args
.setattr
.btime
;
5347 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
5348 pi
.inode
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
5349 if (mask
& CEPH_SETATTR_SIZE
) {
5350 if (truncating_smaller
) {
5351 pi
.inode
->truncate(old_size
, req
->head
.args
.setattr
.size
, req
->get_data());
5352 le
->metablob
.add_truncate_start(cur
->ino());
5354 pi
.inode
->size
= req
->head
.args
.setattr
.size
;
5355 pi
.inode
->rstat
.rbytes
= pi
.inode
->size
;
5357 pi
.inode
->mtime
= mdr
->get_op_stamp();
5359 // adjust client's max_size?
5360 if (mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
->size
)) {
5361 dout(10) << " client_ranges " << cur
->get_previous_projected_inode()->client_ranges
5362 << " -> " << pi
.inode
->client_ranges
<< dendl
;
5363 changed_ranges
= true;
5367 if (mask
& CEPH_SETATTR_FSCRYPT_AUTH
)
5368 pi
.inode
->fscrypt_auth
= req
->fscrypt_auth
;
5369 if (mask
& CEPH_SETATTR_FSCRYPT_FILE
)
5370 pi
.inode
->fscrypt_file
= req
->fscrypt_file
;
5372 pi
.inode
->version
= cur
->pre_dirty();
5373 pi
.inode
->ctime
= mdr
->get_op_stamp();
5374 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5375 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5376 pi
.inode
->change_attr
++;
5379 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5380 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5381 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5383 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
5384 truncating_smaller
, changed_ranges
));
5386 // flush immediately if there are readers/writers waiting
5387 if (mdr
->is_xlocked(&cur
->filelock
) &&
5388 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
5389 mds
->mdlog
->flush();
5392 /* Takes responsibility for mdr */
5393 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
5395 CInode
*in
= mdr
->in
[0];
5396 client_t client
= mdr
->get_client();
5399 dout(10) << "do_open_truncate " << *in
<< dendl
;
5401 SnapRealm
*realm
= in
->find_snaprealm();
5402 Capability
*cap
= mds
->locker
->issue_new_caps(in
, cmode
, mdr
, realm
);
5404 mdr
->ls
= mdlog
->get_current_segment();
5405 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
5406 mdlog
->start_entry(le
);
5409 auto pi
= in
->project_inode(mdr
);
5410 pi
.inode
->version
= in
->pre_dirty();
5411 pi
.inode
->mtime
= pi
.inode
->ctime
= mdr
->get_op_stamp();
5412 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5413 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5414 pi
.inode
->change_attr
++;
5416 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
5418 pi
.inode
->truncate(old_size
, 0);
5419 le
->metablob
.add_truncate_start(in
->ino());
5422 bool changed_ranges
= false;
5423 if (cap
&& (cmode
& CEPH_FILE_MODE_WR
)) {
5424 pi
.inode
->client_ranges
[client
].range
.first
= 0;
5425 pi
.inode
->client_ranges
[client
].range
.last
= pi
.inode
->get_layout_size_increment();
5426 pi
.inode
->client_ranges
[client
].follows
= realm
->get_newest_seq();
5427 changed_ranges
= true;
5428 in
->mark_clientwriteable();
5429 cap
->mark_clientwriteable();
5432 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5434 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5435 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5437 // make sure ino gets into the journal
5438 le
->metablob
.add_opened_ino(in
->ino());
5440 mdr
->o_trunc
= true;
5443 if (mdr
->client_request
->get_dentry_wanted()) {
5444 ceph_assert(mdr
->dn
[0].size());
5445 dn
= mdr
->dn
[0].back();
5448 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
5450 // Although the `open` part can give an early reply, the truncation won't
5451 // happen until our EUpdate is persistent, to give the client a prompt
5452 // response we must also flush that event.
5457 /* This function cleans up the passed mdr */
5458 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
5460 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5461 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
5464 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5465 respond_to_request(mdr
, -CEPHFS_EROFS
);
5468 if (!cur
->is_file()) {
5469 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5472 if (cur
->get_projected_inode()->size
||
5473 cur
->get_projected_inode()->truncate_seq
> 1) {
5474 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
5479 file_layout_t layout
= cur
->get_projected_inode()->layout
;
5480 // save existing layout for later
5481 const auto old_layout
= layout
;
5483 int access
= MAY_WRITE
;
5485 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5486 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5487 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5488 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5489 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5490 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5491 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5492 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5494 // make sure we have as new a map as the client
5495 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5496 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5501 // Don't permit layout modifications without 'p' caps
5502 if (layout
!= old_layout
) {
5503 access
|= MAY_SET_VXATTR
;
5506 if (!layout
.is_valid()) {
5507 dout(10) << "bad layout" << dendl
;
5508 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5511 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5512 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5513 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5517 MutationImpl::LockOpVec lov
;
5518 lov
.add_xlock(&cur
->filelock
);
5519 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5522 if (!check_access(mdr
, cur
, access
))
5526 auto pi
= cur
->project_inode(mdr
);
5527 pi
.inode
->layout
= layout
;
5528 // add the old pool to the inode
5529 pi
.inode
->add_old_pool(old_layout
.pool_id
);
5530 pi
.inode
->version
= cur
->pre_dirty();
5531 pi
.inode
->ctime
= mdr
->get_op_stamp();
5532 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
5533 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
5534 pi
.inode
->change_attr
++;
5537 mdr
->ls
= mdlog
->get_current_segment();
5538 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5539 mdlog
->start_entry(le
);
5540 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5541 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5542 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5544 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5547 bool Server::xlock_policylock(MDRequestRef
& mdr
, CInode
*in
, bool want_layout
, bool xlock_snaplock
)
5549 if (mdr
->locking_state
& MutationImpl::ALL_LOCKED
)
5552 MutationImpl::LockOpVec lov
;
5553 lov
.add_xlock(&in
->policylock
);
5555 lov
.add_xlock(&in
->snaplock
);
5557 lov
.add_rdlock(&in
->snaplock
);
5558 if (!mds
->locker
->acquire_locks(mdr
, lov
))
5561 if (want_layout
&& in
->get_projected_inode()->has_layout()) {
5562 mdr
->dir_layout
= in
->get_projected_inode()->layout
;
5563 want_layout
= false;
5565 if (CDentry
*pdn
= in
->get_projected_parent_dn(); pdn
) {
5566 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 0, want_layout
))
5570 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
5574 CInode
* Server::try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
)
5576 CInode
*in
= mdcache
->get_inode(ino
);
5577 if (!in
|| in
->state_test(CInode::STATE_PURGING
)) {
5578 respond_to_request(mdr
, -CEPHFS_ESTALE
);
5581 if (!in
->is_auth()) {
5582 mdcache
->request_forward(mdr
, in
->authority().first
);
5589 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
5591 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5593 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5594 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
5598 if (!cur
->is_dir()) {
5599 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
5603 if (!xlock_policylock(mdr
, cur
, true))
5607 const auto& old_pi
= cur
->get_projected_inode();
5608 file_layout_t layout
;
5609 if (old_pi
->has_layout())
5610 layout
= old_pi
->layout
;
5611 else if (mdr
->dir_layout
!= file_layout_t())
5612 layout
= mdr
->dir_layout
;
5614 layout
= mdcache
->default_file_layout
;
5616 // Level of access required to complete
5617 int access
= MAY_WRITE
;
5619 const auto old_layout
= layout
;
5621 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
5622 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
5623 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
5624 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
5625 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
5626 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
5627 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
5628 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
5629 // make sure we have as new a map as the client
5630 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
5631 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
5636 if (layout
!= old_layout
) {
5637 access
|= MAY_SET_VXATTR
;
5640 if (!layout
.is_valid()) {
5641 dout(10) << "bad layout" << dendl
;
5642 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5645 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
5646 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
5647 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5651 if (!check_access(mdr
, cur
, access
))
5654 auto pi
= cur
->project_inode(mdr
);
5655 pi
.inode
->layout
= layout
;
5656 pi
.inode
->version
= cur
->pre_dirty();
5659 mdr
->ls
= mdlog
->get_current_segment();
5660 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
5661 mdlog
->start_entry(le
);
5662 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5663 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5664 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5666 mdr
->no_early_reply
= true;
5667 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5671 int Server::parse_layout_vxattr_json(
5672 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5674 auto parse_pool
= [&](std::string pool_name
, int64_t pool_id
) -> int64_t {
5675 if (pool_name
!= "") {
5676 int64_t _pool_id
= osdmap
.lookup_pg_pool_name(pool_name
);
5678 dout(10) << __func__
<< ": unknown pool name:" << pool_name
<< dendl
;
5679 return -CEPHFS_EINVAL
;
5682 } else if (pool_id
>= 0) {
5683 const auto pools
= osdmap
.get_pools();
5684 if (pools
.find(pool_id
) == pools
.end()) {
5685 dout(10) << __func__
<< ": unknown pool id:" << pool_id
<< dendl
;
5686 return -CEPHFS_EINVAL
;
5690 return -CEPHFS_EINVAL
;
5695 if (name
== "layout.json") {
5696 JSONParser json_parser
;
5697 if (json_parser
.parse(value
.c_str(), value
.length()) and json_parser
.is_object()) {
5700 field
= "object_size";
5701 JSONDecoder::decode_json("object_size", layout
->object_size
, &json_parser
, true);
5703 field
= "stripe_unit";
5704 JSONDecoder::decode_json("stripe_unit", layout
->stripe_unit
, &json_parser
, true);
5706 field
= "stripe_count";
5707 JSONDecoder::decode_json("stripe_count", layout
->stripe_count
, &json_parser
, true);
5709 field
= "pool_namespace";
5710 JSONDecoder::decode_json("pool_namespace", layout
->pool_ns
, &json_parser
, false);
5713 int64_t pool_id
= 0;
5714 JSONDecoder::decode_json("pool_id", pool_id
, &json_parser
, false);
5716 field
= "pool_name";
5717 std::string pool_name
;
5718 JSONDecoder::decode_json("pool_name", pool_name
, &json_parser
, false);
5720 pool_id
= parse_pool(pool_name
, pool_id
);
5722 return (int)pool_id
;
5724 layout
->pool_id
= pool_id
;
5725 } catch (JSONDecoder::err
&) {
5726 dout(10) << __func__
<< ": json is missing a mandatory field named "
5728 return -CEPHFS_EINVAL
;
5731 dout(10) << __func__
<< ": bad json" << dendl
;
5732 return -CEPHFS_EINVAL
;
5735 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5736 return -CEPHFS_ENODATA
; // no such attribute
5738 } catch (boost::bad_lexical_cast
const&) {
5739 dout(10) << __func__
<< ": bad vxattr value:" << value
5740 << ", unable to parse for xattr:" << name
<< dendl
;
5741 return -CEPHFS_EINVAL
;
5746 // parse old style layout string
5747 int Server::parse_layout_vxattr_string(
5748 string name
, string value
, const OSDMap
& osdmap
, file_layout_t
*layout
)
5751 if (name
== "layout") {
5752 string::iterator begin
= value
.begin();
5753 string::iterator end
= value
.end();
5754 keys_and_values
<string::iterator
> p
; // create instance of parser
5755 std::map
<string
, string
> m
; // map to receive results
5756 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5757 return -CEPHFS_EINVAL
;
5759 string
left(begin
, end
);
5760 dout(10) << __func__
<< ": parsed " << m
<< " left '" << left
<< "'" << dendl
;
5762 return -CEPHFS_EINVAL
;
5763 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5764 // Skip validation on each attr, we do it once at the end (avoid
5765 // rejecting intermediate states if the overall result is ok)
5766 int r
= parse_layout_vxattr_string(string("layout.") + q
->first
, q
->second
,
5771 } else if (name
== "layout.object_size") {
5772 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
5773 } else if (name
== "layout.stripe_unit") {
5774 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
5775 } else if (name
== "layout.stripe_count") {
5776 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
5777 } else if (name
== "layout.pool") {
5779 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
5780 } catch (boost::bad_lexical_cast
const&) {
5781 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
5783 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5784 return -CEPHFS_ENOENT
;
5786 layout
->pool_id
= pool
;
5788 } else if (name
== "layout.pool_id") {
5789 layout
->pool_id
= boost::lexical_cast
<int64_t>(value
);
5790 } else if (name
== "layout.pool_name") {
5791 layout
->pool_id
= osdmap
.lookup_pg_pool_name(value
);
5792 if (layout
->pool_id
< 0) {
5793 dout(10) << __func__
<< ": unknown pool " << value
<< dendl
;
5794 return -CEPHFS_EINVAL
;
5796 } else if (name
== "layout.pool_namespace") {
5797 layout
->pool_ns
= value
;
5799 dout(10) << __func__
<< ": unknown layout vxattr " << name
<< dendl
;
5800 return -CEPHFS_ENODATA
; // no such attribute
5802 } catch (boost::bad_lexical_cast
const&) {
5803 dout(10) << __func__
<< ": bad vxattr value, unable to parse int for "
5805 return -CEPHFS_EINVAL
;
5810 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
5811 file_layout_t
*layout
, bool validate
)
5813 dout(20) << __func__
<< ": name:" << name
<< " value:'" << value
<< "'" << dendl
;
5816 if (name
== "layout.json") {
5817 r
= parse_layout_vxattr_json(name
, value
, osdmap
, layout
);
5819 r
= parse_layout_vxattr_string(name
, value
, osdmap
, layout
);
5825 if (validate
&& !layout
->is_valid()) {
5826 dout(10) << __func__
<< ": bad layout" << dendl
;
5827 return -CEPHFS_EINVAL
;
5829 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
5830 dout(10) << __func__
<< ": invalid data pool " << layout
->pool_id
<< dendl
;
5831 return -CEPHFS_EINVAL
;
5836 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
5838 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
5840 if (name
== "quota") {
5841 string::iterator begin
= value
.begin();
5842 string::iterator end
= value
.end();
5844 // keep quota unchanged. (for create_quota_realm())
5847 keys_and_values
<string::iterator
> p
; // create instance of parser
5848 std::map
<string
, string
> m
; // map to receive results
5849 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
5850 return -CEPHFS_EINVAL
;
5852 string
left(begin
, end
);
5853 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
5855 return -CEPHFS_EINVAL
;
5856 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
5857 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
5861 } else if (name
== "quota.max_bytes") {
5862 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5864 return -CEPHFS_EINVAL
;
5865 quota
->max_bytes
= q
;
5866 } else if (name
== "quota.max_files") {
5867 int64_t q
= boost::lexical_cast
<int64_t>(value
);
5869 return -CEPHFS_EINVAL
;
5870 quota
->max_files
= q
;
5872 dout(10) << " unknown quota vxattr " << name
<< dendl
;
5873 return -CEPHFS_EINVAL
;
5875 } catch (boost::bad_lexical_cast
const&) {
5876 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
5877 return -CEPHFS_EINVAL
;
5880 if (!quota
->is_valid()) {
5881 dout(10) << "bad quota" << dendl
;
5882 return -CEPHFS_EINVAL
;
5887 void Server::create_quota_realm(CInode
*in
)
5889 dout(10) << __func__
<< " " << *in
<< dendl
;
5891 auto req
= make_message
<MClientRequest
>(CEPH_MDS_OP_SETXATTR
);
5892 req
->set_filepath(filepath(in
->ino()));
5893 req
->set_string2("ceph.quota");
5894 // empty vxattr value
5895 req
->set_tid(mds
->issue_tid());
5897 mds
->send_message_mds(req
, in
->authority().first
);
5901 * Verify that the file layout attribute carried by client
5902 * is well-formatted.
5903 * Return 0 on success, otherwise this function takes
5904 * responsibility for the passed mdr.
5906 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
5909 file_layout_t
*layout
)
5911 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5915 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
5916 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
5917 epoch
= osdmap
.get_epoch();
5920 if (r
== -CEPHFS_ENOENT
) {
5922 // we don't have the specified pool, make sure our map
5923 // is newer than or as new as the client.
5924 epoch_t req_epoch
= req
->get_osdmap_epoch();
5926 if (req_epoch
> epoch
) {
5928 // well, our map is older. consult mds.
5929 auto fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
5931 mds
->objecter
->wait_for_map(req_epoch
, lambdafy(fin
));
5933 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
5935 // For compatibility with client w/ old code, we still need get the
5936 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5937 // we can remove those code.
5938 mdr
->waited_for_osdmap
= true;
5939 mds
->objecter
->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5940 mds
, new C_MDS_RetryRequest(mdcache
, mdr
))));
5947 if (r
== -CEPHFS_ENOENT
)
5950 respond_to_request(mdr
, r
);
5958 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
5960 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
5961 MutationImpl::LockOpVec lov
;
5962 string
name(req
->get_path2());
5963 bufferlist bl
= req
->get_data();
5964 string
value (bl
.c_str(), bl
.length());
5965 dout(10) << "handle_set_vxattr " << name
5966 << " val " << value
.length()
5967 << " bytes on " << *cur
5970 CInode::mempool_inode
*pip
= nullptr;
5973 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
5977 bool adjust_realm
= false;
5978 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
5979 if (!cur
->is_dir()) {
5980 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5984 if (!xlock_policylock(mdr
, cur
, true))
5987 /* We need 'As' caps for the fscrypt context */
5988 lov
.add_xlock(&cur
->authlock
);
5989 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
5993 /* encrypted directories can't have their layout changed */
5994 if (!cur
->get_inode()->fscrypt_auth
.empty()) {
5995 respond_to_request(mdr
, -CEPHFS_EINVAL
);
5999 file_layout_t layout
;
6000 if (cur
->get_projected_inode()->has_layout())
6001 layout
= cur
->get_projected_inode()->layout
;
6002 else if (mdr
->dir_layout
!= file_layout_t())
6003 layout
= mdr
->dir_layout
;
6005 layout
= mdcache
->default_file_layout
;
6007 rest
= name
.substr(name
.find("layout"));
6008 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
6011 auto pi
= cur
->project_inode(mdr
);
6012 pi
.inode
->layout
= layout
;
6013 mdr
->no_early_reply
= true;
6014 pip
= pi
.inode
.get();
6015 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
6016 if (!cur
->is_file()) {
6017 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6020 if (cur
->get_projected_inode()->size
||
6021 cur
->get_projected_inode()->truncate_seq
> 1) {
6022 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
6025 file_layout_t layout
= cur
->get_projected_inode()->layout
;
6026 rest
= name
.substr(name
.find("layout"));
6027 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
6030 lov
.add_xlock(&cur
->filelock
);
6031 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6034 /* encrypted files can't have their layout changed */
6035 if (!cur
->get_inode()->fscrypt_auth
.empty()) {
6036 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6040 auto pi
= cur
->project_inode(mdr
);
6041 int64_t old_pool
= pi
.inode
->layout
.pool_id
;
6042 pi
.inode
->add_old_pool(old_pool
);
6043 pi
.inode
->layout
= layout
;
6044 pip
= pi
.inode
.get();
6045 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
6046 if (!cur
->is_dir()) {
6047 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6051 quota_info_t quota
= cur
->get_projected_inode()->quota
;
6053 rest
= name
.substr(name
.find("quota"));
6054 int r
= parse_quota_vxattr(rest
, value
, "a
);
6056 respond_to_request(mdr
, r
);
6060 if (quota
.is_enabled() && !cur
->get_projected_srnode())
6061 adjust_realm
= true;
6063 if (!xlock_policylock(mdr
, cur
, false, adjust_realm
))
6066 if (cur
->get_projected_inode()->quota
== quota
) {
6067 respond_to_request(mdr
, 0);
6071 auto pi
= cur
->project_inode(mdr
, false, adjust_realm
);
6072 pi
.inode
->quota
= quota
;
6075 pi
.snapnode
->created
= pi
.snapnode
->seq
= cur
->find_snaprealm()->get_newest_seq();
6077 mdr
->no_early_reply
= true;
6078 pip
= pi
.inode
.get();
6080 client_t exclude_ct
= mdr
->get_client();
6081 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
, true);
6082 } else if (name
== "ceph.dir.subvolume"sv
) {
6083 if (!cur
->is_dir()) {
6084 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6090 val
= boost::lexical_cast
<bool>(value
);
6091 } catch (boost::bad_lexical_cast
const&) {
6092 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
6093 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6097 /* Verify it's not already a subvolume with lighter weight
6100 if (!mdr
->more()->rdonly_checks
) {
6101 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
6102 lov
.add_rdlock(&cur
->snaplock
);
6103 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6105 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
6107 const auto srnode
= cur
->get_projected_srnode();
6108 if (val
== (srnode
&& srnode
->is_subvolume())) {
6109 dout(20) << "already marked subvolume" << dendl
;
6110 respond_to_request(mdr
, 0);
6113 mdr
->more()->rdonly_checks
= true;
6116 if ((mdr
->locking_state
& MutationImpl::ALL_LOCKED
) && !mdr
->is_xlocked(&cur
->snaplock
)) {
6117 /* drop the rdlock and acquire xlocks */
6118 dout(20) << "dropping rdlocks" << dendl
;
6119 mds
->locker
->drop_locks(mdr
.get());
6120 if (!xlock_policylock(mdr
, cur
, false, true))
6124 /* repeat rdonly checks in case changed between rdlock -> xlock */
6125 SnapRealm
*realm
= cur
->find_snaprealm();
6127 inodeno_t subvol_ino
= realm
->get_subvolume_ino();
6128 // can't create subvolume inside another subvolume
6129 if (subvol_ino
&& subvol_ino
!= cur
->ino()) {
6130 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6135 const auto srnode
= cur
->get_projected_srnode();
6136 if (val
== (srnode
&& srnode
->is_subvolume())) {
6137 respond_to_request(mdr
, 0);
6141 auto pi
= cur
->project_inode(mdr
, false, true);
6143 pi
.snapnode
->created
= pi
.snapnode
->seq
= realm
->get_newest_seq();
6145 pi
.snapnode
->mark_subvolume();
6147 pi
.snapnode
->clear_subvolume();
6149 mdr
->no_early_reply
= true;
6150 pip
= pi
.inode
.get();
6151 adjust_realm
= true;
6152 } else if (name
== "ceph.dir.pin"sv
) {
6153 if (!cur
->is_dir() || cur
->is_root()) {
6154 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6160 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
6161 if (rank
< 0) rank
= MDS_RANK_NONE
;
6162 else if (rank
>= MAX_MDS
) {
6163 respond_to_request(mdr
, -CEPHFS_EDOM
);
6166 } catch (boost::bad_lexical_cast
const&) {
6167 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
6168 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6172 if (!xlock_policylock(mdr
, cur
))
6175 auto pi
= cur
->project_inode(mdr
);
6176 cur
->set_export_pin(rank
);
6177 pip
= pi
.inode
.get();
6178 } else if (name
== "ceph.dir.pin.random"sv
) {
6179 if (!cur
->is_dir() || cur
->is_root()) {
6180 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6186 val
= boost::lexical_cast
<double>(value
);
6187 } catch (boost::bad_lexical_cast
const&) {
6188 dout(10) << "bad vxattr value, unable to parse float for " << name
<< dendl
;
6189 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6193 if (val
< 0.0 || 1.0 < val
) {
6194 respond_to_request(mdr
, -CEPHFS_EDOM
);
6196 } else if (mdcache
->export_ephemeral_random_max
< val
) {
6197 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6201 if (!xlock_policylock(mdr
, cur
))
6204 auto pi
= cur
->project_inode(mdr
);
6205 cur
->setxattr_ephemeral_rand(val
);
6206 pip
= pi
.inode
.get();
6207 } else if (name
== "ceph.dir.pin.distributed"sv
) {
6208 if (!cur
->is_dir() || cur
->is_root()) {
6209 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6215 val
= boost::lexical_cast
<bool>(value
);
6216 } catch (boost::bad_lexical_cast
const&) {
6217 dout(10) << "bad vxattr value, unable to parse bool for " << name
<< dendl
;
6218 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6222 if (!xlock_policylock(mdr
, cur
))
6225 auto pi
= cur
->project_inode(mdr
);
6226 cur
->setxattr_ephemeral_dist(val
);
6227 pip
= pi
.inode
.get();
6229 dout(10) << " unknown vxattr " << name
<< dendl
;
6230 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6235 pip
->ctime
= mdr
->get_op_stamp();
6236 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
6237 pip
->rstat
.rctime
= mdr
->get_op_stamp();
6238 pip
->version
= cur
->pre_dirty();
6240 pip
->update_backtrace();
6243 mdr
->ls
= mdlog
->get_current_segment();
6244 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
6245 mdlog
->start_entry(le
);
6246 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6247 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6248 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6250 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
6251 false, false, adjust_realm
));
6255 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
)
6257 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6258 string
name(req
->get_path2());
6260 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
6262 if (name
== "ceph.dir.layout") {
6263 if (!cur
->is_dir()) {
6264 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6267 if (cur
->is_root()) {
6268 dout(10) << "can't remove layout policy on the root directory" << dendl
;
6269 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6273 if (!cur
->get_projected_inode()->has_layout()) {
6274 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6278 MutationImpl::LockOpVec lov
;
6279 lov
.add_xlock(&cur
->policylock
);
6280 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6283 auto pi
= cur
->project_inode(mdr
);
6284 pi
.inode
->clear_layout();
6285 pi
.inode
->version
= cur
->pre_dirty();
6288 mdr
->ls
= mdlog
->get_current_segment();
6289 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
6290 mdlog
->start_entry(le
);
6291 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6292 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6293 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6295 mdr
->no_early_reply
= true;
6296 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6298 } else if (name
== "ceph.dir.layout.pool_namespace"
6299 || name
== "ceph.file.layout.pool_namespace") {
6300 // Namespace is the only layout field that has a meaningful
6301 // null/none value (empty string, means default layout). Is equivalent
6302 // to a setxattr with empty string: pass through the empty payload of
6303 // the rmxattr request to do this.
6304 handle_set_vxattr(mdr
, cur
);
6308 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6311 const Server::XattrHandler
Server::xattr_handlers
[] = {
6313 xattr_name
: Server::DEFAULT_HANDLER
,
6314 description
: "default xattr handler",
6315 validate
: &Server::default_xattr_validate
,
6316 setxattr
: &Server::default_setxattr_handler
,
6317 removexattr
: &Server::default_removexattr_handler
,
6320 xattr_name
: "ceph.mirror.info",
6321 description
: "mirror info xattr handler",
6322 validate
: &Server::mirror_info_xattr_validate
,
6323 setxattr
: &Server::mirror_info_setxattr_handler
,
6324 removexattr
: &Server::mirror_info_removexattr_handler
6328 const Server::XattrHandler
* Server::get_xattr_or_default_handler(std::string_view xattr_name
) {
6329 const XattrHandler
*default_xattr_handler
= nullptr;
6331 for (auto &handler
: xattr_handlers
) {
6332 if (handler
.xattr_name
== Server::DEFAULT_HANDLER
) {
6333 ceph_assert(default_xattr_handler
== nullptr);
6334 default_xattr_handler
= &handler
;
6336 if (handler
.xattr_name
== xattr_name
) {
6337 dout(20) << "handler=" << handler
.description
<< dendl
;
6342 ceph_assert(default_xattr_handler
!= nullptr);
6343 dout(20) << "handler=" << default_xattr_handler
->description
<< dendl
;
6344 return default_xattr_handler
;
6347 int Server::xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6348 const std::string
&xattr_name
, int op
, int flags
) {
6349 if (op
== CEPH_MDS_OP_SETXATTR
) {
6351 if ((flags
& CEPH_XATTR_CREATE
) && xattrs
->count(mempool::mds_co::string(xattr_name
))) {
6352 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur
<< dendl
;
6353 return -CEPHFS_EEXIST
;
6356 if ((flags
& CEPH_XATTR_REPLACE
) && !(xattrs
&& xattrs
->count(mempool::mds_co::string(xattr_name
)))) {
6357 dout(10) << "setxattr '" << xattr_name
<< "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur
<< dendl
;
6358 return -CEPHFS_ENODATA
;
6364 if (op
== CEPH_MDS_OP_RMXATTR
) {
6365 if (!xattrs
|| xattrs
->count(mempool::mds_co::string(xattr_name
)) == 0) {
6366 dout(10) << "removexattr '" << xattr_name
<< "' and CEPHFS_ENODATA on " << *cur
<< dendl
;
6367 return -CEPHFS_ENODATA
;
6373 derr
<< ": unhandled validation for: " << xattr_name
<< dendl
;
6374 return -CEPHFS_EINVAL
;
6377 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
6378 const bufferlist
&xattr_value
) {
6379 size_t len
= xattr_value
.length();
6380 bufferptr b
= buffer::create(len
);
6382 xattr_value
.begin().copy(len
, b
.c_str());
6384 auto em
= xattrs
->emplace(std::piecewise_construct
,
6385 std::forward_as_tuple(mempool::mds_co::string(xattr_name
)),
6386 std::forward_as_tuple(b
));
6388 em
.first
->second
= b
;
6392 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
) {
6393 xattrs
->erase(mempool::mds_co::string(xattr_name
));
6396 int Server::default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6397 XattrOp
*xattr_op
) {
6398 return xattr_validate(cur
, xattrs
, xattr_op
->xattr_name
, xattr_op
->op
, xattr_op
->flags
);
6401 void Server::default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6402 const XattrOp
&xattr_op
) {
6403 xattr_set(xattrs
, xattr_op
.xattr_name
, xattr_op
.xattr_value
);
6406 void Server::default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6407 const XattrOp
&xattr_op
) {
6408 xattr_rm(xattrs
, xattr_op
.xattr_name
);
6411 // mirror info xattr handlers
6412 const std::string
Server::MirrorXattrInfo::MIRROR_INFO_REGEX
= "^cluster_id=([a-f0-9]{8}-" \
6413 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6414 "[a-f0-9]{4}-[a-f0-9]{12})" \
6416 const std::string
Server::MirrorXattrInfo::CLUSTER_ID
= "ceph.mirror.info.cluster_id";
6417 const std::string
Server::MirrorXattrInfo::FS_ID
= "ceph.mirror.info.fs_id";
6418 int Server::parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
6419 std::string
&cluster_id
, std::string
&fs_id
) {
6420 dout(20) << "parsing name=" << name
<< ", value=" << value
<< dendl
;
6422 static const std::regex
regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX
);
6425 std::regex_search(value
, match
, regex
);
6426 if (match
.size() != 3) {
6427 derr
<< "mirror info parse error" << dendl
;
6428 return -CEPHFS_EINVAL
;
6431 cluster_id
= match
[1];
6433 dout(20) << " parsed cluster_id=" << cluster_id
<< ", fs_id=" << fs_id
<< dendl
;
6437 int Server::mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
6438 XattrOp
*xattr_op
) {
6439 if (!cur
->is_root()) {
6440 return -CEPHFS_EINVAL
;
6443 int v1
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, xattr_op
->op
, xattr_op
->flags
);
6444 int v2
= xattr_validate(cur
, xattrs
, Server::MirrorXattrInfo::FS_ID
, xattr_op
->op
, xattr_op
->flags
);
6446 derr
<< "inconsistent mirror info state (" << v1
<< "," << v2
<< ")" << dendl
;
6447 return -CEPHFS_EINVAL
;
6454 if (xattr_op
->op
== CEPH_MDS_OP_RMXATTR
) {
6458 std::string cluster_id
;
6460 int r
= parse_mirror_info_xattr(xattr_op
->xattr_name
, xattr_op
->xattr_value
.to_str(),
6466 xattr_op
->xinfo
= std::make_unique
<MirrorXattrInfo
>(cluster_id
, fs_id
);
6470 void Server::mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6471 const XattrOp
&xattr_op
) {
6472 auto mirror_info
= dynamic_cast<MirrorXattrInfo
&>(*(xattr_op
.xinfo
));
6475 bl
.append(mirror_info
.cluster_id
.c_str(), mirror_info
.cluster_id
.length());
6476 xattr_set(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
, bl
);
6479 bl
.append(mirror_info
.fs_id
.c_str(), mirror_info
.fs_id
.length());
6480 xattr_set(xattrs
, Server::MirrorXattrInfo::FS_ID
, bl
);
6483 void Server::mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
6484 const XattrOp
&xattr_op
) {
6485 xattr_rm(xattrs
, Server::MirrorXattrInfo::CLUSTER_ID
);
6486 xattr_rm(xattrs
, Server::MirrorXattrInfo::FS_ID
);
6489 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
6491 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6492 string
name(req
->get_path2());
6494 // is a ceph virtual xattr?
6495 if (is_ceph_vxattr(name
)) {
6496 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6497 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6501 handle_set_vxattr(mdr
, cur
);
6505 if (!is_allowed_ceph_xattr(name
)) {
6506 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6510 CInode
*cur
= rdlock_path_pin_ref(mdr
, true);
6514 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6515 respond_to_request(mdr
, -CEPHFS_EROFS
);
6519 int flags
= req
->head
.args
.setxattr
.flags
;
6521 MutationImpl::LockOpVec lov
;
6522 lov
.add_xlock(&cur
->xattrlock
);
6523 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6526 if (!check_access(mdr
, cur
, MAY_WRITE
))
6529 size_t len
= req
->get_data().length();
6530 size_t inc
= len
+ name
.length();
6532 auto handler
= Server::get_xattr_or_default_handler(name
);
6533 const auto& pxattrs
= cur
->get_projected_xattrs();
6535 // check xattrs kv pairs size
6536 size_t cur_xattrs_size
= 0;
6537 for (const auto& p
: *pxattrs
) {
6538 if ((flags
& CEPH_XATTR_REPLACE
) && name
.compare(p
.first
) == 0) {
6541 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
6544 if (((cur_xattrs_size
+ inc
) > g_conf()->mds_max_xattr_pairs_size
)) {
6545 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6546 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
6547 respond_to_request(mdr
, -CEPHFS_ENOSPC
);
6552 XattrOp
xattr_op(CEPH_MDS_OP_SETXATTR
, name
, req
->get_data(), flags
);
6553 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6555 respond_to_request(mdr
, r
);
6559 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
6562 auto pi
= cur
->project_inode(mdr
, true);
6563 pi
.inode
->version
= cur
->pre_dirty();
6564 pi
.inode
->ctime
= mdr
->get_op_stamp();
6565 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6566 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6567 pi
.inode
->change_attr
++;
6568 pi
.inode
->xattr_version
++;
6570 if ((flags
& CEPH_XATTR_REMOVE
)) {
6571 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6573 std::invoke(handler
->setxattr
, this, cur
, pi
.xattrs
, xattr_op
);
6577 mdr
->ls
= mdlog
->get_current_segment();
6578 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
6579 mdlog
->start_entry(le
);
6580 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6581 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6582 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6584 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6587 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
6589 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6590 std::string
name(req
->get_path2());
6592 // is a ceph virtual xattr?
6593 if (is_ceph_vxattr(name
)) {
6594 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6595 CInode
*cur
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
6599 handle_remove_vxattr(mdr
, cur
);
6603 if (!is_allowed_ceph_xattr(name
)) {
6604 respond_to_request(mdr
, -CEPHFS_EINVAL
);
6608 CInode
* cur
= rdlock_path_pin_ref(mdr
, true);
6612 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6613 respond_to_request(mdr
, -CEPHFS_EROFS
);
6617 MutationImpl::LockOpVec lov
;
6618 lov
.add_xlock(&cur
->xattrlock
);
6619 if (!mds
->locker
->acquire_locks(mdr
, lov
))
6623 auto handler
= Server::get_xattr_or_default_handler(name
);
6625 XattrOp
xattr_op(CEPH_MDS_OP_RMXATTR
, name
, bl
, 0);
6627 const auto& pxattrs
= cur
->get_projected_xattrs();
6628 int r
= std::invoke(handler
->validate
, this, cur
, pxattrs
, &xattr_op
);
6630 respond_to_request(mdr
, r
);
6634 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
6637 auto pi
= cur
->project_inode(mdr
, true);
6638 pi
.inode
->version
= cur
->pre_dirty();
6639 pi
.inode
->ctime
= mdr
->get_op_stamp();
6640 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
6641 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
6642 pi
.inode
->change_attr
++;
6643 pi
.inode
->xattr_version
++;
6644 std::invoke(handler
->removexattr
, this, cur
, pi
.xattrs
, xattr_op
);
6647 mdr
->ls
= mdlog
->get_current_segment();
6648 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
6649 mdlog
->start_entry(le
);
6650 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6651 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
6652 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
6654 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
6657 void Server::handle_client_getvxattr(MDRequestRef
& mdr
)
6659 const auto& req
= mdr
->client_request
;
6660 string xattr_name
{req
->get_path2()};
6662 // is a ceph virtual xattr?
6663 if (!is_ceph_vxattr(xattr_name
)) {
6664 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6668 CInode
*cur
= rdlock_path_pin_ref(mdr
, true, false);
6673 if (is_ceph_dir_vxattr(xattr_name
)) {
6674 if (!cur
->is_dir()) {
6675 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6678 } else if (is_ceph_file_vxattr(xattr_name
)) {
6679 if (cur
->is_dir()) {
6680 respond_to_request(mdr
, -CEPHFS_ENODATA
);
6685 CachedStackStringStream css
;
6687 ceph::bufferlist bl
;
6688 // handle these vxattrs
6689 if ((xattr_name
.substr(0, 15) == "ceph.dir.layout"sv
) ||
6690 (xattr_name
.substr(0, 16) == "ceph.file.layout"sv
)) {
6691 std::string layout_field
;
6693 struct layout_xattr_info_t
{
6694 enum class InheritanceStatus
: uint32_t {
6700 const file_layout_t layout
;
6701 const InheritanceStatus status
;
6703 layout_xattr_info_t(const file_layout_t
& l
, InheritanceStatus inh
)
6704 : layout(l
), status(inh
) { }
6706 static std::string
status_to_string(InheritanceStatus status
) {
6708 case InheritanceStatus::DEFAULT
: return "default"s
;
6709 case InheritanceStatus::SET
: return "set"s
;
6710 case InheritanceStatus::INHERITED
: return "inherited"s
;
6711 default: return "unknown"s
;
6716 auto is_default_layout
= [&](const file_layout_t
& layout
) -> bool {
6717 return (layout
== mdcache
->default_file_layout
);
6719 auto get_inherited_layout
= [&](CInode
*cur
) -> layout_xattr_info_t
{
6723 if (cur
->get_projected_inode()->has_layout()) {
6724 auto& curr_layout
= cur
->get_projected_inode()->layout
;
6725 if (is_default_layout(curr_layout
)) {
6726 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::DEFAULT
};
6728 if (cur
== orig_in
) {
6729 // we've found a new layout at this inode
6730 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::SET
};
6732 return {curr_layout
, layout_xattr_info_t::InheritanceStatus::INHERITED
};
6736 if (cur
->is_root()) {
6740 cur
= cur
->get_projected_parent_dir()->get_inode();
6742 mds
->clog
->error() << "no layout found at root dir!";
6743 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6746 if (xattr_name
== "ceph.dir.layout.json"sv
||
6747 xattr_name
== "ceph.file.layout.json"sv
) {
6748 // fetch layout only for valid xattr_name
6749 const auto lxi
= get_inherited_layout(cur
);
6751 *css
<< "{\"stripe_unit\": " << lxi
.layout
.stripe_unit
6752 << ", \"stripe_count\": " << lxi
.layout
.stripe_count
6753 << ", \"object_size\": " << lxi
.layout
.object_size
6754 << ", \"pool_name\": ";
6755 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6757 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6758 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6762 *css
<< ", \"pool_id\": " << (uint64_t)lxi
.layout
.pool_id
;
6763 *css
<< ", \"pool_namespace\": \"" << lxi
.layout
.pool_ns
<< "\"";
6764 *css
<< ", \"inheritance\": \"@"
6765 << layout_xattr_info_t::status_to_string(lxi
.status
) << "\"}";
6766 } else if ((xattr_name
== "ceph.dir.layout.pool_name"sv
) ||
6767 (xattr_name
== "ceph.file.layout.pool_name"sv
)) {
6768 // fetch layout only for valid xattr_name
6769 const auto lxi
= get_inherited_layout(cur
);
6770 mds
->objecter
->with_osdmap([lxi
, &css
](const OSDMap
& o
) {
6771 if (o
.have_pg_pool(lxi
.layout
.pool_id
)) {
6772 *css
<< o
.get_pool_name(lxi
.layout
.pool_id
);
6775 } else if ((xattr_name
== "ceph.dir.layout.pool_id"sv
) ||
6776 (xattr_name
== "ceph.file.layout.pool_id"sv
)) {
6777 // fetch layout only for valid xattr_name
6778 const auto lxi
= get_inherited_layout(cur
);
6779 *css
<< (uint64_t)lxi
.layout
.pool_id
;
6781 r
= -CEPHFS_ENODATA
; // no such attribute
6783 } else if (xattr_name
.substr(0, 12) == "ceph.dir.pin"sv
) {
6784 if (xattr_name
== "ceph.dir.pin"sv
) {
6785 *css
<< cur
->get_projected_inode()->export_pin
;
6786 } else if (xattr_name
== "ceph.dir.pin.random"sv
) {
6787 *css
<< cur
->get_projected_inode()->export_ephemeral_random_pin
;
6788 } else if (xattr_name
== "ceph.dir.pin.distributed"sv
) {
6789 *css
<< cur
->get_projected_inode()->export_ephemeral_distributed_pin
;
6791 // otherwise respond as invalid request
6792 // since we only handle ceph vxattrs here
6793 r
= -CEPHFS_ENODATA
; // no such attribute
6796 // otherwise respond as invalid request
6797 // since we only handle ceph vxattrs here
6798 r
= -CEPHFS_ENODATA
; // no such attribute
6802 ENCODE_START(1, 1, bl
);
6803 encode(css
->strv(), bl
);
6805 mdr
->reply_extra_bl
= bl
;
6808 respond_to_request(mdr
, r
);
6811 // =================================================================
6812 // DIRECTORY and NAMESPACE OPS
6815 // ------------------------------------------------
6819 class C_MDS_mknod_finish
: public ServerLogContext
{
6823 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
6824 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
6825 void finish(int r
) override
{
6826 ceph_assert(r
== 0);
6828 // crash current MDS and the replacing MDS will test the journal
6829 ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable
);
6832 dn
->pop_projected_linkage();
6834 // be a bit hacky with the inode version, here.. we decrement it
6835 // just to keep mark_dirty() happen. (we didn't bother projecting
6836 // a new version of hte inode since it's just been created)
6837 newi
->mark_dirty(mdr
->ls
);
6838 newi
->mark_dirty_parent(mdr
->ls
, true);
6841 if (newi
->is_dir()) {
6842 CDir
*dir
= newi
->get_dirfrag(frag_t());
6844 dir
->mark_dirty(mdr
->ls
);
6845 dir
->mark_new(mdr
->ls
);
6850 MDRequestRef null_ref
;
6851 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
6853 if (newi
->is_file()) {
6854 get_mds()->locker
->share_inode_max_size(newi
);
6855 } else if (newi
->is_dir()) {
6856 // We do this now so that the linkages on the new directory are stable.
6857 newi
->maybe_ephemeral_rand();
6861 get_mds()->balancer
->hit_inode(newi
, META_POP_IWR
);
6864 server
->respond_to_request(mdr
, 0);
6869 void Server::handle_client_mknod(MDRequestRef
& mdr
)
6871 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6872 client_t client
= mdr
->get_client();
6874 unsigned mode
= req
->head
.args
.mknod
.mode
;
6875 if ((mode
& S_IFMT
) == 0)
6878 mdr
->disable_lock_cache();
6879 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true, false, false, S_ISREG(mode
));
6883 CDir
*dir
= dn
->get_dir();
6884 CInode
*diri
= dir
->get_inode();
6885 if (!check_access(mdr
, diri
, MAY_WRITE
))
6887 if (!check_fragment_space(mdr
, dir
))
6889 if (!check_dir_max_entries(mdr
, dir
))
6892 ceph_assert(dn
->get_projected_linkage()->is_null());
6893 if (req
->get_alternate_name().size() > alternate_name_max
) {
6894 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6895 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6898 dn
->set_alternate_name(req
->get_alternate_name());
6901 file_layout_t layout
;
6902 if (mdr
->dir_layout
!= file_layout_t())
6903 layout
= mdr
->dir_layout
;
6905 layout
= mdcache
->default_file_layout
;
6907 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
, &layout
);
6910 dn
->push_projected_linkage(newi
);
6912 auto _inode
= newi
->_get_inode();
6913 _inode
->version
= dn
->pre_dirty();
6914 _inode
->rdev
= req
->head
.args
.mknod
.rdev
;
6915 _inode
->rstat
.rfiles
= 1;
6916 _inode
->accounted_rstat
= _inode
->rstat
;
6917 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
6918 _inode
->add_old_pool(mdcache
->default_file_layout
.pool_id
);
6919 _inode
->update_backtrace();
6921 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
6922 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
6923 ceph_assert(follows
>= realm
->get_newest_seq());
6925 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6926 // want to write to it (e.g., if they are reexporting NFS)
6927 if (S_ISREG(_inode
->mode
)) {
6928 // issue a cap on the file
6929 int cmode
= CEPH_FILE_MODE_RDWR
;
6930 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
6934 // put locks in excl mode
6935 newi
->filelock
.set_state(LOCK_EXCL
);
6936 newi
->authlock
.set_state(LOCK_EXCL
);
6937 newi
->xattrlock
.set_state(LOCK_EXCL
);
6939 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
6940 _inode
->client_ranges
[client
].range
.first
= 0;
6941 _inode
->client_ranges
[client
].range
.last
= _inode
->layout
.stripe_unit
;
6942 _inode
->client_ranges
[client
].follows
= follows
;
6943 newi
->mark_clientwriteable();
6944 cap
->mark_clientwriteable();
6948 ceph_assert(dn
->first
== follows
+ 1);
6949 newi
->first
= dn
->first
;
6951 dout(10) << "mknod mode " << _inode
->mode
<< " rdev " << _inode
->rdev
<< dendl
;
6954 mdr
->ls
= mdlog
->get_current_segment();
6955 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
6956 mdlog
->start_entry(le
);
6957 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
6958 journal_allocated_inos(mdr
, &le
->metablob
);
6960 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
6961 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6962 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
6964 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
6965 mds
->balancer
->maybe_fragment(dn
->get_dir(), false);
6971 /* This function takes responsibility for the passed mdr*/
6972 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
6974 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
6976 mdr
->disable_lock_cache();
6977 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
6981 CDir
*dir
= dn
->get_dir();
6982 CInode
*diri
= dir
->get_inode();
6984 // mkdir check access
6985 if (!check_access(mdr
, diri
, MAY_WRITE
))
6988 if (!check_fragment_space(mdr
, dir
))
6990 if (!check_dir_max_entries(mdr
, dir
))
6993 ceph_assert(dn
->get_projected_linkage()->is_null());
6994 if (req
->get_alternate_name().size() > alternate_name_max
) {
6995 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
6996 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
6999 dn
->set_alternate_name(req
->get_alternate_name());
7002 unsigned mode
= req
->head
.args
.mkdir
.mode
;
7005 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
7008 // it's a directory.
7009 dn
->push_projected_linkage(newi
);
7011 auto _inode
= newi
->_get_inode();
7012 _inode
->version
= dn
->pre_dirty();
7013 _inode
->rstat
.rsubdirs
= 1;
7014 _inode
->accounted_rstat
= _inode
->rstat
;
7015 _inode
->update_backtrace();
7017 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
7018 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
7019 ceph_assert(follows
>= realm
->get_newest_seq());
7021 dout(12) << " follows " << follows
<< dendl
;
7022 ceph_assert(dn
->first
== follows
+ 1);
7023 newi
->first
= dn
->first
;
7025 // ...and that new dir is empty.
7026 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
7027 newdir
->state_set(CDir::STATE_CREATING
);
7028 newdir
->mark_complete();
7029 newdir
->_get_fnode()->version
= newdir
->pre_dirty();
7032 mdr
->ls
= mdlog
->get_current_segment();
7033 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
7034 mdlog
->start_entry(le
);
7035 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
7036 journal_allocated_inos(mdr
, &le
->metablob
);
7037 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7038 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
7039 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
7041 // issue a cap on the directory
7042 int cmode
= CEPH_FILE_MODE_RDWR
;
7043 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
, realm
);
7047 // put locks in excl mode
7048 newi
->filelock
.set_state(LOCK_EXCL
);
7049 newi
->authlock
.set_state(LOCK_EXCL
);
7050 newi
->xattrlock
.set_state(LOCK_EXCL
);
7053 // make sure this inode gets into the journal
7054 le
->metablob
.add_opened_ino(newi
->ino());
7056 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
7058 // We hit_dir (via hit_inode) in our finish callback, but by then we might
7059 // have overshot the split size (multiple mkdir in flight), so here is
7060 // an early chance to split the dir if this mkdir makes it oversized.
7061 mds
->balancer
->maybe_fragment(dir
, false);
7067 void Server::handle_client_symlink(MDRequestRef
& mdr
)
7069 const auto& req
= mdr
->client_request
;
7071 mdr
->disable_lock_cache();
7072 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, true);
7076 CDir
*dir
= dn
->get_dir();
7077 CInode
*diri
= dir
->get_inode();
7079 if (!check_access(mdr
, diri
, MAY_WRITE
))
7081 if (!check_fragment_space(mdr
, dir
))
7083 if (!check_dir_max_entries(mdr
, dir
))
7086 ceph_assert(dn
->get_projected_linkage()->is_null());
7087 if (req
->get_alternate_name().size() > alternate_name_max
) {
7088 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
7089 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
7091 dn
->set_alternate_name(req
->get_alternate_name());
7093 unsigned mode
= S_IFLNK
| 0777;
7094 CInode
*newi
= prepare_new_inode(mdr
, dir
, inodeno_t(req
->head
.ino
), mode
);
7098 dn
->push_projected_linkage(newi
);
7100 newi
->symlink
= req
->get_path2();
7101 auto _inode
= newi
->_get_inode();
7102 _inode
->version
= dn
->pre_dirty();
7103 _inode
->size
= newi
->symlink
.length();
7104 _inode
->rstat
.rbytes
= _inode
->size
;
7105 _inode
->rstat
.rfiles
= 1;
7106 _inode
->accounted_rstat
= _inode
->rstat
;
7107 _inode
->update_backtrace();
7109 newi
->first
= dn
->first
;
7112 mdr
->ls
= mdlog
->get_current_segment();
7113 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
7114 mdlog
->start_entry(le
);
7115 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
7116 journal_allocated_inos(mdr
, &le
->metablob
);
7117 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7118 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
7120 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
7121 mds
->balancer
->maybe_fragment(dir
, false);
7123 // flush the journal as soon as possible
7124 if (g_conf()->mds_kill_skip_replaying_inotable
) {
7135 void Server::handle_client_link(MDRequestRef
& mdr
)
7137 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7139 dout(7) << "handle_client_link " << req
->get_filepath()
7140 << " to " << req
->get_filepath2()
7143 mdr
->disable_lock_cache();
7148 if (req
->get_filepath2().depth() == 0) {
7149 targeti
= mdcache
->get_inode(req
->get_filepath2().get_ino());
7151 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl
;
7152 inodeno_t ino
= req
->get_filepath2().get_ino();
7153 mdcache
->find_ino_peers(ino
, new C_MDS_TryFindInode(this, mdr
, mdcache
, ino
));
7158 if (!(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
)) {
7159 CDentry
*pdn
= targeti
->get_projected_parent_dn();
7161 dout(7) << "target has no parent dn, failing..." << dendl
;
7162 respond_to_request(mdr
, -CEPHFS_EINVAL
);
7165 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
, 1))
7167 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
7170 destdn
= rdlock_path_xlock_dentry(mdr
, false);
7174 auto ret
= rdlock_two_paths_xlock_destdn(mdr
, false);
7179 if (!destdn
->get_projected_linkage()->is_null()) {
7180 respond_to_request(mdr
, -CEPHFS_EEXIST
);
7184 targeti
= ret
.second
->get_projected_linkage()->get_inode();
7187 ceph_assert(destdn
->get_projected_linkage()->is_null());
7188 if (req
->get_alternate_name().size() > alternate_name_max
) {
7189 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
7190 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
7193 destdn
->set_alternate_name(req
->get_alternate_name());
7195 if (targeti
->is_dir()) {
7196 dout(7) << "target is a dir, failing..." << dendl
;
7197 respond_to_request(mdr
, -CEPHFS_EINVAL
);
7201 CDir
*dir
= destdn
->get_dir();
7202 dout(7) << "handle_client_link link " << destdn
->get_name() << " in " << *dir
<< dendl
;
7203 dout(7) << "target is " << *targeti
<< dendl
;
7205 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7206 MutationImpl::LockOpVec lov
;
7207 lov
.add_xlock(&targeti
->snaplock
);
7208 lov
.add_xlock(&targeti
->linklock
);
7210 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7213 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7216 if (targeti
->get_projected_inode()->nlink
== 0) {
7217 dout(7) << "target has no link, failing..." << dendl
;
7218 respond_to_request(mdr
, -CEPHFS_ENOENT
);
7222 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7223 if (!check_access(mdr
, targeti
, MAY_WRITE
))
7226 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
7229 if (!check_fragment_space(mdr
, dir
))
7232 if (!check_dir_max_entries(mdr
, dir
))
7236 CInode
* target_pin
= targeti
->get_projected_parent_dir()->inode
;
7237 SnapRealm
*target_realm
= target_pin
->find_snaprealm();
7238 if (target_pin
!= dir
->inode
&&
7239 target_realm
->get_subvolume_ino() !=
7240 dir
->inode
->find_snaprealm()->get_subvolume_ino() &&
7241 /* The inode is temporarily located in the stray dir pending reintegration */
7242 !target_pin
->is_stray()) {
7243 dout(7) << "target is in different subvolume, failing..." << dendl
;
7244 respond_to_request(mdr
, -CEPHFS_EXDEV
);
7249 ceph_assert(g_conf()->mds_kill_link_at
!= 1);
7252 if (targeti
->is_auth())
7253 _link_local(mdr
, destdn
, targeti
, target_realm
);
7255 _link_remote(mdr
, true, destdn
, targeti
);
7256 mds
->balancer
->maybe_fragment(dir
, false);
7260 class C_MDS_link_local_finish
: public ServerLogContext
{
7267 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
7268 version_t dnpv_
, version_t tipv_
, bool ar
) :
7269 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
7270 dnpv(dnpv_
), tipv(tipv_
), adjust_realm(ar
) { }
7271 void finish(int r
) override
{
7272 ceph_assert(r
== 0);
7273 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
);
7278 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
)
7280 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
7282 mdr
->ls
= mdlog
->get_current_segment();
7284 // predirty NEW dentry
7285 version_t dnpv
= dn
->pre_dirty();
7286 version_t tipv
= targeti
->pre_dirty();
7288 // project inode update
7289 auto pi
= targeti
->project_inode(mdr
);
7291 pi
.inode
->ctime
= mdr
->get_op_stamp();
7292 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
7293 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
7294 pi
.inode
->change_attr
++;
7295 pi
.inode
->version
= tipv
;
7297 bool adjust_realm
= false;
7298 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7299 sr_t
*newsnap
= targeti
->project_snaprealm();
7300 targeti
->mark_snaprealm_global(newsnap
);
7301 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, targeti
->get_projected_parent_dn(), true);
7302 adjust_realm
= true;
7306 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
7307 mdlog
->start_entry(le
);
7308 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7309 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
7310 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
7311 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7312 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
7314 // do this after predirty_*, to avoid funky extra dnl arg
7315 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7317 journal_and_reply(mdr
, targeti
, dn
, le
,
7318 new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
, adjust_realm
));
7321 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
7322 version_t dnpv
, version_t tipv
, bool adjust_realm
)
7324 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
7326 // link and unlock the NEW dentry
7327 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7328 if (!dnl
->get_inode())
7329 dn
->link_remote(dnl
, targeti
);
7330 dn
->mark_dirty(dnpv
, mdr
->ls
);
7335 MDRequestRef null_ref
;
7336 mdcache
->send_dentry_link(dn
, null_ref
);
7339 int op
= CEPH_SNAP_OP_SPLIT
;
7340 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7341 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7344 // bump target popularity
7345 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7346 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7349 respond_to_request(mdr
, 0);
7353 // link / unlink remote
7355 class C_MDS_link_remote_finish
: public ServerLogContext
{
7361 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
7362 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
7363 dpv(d
->get_projected_version()) {}
7364 void finish(int r
) override
{
7365 ceph_assert(r
== 0);
7366 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
7370 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
7372 dout(10) << "_link_remote "
7373 << (inc
? "link ":"unlink ")
7374 << *dn
<< " to " << *targeti
<< dendl
;
7376 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7377 mds_rank_t linkauth
= targeti
->authority().first
;
7378 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
7379 if (mds
->is_cluster_degraded() &&
7380 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
7381 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
7382 if (mdr
->more()->waiting_on_peer
.empty())
7383 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
7387 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
7390 op
= MMDSPeerRequest::OP_LINKPREP
;
7392 op
= MMDSPeerRequest::OP_UNLINKPREP
;
7393 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, op
);
7394 targeti
->set_object_info(req
->get_object_info());
7395 req
->op_stamp
= mdr
->get_op_stamp();
7396 if (auto& desti_srnode
= mdr
->more()->desti_srnode
)
7397 encode(*desti_srnode
, req
->desti_snapbl
);
7398 mds
->send_message_mds(req
, linkauth
);
7400 ceph_assert(mdr
->more()->waiting_on_peer
.count(linkauth
) == 0);
7401 mdr
->more()->waiting_on_peer
.insert(linkauth
);
7404 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
7406 ceph_assert(g_conf()->mds_kill_link_at
!= 2);
7408 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
7409 delete desti_srnode
;
7410 desti_srnode
= NULL
;
7413 mdr
->set_mds_stamp(ceph_clock_now());
7416 mdr
->ls
= mdlog
->get_current_segment();
7417 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
7418 mdlog
->start_entry(le
);
7419 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7420 if (!mdr
->more()->witnessed
.empty()) {
7421 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
7422 le
->reqid
= mdr
->reqid
;
7423 le
->had_peers
= true;
7424 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7429 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
7430 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
7431 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
7434 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
7435 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
7436 le
->metablob
.add_null_dentry(dn
, true);
7437 dn
->push_projected_linkage();
7440 journal_and_reply(mdr
, (inc
? targeti
: nullptr), dn
, le
,
7441 new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
7444 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
7445 CDentry
*dn
, CInode
*targeti
,
7448 dout(10) << "_link_remote_finish "
7449 << (inc
? "link ":"unlink ")
7450 << *dn
<< " to " << *targeti
<< dendl
;
7452 ceph_assert(g_conf()->mds_kill_link_at
!= 3);
7454 if (!mdr
->more()->witnessed
.empty())
7455 mdcache
->logged_leader_update(mdr
->reqid
);
7458 // link the new dentry
7459 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
7460 if (!dnl
->get_inode())
7461 dn
->link_remote(dnl
, targeti
);
7462 dn
->mark_dirty(dpv
, mdr
->ls
);
7464 // unlink main dentry
7465 dn
->get_dir()->unlink_inode(dn
);
7466 dn
->pop_projected_linkage();
7467 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
7472 MDRequestRef null_ref
;
7474 mdcache
->send_dentry_link(dn
, null_ref
);
7476 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
7478 // bump target popularity
7479 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7480 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
7483 respond_to_request(mdr
, 0);
7486 // removing a new dn?
7487 dn
->get_dir()->try_remove_unlinked_dn(dn
);
7491 // remote linking/unlinking
7493 class C_MDS_PeerLinkPrep
: public ServerLogContext
{
7497 C_MDS_PeerLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
, bool ar
) :
7498 ServerLogContext(s
, r
), targeti(t
), adjust_realm(ar
) { }
7499 void finish(int r
) override
{
7500 ceph_assert(r
== 0);
7501 server
->_logged_peer_link(mdr
, targeti
, adjust_realm
);
7505 class C_MDS_PeerLinkCommit
: public ServerContext
{
7509 C_MDS_PeerLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
7510 ServerContext(s
), mdr(r
), targeti(t
) { }
7511 void finish(int r
) override
{
7512 server
->_commit_peer_link(mdr
, r
, targeti
);
7516 void Server::handle_peer_link_prep(MDRequestRef
& mdr
)
7518 dout(10) << "handle_peer_link_prep " << *mdr
7519 << " on " << mdr
->peer_request
->get_object_info()
7522 ceph_assert(g_conf()->mds_kill_link_at
!= 4);
7524 CInode
*targeti
= mdcache
->get_inode(mdr
->peer_request
->get_object_info().ino
);
7525 ceph_assert(targeti
);
7526 dout(10) << "targeti " << *targeti
<< dendl
;
7527 CDentry
*dn
= targeti
->get_parent_dn();
7528 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7529 ceph_assert(dnl
->is_primary());
7531 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
7533 mdr
->auth_pin(targeti
);
7535 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7536 ceph_assert(g_conf()->mds_kill_link_at
!= 5);
7539 mdr
->ls
= mdlog
->get_current_segment();
7540 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_prep", mdr
->reqid
, mdr
->peer_to_mds
,
7541 EPeerUpdate::OP_PREPARE
, EPeerUpdate::LINK
);
7542 mdlog
->start_entry(le
);
7544 auto pi
= dnl
->get_inode()->project_inode(mdr
);
7546 // update journaled target inode
7548 bool adjust_realm
= false;
7549 bool realm_projected
= false;
7550 if (mdr
->peer_request
->get_op() == MMDSPeerRequest::OP_LINKPREP
) {
7554 CDentry
*target_pdn
= targeti
->get_projected_parent_dn();
7555 SnapRealm
*target_realm
= target_pdn
->get_dir()->inode
->find_snaprealm();
7556 if (!target_realm
->get_subvolume_ino() && !targeti
->is_projected_snaprealm_global()) {
7557 sr_t
*newsnap
= targeti
->project_snaprealm();
7558 targeti
->mark_snaprealm_global(newsnap
);
7559 targeti
->record_snaprealm_parent_dentry(newsnap
, target_realm
, target_pdn
, true);
7560 adjust_realm
= true;
7561 realm_projected
= true;
7566 if (targeti
->is_projected_snaprealm_global()) {
7567 ceph_assert(mdr
->peer_request
->desti_snapbl
.length());
7568 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
7570 sr_t
*newsnap
= targeti
->project_snaprealm();
7571 decode(*newsnap
, p
);
7573 if (pi
.inode
->nlink
== 0)
7574 ceph_assert(!newsnap
->is_parent_global());
7576 realm_projected
= true;
7578 ceph_assert(mdr
->peer_request
->desti_snapbl
.length() == 0);
7582 link_rollback rollback
;
7583 rollback
.reqid
= mdr
->reqid
;
7584 rollback
.ino
= targeti
->ino();
7585 rollback
.old_ctime
= targeti
->get_inode()->ctime
; // we hold versionlock xlock; no concorrent projections
7586 const auto& pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
7587 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
7588 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
7589 rollback
.was_inc
= inc
;
7590 if (realm_projected
) {
7591 if (targeti
->snaprealm
) {
7592 encode(true, rollback
.snapbl
);
7593 targeti
->encode_snap_blob(rollback
.snapbl
);
7595 encode(false, rollback
.snapbl
);
7598 encode(rollback
, le
->rollback
);
7599 mdr
->more()->rollback_bl
= le
->rollback
;
7601 pi
.inode
->ctime
= mdr
->get_op_stamp();
7602 pi
.inode
->version
= targeti
->pre_dirty();
7604 dout(10) << " projected inode " << pi
.inode
->ino
<< " v " << pi
.inode
->version
<< dendl
;
7607 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
7608 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
7609 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
7611 // set up commit waiter
7612 mdr
->more()->peer_commit
= new C_MDS_PeerLinkCommit(this, mdr
, targeti
);
7614 mdr
->more()->peer_update_journaled
= true;
7615 submit_mdlog_entry(le
, new C_MDS_PeerLinkPrep(this, mdr
, targeti
, adjust_realm
),
7620 void Server::_logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
)
7622 dout(10) << "_logged_peer_link " << *mdr
7623 << " " << *targeti
<< dendl
;
7625 ceph_assert(g_conf()->mds_kill_link_at
!= 6);
7627 // update the target
7631 mds
->balancer
->hit_inode(targeti
, META_POP_IWR
);
7634 mdr
->reset_peer_request();
7637 int op
= CEPH_SNAP_OP_SPLIT
;
7638 mds
->mdcache
->send_snap_update(targeti
, 0, op
);
7639 mds
->mdcache
->do_realm_invalidate_and_update_notify(targeti
, op
);
7643 if (!mdr
->aborted
) {
7644 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_LINKPREPACK
);
7645 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
7647 dout(10) << " abort flag set, finishing" << dendl
;
7648 mdcache
->request_finish(mdr
);
7653 struct C_MDS_CommittedPeer
: public ServerLogContext
{
7654 C_MDS_CommittedPeer(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
7655 void finish(int r
) override
{
7656 server
->_committed_peer(mdr
);
7660 void Server::_commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
7662 dout(10) << "_commit_peer_link " << *mdr
7664 << " " << *targeti
<< dendl
;
7666 ceph_assert(g_conf()->mds_kill_link_at
!= 7);
7669 // drop our pins, etc.
7672 // write a commit to the journal
7673 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_commit", mdr
->reqid
, mdr
->peer_to_mds
,
7674 EPeerUpdate::OP_COMMIT
, EPeerUpdate::LINK
);
7675 mdlog
->start_entry(le
);
7676 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
7679 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
7683 void Server::_committed_peer(MDRequestRef
& mdr
)
7685 dout(10) << "_committed_peer " << *mdr
<< dendl
;
7687 ceph_assert(g_conf()->mds_kill_link_at
!= 8);
7689 bool assert_exist
= mdr
->more()->peer_update_journaled
;
7690 mdcache
->finish_uncommitted_peer(mdr
->reqid
, assert_exist
);
7691 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_COMMITTED
);
7692 mds
->send_message_mds(req
, mdr
->peer_to_mds
);
7693 mdcache
->request_finish(mdr
);
7696 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
7698 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7699 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7700 map
<client_t
,ref_t
<MClientSnap
>>&& _splits
) :
7701 ServerLogContext(s
, r
), mut(m
), splits(std::move(_splits
)) {
7703 void finish(int r
) override
{
7704 server
->_link_rollback_finish(mut
, mdr
, splits
);
7708 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
7710 link_rollback rollback
;
7711 auto p
= rbl
.cbegin();
7712 decode(rollback
, p
);
7714 dout(10) << "do_link_rollback on " << rollback
.reqid
7715 << (rollback
.was_inc
? " inc":" dec")
7716 << " ino " << rollback
.ino
7719 ceph_assert(g_conf()->mds_kill_link_at
!= 9);
7721 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
7722 ceph_assert(mdr
|| mds
->is_resolve());
7724 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7725 mut
->ls
= mds
->mdlog
->get_current_segment();
7727 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
7729 dout(10) << " target is " << *in
<< dendl
;
7730 ceph_assert(!in
->is_projected()); // live peer request hold versionlock xlock.
7732 auto pi
= in
->project_inode(mut
);
7733 pi
.inode
->version
= in
->pre_dirty();
7735 // parent dir rctime
7736 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
7737 auto pf
= parent
->project_fnode(mut
);
7738 pf
->version
= parent
->pre_dirty();
7739 if (pf
->fragstat
.mtime
== pi
.inode
->ctime
) {
7740 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
7741 if (pf
->rstat
.rctime
== pi
.inode
->ctime
)
7742 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
7743 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
7744 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
7748 pi
.inode
->ctime
= rollback
.old_ctime
;
7749 if (rollback
.was_inc
)
7754 map
<client_t
,ref_t
<MClientSnap
>> splits
;
7755 if (rollback
.snapbl
.length() && in
->snaprealm
) {
7757 auto p
= rollback
.snapbl
.cbegin();
7758 decode(hadrealm
, p
);
7760 if (!mds
->is_resolve()) {
7761 sr_t
*new_srnode
= new sr_t();
7762 decode(*new_srnode
, p
);
7763 in
->project_snaprealm(new_srnode
);
7765 decode(in
->snaprealm
->srnode
, p
);
7768 SnapRealm
*realm
= parent
->get_inode()->find_snaprealm();
7769 if (!mds
->is_resolve())
7770 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
);
7771 in
->project_snaprealm(NULL
);
7776 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_link_rollback", rollback
.reqid
, leader
,
7777 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::LINK
);
7778 mdlog
->start_entry(le
);
7779 le
->commit
.add_dir_context(parent
);
7780 le
->commit
.add_dir(parent
, true);
7781 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
7783 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
, std::move(splits
)),
7788 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
7789 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
7791 dout(10) << "_link_rollback_finish" << dendl
;
7793 ceph_assert(g_conf()->mds_kill_link_at
!= 10);
7797 if (!mds
->is_resolve())
7798 mdcache
->send_snaps(splits
);
7801 mdcache
->request_finish(mdr
);
7803 mdcache
->finish_rollback(mut
->reqid
, mdr
);
7809 void Server::handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
)
7811 dout(10) << "handle_peer_link_prep_ack " << *mdr
7812 << " " << *m
<< dendl
;
7813 mds_rank_t from
= mds_rank_t(m
->get_source().num());
7815 ceph_assert(g_conf()->mds_kill_link_at
!= 11);
7818 mdr
->more()->peers
.insert(from
);
7821 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
7822 mdr
->more()->witnessed
.insert(from
);
7823 ceph_assert(!m
->is_not_journaled());
7824 mdr
->more()->has_journaled_peers
= true;
7826 // remove from waiting list
7827 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
7828 mdr
->more()->waiting_on_peer
.erase(from
);
7830 ceph_assert(mdr
->more()->waiting_on_peer
.empty());
7832 dispatch_client_request(mdr
); // go again!
7841 void Server::handle_client_unlink(MDRequestRef
& mdr
)
7843 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
7844 client_t client
= mdr
->get_client();
7847 bool rmdir
= (req
->get_op() == CEPH_MDS_OP_RMDIR
);
7850 mdr
->disable_lock_cache();
7851 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, false, true);
7855 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
7856 ceph_assert(!dnl
->is_null());
7857 CInode
*in
= dnl
->get_inode();
7860 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
7862 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
7864 dout(7) << "dn links to " << *in
<< dendl
;
7869 // do empty directory checks
7870 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
7871 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7875 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
7876 respond_to_request(mdr
, -CEPHFS_EISDIR
);
7882 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
7883 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
7888 CInode
*diri
= dn
->get_dir()->get_inode();
7889 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
7890 if (!check_access(mdr
, diri
, MAY_WRITE
))
7894 // -- create stray dentry? --
7895 CDentry
*straydn
= NULL
;
7896 if (dnl
->is_primary()) {
7897 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
7900 dout(10) << " straydn is " << *straydn
<< dendl
;
7901 } else if (mdr
->straydn
) {
7902 mdr
->unpin(mdr
->straydn
);
7903 mdr
->straydn
= NULL
;
7907 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
7908 MutationImpl::LockOpVec lov
;
7910 lov
.add_xlock(&in
->linklock
);
7911 lov
.add_xlock(&in
->snaplock
);
7913 lov
.add_rdlock(&in
->filelock
); // to verify it's empty
7916 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
7917 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
7918 lov
.add_xlock(&straydn
->lock
);
7921 if (!mds
->locker
->acquire_locks(mdr
, lov
))
7924 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
7928 _dir_is_nonempty(mdr
, in
)) {
7929 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
7934 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
7936 if (!mdr
->more()->desti_srnode
) {
7937 if (in
->is_projected_snaprealm_global()) {
7938 sr_t
*new_srnode
= in
->prepare_new_srnode(0);
7939 in
->record_snaprealm_parent_dentry(new_srnode
, nullptr, dn
, dnl
->is_primary());
7940 // dropping the last linkage or dropping the last remote linkage,
7941 // detch the inode from global snaprealm
7942 auto nlink
= in
->get_projected_inode()->nlink
;
7944 (nlink
== 2 && !dnl
->is_primary() &&
7945 !in
->get_projected_parent_dir()->inode
->is_stray()))
7946 in
->clear_snaprealm_global(new_srnode
);
7947 mdr
->more()->desti_srnode
= new_srnode
;
7948 } else if (dnl
->is_primary()) {
7949 // prepare snaprealm blob for peer request
7950 SnapRealm
*realm
= in
->find_snaprealm();
7951 snapid_t follows
= realm
->get_newest_seq();
7952 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap()) {
7953 sr_t
*new_srnode
= in
->prepare_new_srnode(follows
);
7954 in
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
7955 mdr
->more()->desti_srnode
= new_srnode
;
7961 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
7962 // subtree root auths need to be witnesses
7963 set
<mds_rank_t
> witnesses
;
7964 in
->list_replicas(witnesses
);
7965 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
7967 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
7968 p
!= witnesses
.end();
7970 if (mdr
->more()->witnessed
.count(*p
)) {
7971 dout(10) << " already witnessed by mds." << *p
<< dendl
;
7972 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
7973 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
7975 if (!_rmdir_prepare_witness(mdr
, *p
, mdr
->dn
[0], straydn
))
7979 if (!mdr
->more()->waiting_on_peer
.empty())
7980 return; // we're waiting for a witness.
7983 if (!rmdir
&& dnl
->is_primary() && mdr
->dn
[0].size() == 1)
7984 mds
->locker
->create_lock_cache(mdr
, diri
);
7987 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
7988 _link_remote(mdr
, false, dn
, dnl
->get_inode());
7990 _unlink_local(mdr
, dn
, straydn
);
7993 class C_MDS_unlink_local_finish
: public ServerLogContext
{
7996 version_t dnpv
; // deleted dentry
7998 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
7999 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
8000 dnpv(d
->get_projected_version()) {}
8001 void finish(int r
) override
{
8002 ceph_assert(r
== 0);
8003 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
8007 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
8009 dout(10) << "_unlink_local " << *dn
<< dendl
;
8011 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8012 CInode
*in
= dnl
->get_inode();
8016 mdr
->ls
= mdlog
->get_current_segment();
8018 // prepare log entry
8019 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
8020 mdlog
->start_entry(le
);
8021 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
8022 if (!mdr
->more()->witnessed
.empty()) {
8023 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
8024 le
->reqid
= mdr
->reqid
;
8025 le
->had_peers
= true;
8026 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
8030 ceph_assert(dnl
->is_primary());
8031 straydn
->push_projected_linkage(in
);
8034 // the unlinked dentry
8037 auto pi
= in
->project_inode(mdr
);
8040 dn
->make_path_string(t
, true);
8041 pi
.inode
->stray_prior_path
= std::move(t
);
8043 pi
.inode
->version
= in
->pre_dirty();
8044 pi
.inode
->ctime
= mdr
->get_op_stamp();
8045 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
8046 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
8047 pi
.inode
->change_attr
++;
8049 if (pi
.inode
->nlink
== 0)
8050 in
->state_set(CInode::STATE_ORPHAN
);
8052 if (mdr
->more()->desti_srnode
) {
8053 auto& desti_srnode
= mdr
->more()->desti_srnode
;
8054 in
->project_snaprealm(desti_srnode
);
8055 desti_srnode
= NULL
;
8059 // will manually pop projected inode
8061 // primary link. add stray dentry.
8062 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
8063 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
8065 pi
.inode
->update_backtrace();
8066 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
8068 // remote link. update remote inode.
8069 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
8070 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
8071 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
8074 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
8075 le
->metablob
.add_null_dentry(dn
, true);
8078 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8079 le
->metablob
.renamed_dirino
= in
->ino();
8082 dn
->push_projected_linkage();
8085 ceph_assert(in
->first
<= straydn
->first
);
8086 in
->first
= straydn
->first
;
8090 ceph_assert(straydn
);
8091 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
8094 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
8097 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
8098 CDentry
*dn
, CDentry
*straydn
,
8101 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
8103 if (!mdr
->more()->witnessed
.empty())
8104 mdcache
->logged_leader_update(mdr
->reqid
);
8106 CInode
*strayin
= NULL
;
8107 bool hadrealm
= false;
8109 // if there is newly created snaprealm, need to split old snaprealm's
8110 // inodes_with_caps. So pop snaprealm before linkage changes.
8111 strayin
= dn
->get_linkage()->get_inode();
8112 hadrealm
= strayin
->snaprealm
? true : false;
8113 strayin
->early_pop_projected_snaprealm();
8116 // unlink main dentry
8117 dn
->get_dir()->unlink_inode(dn
);
8118 dn
->pop_projected_linkage();
8119 dn
->mark_dirty(dnpv
, mdr
->ls
);
8121 // relink as stray? (i.e. was primary link?)
8123 dout(20) << " straydn is " << *straydn
<< dendl
;
8124 straydn
->pop_projected_linkage();
8125 mdcache
->touch_dentry_bottom(straydn
);
8130 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
8133 // update subtree map?
8134 if (strayin
->is_dir())
8135 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
8137 if (strayin
->snaprealm
&& !hadrealm
)
8138 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, false);
8142 mds
->balancer
->hit_dir(dn
->get_dir(), META_POP_IWR
);
8145 respond_to_request(mdr
, 0);
8147 // removing a new dn?
8148 dn
->get_dir()->try_remove_unlinked_dn(dn
);
8151 // respond_to_request() drops locks. So stray reintegration can race with us.
8152 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
8153 // Tip off the MDCache that this dentry is a stray that
8154 // might be elegible for purge.
8155 mdcache
->notify_stray(straydn
);
8159 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
8161 if (mds
->is_cluster_degraded() &&
8162 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
8163 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
8164 if (mdr
->more()->waiting_on_peer
.empty())
8165 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
8169 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
8170 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREP
);
8171 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
8172 for (auto dn
: trace
)
8173 req
->srcdnpath
.push_dentry(dn
->get_name());
8174 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
8175 if (mdr
->more()->desti_srnode
)
8176 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
8178 req
->op_stamp
= mdr
->get_op_stamp();
8179 mds
->send_message_mds(req
, who
);
8181 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
8182 mdr
->more()->waiting_on_peer
.insert(who
);
8186 struct C_MDS_PeerRmdirPrep
: public ServerLogContext
{
8187 CDentry
*dn
, *straydn
;
8188 C_MDS_PeerRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
8189 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
8190 void finish(int r
) override
{
8191 server
->_logged_peer_rmdir(mdr
, dn
, straydn
);
8195 struct C_MDS_PeerRmdirCommit
: public ServerContext
{
8198 C_MDS_PeerRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
8199 : ServerContext(s
), mdr(r
), straydn(sd
) { }
8200 void finish(int r
) override
{
8201 server
->_commit_peer_rmdir(mdr
, r
, straydn
);
8205 void Server::handle_peer_rmdir_prep(MDRequestRef
& mdr
)
8207 dout(10) << "handle_peer_rmdir_prep " << *mdr
8208 << " " << mdr
->peer_request
->srcdnpath
8209 << " to " << mdr
->peer_request
->destdnpath
8212 vector
<CDentry
*> trace
;
8213 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
8214 dout(10) << " src " << srcpath
<< dendl
;
8216 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
8217 int r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
8218 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
8221 if (r
== -CEPHFS_ESTALE
) {
8222 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
8223 mdr
->peer_to_mds
, true);
8226 ceph_assert(r
== 0);
8227 CDentry
*dn
= trace
.back();
8228 dout(10) << " dn " << *dn
<< dendl
;
8231 ceph_assert(mdr
->straydn
);
8232 CDentry
*straydn
= mdr
->straydn
;
8233 dout(10) << " straydn " << *straydn
<< dendl
;
8235 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
8237 rmdir_rollback rollback
;
8238 rollback
.reqid
= mdr
->reqid
;
8239 rollback
.src_dir
= dn
->get_dir()->dirfrag();
8240 rollback
.src_dname
= dn
->get_name();
8241 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
8242 rollback
.dest_dname
= straydn
->get_name();
8243 if (mdr
->peer_request
->desti_snapbl
.length()) {
8244 if (in
->snaprealm
) {
8245 encode(true, rollback
.snapbl
);
8246 in
->encode_snap_blob(rollback
.snapbl
);
8248 encode(false, rollback
.snapbl
);
8251 encode(rollback
, mdr
->more()->rollback_bl
);
8252 // FIXME: rollback snaprealm
8253 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
8255 // set up commit waiter
8256 mdr
->more()->peer_commit
= new C_MDS_PeerRmdirCommit(this, mdr
, straydn
);
8258 straydn
->push_projected_linkage(in
);
8259 dn
->push_projected_linkage();
8261 ceph_assert(straydn
->first
>= in
->first
);
8262 in
->first
= straydn
->first
;
8264 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
8265 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
8266 _logged_peer_rmdir(mdr
, dn
, straydn
);
8270 mdr
->ls
= mdlog
->get_current_segment();
8271 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir", mdr
->reqid
, mdr
->peer_to_mds
,
8272 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RMDIR
);
8273 mdlog
->start_entry(le
);
8274 le
->rollback
= mdr
->more()->rollback_bl
;
8276 le
->commit
.add_dir_context(straydn
->get_dir());
8277 le
->commit
.add_primary_dentry(straydn
, in
, true);
8278 // peer: no need to journal original dentry
8280 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8281 le
->commit
.renamed_dirino
= in
->ino();
8283 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
8284 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
8286 mdr
->more()->peer_update_journaled
= true;
8287 submit_mdlog_entry(le
, new C_MDS_PeerRmdirPrep(this, mdr
, dn
, straydn
),
8292 void Server::_logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
8294 dout(10) << "_logged_peer_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
8295 CInode
*in
= dn
->get_linkage()->get_inode();
8298 if (mdr
->peer_request
->desti_snapbl
.length()) {
8299 new_realm
= !in
->snaprealm
;
8300 in
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
8301 ceph_assert(in
->snaprealm
);
8306 // update our cache now, so we are consistent with what is in the journal
8307 // when we journal a subtree map
8308 dn
->get_dir()->unlink_inode(dn
);
8309 straydn
->pop_projected_linkage();
8310 dn
->pop_projected_linkage();
8312 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), mdr
->more()->peer_update_journaled
);
8315 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
8318 mdr
->reset_peer_request();
8321 if (!mdr
->aborted
) {
8322 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RMDIRPREPACK
);
8323 if (!mdr
->more()->peer_update_journaled
)
8324 reply
->mark_not_journaled();
8325 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
8327 dout(10) << " abort flag set, finishing" << dendl
;
8328 mdcache
->request_finish(mdr
);
8332 void Server::handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
8334 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8335 << " " << *ack
<< dendl
;
8337 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8339 mdr
->more()->peers
.insert(from
);
8340 mdr
->more()->witnessed
.insert(from
);
8341 if (!ack
->is_not_journaled())
8342 mdr
->more()->has_journaled_peers
= true;
8344 // remove from waiting list
8345 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
8346 mdr
->more()->waiting_on_peer
.erase(from
);
8348 if (mdr
->more()->waiting_on_peer
.empty())
8349 dispatch_client_request(mdr
); // go again!
8351 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
8354 void Server::_commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
8356 dout(10) << "_commit_peer_rmdir " << *mdr
<< " r=" << r
<< dendl
;
8359 if (mdr
->more()->peer_update_journaled
) {
8360 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
8361 if (strayin
&& !strayin
->snaprealm
)
8362 mdcache
->clear_dirty_bits_for_stray(strayin
);
8367 if (mdr
->more()->peer_update_journaled
) {
8368 // write a commit to the journal
8369 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_commit", mdr
->reqid
,
8370 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
8371 EPeerUpdate::RMDIR
);
8372 mdlog
->start_entry(le
);
8373 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
8376 _committed_peer(mdr
);
8380 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
);
8384 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
8388 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
8389 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
8390 void finish(int r
) override
{
8391 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
8395 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
)
8397 // unlink the other rollback methods, the rmdir rollback is only
8398 // needed to record the subtree changes in the journal for inode
8399 // replicas who are auth for empty dirfrags. no actual changes to
8400 // the file system are taking place here, so there is no Mutation.
8402 rmdir_rollback rollback
;
8403 auto p
= rbl
.cbegin();
8404 decode(rollback
, p
);
8406 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
8407 mdcache
->add_rollback(rollback
.reqid
, leader
); // need to finish this update before resolve finishes
8408 ceph_assert(mdr
|| mds
->is_resolve());
8410 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
8412 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
8414 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
8416 dout(10) << " dn " << *dn
<< dendl
;
8417 CDir
*straydir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
8418 ceph_assert(straydir
);
8419 CDentry
*straydn
= straydir
->lookup(rollback
.dest_dname
);
8420 ceph_assert(straydn
);
8421 dout(10) << " straydn " << *straydn
<< dendl
;
8422 CInode
*in
= straydn
->get_linkage()->get_inode();
8424 dn
->push_projected_linkage(in
);
8425 straydn
->push_projected_linkage();
8427 if (rollback
.snapbl
.length() && in
->snaprealm
) {
8429 auto p
= rollback
.snapbl
.cbegin();
8430 decode(hadrealm
, p
);
8432 decode(in
->snaprealm
->srnode
, p
);
8434 in
->snaprealm
->merge_to(dir
->get_inode()->find_snaprealm());
8438 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
8439 ceph_assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
8441 _rmdir_rollback_finish(mdr
, rollback
.reqid
, dn
, straydn
);
8446 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rmdir_rollback", rollback
.reqid
, leader
,
8447 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RMDIR
);
8448 mdlog
->start_entry(le
);
8450 le
->commit
.add_dir_context(dn
->get_dir());
8451 le
->commit
.add_primary_dentry(dn
, in
, true);
8452 // peer: no need to journal straydn
8454 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
8455 le
->commit
.renamed_dirino
= in
->ino();
8457 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
8459 submit_mdlog_entry(le
,
8460 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
8466 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
8468 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
8470 straydn
->get_dir()->unlink_inode(straydn
);
8471 dn
->pop_projected_linkage();
8472 straydn
->pop_projected_linkage();
8474 CInode
*in
= dn
->get_linkage()->get_inode();
8475 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(),
8476 !mdr
|| mdr
->more()->peer_update_journaled
);
8478 if (mds
->is_resolve()) {
8479 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
8480 mdcache
->try_trim_non_auth_subtree(root
);
8484 mdcache
->request_finish(mdr
);
8486 mdcache
->finish_rollback(reqid
, mdr
);
8490 /** _dir_is_nonempty[_unlocked]
8492 * check if a directory is non-empty (i.e. we can rmdir it).
8494 * the unlocked varient this is a fastpath check. we can't really be
8495 * sure until we rdlock the filelock.
8497 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
8499 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
8500 ceph_assert(in
->is_auth());
8502 if (in
->filelock
.is_cached())
8503 return false; // there can be pending async create/unlink. don't know.
8504 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
8505 return true; // in a snapshot!
8507 auto&& ls
= in
->get_dirfrags();
8508 for (const auto& dir
: ls
) {
8509 // is the frag obviously non-empty?
8510 if (dir
->is_auth()) {
8511 if (dir
->get_projected_fnode()->fragstat
.size()) {
8512 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8513 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
8522 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
8524 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
8525 ceph_assert(in
->is_auth());
8526 ceph_assert(in
->filelock
.can_read(mdr
->get_client()));
8528 frag_info_t dirstat
;
8529 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
8531 auto&& ls
= in
->get_dirfrags();
8532 for (const auto& dir
: ls
) {
8533 const auto& pf
= dir
->get_projected_fnode();
8534 if (pf
->fragstat
.size()) {
8535 dout(10) << "dir_is_nonempty dirstat has "
8536 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
8540 if (pf
->accounted_fragstat
.version
== dirstat_version
)
8541 dirstat
.add(pf
->accounted_fragstat
);
8543 dirstat
.add(pf
->fragstat
);
8546 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
8550 // ======================================================
8553 class C_MDS_rename_finish
: public ServerLogContext
{
8558 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
8559 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
8560 ServerLogContext(s
, r
),
8561 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
8562 void finish(int r
) override
{
8563 ceph_assert(r
== 0);
8564 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
8569 /** handle_client_rename
8571 * rename leader is the destdn auth. this is because cached inodes
8572 * must remain connected. thus, any replica of srci, must also
8573 * replicate destdn, and possibly straydn, so that srci (and
8574 * destdn->inode) remain connected during the rename.
8576 * to do this, we freeze srci, then leader (destdn auth) verifies that
8577 * all other nodes have also replciated destdn and straydn. note that
8578 * destdn replicas need not also replicate srci. this only works when
8581 * This function takes responsibility for the passed mdr.
8583 void Server::handle_client_rename(MDRequestRef
& mdr
)
8585 const auto& req
= mdr
->client_request
;
8586 dout(7) << "handle_client_rename " << *req
<< dendl
;
8588 filepath destpath
= req
->get_filepath();
8589 filepath srcpath
= req
->get_filepath2();
8590 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
8591 respond_to_request(mdr
, -CEPHFS_EBUSY
);
8595 if (req
->get_alternate_name().size() > alternate_name_max
) {
8596 dout(10) << " alternate_name longer than " << alternate_name_max
<< dendl
;
8597 respond_to_request(mdr
, -CEPHFS_ENAMETOOLONG
);
8601 auto [destdn
, srcdn
] = rdlock_two_paths_xlock_destdn(mdr
, true);
8605 dout(10) << " destdn " << *destdn
<< dendl
;
8606 CDir
*destdir
= destdn
->get_dir();
8607 ceph_assert(destdir
->is_auth());
8608 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
8610 dout(10) << " srcdn " << *srcdn
<< dendl
;
8611 CDir
*srcdir
= srcdn
->get_dir();
8612 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
8613 CInode
*srci
= srcdnl
->get_inode();
8614 dout(10) << " srci " << *srci
<< dendl
;
8616 // -- some sanity checks --
8617 if (destdn
== srcdn
) {
8618 dout(7) << "rename src=dest, noop" << dendl
;
8619 respond_to_request(mdr
, 0);
8623 // dest a child of src?
8624 // e.g. mv /usr /usr/foo
8625 if (srci
->is_dir() && srci
->is_projected_ancestor_of(destdir
->get_inode())) {
8626 dout(7) << "cannot rename item to be a child of itself" << dendl
;
8627 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8631 // is this a stray migration, reintegration or merge? (sanity checks!)
8632 if (mdr
->reqid
.name
.is_mds() &&
8633 !(MDS_INO_IS_STRAY(srcpath
.get_ino()) &&
8634 MDS_INO_IS_STRAY(destpath
.get_ino())) &&
8635 !(destdnl
->is_remote() &&
8636 destdnl
->get_remote_ino() == srci
->ino())) {
8637 respond_to_request(mdr
, -CEPHFS_EINVAL
); // actually, this won't reply, but whatev.
8642 if (!destdnl
->is_null()) {
8643 //dout(10) << "dest dn exists " << *destdn << dendl;
8644 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
8646 dout(10) << " oldin " << *oldin
<< dendl
;
8648 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8649 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
8650 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8654 // mv /some/thing /to/some/existing_other_thing
8655 if (oldin
->is_dir() && !srci
->is_dir()) {
8656 respond_to_request(mdr
, -CEPHFS_EISDIR
);
8659 if (!oldin
->is_dir() && srci
->is_dir()) {
8660 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
8663 if (srci
== oldin
&& !srcdir
->inode
->is_stray()) {
8664 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
8667 if (destdn
->get_alternate_name() != req
->get_alternate_name()) {
8668 /* the dentry exists but the alternate_names do not match, fail... */
8669 respond_to_request(mdr
, -CEPHFS_EINVAL
);
8674 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
8675 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
8677 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8678 if (destpath
.get_ino() != srcpath
.get_ino() &&
8679 !(req
->get_source().is_mds() &&
8680 MDS_INO_IS_STRAY(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8681 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
8682 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
8683 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8684 while (srcbase
!= destbase
&&
8685 !srcbase
->is_projected_ancestor_of(destbase
)) {
8686 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
8687 srctrace
.insert(srctrace
.begin(), pdn
);
8688 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
8689 srcbase
= pdn
->get_dir()->get_inode();
8692 // then, extend destpath until it shares the same parent inode as srcpath.
8693 while (destbase
!= srcbase
) {
8694 CDentry
*pdn
= destbase
->get_projected_parent_dn();
8695 desttrace
.insert(desttrace
.begin(), pdn
);
8696 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
8697 destbase
= pdn
->get_dir()->get_inode();
8699 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
8703 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
8705 dout(10) << " this is a link merge" << dendl
;
8707 // -- create stray dentry? --
8708 CDentry
*straydn
= NULL
;
8709 if (destdnl
->is_primary() && !linkmerge
) {
8710 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
8713 dout(10) << " straydn is " << *straydn
<< dendl
;
8714 } else if (mdr
->straydn
) {
8715 mdr
->unpin(mdr
->straydn
);
8716 mdr
->straydn
= NULL
;
8721 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
8722 MutationImpl::LockOpVec lov
;
8724 // we need to update srci's ctime. xlock its least contended lock to do that...
8725 lov
.add_xlock(&srci
->linklock
);
8726 lov
.add_xlock(&srci
->snaplock
);
8729 // xlock oldin (for nlink--)
8730 lov
.add_xlock(&oldin
->linklock
);
8731 lov
.add_xlock(&oldin
->snaplock
);
8732 if (oldin
->is_dir()) {
8733 ceph_assert(srci
->is_dir());
8734 lov
.add_rdlock(&oldin
->filelock
); // to verify it's empty
8736 // adjust locking order?
8737 int cmp
= mdr
->compare_paths();
8738 if (cmp
< 0 || (cmp
== 0 && oldin
->ino() < srci
->ino()))
8739 std::reverse(lov
.begin(), lov
.end());
8741 ceph_assert(!srci
->is_dir());
8742 // adjust locking order;
8743 if (srci
->ino() > oldin
->ino())
8744 std::reverse(lov
.begin(), lov
.end());
8750 lov
.add_wrlock(&straydn
->get_dir()->inode
->filelock
);
8751 lov
.add_wrlock(&straydn
->get_dir()->inode
->nestlock
);
8752 lov
.add_xlock(&straydn
->lock
);
8755 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: nullptr;
8756 if (!mds
->locker
->acquire_locks(mdr
, lov
, auth_pin_freeze
))
8759 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
8763 ceph_assert(srcdir
->inode
->is_stray() && srcdnl
->is_primary() && destdnl
->is_remote());
8765 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
8766 if (!check_access(mdr
, srcdir
->get_inode(), MAY_WRITE
))
8769 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
8772 if (!linkmerge
&& !check_fragment_space(mdr
, destdn
->get_dir()))
8775 if (!linkmerge
&& !check_dir_max_entries(mdr
, destdn
->get_dir()))
8778 if (!check_access(mdr
, srci
, MAY_WRITE
))
8782 // with read lock, really verify oldin is empty
8785 _dir_is_nonempty(mdr
, oldin
)) {
8786 respond_to_request(mdr
, -CEPHFS_ENOTEMPTY
);
8790 /* project_snaprealm_past_parent() will do this job
8792 // moving between snaprealms?
8793 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8794 SnapRealm *srcrealm = srci->find_snaprealm();
8795 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8796 if (srcrealm != destrealm &&
8797 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8798 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8799 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8800 mdcache->snaprealm_create(mdr, srci);
8806 SnapRealm
*dest_realm
= nullptr;
8807 SnapRealm
*src_realm
= nullptr;
8809 dest_realm
= destdir
->inode
->find_snaprealm();
8810 if (srcdir
->inode
== destdir
->inode
)
8811 src_realm
= dest_realm
;
8813 src_realm
= srcdir
->inode
->find_snaprealm();
8814 if (src_realm
!= dest_realm
&&
8815 src_realm
->get_subvolume_ino() != dest_realm
->get_subvolume_ino()) {
8816 respond_to_request(mdr
, -CEPHFS_EXDEV
);
8821 ceph_assert(g_conf()->mds_kill_rename_at
!= 1);
8823 // -- open all srcdn inode frags, if any --
8824 // we need these open so that auth can properly delegate from inode to dirfrags
8825 // after the inode is _ours_.
8826 if (srcdnl
->is_primary() &&
8827 !srcdn
->is_auth() &&
8829 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
8830 mdr
->set_stickydirs(srci
);
8833 srci
->dirfragtree
.get_leaves(leaves
);
8834 for (const auto& leaf
: leaves
) {
8835 CDir
*dir
= srci
->get_dirfrag(leaf
);
8837 dout(10) << " opening " << leaf
<< " under " << *srci
<< dendl
;
8838 mdcache
->open_remote_dirfrag(srci
, leaf
, new C_MDS_RetryRequest(mdcache
, mdr
));
8844 // -- prepare snaprealm ---
8847 if (!mdr
->more()->srci_srnode
&&
8848 srci
->get_projected_inode()->nlink
== 1 &&
8849 srci
->is_projected_snaprealm_global()) {
8850 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8851 srci
->record_snaprealm_parent_dentry(new_srnode
, nullptr, destdn
, false);
8853 srci
->clear_snaprealm_global(new_srnode
);
8854 mdr
->more()->srci_srnode
= new_srnode
;
8857 if (oldin
&& !mdr
->more()->desti_srnode
) {
8858 if (oldin
->is_projected_snaprealm_global()) {
8859 sr_t
*new_srnode
= oldin
->prepare_new_srnode(0);
8860 oldin
->record_snaprealm_parent_dentry(new_srnode
, dest_realm
, destdn
, destdnl
->is_primary());
8861 // dropping the last linkage or dropping the last remote linkage,
8862 // detch the inode from global snaprealm
8863 auto nlink
= oldin
->get_projected_inode()->nlink
;
8865 (nlink
== 2 && !destdnl
->is_primary() &&
8866 !oldin
->get_projected_parent_dir()->inode
->is_stray()))
8867 oldin
->clear_snaprealm_global(new_srnode
);
8868 mdr
->more()->desti_srnode
= new_srnode
;
8869 } else if (destdnl
->is_primary()) {
8870 snapid_t follows
= dest_realm
->get_newest_seq();
8871 if (oldin
->snaprealm
|| follows
+ 1 > oldin
->get_oldest_snap()) {
8872 sr_t
*new_srnode
= oldin
->prepare_new_srnode(follows
);
8873 oldin
->record_snaprealm_past_parent(new_srnode
, straydn
->get_dir()->inode
->find_snaprealm());
8874 mdr
->more()->desti_srnode
= new_srnode
;
8878 if (!mdr
->more()->srci_srnode
) {
8879 if (srci
->is_projected_snaprealm_global()) {
8880 sr_t
*new_srnode
= srci
->prepare_new_srnode(0);
8881 srci
->record_snaprealm_parent_dentry(new_srnode
, src_realm
, srcdn
, srcdnl
->is_primary());
8882 mdr
->more()->srci_srnode
= new_srnode
;
8883 } else if (srcdnl
->is_primary()) {
8884 snapid_t follows
= src_realm
->get_newest_seq();
8885 if (src_realm
!= dest_realm
&&
8886 (srci
->snaprealm
|| follows
+ 1 > srci
->get_oldest_snap())) {
8887 sr_t
*new_srnode
= srci
->prepare_new_srnode(follows
);
8888 srci
->record_snaprealm_past_parent(new_srnode
, dest_realm
);
8889 mdr
->more()->srci_srnode
= new_srnode
;
8895 // -- prepare witnesses --
8898 * NOTE: we use _all_ replicas as witnesses.
8899 * this probably isn't totally necessary (esp for file renames),
8900 * but if/when we change that, we have to make sure rejoin is
8901 * sufficiently robust to handle strong rejoins from survivors
8902 * with totally wrong dentry->inode linkage.
8903 * (currently, it can ignore rename effects, because the resolve
8904 * stage will sort them out.)
8906 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
8907 if (srcdn
->is_auth())
8908 srcdn
->list_replicas(witnesses
);
8910 witnesses
.insert(srcdn
->authority().first
);
8911 if (srcdnl
->is_remote() && !srci
->is_auth())
8912 witnesses
.insert(srci
->authority().first
);
8913 destdn
->list_replicas(witnesses
);
8914 if (destdnl
->is_remote() && !oldin
->is_auth())
8915 witnesses
.insert(oldin
->authority().first
);
8916 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
8918 if (!witnesses
.empty()) {
8919 // Replicas can't see projected dentry linkages and will get confused.
8920 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8921 // can't project these inodes' linkages.
8922 bool need_flush
= false;
8923 for (auto& dn
: srctrace
) {
8924 if (dn
->is_projected()) {
8930 CDentry
*dn
= destdn
;
8932 if (dn
->is_projected()) {
8936 CInode
*diri
= dn
->get_dir()->get_inode();
8937 dn
= diri
->get_projected_parent_dn();
8941 mdlog
->wait_for_safe(
8942 new MDSInternalContextWrapper(mds
,
8943 new C_MDS_RetryRequest(mdcache
, mdr
)));
8949 // do srcdn auth last
8950 mds_rank_t last
= MDS_RANK_NONE
;
8951 if (!srcdn
->is_auth()) {
8952 last
= srcdn
->authority().first
;
8953 mdr
->more()->srcdn_auth_mds
= last
;
8954 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8955 // are involved in the rename operation.
8956 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
8957 dout(10) << " preparing ambiguous auth for srci" << dendl
;
8958 ceph_assert(mdr
->more()->is_remote_frozen_authpin
);
8959 ceph_assert(mdr
->more()->rename_inode
== srci
);
8960 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8965 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
8966 p
!= witnesses
.end();
8968 if (*p
== last
) continue; // do it last!
8969 if (mdr
->more()->witnessed
.count(*p
)) {
8970 dout(10) << " already witnessed by mds." << *p
<< dendl
;
8971 } else if (mdr
->more()->waiting_on_peer
.count(*p
)) {
8972 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
8974 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
8978 if (!mdr
->more()->waiting_on_peer
.empty())
8979 return; // we're waiting for a witness.
8981 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
8982 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
8983 ceph_assert(mdr
->more()->waiting_on_peer
.count(last
) == 0);
8984 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
8988 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8989 if (!mdr
->more()->peers
.empty() && !srci
->is_dir())
8990 ceph_assert(g_conf()->mds_kill_rename_at
!= 3);
8991 if (!mdr
->more()->peers
.empty() && srci
->is_dir())
8992 ceph_assert(g_conf()->mds_kill_rename_at
!= 4);
8994 // -- declare now --
8995 mdr
->set_mds_stamp(ceph_clock_now());
8997 // -- prepare journal entry --
8998 mdr
->ls
= mdlog
->get_current_segment();
8999 EUpdate
*le
= new EUpdate(mdlog
, "rename");
9000 mdlog
->start_entry(le
);
9001 le
->metablob
.add_client_req(mdr
->reqid
, req
->get_oldest_client_tid());
9002 if (!mdr
->more()->witnessed
.empty()) {
9003 dout(20) << " noting uncommitted_peers " << mdr
->more()->witnessed
<< dendl
;
9005 le
->reqid
= mdr
->reqid
;
9006 le
->had_peers
= true;
9008 mdcache
->add_uncommitted_leader(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
9009 // no need to send frozen auth pin to recovring auth MDS of srci
9010 mdr
->more()->is_remote_frozen_authpin
= false;
9013 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, req
->get_alternate_name(), straydn
);
9014 if (le
->client_map
.length())
9015 le
->cmapv
= mds
->sessionmap
.get_projected();
9017 // -- commit locally --
9018 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
9020 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
9021 mds
->balancer
->maybe_fragment(destdn
->get_dir(), false);
9025 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9027 dout(10) << "_rename_finish " << *mdr
<< dendl
;
9029 if (!mdr
->more()->witnessed
.empty())
9030 mdcache
->logged_leader_update(mdr
->reqid
);
9033 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
9035 mdcache
->send_dentry_link(destdn
, mdr
);
9037 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9038 CInode
*in
= destdnl
->get_inode();
9039 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
9041 // test hack: test peer commit
9042 if (!mdr
->more()->peers
.empty() && !in
->is_dir())
9043 ceph_assert(g_conf()->mds_kill_rename_at
!= 5);
9044 if (!mdr
->more()->peers
.empty() && in
->is_dir())
9045 ceph_assert(g_conf()->mds_kill_rename_at
!= 6);
9048 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
9049 if (destdnl
->is_remote() && in
->is_auth())
9050 mds
->balancer
->hit_inode(in
, META_POP_IWR
);
9052 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
9054 ceph_assert(g_conf()->mds_kill_rename_at
!= 7);
9057 respond_to_request(mdr
, 0);
9060 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
9063 // respond_to_request() drops locks. So stray reintegration can race with us.
9064 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
9065 mdcache
->notify_stray(straydn
);
9073 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
9074 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
9076 const auto& client_req
= mdr
->client_request
;
9077 ceph_assert(client_req
);
9079 if (mds
->is_cluster_degraded() &&
9080 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
9081 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
9082 if (mdr
->more()->waiting_on_peer
.empty())
9083 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
9087 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
9088 auto req
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREP
);
9090 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
9091 for (auto dn
: srctrace
)
9092 req
->srcdnpath
.push_dentry(dn
->get_name());
9093 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
9094 for (auto dn
: dsttrace
)
9095 req
->destdnpath
.push_dentry(dn
->get_name());
9096 req
->alternate_name
= client_req
->alternate_name
;
9098 mdcache
->encode_replica_stray(straydn
, who
, req
->straybl
);
9100 if (mdr
->more()->srci_srnode
)
9101 encode(*mdr
->more()->srci_srnode
, req
->srci_snapbl
);
9102 if (mdr
->more()->desti_srnode
)
9103 encode(*mdr
->more()->desti_srnode
, req
->desti_snapbl
);
9105 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
9107 // srcdn auth will verify our current witness list is sufficient
9108 req
->witnesses
= witnesse
;
9110 req
->op_stamp
= mdr
->get_op_stamp();
9111 mds
->send_message_mds(req
, who
);
9113 ceph_assert(mdr
->more()->waiting_on_peer
.count(who
) == 0);
9114 mdr
->more()->waiting_on_peer
.insert(who
);
9118 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
9120 version_t oldpv
= mdr
->more()->inode_import_v
;
9122 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9125 auto blp
= mdr
->more()->inode_import
.cbegin();
9128 map
<client_t
,entity_inst_t
> client_map
;
9129 map
<client_t
, client_metadata_t
> client_metadata_map
;
9130 decode(client_map
, blp
);
9131 decode(client_metadata_map
, blp
);
9132 prepare_force_open_sessions(client_map
, client_metadata_map
,
9133 mdr
->more()->imported_session_map
);
9134 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
9135 encode(client_metadata_map
, *client_map_bl
);
9137 list
<ScatterLock
*> updated_scatterlocks
;
9138 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
9139 mdr
->more()->cap_imports
, updated_scatterlocks
);
9141 // hack: force back to !auth and clean, temporarily
9142 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
9143 srcdnl
->get_inode()->mark_clean();
9148 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
9150 auto&& dirs
= diri
->get_dirfrags();
9152 bool force_journal
= false;
9154 for (const auto& dir
: dirs
) {
9155 if (dir
->is_subtree_root() && dir
->get_dir_auth().first
== mds
->get_nodeid()) {
9156 dout(10) << " frag " << dir
->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
9157 force_journal
= true;
9160 dout(20) << " frag " << dir
->get_frag() << " is not auth subtree dirfrag" << dendl
;
9163 // see if any children of our frags are auth subtrees.
9164 std::vector
<CDir
*> subtrees
;
9165 mdcache
->get_subtrees(subtrees
);
9166 dout(10) << " subtrees " << subtrees
<< " frags " << dirs
<< dendl
;
9167 for (const auto& dir
: dirs
) {
9168 for (const auto& subtree
: subtrees
) {
9169 if (dir
->contains(subtree
)) {
9170 if (subtree
->get_dir_auth().first
== mds
->get_nodeid()) {
9171 dout(10) << " frag " << dir
->get_frag() << " contains (maybe) auth subtree, will force journal "
9172 << *subtree
<< dendl
;
9173 force_journal
= true;
9176 dout(20) << " frag " << dir
->get_frag() << " contains but isn't auth for " << *subtree
<< dendl
;
9178 dout(20) << " frag " << dir
->get_frag() << " does not contain " << *subtree
<< dendl
;
9184 return force_journal
;
9187 void Server::_rename_prepare(MDRequestRef
& mdr
,
9188 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
9189 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
9192 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9194 dout(10) << " straydn " << *straydn
<< dendl
;
9196 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9197 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9198 CInode
*srci
= srcdnl
->get_inode();
9199 CInode
*oldin
= destdnl
->get_inode();
9201 // primary+remote link merge?
9202 bool linkmerge
= (srci
== oldin
);
9204 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9205 bool silent
= srcdn
->get_dir()->inode
->is_stray();
9207 bool force_journal_dest
= false;
9208 if (srci
->is_dir() && !destdn
->is_auth()) {
9209 if (srci
->is_auth()) {
9210 // if we are auth for srci and exporting it, force journal because journal replay needs
9211 // the source inode to create auth subtrees.
9212 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
9213 force_journal_dest
= true;
9215 force_journal_dest
= _need_force_journal(srci
, false);
9218 bool force_journal_stray
= false;
9219 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
9220 force_journal_stray
= _need_force_journal(oldin
, true);
9223 dout(10) << " merging remote and primary links to the same inode" << dendl
;
9225 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
9226 if (force_journal_dest
)
9227 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
9228 if (force_journal_stray
)
9229 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
9231 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
9232 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
9233 metablob
->renamed_dirino
= srci
->ino();
9234 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
9235 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
9236 metablob
->renamed_dirino
= oldin
->ino();
9240 CInode::mempool_inode
*spi
= 0; // renamed inode
9241 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
9245 if (destdnl
->is_primary()) {
9246 ceph_assert(straydn
); // moving to straydn.
9247 // link--, and move.
9248 if (destdn
->is_auth()) {
9249 auto pi
= oldin
->project_inode(mdr
); //project_snaprealm
9250 pi
.inode
->version
= straydn
->pre_dirty(pi
.inode
->version
);
9251 pi
.inode
->update_backtrace();
9252 tpi
= pi
.inode
.get();
9254 straydn
->push_projected_linkage(oldin
);
9255 } else if (destdnl
->is_remote()) {
9257 if (oldin
->is_auth()) {
9258 auto pi
= oldin
->project_inode(mdr
);
9259 pi
.inode
->version
= oldin
->pre_dirty();
9260 tpi
= pi
.inode
.get();
9266 if (destdnl
->is_null()) {
9267 /* handle_client_rename checks that alternate_name matches for existing destdn */
9268 destdn
->set_alternate_name(alternate_name
);
9270 if (srcdnl
->is_remote()) {
9273 if (destdn
->is_auth())
9274 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
9275 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9277 if (srci
->is_auth()) {
9278 auto pi
= srci
->project_inode(mdr
);
9279 pi
.inode
->version
= srci
->pre_dirty();
9280 spi
= pi
.inode
.get();
9283 dout(10) << " will merge remote onto primary link" << dendl
;
9284 if (destdn
->is_auth()) {
9285 auto pi
= oldin
->project_inode(mdr
);
9286 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->get_version());
9287 spi
= pi
.inode
.get();
9291 if (destdn
->is_auth()) {
9293 if (srcdn
->is_auth())
9294 oldpv
= srci
->get_projected_version();
9296 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
9298 // note which dirfrags have child subtrees in the journal
9299 // event, so that we can open those (as bounds) during replay.
9300 if (srci
->is_dir()) {
9301 auto&& ls
= srci
->get_dirfrags();
9302 for (const auto& dir
: ls
) {
9303 if (!dir
->is_auth())
9304 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
9306 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
9309 auto pi
= srci
->project_inode(mdr
); // project snaprealm if srcdnl->is_primary
9310 // & srcdnl->snaprealm
9311 pi
.inode
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
9312 pi
.inode
->update_backtrace();
9313 spi
= pi
.inode
.get();
9315 destdn
->push_projected_linkage(srci
);
9319 if (srcdn
->is_auth())
9320 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
9321 srcdn
->push_projected_linkage(); // push null linkage
9325 spi
->ctime
= mdr
->get_op_stamp();
9326 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
9327 spi
->rstat
.rctime
= mdr
->get_op_stamp();
9333 tpi
->ctime
= mdr
->get_op_stamp();
9334 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
9335 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
9339 destdn
->make_path_string(t
, true);
9340 tpi
->stray_prior_path
= std::move(t
);
9343 if (tpi
->nlink
== 0)
9344 oldin
->state_set(CInode::STATE_ORPHAN
);
9348 // prepare nesting, mtime updates
9349 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
9351 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9352 // then link the source inode to destdn
9353 if (destdnl
->is_primary()) {
9354 ceph_assert(straydn
);
9355 if (straydn
->is_auth()) {
9356 metablob
->add_dir_context(straydn
->get_dir());
9357 metablob
->add_dir(straydn
->get_dir(), true);
9361 if (!linkmerge
&& destdnl
->is_remote() && oldin
->is_auth()) {
9362 CDir
*oldin_dir
= oldin
->get_projected_parent_dir();
9363 if (oldin_dir
!= srcdn
->get_dir() && oldin_dir
!= destdn
->get_dir())
9364 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, oldin_dir
, PREDIRTY_PRIMARY
);
9368 if (destdn
->is_auth() && !destdnl
->is_null()) {
9369 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
9370 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
9371 if (destdnl
->is_primary()) {
9372 ceph_assert(straydn
);
9373 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
9374 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
9378 if (srcdnl
->is_remote() && srci
->is_auth()) {
9379 CDir
*srci_dir
= srci
->get_projected_parent_dir();
9380 if (srci_dir
!= srcdn
->get_dir() && srci_dir
!= destdn
->get_dir())
9381 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srci_dir
, PREDIRTY_PRIMARY
);
9385 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
9386 int flags
= predirty_dir
| predirty_primary
;
9387 if (srcdn
->is_auth())
9388 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
9389 if (destdn
->is_auth())
9390 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
9392 // add it all to the metablob
9395 if (destdnl
->is_primary()) {
9396 ceph_assert(straydn
);
9397 if (destdn
->is_auth()) {
9398 // project snaprealm, too
9399 if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9400 oldin
->project_snaprealm(desti_srnode
);
9401 if (tpi
->nlink
== 0)
9402 ceph_assert(!desti_srnode
->is_parent_global());
9403 desti_srnode
= NULL
;
9405 straydn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9406 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
9407 } else if (force_journal_stray
) {
9408 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
9409 metablob
->add_dir_context(straydn
->get_dir());
9410 metablob
->add_primary_dentry(straydn
, oldin
, true);
9412 } else if (destdnl
->is_remote()) {
9413 if (oldin
->is_auth()) {
9414 sr_t
*new_srnode
= NULL
;
9415 if (mdr
->peer_request
) {
9416 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9417 new_srnode
= new sr_t();
9418 auto p
= mdr
->peer_request
->desti_snapbl
.cbegin();
9419 decode(*new_srnode
, p
);
9421 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9422 new_srnode
= desti_srnode
;
9423 desti_srnode
= NULL
;
9426 oldin
->project_snaprealm(new_srnode
);
9427 if (tpi
->nlink
== 0)
9428 ceph_assert(!new_srnode
->is_parent_global());
9431 CDentry
*oldin_pdn
= oldin
->get_projected_parent_dn();
9432 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin_pdn
);
9433 metablob
->add_primary_dentry(oldin_pdn
, oldin
, true);
9439 if (srcdnl
->is_remote()) {
9440 ceph_assert(!linkmerge
);
9441 if (destdn
->is_auth() && !destdnl
->is_null())
9442 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9444 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9446 if (destdn
->is_auth())
9447 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
9449 if (srci
->is_auth() ) { // it's remote
9450 if (mdr
->peer_request
) {
9451 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9452 sr_t
*new_srnode
= new sr_t();
9453 auto p
= mdr
->peer_request
->srci_snapbl
.cbegin();
9454 decode(*new_srnode
, p
);
9455 srci
->project_snaprealm(new_srnode
);
9457 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9458 srci
->project_snaprealm(srci_srnode
);
9462 CDentry
*srci_pdn
= srci
->get_projected_parent_dn();
9463 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci_pdn
);
9464 metablob
->add_primary_dentry(srci_pdn
, srci
, true);
9466 } else if (srcdnl
->is_primary()) {
9467 // project snap parent update?
9468 if (destdn
->is_auth()) {
9469 if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9470 srci
->project_snaprealm(srci_srnode
);
9475 if (destdn
->is_auth() && !destdnl
->is_null())
9476 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
9478 destdn
->first
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
9480 auto do_corruption
= inject_rename_corrupt_dentry_first
;
9481 if (unlikely(do_corruption
> 0.0)) {
9482 auto r
= ceph::util::generate_random_number(0.0, 1.0);
9483 if (r
< do_corruption
) {
9484 dout(0) << "corrupting dn: " << *destdn
<< dendl
;
9485 destdn
->first
= -10;
9490 if (destdn
->is_auth())
9491 metablob
->add_primary_dentry(destdn
, srci
, true, true);
9492 else if (force_journal_dest
) {
9493 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
9494 metablob
->add_dir_context(destdn
->get_dir());
9495 metablob
->add_primary_dentry(destdn
, srci
, true);
9496 if (srcdn
->is_auth() && srci
->is_dir()) {
9497 // journal new subtrees root dirfrags
9498 auto&& ls
= srci
->get_dirfrags();
9499 for (const auto& dir
: ls
) {
9501 metablob
->add_dir(dir
, true);
9508 if (srcdn
->is_auth()) {
9509 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
9510 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
9511 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9512 // both primary and NULL dentries. Because during journal replay, null dentry is
9513 // processed after primary dentry.
9514 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
9515 metablob
->add_primary_dentry(srcdn
, srci
, true);
9516 metablob
->add_null_dentry(srcdn
, true);
9518 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
9520 // make renamed inode first track the dn
9521 if (srcdnl
->is_primary() && destdn
->is_auth()) {
9522 ceph_assert(srci
->first
<= destdn
->first
);
9523 srci
->first
= destdn
->first
;
9525 // make stray inode first track the straydn
9526 if (straydn
&& straydn
->is_auth()) {
9527 ceph_assert(oldin
->first
<= straydn
->first
);
9528 oldin
->first
= straydn
->first
;
9531 if (oldin
&& oldin
->is_dir()) {
9532 ceph_assert(straydn
);
9533 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
9536 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
9541 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
9543 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
9544 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
9546 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
9547 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
9549 CInode
*oldin
= destdnl
->get_inode();
9551 // primary+remote link merge?
9552 bool linkmerge
= (srcdnl
->get_inode() == oldin
);
9554 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9556 bool new_in_snaprealm
= false;
9557 bool new_oldin_snaprealm
= false;
9561 if (destdnl
->is_primary()) {
9562 ceph_assert(straydn
);
9563 dout(10) << "straydn is " << *straydn
<< dendl
;
9565 // if there is newly created snaprealm, need to split old snaprealm's
9566 // inodes_with_caps. So pop snaprealm before linkage changes.
9567 if (destdn
->is_auth()) {
9568 bool hadrealm
= (oldin
->snaprealm
? true : false);
9569 oldin
->early_pop_projected_snaprealm();
9570 new_oldin_snaprealm
= (oldin
->snaprealm
&& !hadrealm
);
9572 ceph_assert(mdr
->peer_request
);
9573 if (mdr
->peer_request
->desti_snapbl
.length()) {
9574 new_oldin_snaprealm
= !oldin
->snaprealm
;
9575 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9576 ceph_assert(oldin
->snaprealm
);
9580 destdn
->get_dir()->unlink_inode(destdn
, false);
9582 straydn
->pop_projected_linkage();
9583 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9584 ceph_assert(!straydn
->is_projected()); // no other projected
9587 if (destdn
->is_auth())
9588 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9590 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
9591 } else if (destdnl
->is_remote()) {
9592 destdn
->get_dir()->unlink_inode(destdn
, false);
9593 if (oldin
->is_auth()) {
9594 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9595 } else if (mdr
->peer_request
) {
9596 if (mdr
->peer_request
->desti_snapbl
.length() > 0) {
9597 ceph_assert(oldin
->snaprealm
);
9598 oldin
->decode_snap_blob(mdr
->peer_request
->desti_snapbl
);
9600 } else if (auto& desti_srnode
= mdr
->more()->desti_srnode
) {
9601 delete desti_srnode
;
9602 desti_srnode
= NULL
;
9607 // unlink src before we relink it at dest
9608 CInode
*in
= srcdnl
->get_inode();
9611 bool srcdn_was_remote
= srcdnl
->is_remote();
9612 if (!srcdn_was_remote
) {
9613 // if there is newly created snaprealm, need to split old snaprealm's
9614 // inodes_with_caps. So pop snaprealm before linkage changes.
9615 if (destdn
->is_auth()) {
9616 bool hadrealm
= (in
->snaprealm
? true : false);
9617 in
->early_pop_projected_snaprealm();
9618 new_in_snaprealm
= (in
->snaprealm
&& !hadrealm
);
9620 ceph_assert(mdr
->peer_request
);
9621 if (mdr
->peer_request
->srci_snapbl
.length()) {
9622 new_in_snaprealm
= !in
->snaprealm
;
9623 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9624 ceph_assert(in
->snaprealm
);
9629 srcdn
->get_dir()->unlink_inode(srcdn
);
9632 if (srcdn_was_remote
) {
9635 destdnl
= destdn
->pop_projected_linkage();
9636 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9637 ceph_assert(!destdn
->is_projected()); // no other projected
9639 destdn
->link_remote(destdnl
, in
);
9640 if (destdn
->is_auth())
9641 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
9643 if (in
->is_auth()) {
9644 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9645 } else if (mdr
->peer_request
) {
9646 if (mdr
->peer_request
->srci_snapbl
.length() > 0) {
9647 ceph_assert(in
->snaprealm
);
9648 in
->decode_snap_blob(mdr
->peer_request
->srci_snapbl
);
9650 } else if (auto& srci_srnode
= mdr
->more()->srci_srnode
) {
9655 dout(10) << "merging remote onto primary link" << dendl
;
9656 oldin
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9660 dout(10) << "merging primary onto remote link" << dendl
;
9661 destdn
->get_dir()->unlink_inode(destdn
, false);
9663 destdnl
= destdn
->pop_projected_linkage();
9664 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9665 ceph_assert(!destdn
->is_projected()); // no other projected
9667 // srcdn inode import?
9668 if (!srcdn
->is_auth() && destdn
->is_auth()) {
9669 ceph_assert(mdr
->more()->inode_import
.length() > 0);
9671 map
<client_t
,Capability::Import
> imported_caps
;
9673 // finish cap imports
9674 finish_force_open_sessions(mdr
->more()->imported_session_map
);
9675 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
9676 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
9677 mdr
->more()->srcdn_auth_mds
, true,
9678 mdr
->more()->imported_session_map
,
9679 mdr
->more()->cap_imports
[destdnl
->get_inode()],
9683 mdr
->more()->inode_import
.clear();
9684 encode(imported_caps
, mdr
->more()->inode_import
);
9686 /* hack: add an auth pin for each xlock we hold. These were
9687 * remote xlocks previously but now they're local and
9688 * we're going to try and unpin when we xlock_finish. */
9690 for (auto i
= mdr
->locks
.lower_bound(&destdnl
->get_inode()->versionlock
);
9691 i
!= mdr
->locks
.end();
9693 SimpleLock
*lock
= i
->lock
;
9694 if (lock
->get_parent() != destdnl
->get_inode())
9696 if (i
->is_xlock() && !lock
->is_locallock())
9697 mds
->locker
->xlock_import(lock
);
9700 // hack: fix auth bit
9701 in
->state_set(CInode::STATE_AUTH
);
9703 mdr
->clear_ambiguous_auth();
9706 if (destdn
->is_auth())
9707 in
->pop_and_dirty_projected_inode(mdr
->ls
, mdr
);
9711 if (srcdn
->is_auth())
9712 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
9713 srcdn
->pop_projected_linkage();
9714 if (mdr
->is_peer() && !mdr
->more()->peer_update_journaled
)
9715 ceph_assert(!srcdn
->is_projected()); // no other projected
9717 // apply remaining projected inodes (nested)
9720 // update subtree map?
9721 if (destdnl
->is_primary() && in
->is_dir())
9722 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
9724 if (straydn
&& oldin
->is_dir())
9725 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
9727 if (new_oldin_snaprealm
)
9728 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
, false);
9729 if (new_in_snaprealm
)
9730 mdcache
->do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, true);
9732 // removing a new dn?
9733 if (srcdn
->is_auth())
9734 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
9742 class C_MDS_PeerRenamePrep
: public ServerLogContext
{
9743 CDentry
*srcdn
, *destdn
, *straydn
;
9745 C_MDS_PeerRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9746 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9747 void finish(int r
) override
{
9748 server
->_logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
9752 class C_MDS_PeerRenameCommit
: public ServerContext
{
9754 CDentry
*srcdn
, *destdn
, *straydn
;
9756 C_MDS_PeerRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
9757 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
9758 void finish(int r
) override
{
9759 server
->_commit_peer_rename(mdr
, r
, srcdn
, destdn
, straydn
);
9763 class C_MDS_PeerRenameSessionsFlushed
: public ServerContext
{
9766 C_MDS_PeerRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
9767 ServerContext(s
), mdr(r
) {}
9768 void finish(int r
) override
{
9769 server
->_peer_rename_sessions_flushed(mdr
);
9773 void Server::handle_peer_rename_prep(MDRequestRef
& mdr
)
9775 dout(10) << "handle_peer_rename_prep " << *mdr
9776 << " " << mdr
->peer_request
->srcdnpath
9777 << " to " << mdr
->peer_request
->destdnpath
9780 if (mdr
->peer_request
->is_interrupted()) {
9781 dout(10) << " peer request interrupted, sending noop reply" << dendl
;
9782 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9783 reply
->mark_interrupted();
9784 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9785 mdr
->reset_peer_request();
9790 filepath
destpath(mdr
->peer_request
->destdnpath
);
9791 dout(10) << " dest " << destpath
<< dendl
;
9792 vector
<CDentry
*> trace
;
9793 CF_MDS_RetryRequestFactory
cf(mdcache
, mdr
, false);
9794 int r
= mdcache
->path_traverse(mdr
, cf
, destpath
,
9795 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
| MDS_TRAVERSE_WANT_DENTRY
,
9798 if (r
== -CEPHFS_ESTALE
) {
9799 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
9800 mdr
->peer_to_mds
, true);
9803 ceph_assert(r
== 0); // we shouldn't get an error here!
9805 CDentry
*destdn
= trace
.back();
9806 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
9807 dout(10) << " destdn " << *destdn
<< dendl
;
9811 filepath
srcpath(mdr
->peer_request
->srcdnpath
);
9812 dout(10) << " src " << srcpath
<< dendl
;
9813 CInode
*srci
= nullptr;
9814 r
= mdcache
->path_traverse(mdr
, cf
, srcpath
,
9815 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
9818 ceph_assert(r
== 0);
9820 CDentry
*srcdn
= trace
.back();
9821 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
9822 dout(10) << " srcdn " << *srcdn
<< dendl
;
9827 bool linkmerge
= srcdnl
->get_inode() == destdnl
->get_inode();
9829 ceph_assert(srcdnl
->is_primary() && destdnl
->is_remote());
9830 CDentry
*straydn
= mdr
->straydn
;
9831 if (destdnl
->is_primary() && !linkmerge
)
9832 ceph_assert(straydn
);
9834 mdr
->set_op_stamp(mdr
->peer_request
->op_stamp
);
9835 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
9837 // set up commit waiter (early, to clean up any freezing etc we do)
9838 if (!mdr
->more()->peer_commit
)
9839 mdr
->more()->peer_commit
= new C_MDS_PeerRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
9842 if (srcdn
->is_auth()) {
9843 set
<mds_rank_t
> srcdnrep
;
9844 srcdn
->list_replicas(srcdnrep
);
9846 bool reply_witness
= false;
9847 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
9850 // - avoid conflicting lock state changes
9851 // - avoid concurrent updates to the inode
9852 // (this could also be accomplished with the versionlock)
9853 int allowance
= 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9854 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
9855 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
9857 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9858 if (srcdnl
->get_inode()->is_frozen_auth_pin())
9859 mdr
->unfreeze_auth_pin();
9861 if (!frozen_inode
) {
9862 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
9867 * set ambiguous auth for srci
9868 * NOTE: we don't worry about ambiguous cache expire as we do
9869 * with subtree migrations because all peers will pin
9870 * srcdn->get_inode() for duration of this rename.
9872 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9874 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9875 // the leader will send another OP_RENAMEPREP peer request later.
9876 if (mdr
->peer_request
->witnesses
.size() > 1) {
9877 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
9878 reply_witness
= true;
9881 // make sure bystanders have received all lock related messages
9882 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9883 if (*p
== mdr
->peer_to_mds
||
9884 (mds
->is_cluster_degraded() &&
9885 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
9887 auto notify
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMENOTIFY
);
9888 mds
->send_message_mds(notify
, *p
);
9889 mdr
->more()->waiting_on_peer
.insert(*p
);
9892 // make sure clients have received all cap related messages
9893 set
<client_t
> export_client_set
;
9894 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
9896 MDSGatherBuilder
gather(g_ceph_context
);
9897 flush_client_sessions(export_client_set
, gather
);
9898 if (gather
.has_subs()) {
9899 mdr
->more()->waiting_on_peer
.insert(MDS_RANK_NONE
);
9900 gather
.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr
));
9905 // is witness list sufficient?
9906 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
9907 if (*p
== mdr
->peer_to_mds
||
9908 mdr
->peer_request
->witnesses
.count(*p
)) continue;
9909 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
9910 reply_witness
= true;
9914 if (reply_witness
) {
9915 ceph_assert(!srcdnrep
.empty());
9916 auto reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
9917 reply
->witnesses
.swap(srcdnrep
);
9918 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
9919 mdr
->reset_peer_request();
9922 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
9923 if (!mdr
->more()->waiting_on_peer
.empty()) {
9924 dout(10) << " still waiting for rename notify acks from "
9925 << mdr
->more()->waiting_on_peer
<< dendl
;
9928 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
9929 // set ambiguous auth for srci on witnesses
9930 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
9933 // encode everything we'd need to roll this back... basically, just the original state.
9934 rename_rollback rollback
;
9936 rollback
.reqid
= mdr
->reqid
;
9938 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
9939 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9940 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9941 rollback
.orig_src
.dname
= srcdn
->get_name();
9942 if (srcdnl
->is_primary())
9943 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
9945 ceph_assert(srcdnl
->is_remote());
9946 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
9947 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
9950 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
9951 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9952 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9953 rollback
.orig_dest
.dname
= destdn
->get_name();
9954 if (destdnl
->is_primary())
9955 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
9956 else if (destdnl
->is_remote()) {
9957 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
9958 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
9962 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
9963 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
9964 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
9965 rollback
.stray
.dname
= straydn
->get_name();
9967 if (mdr
->peer_request
->desti_snapbl
.length()) {
9968 CInode
*oldin
= destdnl
->get_inode();
9969 if (oldin
->snaprealm
) {
9970 encode(true, rollback
.desti_snapbl
);
9971 oldin
->encode_snap_blob(rollback
.desti_snapbl
);
9973 encode(false, rollback
.desti_snapbl
);
9976 if (mdr
->peer_request
->srci_snapbl
.length()) {
9977 if (srci
->snaprealm
) {
9978 encode(true, rollback
.srci_snapbl
);
9979 srci
->encode_snap_blob(rollback
.srci_snapbl
);
9981 encode(false, rollback
.srci_snapbl
);
9984 encode(rollback
, mdr
->more()->rollback_bl
);
9985 // FIXME: rollback snaprealm
9986 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
9989 mdr
->ls
= mdlog
->get_current_segment();
9990 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_prep", mdr
->reqid
, mdr
->peer_to_mds
,
9991 EPeerUpdate::OP_PREPARE
, EPeerUpdate::RENAME
);
9992 mdlog
->start_entry(le
);
9993 le
->rollback
= mdr
->more()->rollback_bl
;
9995 bufferlist blah
; // inode import data... obviously not used if we're the peer
9996 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, mdr
->peer_request
->alternate_name
, straydn
);
9998 if (le
->commit
.empty()) {
9999 dout(10) << " empty metablob, skipping journal" << dendl
;
10000 mdlog
->cancel_entry(le
);
10002 _logged_peer_rename(mdr
, srcdn
, destdn
, straydn
);
10004 mdcache
->add_uncommitted_peer(mdr
->reqid
, mdr
->ls
, mdr
->peer_to_mds
);
10005 mdr
->more()->peer_update_journaled
= true;
10006 submit_mdlog_entry(le
, new C_MDS_PeerRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
10012 void Server::_logged_peer_rename(MDRequestRef
& mdr
,
10013 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
10015 dout(10) << "_logged_peer_rename " << *mdr
<< dendl
;
10018 ref_t
<MMDSPeerRequest
> reply
;
10019 if (!mdr
->aborted
) {
10020 reply
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
, MMDSPeerRequest::OP_RENAMEPREPACK
);
10021 if (!mdr
->more()->peer_update_journaled
)
10022 reply
->mark_not_journaled();
10025 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
10026 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
10029 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
10030 // set export bounds for CInode::encode_export()
10032 std::vector
<CDir
*> bounds
;
10033 if (srcdnl
->get_inode()->is_dir()) {
10034 srcdnl
->get_inode()->get_dirfrags(bounds
);
10035 for (const auto& bound
: bounds
) {
10036 bound
->state_set(CDir::STATE_EXPORTBOUND
);
10040 map
<client_t
,entity_inst_t
> exported_client_map
;
10041 map
<client_t
, client_metadata_t
> exported_client_metadata_map
;
10042 bufferlist inodebl
;
10043 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
10044 exported_client_map
,
10045 exported_client_metadata_map
);
10047 for (const auto& bound
: bounds
) {
10048 bound
->state_clear(CDir::STATE_EXPORTBOUND
);
10051 encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
10052 encode(exported_client_metadata_map
, reply
->inode_export
);
10053 reply
->inode_export
.claim_append(inodebl
);
10054 reply
->inode_export_v
= srcdnl
->get_inode()->get_version();
10057 // remove mdr auth pin
10058 mdr
->auth_unpin(srcdnl
->get_inode());
10059 mdr
->more()->is_inode_exporter
= true;
10061 if (srcdnl
->get_inode()->is_dirty())
10062 srcdnl
->get_inode()->mark_clean();
10064 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
10068 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
10070 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
10073 mds
->balancer
->hit_dir(srcdn
->get_dir(), META_POP_IWR
);
10074 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
10075 mds
->balancer
->hit_inode(destdnl
->get_inode(), META_POP_IWR
);
10078 mdr
->reset_peer_request();
10082 mds
->send_message_mds(reply
, mdr
->peer_to_mds
);
10084 ceph_assert(mdr
->aborted
);
10085 dout(10) << " abort flag set, finishing" << dendl
;
10086 mdcache
->request_finish(mdr
);
10090 void Server::_commit_peer_rename(MDRequestRef
& mdr
, int r
,
10091 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
10093 dout(10) << "_commit_peer_rename " << *mdr
<< " r=" << r
<< dendl
;
10095 CInode
*in
= destdn
->get_linkage()->get_inode();
10097 inodeno_t migrated_stray
;
10098 if (srcdn
->is_auth() && srcdn
->get_dir()->inode
->is_stray())
10099 migrated_stray
= in
->ino();
10101 MDSContext::vec finished
;
10103 // unfreeze+singleauth inode
10104 // hmm, do i really need to delay this?
10105 if (mdr
->more()->is_inode_exporter
) {
10107 // we exported, clear out any xlocks that we moved to another MDS
10109 for (auto i
= mdr
->locks
.lower_bound(&in
->versionlock
);
10110 i
!= mdr
->locks
.end(); ) {
10111 SimpleLock
*lock
= i
->lock
;
10112 if (lock
->get_parent() != in
)
10114 // we only care about xlocks on the exported inode
10115 if (i
->is_xlock() && !lock
->is_locallock())
10116 mds
->locker
->xlock_export(i
++, mdr
.get());
10121 map
<client_t
,Capability::Import
> peer_imported
;
10122 auto bp
= mdr
->more()->inode_import
.cbegin();
10123 decode(peer_imported
, bp
);
10125 dout(10) << " finishing inode export on " << *in
<< dendl
;
10126 mdcache
->migrator
->finish_export_inode(in
, mdr
->peer_to_mds
, peer_imported
, finished
);
10127 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
10130 ceph_assert(in
->is_frozen_inode());
10131 in
->unfreeze_inode(finished
);
10135 if (mdr
->more()->is_ambiguous_auth
) {
10136 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10137 mdr
->more()->is_ambiguous_auth
= false;
10140 if (straydn
&& mdr
->more()->peer_update_journaled
) {
10141 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
10142 if (strayin
&& !strayin
->snaprealm
)
10143 mdcache
->clear_dirty_bits_for_stray(strayin
);
10146 mds
->queue_waiters(finished
);
10149 if (mdr
->more()->peer_update_journaled
) {
10150 // write a commit to the journal
10151 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_commit", mdr
->reqid
,
10152 mdr
->peer_to_mds
, EPeerUpdate::OP_COMMIT
,
10153 EPeerUpdate::RENAME
);
10154 mdlog
->start_entry(le
);
10155 submit_mdlog_entry(le
, new C_MDS_CommittedPeer(this, mdr
), mdr
, __func__
);
10158 _committed_peer(mdr
);
10163 // rollback_bl may be empty if we froze the inode but had to provide an expanded
10164 // witness list from the leader, and they failed before we tried prep again.
10165 if (mdr
->more()->rollback_bl
.length()) {
10166 if (mdr
->more()->is_inode_exporter
) {
10167 dout(10) << " reversing inode export of " << *in
<< dendl
;
10168 in
->abort_export();
10170 if (mdcache
->is_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
)) {
10171 mdcache
->remove_ambiguous_peer_update(mdr
->reqid
, mdr
->peer_to_mds
);
10172 // rollback but preserve the peer request
10173 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, false);
10174 mdr
->more()->rollback_bl
.clear();
10176 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->peer_to_mds
, mdr
, true);
10178 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl
;
10180 if (mdr
->more()->is_ambiguous_auth
) {
10181 if (srcdn
->is_auth())
10182 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10184 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10185 mdr
->more()->is_ambiguous_auth
= false;
10187 mds
->queue_waiters(finished
);
10188 mdcache
->request_finish(mdr
);
10192 if (migrated_stray
&& mds
->is_stopping())
10193 mdcache
->shutdown_export_stray_finish(migrated_stray
);
10196 static void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
,
10197 rename_rollback::drec
&r
, utime_t ctime
,
10198 bool isdir
, const nest_info_t
&rstat
)
10200 auto pf
= dir
->project_fnode(mut
);
10201 pf
->version
= dir
->pre_dirty();
10204 pf
->fragstat
.nsubdirs
+= 1;
10206 pf
->fragstat
.nfiles
+= 1;
10209 pf
->rstat
.rbytes
+= rstat
.rbytes
;
10210 pf
->rstat
.rfiles
+= rstat
.rfiles
;
10211 pf
->rstat
.rsubdirs
+= rstat
.rsubdirs
;
10212 pf
->rstat
.rsnaps
+= rstat
.rsnaps
;
10214 if (pf
->fragstat
.mtime
== ctime
) {
10215 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
10216 if (pf
->rstat
.rctime
== ctime
)
10217 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
10219 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
10220 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
10223 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
10229 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
10231 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
10232 CDentry
*sd
, version_t pv
, CDentry
*dd
, CDentry
*st
,
10233 map
<client_t
,ref_t
<MClientSnap
>> _splits
[2], bool f
) :
10234 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
10235 straydn(st
), finish_mdr(f
) {
10236 splits
[0].swap(_splits
[0]);
10237 splits
[1].swap(_splits
[1]);
10239 void finish(int r
) override
{
10240 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
10241 destdn
, straydn
, splits
, finish_mdr
);
10245 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
,
10248 rename_rollback rollback
;
10249 auto p
= rbl
.cbegin();
10250 decode(rollback
, p
);
10252 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
10253 // need to finish this update before sending resolve to claim the subtree
10254 mdcache
->add_rollback(rollback
.reqid
, leader
);
10256 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
10257 mut
->ls
= mds
->mdlog
->get_current_segment();
10259 CDentry
*srcdn
= NULL
;
10260 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
10262 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
10264 dout(10) << " srcdir " << *srcdir
<< dendl
;
10265 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
10267 dout(10) << " srcdn " << *srcdn
<< dendl
;
10268 ceph_assert(srcdn
->get_linkage()->is_null());
10270 dout(10) << " srcdn not found" << dendl
;
10272 dout(10) << " srcdir not found" << dendl
;
10274 CDentry
*destdn
= NULL
;
10275 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
10277 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
10279 dout(10) << " destdir " << *destdir
<< dendl
;
10280 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
10282 dout(10) << " destdn " << *destdn
<< dendl
;
10284 dout(10) << " destdn not found" << dendl
;
10286 dout(10) << " destdir not found" << dendl
;
10289 if (rollback
.orig_src
.ino
) {
10290 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
10291 if (in
&& in
->is_dir())
10292 ceph_assert(srcdn
&& destdn
);
10294 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
10296 CDir
*straydir
= NULL
;
10297 CDentry
*straydn
= NULL
;
10298 if (rollback
.stray
.dirfrag
.ino
) {
10299 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
10301 dout(10) << "straydir " << *straydir
<< dendl
;
10302 straydn
= straydir
->lookup(rollback
.stray
.dname
);
10304 dout(10) << " straydn " << *straydn
<< dendl
;
10305 ceph_assert(straydn
->get_linkage()->is_primary());
10307 dout(10) << " straydn not found" << dendl
;
10309 dout(10) << "straydir not found" << dendl
;
10312 CInode
*target
= NULL
;
10313 if (rollback
.orig_dest
.ino
) {
10314 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
10316 ceph_assert(destdn
&& straydn
);
10317 } else if (rollback
.orig_dest
.remote_ino
)
10318 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
10320 // can't use is_auth() in the resolve stage
10321 mds_rank_t whoami
= mds
->get_nodeid();
10323 ceph_assert(!destdn
|| destdn
->authority().first
!= whoami
);
10324 ceph_assert(!straydn
|| straydn
->authority().first
!= whoami
);
10326 bool force_journal_src
= false;
10327 bool force_journal_dest
= false;
10328 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
10329 force_journal_src
= _need_force_journal(in
, false);
10330 if (in
&& target
&& target
->is_dir())
10331 force_journal_dest
= _need_force_journal(in
, true);
10333 version_t srcdnpv
= 0;
10336 if (srcdn
->authority().first
== whoami
)
10337 srcdnpv
= srcdn
->pre_dirty();
10338 if (rollback
.orig_src
.ino
) {
10340 srcdn
->push_projected_linkage(in
);
10342 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
10343 rollback
.orig_src
.remote_d_type
);
10346 map
<client_t
,ref_t
<MClientSnap
>> splits
[2];
10348 const CInode::mempool_inode
*pip
= nullptr;
10351 CDir
*pdir
= in
->get_projected_parent_dir();
10352 if (pdir
->authority().first
== whoami
) {
10353 auto pi
= in
->project_inode(mut
);
10354 pi
.inode
->version
= in
->pre_dirty();
10355 if (pdir
!= srcdir
) {
10356 auto pf
= pdir
->project_fnode(mut
);
10357 pf
->version
= pdir
->pre_dirty();
10359 if (pi
.inode
->ctime
== rollback
.ctime
)
10360 pi
.inode
->ctime
= rollback
.orig_src
.old_ctime
;
10363 if (in
->get_inode()->ctime
== rollback
.ctime
) {
10364 auto _inode
= CInode::allocate_inode(*in
->get_inode());
10365 _inode
->ctime
= rollback
.orig_src
.old_ctime
;
10366 in
->reset_inode(_inode
);
10370 pip
= in
->get_projected_inode().get();
10372 if (rollback
.srci_snapbl
.length() && in
->snaprealm
) {
10374 auto p
= rollback
.srci_snapbl
.cbegin();
10375 decode(hadrealm
, p
);
10377 if (projected
&& !mds
->is_resolve()) {
10378 sr_t
*new_srnode
= new sr_t();
10379 decode(*new_srnode
, p
);
10380 in
->project_snaprealm(new_srnode
);
10382 decode(in
->snaprealm
->srnode
, p
);
10385 if (rollback
.orig_src
.ino
) {
10386 ceph_assert(srcdir
);
10387 realm
= srcdir
->get_inode()->find_snaprealm();
10389 realm
= in
->snaprealm
->parent
;
10391 if (!mds
->is_resolve())
10392 mdcache
->prepare_realm_merge(in
->snaprealm
, realm
, splits
[0]);
10394 in
->project_snaprealm(NULL
);
10396 in
->snaprealm
->merge_to(realm
);
10403 if (rollback
.orig_dest
.ino
&& target
) {
10404 destdn
->push_projected_linkage(target
);
10405 } else if (rollback
.orig_dest
.remote_ino
) {
10406 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
10407 rollback
.orig_dest
.remote_d_type
);
10409 // the dentry will be trimmed soon, it's ok to have wrong linkage
10410 if (rollback
.orig_dest
.ino
)
10411 ceph_assert(mds
->is_resolve());
10412 destdn
->push_projected_linkage();
10417 straydn
->push_projected_linkage();
10421 CInode::inode_ptr ti
;
10422 CDir
*pdir
= target
->get_projected_parent_dir();
10423 if (pdir
->authority().first
== whoami
) {
10424 auto pi
= target
->project_inode(mut
);
10425 pi
.inode
->version
= target
->pre_dirty();
10426 if (pdir
!= srcdir
) {
10427 auto pf
= pdir
->project_fnode(mut
);
10428 pf
->version
= pdir
->pre_dirty();
10433 ti
= CInode::allocate_inode(*target
->get_inode());
10437 if (ti
->ctime
== rollback
.ctime
)
10438 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
10439 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
10440 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
10441 ceph_assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
10443 ceph_assert(rollback
.orig_dest
.remote_ino
&&
10444 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
10449 target
->reset_inode(ti
);
10451 if (rollback
.desti_snapbl
.length() && target
->snaprealm
) {
10453 auto p
= rollback
.desti_snapbl
.cbegin();
10454 decode(hadrealm
, p
);
10456 if (projected
&& !mds
->is_resolve()) {
10457 sr_t
*new_srnode
= new sr_t();
10458 decode(*new_srnode
, p
);
10459 target
->project_snaprealm(new_srnode
);
10461 decode(target
->snaprealm
->srnode
, p
);
10464 if (rollback
.orig_dest
.ino
) {
10465 ceph_assert(destdir
);
10466 realm
= destdir
->get_inode()->find_snaprealm();
10468 realm
= target
->snaprealm
->parent
;
10470 if (!mds
->is_resolve())
10471 mdcache
->prepare_realm_merge(target
->snaprealm
, realm
, splits
[1]);
10473 target
->project_snaprealm(NULL
);
10475 target
->snaprealm
->merge_to(realm
);
10480 if (srcdn
&& srcdn
->authority().first
== whoami
) {
10482 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
10483 in
&& in
->is_dir(), pip
? pip
->accounted_rstat
: blah
);
10487 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
10489 dout(0) << " srci back to " << *in
<< dendl
;
10491 dout(0) << " destdn back to " << *destdn
<< dendl
;
10493 dout(0) << " desti back to " << *target
<< dendl
;
10496 EPeerUpdate
*le
= new EPeerUpdate(mdlog
, "peer_rename_rollback", rollback
.reqid
, leader
,
10497 EPeerUpdate::OP_ROLLBACK
, EPeerUpdate::RENAME
);
10498 mdlog
->start_entry(le
);
10500 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10501 le
->commit
.add_dir_context(srcdir
);
10502 if (rollback
.orig_src
.ino
)
10503 le
->commit
.add_primary_dentry(srcdn
, 0, true);
10505 le
->commit
.add_remote_dentry(srcdn
, true);
10508 if (!rollback
.orig_src
.ino
&& // remote linkage
10509 in
&& in
->authority().first
== whoami
) {
10510 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
10511 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
10514 if (force_journal_dest
) {
10515 ceph_assert(rollback
.orig_dest
.ino
);
10516 le
->commit
.add_dir_context(destdir
);
10517 le
->commit
.add_primary_dentry(destdn
, 0, true);
10520 // peer: no need to journal straydn
10522 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
10523 ceph_assert(rollback
.orig_dest
.remote_ino
);
10524 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
10525 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
10528 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
10529 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
10530 le
->commit
.renamed_dirino
= in
->ino();
10531 if (srcdn
->authority().first
== whoami
) {
10532 auto&& ls
= in
->get_dirfrags();
10533 for (const auto& dir
: ls
) {
10534 if (!dir
->is_auth())
10535 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
10537 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
10539 } else if (force_journal_dest
) {
10540 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
10541 le
->commit
.renamed_dirino
= target
->ino();
10544 if (target
&& target
->is_dir()) {
10545 ceph_assert(destdn
);
10546 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
10549 if (in
&& in
->is_dir()) {
10550 ceph_assert(srcdn
);
10551 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
10554 if (mdr
&& !mdr
->more()->peer_update_journaled
) {
10555 ceph_assert(le
->commit
.empty());
10556 mdlog
->cancel_entry(le
);
10558 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, splits
, finish_mdr
);
10560 ceph_assert(!le
->commit
.empty());
10562 mdr
->more()->peer_update_journaled
= false;
10563 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
,
10564 srcdn
, srcdnpv
, destdn
, straydn
,
10565 splits
, finish_mdr
);
10566 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
10571 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
10572 version_t srcdnpv
, CDentry
*destdn
, CDentry
*straydn
,
10573 map
<client_t
,ref_t
<MClientSnap
>> splits
[2], bool finish_mdr
)
10575 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
10578 straydn
->get_dir()->unlink_inode(straydn
);
10579 straydn
->pop_projected_linkage();
10582 destdn
->get_dir()->unlink_inode(destdn
);
10583 destdn
->pop_projected_linkage();
10586 srcdn
->pop_projected_linkage();
10587 if (srcdn
->authority().first
== mds
->get_nodeid()) {
10588 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
10589 if (srcdn
->get_linkage()->is_primary())
10590 srcdn
->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH
);
10596 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
10597 CInode
*in
= srcdn
->get_linkage()->get_inode();
10598 if (in
&& in
->is_dir()) {
10599 ceph_assert(destdn
);
10600 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
10605 CInode
*oldin
= destdn
->get_linkage()->get_inode();
10606 // update subtree map?
10607 if (oldin
&& oldin
->is_dir()) {
10608 ceph_assert(straydn
);
10609 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
10613 if (mds
->is_resolve()) {
10616 root
= mdcache
->get_subtree_root(straydn
->get_dir());
10618 root
= mdcache
->get_subtree_root(destdn
->get_dir());
10620 mdcache
->try_trim_non_auth_subtree(root
);
10622 mdcache
->send_snaps(splits
[1]);
10623 mdcache
->send_snaps(splits
[0]);
10627 MDSContext::vec finished
;
10628 if (mdr
->more()->is_ambiguous_auth
) {
10629 if (srcdn
->is_auth())
10630 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
10632 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
10633 mdr
->more()->is_ambiguous_auth
= false;
10635 mds
->queue_waiters(finished
);
10636 if (finish_mdr
|| mdr
->aborted
)
10637 mdcache
->request_finish(mdr
);
10639 mdr
->more()->peer_rolling_back
= false;
10642 mdcache
->finish_rollback(mut
->reqid
, mdr
);
10647 void Server::handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10649 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10650 << " witnessed by " << ack
->get_source()
10651 << " " << *ack
<< dendl
;
10652 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10655 mdr
->more()->peers
.insert(from
);
10656 if (mdr
->more()->srcdn_auth_mds
== from
&&
10657 mdr
->more()->is_remote_frozen_authpin
&&
10658 !mdr
->more()->is_ambiguous_auth
) {
10659 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
10662 // witnessed? or add extra witnesses?
10663 ceph_assert(mdr
->more()->witnessed
.count(from
) == 0);
10664 if (ack
->is_interrupted()) {
10665 dout(10) << " peer request interrupted, noop" << dendl
;
10666 } else if (ack
->witnesses
.empty()) {
10667 mdr
->more()->witnessed
.insert(from
);
10668 if (!ack
->is_not_journaled())
10669 mdr
->more()->has_journaled_peers
= true;
10671 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
10672 mdr
->more()->extra_witnesses
= ack
->witnesses
;
10673 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
10677 if (ack
->inode_export
.length()) {
10678 dout(10) << " got srci import" << dendl
;
10679 mdr
->more()->inode_import
.share(ack
->inode_export
);
10680 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
10683 // remove from waiting list
10684 ceph_assert(mdr
->more()->waiting_on_peer
.count(from
));
10685 mdr
->more()->waiting_on_peer
.erase(from
);
10687 if (mdr
->more()->waiting_on_peer
.empty())
10688 dispatch_client_request(mdr
); // go again!
10690 dout(10) << "still waiting on peers " << mdr
->more()->waiting_on_peer
<< dendl
;
10693 void Server::handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
)
10695 dout(10) << "handle_peer_rename_notify_ack " << *mdr
<< " from mds."
10696 << ack
->get_source() << dendl
;
10697 ceph_assert(mdr
->is_peer());
10698 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
10700 if (mdr
->more()->waiting_on_peer
.count(from
)) {
10701 mdr
->more()->waiting_on_peer
.erase(from
);
10703 if (mdr
->more()->waiting_on_peer
.empty()) {
10704 if (mdr
->peer_request
)
10705 dispatch_peer_request(mdr
);
10707 dout(10) << " still waiting for rename notify acks from "
10708 << mdr
->more()->waiting_on_peer
<< dendl
;
10712 void Server::_peer_rename_sessions_flushed(MDRequestRef
& mdr
)
10714 dout(10) << "_peer_rename_sessions_flushed " << *mdr
<< dendl
;
10716 if (mdr
->more()->waiting_on_peer
.count(MDS_RANK_NONE
)) {
10717 mdr
->more()->waiting_on_peer
.erase(MDS_RANK_NONE
);
10719 if (mdr
->more()->waiting_on_peer
.empty()) {
10720 if (mdr
->peer_request
)
10721 dispatch_peer_request(mdr
);
10723 dout(10) << " still waiting for rename notify acks from "
10724 << mdr
->more()->waiting_on_peer
<< dendl
;
10729 /* This function takes responsibility for the passed mdr*/
10730 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
10732 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10734 // traverse to path
10735 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10739 if (!diri
->is_dir()) {
10740 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10743 dout(10) << "lssnap on " << *diri
<< dendl
;
10746 if (!mds
->locker
->try_rdlock_snap_layout(diri
, mdr
))
10749 if (!check_access(mdr
, diri
, MAY_READ
))
10752 SnapRealm
*realm
= diri
->find_snaprealm();
10753 map
<snapid_t
,const SnapInfo
*> infomap
;
10754 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
10756 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
10758 max_entries
= infomap
.size();
10759 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
10761 // make sure at least one item can be encoded
10762 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
10764 __u64 last_snapid
= 0;
10765 string offset_str
= req
->get_path2();
10766 if (!offset_str
.empty())
10767 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
10771 static DirStat empty
;
10772 CDir::encode_dirstat(dirbl
, mdr
->session
->info
, empty
);
10774 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
10778 auto p
= infomap
.upper_bound(last_snapid
);
10779 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
10780 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
10784 if (p
->second
->ino
== diri
->ino())
10785 snap_name
= p
->second
->name
;
10787 snap_name
= p
->second
->get_long_name();
10789 unsigned start_len
= dnbl
.length();
10790 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
10793 encode(snap_name
, dnbl
);
10795 LeaseStat
e(CEPH_LEASE_VALID
, -1, 0);
10796 mds
->locker
->encode_lease(dnbl
, mdr
->session
->info
, e
);
10797 dout(20) << "encode_infinite_lease" << dendl
;
10799 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
10802 keep
.substr_of(dnbl
, 0, start_len
);
10809 encode(num
, dirbl
);
10811 if (p
== infomap
.end()) {
10812 flags
= CEPH_READDIR_FRAG_END
;
10813 if (last_snapid
== 0)
10814 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
10816 encode(flags
, dirbl
);
10817 dirbl
.claim_append(dnbl
);
10819 mdr
->reply_extra_bl
= dirbl
;
10820 mdr
->tracei
= diri
;
10821 respond_to_request(mdr
, 0);
10827 struct C_MDS_mksnap_finish
: public ServerLogContext
{
10830 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
10831 ServerLogContext(s
, r
), diri(di
), info(i
) {}
10832 void finish(int r
) override
{
10833 server
->_mksnap_finish(mdr
, diri
, info
);
10837 /* This function takes responsibility for the passed mdr*/
10838 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
10840 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
10841 // make sure we have as new a map as the client
10842 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
10843 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
10846 if (!mds
->mdsmap
->allows_snaps()) {
10847 // you can't make snapshots until you set an option right now
10848 dout(5) << "new snapshots are disabled for this fs" << dendl
;
10849 respond_to_request(mdr
, -CEPHFS_EPERM
);
10853 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
10858 if (!diri
->is_dir()) {
10859 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
10862 if (diri
->is_system() && !diri
->is_root()) {
10863 // no snaps in system dirs (root is ok)
10864 dout(5) << "is an internal system dir" << dendl
;
10865 respond_to_request(mdr
, -CEPHFS_EPERM
);
10869 std::string_view snapname
= req
->get_filepath().last_dentry();
10871 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
10872 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
10873 respond_to_request(mdr
, -CEPHFS_EPERM
);
10877 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
10880 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
10881 MutationImpl::LockOpVec lov
;
10882 lov
.add_xlock(&diri
->snaplock
);
10883 if (!mds
->locker
->acquire_locks(mdr
, lov
))
10886 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
10887 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
10890 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
10893 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
10896 if (inodeno_t subvol_ino
= diri
->find_snaprealm()->get_subvolume_ino();
10897 (subvol_ino
&& subvol_ino
!= diri
->ino())) {
10898 dout(5) << "is a descendent of a subvolume dir" << dendl
;
10899 respond_to_request(mdr
, -CEPHFS_EPERM
);
10903 // check if we can create any more snapshots
10904 // we don't allow any more if we are already at or beyond the limit
10905 if (diri
->snaprealm
&&
10906 diri
->snaprealm
->get_snaps().size() >= max_snaps_per_dir
) {
10907 respond_to_request(mdr
, -CEPHFS_EMLINK
);
10911 // make sure name is unique
10912 if (diri
->snaprealm
&&
10913 diri
->snaprealm
->exists(snapname
)) {
10914 respond_to_request(mdr
, -CEPHFS_EEXIST
);
10917 if (snapname
.length() == 0 ||
10918 snapname
.length() > snapshot_name_max
||
10919 snapname
[0] == '_') {
10920 respond_to_request(mdr
, -CEPHFS_EINVAL
);
10924 // allocate a snapid
10925 if (!mdr
->more()->stid
) {
10927 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
10928 mdr
->get_mds_stamp(),
10929 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
10930 new C_MDS_RetryRequest(mdcache
, mdr
));
10934 version_t stid
= mdr
->more()->stid
;
10936 auto p
= mdr
->more()->snapidbl
.cbegin();
10938 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
10940 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
10942 SnapPayload payload
;
10943 if (req
->get_data().length()) {
10945 auto iter
= req
->get_data().cbegin();
10946 decode(payload
, iter
);
10947 } catch (const ceph::buffer::error
&e
) {
10948 // backward compat -- client sends xattr bufferlist. however,
10949 // that is not used anywhere -- so (log and) ignore.
10950 dout(20) << ": no metadata in payload (old client?)" << dendl
;
10956 info
.ino
= diri
->ino();
10957 info
.snapid
= snapid
;
10958 info
.name
= snapname
;
10959 info
.stamp
= mdr
->get_op_stamp();
10960 info
.metadata
= payload
.metadata
;
10962 auto pi
= diri
->project_inode(mdr
, false, true);
10963 pi
.inode
->ctime
= info
.stamp
;
10964 if (info
.stamp
> pi
.inode
->rstat
.rctime
)
10965 pi
.inode
->rstat
.rctime
= info
.stamp
;
10966 pi
.inode
->rstat
.rsnaps
++;
10967 pi
.inode
->version
= diri
->pre_dirty();
10969 // project the snaprealm
10970 auto &newsnap
= *pi
.snapnode
;
10971 newsnap
.created
= snapid
;
10972 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
10974 em
.first
->second
= info
;
10975 newsnap
.seq
= snapid
;
10976 newsnap
.last_created
= snapid
;
10977 newsnap
.last_modified
= info
.stamp
;
10978 newsnap
.change_attr
++;
10980 // journal the inode changes
10981 mdr
->ls
= mdlog
->get_current_segment();
10982 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
10983 mdlog
->start_entry(le
);
10985 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
10986 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
10987 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
10988 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
10990 // journal the snaprealm changes
10991 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
10996 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
10998 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
11000 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
11004 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
11007 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11009 // notify other mds
11010 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, op
);
11012 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
11016 mdr
->snapid
= info
.snapid
;
11017 mdr
->tracei
= diri
;
11018 respond_to_request(mdr
, 0);
11024 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
11027 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
11028 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
11029 void finish(int r
) override
{
11030 server
->_rmsnap_finish(mdr
, diri
, snapid
);
11034 /* This function takes responsibility for the passed mdr*/
11035 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
11037 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
11039 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
11043 if (!diri
->is_dir()) {
11044 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
11048 std::string_view snapname
= req
->get_filepath().last_dentry();
11050 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
11051 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
11052 respond_to_request(mdr
, -CEPHFS_EPERM
);
11056 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
11058 // does snap exist?
11059 if (snapname
.length() == 0 || snapname
[0] == '_') {
11060 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't prune a parent snap, currently.
11063 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
11064 respond_to_request(mdr
, -CEPHFS_ENOENT
);
11067 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
11068 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
11069 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
11070 MutationImpl::LockOpVec lov
;
11071 lov
.add_xlock(&diri
->snaplock
);
11072 if (!mds
->locker
->acquire_locks(mdr
, lov
))
11074 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
11075 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
11078 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
11081 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
11085 if (!mdr
->more()->stid
) {
11086 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
11087 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
11088 new C_MDS_RetryRequest(mdcache
, mdr
));
11091 version_t stid
= mdr
->more()->stid
;
11092 auto p
= mdr
->more()->snapidbl
.cbegin();
11095 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
11097 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
11100 auto pi
= diri
->project_inode(mdr
, false, true);
11101 pi
.inode
->version
= diri
->pre_dirty();
11102 pi
.inode
->ctime
= mdr
->get_op_stamp();
11103 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
11104 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
11105 pi
.inode
->rstat
.rsnaps
--;
11107 mdr
->ls
= mdlog
->get_current_segment();
11108 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
11109 mdlog
->start_entry(le
);
11111 // project the snaprealm
11112 auto &newnode
= *pi
.snapnode
;
11113 newnode
.snaps
.erase(snapid
);
11115 newnode
.last_destroyed
= seq
;
11116 newnode
.last_modified
= mdr
->get_op_stamp();
11117 newnode
.change_attr
++;
11119 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
11120 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
11121 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
11122 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11124 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
11129 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
11131 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
11132 snapid_t stid
= mdr
->more()->stid
;
11136 mds
->snapclient
->commit(stid
, mdr
->ls
);
11138 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11140 // notify other mds
11141 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_DESTROY
);
11143 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
11147 mdr
->tracei
= diri
;
11148 mdr
->snapid
= snapid
;
11149 respond_to_request(mdr
, 0);
11151 // purge snapshot data
11152 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
11155 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
11158 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
11159 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
11160 void finish(int r
) override
{
11161 server
->_renamesnap_finish(mdr
, diri
, snapid
);
11165 /* This function takes responsibility for the passed mdr*/
11166 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
11168 const cref_t
<MClientRequest
> &req
= mdr
->client_request
;
11169 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
11170 respond_to_request(mdr
, -CEPHFS_EINVAL
);
11174 CInode
*diri
= try_get_auth_inode(mdr
, req
->get_filepath().get_ino());
11178 if (!diri
->is_dir()) { // dir only
11179 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
11183 if (mdr
->client_request
->get_caller_uid() < g_conf()->mds_snap_min_uid
||
11184 mdr
->client_request
->get_caller_uid() > g_conf()->mds_snap_max_uid
) {
11185 respond_to_request(mdr
, -CEPHFS_EPERM
);
11189 std::string_view dstname
= req
->get_filepath().last_dentry();
11190 std::string_view srcname
= req
->get_filepath2().last_dentry();
11191 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
11193 if (srcname
.length() == 0 || srcname
[0] == '_') {
11194 respond_to_request(mdr
, -CEPHFS_EINVAL
); // can't rename a parent snap.
11197 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
11198 respond_to_request(mdr
, -CEPHFS_ENOENT
);
11201 if (dstname
.length() == 0 || dstname
[0] == '_') {
11202 respond_to_request(mdr
, -CEPHFS_EINVAL
);
11205 if (diri
->snaprealm
->exists(dstname
)) {
11206 respond_to_request(mdr
, -CEPHFS_EEXIST
);
11210 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
11212 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
11215 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
11216 MutationImpl::LockOpVec lov
;
11217 lov
.add_xlock(&diri
->snaplock
);
11218 if (!mds
->locker
->acquire_locks(mdr
, lov
))
11220 if (CDentry
*pdn
= diri
->get_projected_parent_dn(); pdn
) {
11221 if (!mds
->locker
->try_rdlock_snap_layout(pdn
->get_dir()->get_inode(), mdr
))
11224 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
11227 if (!check_access(mdr
, diri
, MAY_WRITE
|MAY_SNAPSHOT
))
11231 if (!mdr
->more()->stid
) {
11232 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
11233 &mdr
->more()->stid
,
11234 new C_MDS_RetryRequest(mdcache
, mdr
));
11238 version_t stid
= mdr
->more()->stid
;
11239 dout(10) << " stid is " << stid
<< dendl
;
11241 ceph_assert(mds
->snapclient
->get_cached_version() >= stid
);
11244 auto pi
= diri
->project_inode(mdr
, false, true);
11245 pi
.inode
->ctime
= mdr
->get_op_stamp();
11246 if (mdr
->get_op_stamp() > pi
.inode
->rstat
.rctime
)
11247 pi
.inode
->rstat
.rctime
= mdr
->get_op_stamp();
11248 pi
.inode
->version
= diri
->pre_dirty();
11250 // project the snaprealm
11251 auto &newsnap
= *pi
.snapnode
;
11252 auto it
= newsnap
.snaps
.find(snapid
);
11253 ceph_assert(it
!= newsnap
.snaps
.end());
11254 it
->second
.name
= dstname
;
11255 newsnap
.last_modified
= mdr
->get_op_stamp();
11256 newsnap
.change_attr
++;
11258 // journal the inode changes
11259 mdr
->ls
= mdlog
->get_current_segment();
11260 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
11261 mdlog
->start_entry(le
);
11263 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
11264 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
11265 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
11266 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11268 // journal the snaprealm changes
11269 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
11274 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
11276 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
11280 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
11282 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
11284 // notify other mds
11285 mdcache
->send_snap_update(diri
, mdr
->more()->stid
, CEPH_SNAP_OP_UPDATE
);
11287 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
);
11291 mdr
->tracei
= diri
;
11292 mdr
->snapid
= snapid
;
11293 respond_to_request(mdr
, 0);
11296 void Server::handle_client_readdir_snapdiff(MDRequestRef
& mdr
)
11298 const cref_t
<MClientRequest
>& req
= mdr
->client_request
;
11299 Session
* session
= mds
->get_session(req
);
11300 MutationImpl::LockOpVec lov
;
11301 CInode
* diri
= rdlock_path_pin_ref(mdr
, false, true);
11304 // it's a directory, right?
11305 if (!diri
->is_dir()) {
11307 dout(10) << "reply to " << *req
<< " snapdiff -CEPHFS_ENOTDIR" << dendl
;
11308 respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
11312 auto num_caps
= session
->get_num_caps();
11313 auto session_cap_acquisition
= session
->get_cap_acquisition();
11315 if (num_caps
> static_cast<uint64_t>(max_caps_per_client
* max_caps_throttle_ratio
) && session_cap_acquisition
>= cap_acquisition_throttle
) {
11316 dout(20) << "snapdiff throttled. max_caps_per_client: " << max_caps_per_client
<< " num_caps: " << num_caps
11317 << " session_cap_acquistion: " << session_cap_acquisition
<< " cap_acquisition_throttle: " << cap_acquisition_throttle
<< dendl
;
11319 logger
->inc(l_mdss_cap_acquisition_throttle
);
11321 mds
->timer
.add_event_after(caps_throttle_retry_request_timeout
, new C_MDS_RetryRequest(mdcache
, mdr
));
11325 lov
.add_rdlock(&diri
->filelock
);
11326 lov
.add_rdlock(&diri
->dirfragtreelock
);
11328 if (!mds
->locker
->acquire_locks(mdr
, lov
))
11331 if (!check_access(mdr
, diri
, MAY_READ
))
11335 frag_t fg
= (__u32
)req
->head
.args
.snapdiff
.frag
;
11336 unsigned req_flags
= (__u32
)req
->head
.args
.snapdiff
.flags
;
11337 string offset_str
= req
->get_path2();
11339 __u32 offset_hash
= 0;
11340 if (!offset_str
.empty()) {
11341 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
11343 offset_hash
= (__u32
)req
->head
.args
.snapdiff
.offset_hash
;
11346 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
11347 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
11349 // does the frag exist?
11350 if (diri
->dirfragtree
[fg
.value()] != fg
) {
11352 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
11353 if (fg
.contains((unsigned)offset_hash
)) {
11354 newfg
= diri
->dirfragtree
[offset_hash
];
11356 // client actually wants next frag
11357 newfg
= diri
->dirfragtree
[fg
.value()];
11360 offset_str
.clear();
11361 newfg
= diri
->dirfragtree
[fg
.value()];
11363 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
11367 CDir
* dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
11371 dout(10) << __func__
<< " on " << *dir
<< dendl
;
11372 ceph_assert(dir
->is_auth());
11374 if (!dir
->is_complete()) {
11375 if (dir
->is_frozen()) {
11376 dout(7) << "dir is frozen " << *dir
<< dendl
;
11377 mds
->locker
->drop_locks(mdr
.get());
11378 mdr
->drop_local_auth_pins();
11379 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
11383 dout(10) << " incomplete dir contents for snapdiff on " << *dir
<< ", fetching" << dendl
;
11384 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
11388 #ifdef MDS_VERIFY_FRAGSTAT
11389 dir
->verify_fragstat();
11392 utime_t now
= ceph_clock_now();
11393 mdr
->set_mds_stamp(now
);
11395 mdr
->snapid_diff_other
= (uint64_t)req
->head
.args
.snapdiff
.snap_other
;
11396 if (mdr
->snapid_diff_other
== mdr
->snapid
||
11397 mdr
->snapid
== CEPH_NOSNAP
||
11398 mdr
->snapid_diff_other
== CEPH_NOSNAP
) {
11399 dout(10) << "reply to " << *req
<< " snapdiff -CEPHFS_EINVAL" << dendl
;
11400 respond_to_request(mdr
, -CEPHFS_EINVAL
);
11403 dout(10) << __func__
11404 << " snap " << mdr
->snapid
11405 << " vs. snap " << mdr
->snapid_diff_other
11408 SnapRealm
* realm
= diri
->find_snaprealm();
11410 unsigned max
= req
->head
.args
.snapdiff
.max_entries
;
11412 max
= dir
->get_num_any(); // whatever, something big.
11413 unsigned max_bytes
= req
->head
.args
.snapdiff
.max_bytes
;
11415 // make sure at least one item can be encoded
11416 max_bytes
= (512 << 10) + g_conf()->mds_max_xattr_pairs_size
;
11418 // start final blob
11421 ds
.frag
= dir
->get_frag();
11422 ds
.auth
= dir
->get_dir_auth().first
;
11423 if (dir
->is_auth() && !forward_all_requests_to_auth
)
11424 dir
->get_dist_spec(ds
.dist
, mds
->get_nodeid());
11426 dir
->encode_dirstat(dirbl
, mdr
->session
->info
, ds
);
11428 // count bytes available.
11429 // this isn't perfect, but we should capture the main variable/unbounded size items!
11430 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
) * 2;
11431 int bytes_left
= max_bytes
- front_bytes
;
11432 bytes_left
-= get_snap_trace(session
, realm
).length();
11450 * Return true if server is in state RECONNECT and this
11451 * client has not yet reconnected.
11453 bool Server::waiting_for_reconnect(client_t c
) const
11455 return client_reconnect_gather
.count(c
) > 0;
11458 void Server::dump_reconnect_status(Formatter
*f
) const
11460 f
->open_object_section("reconnect_status");
11461 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;
11462 f
->close_section();
11465 const bufferlist
& Server::get_snap_trace(Session
*session
, SnapRealm
*realm
) const {
11466 ceph_assert(session
);
11467 ceph_assert(realm
);
11468 if (session
->info
.has_feature(CEPHFS_FEATURE_NEW_SNAPREALM_INFO
)) {
11469 return realm
->get_snap_trace_new();
11471 return realm
->get_snap_trace();
11475 const bufferlist
& Server::get_snap_trace(client_t client
, SnapRealm
*realm
) const {
11476 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
11477 return get_snap_trace(session
, realm
);
11480 void Server::_readdir_diff(
11486 unsigned max_entries
,
11488 const string
& offset_str
,
11489 uint32_t offset_hash
,
11490 unsigned req_flags
,
11493 // build dir contents
11495 __u32 numfiles
= 0;
11497 snapid_t snapid
= mdr
->snapid
;
11498 snapid_t snapid_prev
= mdr
->snapid_diff_other
;
11499 if (snapid
< snapid_prev
) {
11500 std::swap(snapid
, snapid_prev
);
11502 bool from_the_beginning
= !offset_hash
&& offset_str
.empty();
11503 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
11504 dentry_key_t
skip_key(snapid_prev
, offset_str
.c_str(), offset_hash
);
11506 bool end
= build_snap_diff(
11510 from_the_beginning
? nullptr : & skip_key
,
11514 [&](CDentry
* dn
, CInode
* in
, bool exists
) {
11516 snapid_t effective_snapid
;
11517 const auto& dn_name
= dn
->get_name();
11518 // provide the first snapid for removed entries and
11519 // the last one for existent ones
11520 effective_snapid
= exists
? snapid
: snapid_prev
;
11521 name
.append(dn_name
);
11522 if ((int)(dnbl
.length() + name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
11523 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
11527 auto diri
= dir
->get_inode();
11528 auto hash
= ceph_frag_value(diri
->hash_dentry_name(dn_name
));
11529 unsigned start_len
= dnbl
.length();
11530 dout(10) << "inc dn " << *dn
<< " as " << name
11531 << std::hex
<< " hash 0x" << hash
<< std::dec
11533 encode(name
, dnbl
);
11534 mds
->locker
->issue_client_lease(dn
, in
, mdr
, now
, dnbl
);
11537 dout(10) << "inc inode " << *in
<< " snap " << effective_snapid
<< dendl
;
11538 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, effective_snapid
, bytes_left
- (int)dnbl
.length());
11540 // chop off dn->name, lease
11541 dout(10) << " ran out of room, stopping at "
11542 << start_len
<< " < " << bytes_left
<< dendl
;
11544 keep
.substr_of(dnbl
, 0, start_len
);
11550 mdcache
->lru
.lru_touch(dn
);
11556 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
11557 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
11560 std::swap(mdr
->snapid
, mdr
->snapid_diff_other
); // we want opponent snapid to be used for tracei
11562 _finalize_readdir(mdr
, diri
, dir
, from_the_beginning
, end
, flags
, numfiles
,
11566 bool Server::build_snap_diff(
11570 dentry_key_t
* skip_key
,
11571 snapid_t snapid_prev
,
11573 const bufferlist
& dnbl
,
11574 std::function
<bool (CDentry
*, CInode
*, bool)> add_result_cb
)
11576 client_t client
= mdr
->client_request
->get_source().num();
11579 CDentry
* dn
= nullptr;
11580 CInode
* in
= nullptr;
11584 *this = EntryInfo();
11588 auto insert_deleted
= [&](EntryInfo
& ei
) {
11589 dout(20) << "build_snap_diff deleted file " << ei
.dn
->get_name() << " "
11590 << ei
.dn
->first
<< "/" << ei
.dn
->last
<< dendl
;
11591 int r
= add_result_cb(ei
.dn
, ei
.in
, false);
11596 auto it
= !skip_key
? dir
->begin() : dir
->lower_bound(*skip_key
);
11598 while(it
!= dir
->end()) {
11599 CDentry
* dn
= it
->second
;
11600 dout(20) << __func__
<< " " << it
->first
<< "->" << *dn
<< dendl
;
11602 if (dn
->state_test(CDentry::STATE_PURGING
))
11605 bool dnp
= dn
->use_projected(client
, mdr
);
11606 CDentry::linkage_t
* dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
11608 if (dnl
->is_null()) {
11609 dout(20) << __func__
<< " linkage is null, skipping" << dendl
;
11613 if (dn
->last
< snapid_prev
|| dn
->first
> snapid
) {
11614 dout(20) << __func__
<< " not in range, skipping" << dendl
;
11618 skip_key
->snapid
= dn
->last
;
11619 if (!(*skip_key
< dn
->key()))
11623 CInode
* in
= dnl
->get_inode();
11624 if (in
&& in
->ino() == CEPH_INO_CEPH
)
11628 // better for the MDS to do the work, if we think the client will stat any of these files.
11629 if (dnl
->is_remote() && !in
) {
11630 in
= mdcache
->get_inode(dnl
->get_remote_ino());
11631 dout(20) << __func__
<< " remote in: " << *in
<< " ino " << std::hex
<< dnl
->get_remote_ino() << std::dec
<< dendl
;
11633 dn
->link_remote(dnl
, in
);
11634 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
11635 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
11638 // touch everything i _do_ have
11639 for (auto& p
: *dir
) {
11640 if (!p
.second
->get_linkage()->is_null())
11641 mdcache
->lru
.lru_touch(p
.second
);
11644 // already issued caps and leases, reply immediately.
11645 if (dnbl
.length() > 0) {
11646 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
11647 dout(10) << " open remote dentry after caps were issued, stopping at "
11648 << dnbl
.length() << " < " << bytes_left
<< dendl
;
11650 mds
->locker
->drop_locks(mdr
.get());
11651 mdr
->drop_local_auth_pins();
11652 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
11659 utime_t mtime
= in
->get_inode()->mtime
;
11661 if (in
->is_dir()) {
11663 // we need to maintain the order of entries (determined by their name hashes)
11664 // hence need to insert the previous entry if any immediately.
11666 if (!insert_deleted(before
)) {
11671 bool exists
= true;
11672 if (snapid_prev
< dn
->first
&& dn
->last
< snapid
) {
11673 dout(20) << __func__
<< " skipping inner " << dn
->get_name() << " "
11674 << dn
->first
<< "/" << dn
->last
<< dendl
;
11676 } else if (dn
->first
<= snapid_prev
&& dn
->last
< snapid
) {
11678 dout(20) << __func__
<< " deleted dir " << dn
->get_name() << " "
11679 << dn
->first
<< "/" << dn
->last
<< dendl
;
11682 bool r
= add_result_cb(dn
, in
, exists
);
11687 if (snapid_prev
>= dn
->first
&& snapid
<= dn
->last
) {
11688 dout(20) << __func__
<< " skipping unchanged " << dn
->get_name() << " "
11689 << dn
->first
<< "/" << dn
->last
<< dendl
;
11691 } else if (snapid_prev
< dn
->first
&& snapid
> dn
->last
) {
11692 dout(20) << __func__
<< " skipping inner modification " << dn
->get_name() << " "
11693 << dn
->first
<< "/" << dn
->last
<< dendl
;
11696 string_view name_before
=
11697 before
.dn
? string_view(before
.dn
->get_name()) : string_view();
11698 if (before
.dn
&& dn
->get_name() != name_before
) {
11699 if (!insert_deleted(before
)) {
11704 if (snapid_prev
>= dn
->first
&& snapid_prev
<= dn
->last
) {
11705 dout(30) << __func__
<< " dn_before " << dn
->get_name() << " "
11706 << dn
->first
<< "/" << dn
->last
<< dendl
;
11707 before
= EntryInfo
{dn
, in
, mtime
};
11710 if (before
.dn
&& dn
->get_name() == name_before
) {
11711 if (mtime
== before
.mtime
) {
11712 dout(30) << __func__
<< " timestamp not changed " << dn
->get_name() << " "
11713 << dn
->first
<< "/" << dn
->last
11719 dout(30) << __func__
<< " timestamp changed " << dn
->get_name() << " "
11720 << dn
->first
<< "/" << dn
->last
11721 << " " << before
.mtime
<< " vs. " << mtime
11726 dout(20) << __func__
<< " new file " << dn
->get_name() << " "
11727 << dn
->first
<< "/" << dn
->last
11729 ceph_assert(snapid
>= dn
->first
&& snapid
<= dn
->last
);
11731 if (!add_result_cb(dn
, in
, true)) {
11737 insert_deleted(before
);
11739 return it
== dir
->end();