1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
27 #include "MDBalancer.h"
29 #include "SnapClient.h"
32 #include "msg/Messenger.h"
34 #include "osdc/Objecter.h"
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
43 #include "messages/MMDSSlaveRequest.h"
45 #include "messages/MLock.h"
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
64 #include <boost/utility/string_view.hpp>
67 #include "common/config.h"
69 #define dout_context g_ceph_context
70 #define dout_subsys ceph_subsys_mds
72 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
74 class ServerContext
: public MDSInternalContextBase
{
77 MDSRank
*get_mds() override
83 explicit ServerContext(Server
*s
) : server(s
) {
84 assert(server
!= NULL
);
88 class ServerLogContext
: public MDSLogContextBase
{
91 MDSRank
*get_mds() override
97 void pre_finish(int r
) override
{
99 mdr
->mark_event("journal_committed: ");
102 explicit ServerLogContext(Server
*s
) : server(s
) {
103 assert(server
!= NULL
);
105 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
106 assert(server
!= NULL
);
110 void Server::create_logger()
112 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
113 plb
.add_u64_counter(l_mdss_handle_client_request
,"handle_client_request",
114 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
115 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
116 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
117 plb
.add_u64_counter(l_mdss_handle_client_session
, "handle_client_session",
118 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING
);
119 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request", "Client requests dispatched");
120 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request", "Server requests dispatched");
121 plb
.add_u64_counter(l_mdss_req_lookuphash
, "req_lookuphash",
122 "Request type lookup hash of inode");
123 plb
.add_u64_counter(l_mdss_req_lookupino
, "req_lookupino",
124 "Request type lookup inode");
125 plb
.add_u64_counter(l_mdss_req_lookupparent
, "req_lookupparent",
126 "Request type lookup parent");
127 plb
.add_u64_counter(l_mdss_req_lookupname
, "req_lookupname",
128 "Request type lookup name");
129 plb
.add_u64_counter(l_mdss_req_lookup
, "req_lookup",
130 "Request type lookup");
131 plb
.add_u64_counter(l_mdss_req_lookupsnap
, "req_lookupsnap",
132 "Request type lookup snapshot");
133 plb
.add_u64_counter(l_mdss_req_getattr
, "req_getattr",
134 "Request type get attribute");
135 plb
.add_u64_counter(l_mdss_req_setattr
, "req_setattr",
136 "Request type set attribute");
137 plb
.add_u64_counter(l_mdss_req_setlayout
, "req_setlayout",
138 "Request type set file layout");
139 plb
.add_u64_counter(l_mdss_req_setdirlayout
, "req_setdirlayout",
140 "Request type set directory layout");
141 plb
.add_u64_counter(l_mdss_req_setxattr
, "req_setxattr",
142 "Request type set extended attribute");
143 plb
.add_u64_counter(l_mdss_req_rmxattr
, "req_rmxattr",
144 "Request type remove extended attribute");
145 plb
.add_u64_counter(l_mdss_req_readdir
, "req_readdir",
146 "Request type read directory");
147 plb
.add_u64_counter(l_mdss_req_setfilelock
, "req_setfilelock",
148 "Request type set file lock");
149 plb
.add_u64_counter(l_mdss_req_getfilelock
, "req_getfilelock",
150 "Request type get file lock");
151 plb
.add_u64_counter(l_mdss_req_create
, "req_create",
152 "Request type create");
153 plb
.add_u64_counter(l_mdss_req_open
, "req_open",
154 "Request type open");
155 plb
.add_u64_counter(l_mdss_req_mknod
, "req_mknod",
156 "Request type make node");
157 plb
.add_u64_counter(l_mdss_req_link
, "req_link",
158 "Request type link");
159 plb
.add_u64_counter(l_mdss_req_unlink
, "req_unlink",
160 "Request type unlink");
161 plb
.add_u64_counter(l_mdss_req_rmdir
, "req_rmdir",
162 "Request type remove directory");
163 plb
.add_u64_counter(l_mdss_req_rename
, "req_rename",
164 "Request type rename");
165 plb
.add_u64_counter(l_mdss_req_mkdir
, "req_mkdir",
166 "Request type make directory");
167 plb
.add_u64_counter(l_mdss_req_symlink
, "req_symlink",
168 "Request type symbolic link");
169 plb
.add_u64_counter(l_mdss_req_lssnap
, "req_lssnap",
170 "Request type list snapshot");
171 plb
.add_u64_counter(l_mdss_req_mksnap
, "req_mksnap",
172 "Request type make snapshot");
173 plb
.add_u64_counter(l_mdss_req_rmsnap
, "req_rmsnap",
174 "Request type remove snapshot");
175 plb
.add_u64_counter(l_mdss_req_renamesnap
, "req_renamesnap",
176 "Request type rename snapshot");
177 logger
= plb
.create_perf_counters();
178 g_ceph_context
->get_perfcounters_collection()->add(logger
);
181 Server::Server(MDSRank
*m
) :
183 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
186 reconnect_done(NULL
),
187 failed_reconnects(0),
188 reconnect_evicting(false),
189 terminating_sessions(false)
194 /* This function DOES put the passed message before returning*/
195 void Server::dispatch(Message
*m
)
197 switch (m
->get_type()) {
198 case CEPH_MSG_CLIENT_RECONNECT
:
199 handle_client_reconnect(static_cast<MClientReconnect
*>(m
));
204 // handle_slave_request()/handle_client_session() will wait if necessary
205 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
206 MClientRequest
*req
= static_cast<MClientRequest
*>(m
);
207 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
208 Session
*session
= mds
->get_session(req
);
209 if (!session
|| session
->is_closed()) {
210 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
214 bool queue_replay
= false;
215 if (req
->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl
;
218 } else if (req
->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
223 dout(3) << "queuing completed op" << dendl
;
226 // this request was created before the cap reconnect message, drop any embedded
228 req
->releases
.clear();
231 req
->mark_queued_for_replay();
232 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
237 bool wait_for_active
= true;
238 if (mds
->is_stopping()) {
239 wait_for_active
= false;
240 } else if (mds
->is_clientreplay()) {
241 if (req
->is_queued_for_replay()) {
242 wait_for_active
= false;
245 if (wait_for_active
) {
246 dout(3) << "not active yet, waiting" << dendl
;
247 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
252 switch (m
->get_type()) {
253 case CEPH_MSG_CLIENT_SESSION
:
254 handle_client_session(static_cast<MClientSession
*>(m
));
256 case CEPH_MSG_CLIENT_REQUEST
:
257 handle_client_request(static_cast<MClientRequest
*>(m
));
259 case MSG_MDS_SLAVE_REQUEST
:
260 handle_slave_request(static_cast<MMDSSlaveRequest
*>(m
));
263 derr
<< "server unknown message " << m
->get_type() << dendl
;
264 assert(0 == "server unknown message");
270 // ----------------------------------------------------------
271 // SESSION management
273 class C_MDS_session_finish
: public ServerLogContext
{
278 interval_set
<inodeno_t
> inos
;
282 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
283 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
284 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
>& i
, version_t iv
, Context
*fin_
= NULL
) :
285 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(i
), inotablev(iv
), fin(fin_
) { }
286 void finish(int r
) override
{
288 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
);
295 /* This function DOES put the passed message before returning*/
296 void Server::handle_client_session(MClientSession
*m
)
299 bool blacklisted
= false;
300 Session
*session
= mds
->get_session(m
);
302 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
303 assert(m
->get_source().is_client()); // should _not_ come from an mds!
306 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
311 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
312 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
313 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
314 // close requests need to be handled when mds is active
315 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
316 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
320 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
321 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
327 logger
->inc(l_mdss_handle_client_session
);
330 switch (m
->get_op()) {
331 case CEPH_SESSION_REQUEST_OPEN
:
332 if (session
->is_opening() ||
333 session
->is_open() ||
334 session
->is_stale() ||
335 session
->is_killing() ||
336 terminating_sessions
) {
337 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
338 // set client metadata for session opened by prepare_force_open_sessions
339 if (!m
->client_meta
.empty())
340 session
->set_client_metadata(m
->client_meta
);
344 assert(session
->is_closed() ||
345 session
->is_closing());
347 if (mds
->is_stopping()) {
348 dout(10) << "mds is stopping, dropping open req" << dendl
;
353 blacklisted
= mds
->objecter
->with_osdmap(
354 [session
](const OSDMap
&osd_map
) -> bool {
355 return osd_map
.is_blacklisted(session
->info
.inst
.addr
);
359 dout(10) << "rejecting blacklisted client " << session
->info
.inst
.addr
<< dendl
;
360 mds
->send_message_client(new MClientSession(CEPH_SESSION_REJECT
), session
);
365 session
->set_client_metadata(m
->client_meta
);
366 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN "
367 << session
->info
.client_metadata
.size() << " metadata entries:" << dendl
;
368 for (map
<string
, string
>::iterator i
= session
->info
.client_metadata
.begin();
369 i
!= session
->info
.client_metadata
.end(); ++i
) {
370 dout(20) << " " << i
->first
<< ": " << i
->second
<< dendl
;
373 // Special case for the 'root' metadata path; validate that the claimed
374 // root is actually within the caps of the session
375 if (session
->info
.client_metadata
.count("root")) {
376 const auto claimed_root
= session
->info
.client_metadata
.at("root");
377 // claimed_root has a leading "/" which we strip before passing
379 if (claimed_root
.empty() || claimed_root
[0] != '/' ||
380 !session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
381 derr
<< __func__
<< " forbidden path claimed as mount root: "
382 << claimed_root
<< " by " << m
->get_source() << dendl
;
383 // Tell the client we're rejecting their open
384 mds
->send_message_client(new MClientSession(CEPH_SESSION_REJECT
), session
);
385 mds
->clog
->warn() << "client session with invalid root '" <<
386 claimed_root
<< "' denied (" << session
->info
.inst
<< ")";
388 // Drop out; don't record this session in SessionMap or journal it.
393 if (session
->is_closed())
394 mds
->sessionmap
.add_session(session
);
396 pv
= mds
->sessionmap
.mark_projected(session
);
397 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
398 mds
->sessionmap
.touch_session(session
);
399 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, m
->client_meta
),
400 new C_MDS_session_finish(this, session
, sseq
, true, pv
));
404 case CEPH_SESSION_REQUEST_RENEWCAPS
:
405 if (session
->is_open() ||
406 session
->is_stale()) {
407 mds
->sessionmap
.touch_session(session
);
408 if (session
->is_stale()) {
409 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
410 mds
->locker
->resume_stale_caps(session
);
411 mds
->sessionmap
.touch_session(session
);
413 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS
, m
->get_seq()));
415 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
419 case CEPH_SESSION_REQUEST_CLOSE
:
421 if (session
->is_closed() ||
422 session
->is_closing() ||
423 session
->is_killing()) {
424 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
428 if (session
->is_importing()) {
429 dout(10) << "ignoring close req on importing session" << dendl
;
433 assert(session
->is_open() ||
434 session
->is_stale() ||
435 session
->is_opening());
436 if (m
->get_seq() < session
->get_push_seq()) {
437 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
438 << ", dropping" << dendl
;
442 // We are getting a seq that is higher than expected.
443 // Handle the same as any other seqn error.
445 if (m
->get_seq() != session
->get_push_seq()) {
446 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
447 << ", BUGGY!" << dendl
;
448 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
449 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
453 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
457 case CEPH_SESSION_FLUSHMSG_ACK
:
458 finish_flush_session(session
, m
->get_seq());
461 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
462 if (mds
->is_active())
472 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
474 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
475 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
477 if (!session
->is_open() ||
478 !session
->connection
.get() ||
479 !session
->connection
->has_feature(CEPH_FEATURE_EXPORT_PEER
))
481 version_t seq
= session
->wait_for_flush(gather
.new_sub());
482 mds
->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG
, seq
), session
);
486 void Server::finish_flush_session(Session
*session
, version_t seq
)
488 list
<MDSInternalContextBase
*> finished
;
489 session
->finish_flush(seq
, finished
);
490 mds
->queue_waiters(finished
);
493 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
494 interval_set
<inodeno_t
>& inos
, version_t piv
)
496 dout(10) << "_session_logged " << session
->info
.inst
<< " state_seq " << state_seq
<< " " << (open
? "open":"close")
497 << " " << pv
<< dendl
;
500 assert(session
->is_closing() || session
->is_killing() ||
501 session
->is_opening()); // re-open closing session
502 session
->info
.prealloc_inos
.subtract(inos
);
503 mds
->inotable
->apply_release_ids(inos
);
504 assert(mds
->inotable
->get_version() == piv
);
507 mds
->sessionmap
.mark_dirty(session
);
510 if (session
->get_state_seq() != state_seq
) {
511 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
512 << ", noop" << dendl
;
513 // close must have been canceled (by an import?), or any number of other things..
515 assert(session
->is_opening());
516 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
517 mds
->sessionmap
.touch_session(session
);
518 assert(session
->connection
!= NULL
);
519 session
->connection
->send_message(new MClientSession(CEPH_SESSION_OPEN
));
520 if (mdcache
->is_readonly())
521 session
->connection
->send_message(new MClientSession(CEPH_SESSION_FORCE_RO
));
522 } else if (session
->is_closing() ||
523 session
->is_killing()) {
524 // kill any lingering capabilities, leases, requests
525 while (!session
->caps
.empty()) {
526 Capability
*cap
= session
->caps
.front();
527 CInode
*in
= cap
->get_inode();
528 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
529 mds
->locker
->remove_client_cap(in
, session
->info
.inst
.name
.num());
531 while (!session
->leases
.empty()) {
532 ClientLease
*r
= session
->leases
.front();
533 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
534 dout(20) << " killing client lease of " << *dn
<< dendl
;
535 dn
->remove_client_lease(r
, mds
->locker
);
537 if (client_reconnect_gather
.count(session
->info
.get_client())) {
538 dout(20) << " removing client from reconnect set" << dendl
;
539 client_reconnect_gather
.erase(session
->info
.get_client());
541 if (client_reconnect_gather
.empty()) {
542 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
543 reconnect_gather_finish();
547 if (session
->is_closing()) {
548 // mark con disposable. if there is a fault, we will get a
549 // reset and clean it up. if the client hasn't received the
550 // CLOSE message yet, they will reconnect and get an
551 // ms_handle_remote_reset() and realize they had in fact closed.
552 // do this *before* sending the message to avoid a possible
554 if (session
->connection
!= NULL
) {
555 // Conditional because terminate_sessions will indiscrimately
556 // put sessions in CLOSING whether they ever had a conn or not.
557 session
->connection
->mark_disposable();
561 mds
->send_message_client(new MClientSession(CEPH_SESSION_CLOSE
), session
);
562 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
564 mds
->sessionmap
.remove_session(session
);
565 } else if (session
->is_killing()) {
566 // destroy session, close connection
567 if (session
->connection
!= NULL
) {
568 session
->connection
->mark_down();
569 session
->connection
->set_priv(NULL
);
571 mds
->sessionmap
.remove_session(session
);
581 * Inject sessions from some source other than actual connections.
584 * - sessions inferred from journal replay
585 * - sessions learned from other MDSs during rejoin
586 * - sessions learned from other MDSs during dir/caps migration
587 * - sessions learned from other MDSs during a cross-MDS rename
589 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
590 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
592 version_t pv
= mds
->sessionmap
.get_projected();
594 dout(10) << "prepare_force_open_sessions " << pv
595 << " on " << cm
.size() << " clients"
598 mds
->objecter
->with_osdmap(
599 [this, &cm
](const OSDMap
&osd_map
) {
600 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
601 if (osd_map
.is_blacklisted(p
->second
.addr
)) {
602 dout(10) << " ignoring blacklisted client." << p
->first
603 << " (" << p
->second
.addr
<< ")" << dendl
;
611 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
612 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
613 pv
= mds
->sessionmap
.mark_projected(session
);
615 if (session
->is_closed() ||
616 session
->is_closing() ||
617 session
->is_killing()) {
618 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
620 assert(session
->is_open() ||
621 session
->is_opening() ||
622 session
->is_stale());
625 smap
[p
->first
] = make_pair(session
, sseq
);
626 session
->inc_importing();
631 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
635 * FIXME: need to carefully consider the race conditions between a
636 * client trying to close a session and an MDS doing an import
637 * trying to force open a session...
639 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
640 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
642 for (auto &it
: smap
) {
643 Session
*session
= it
.second
.first
;
644 uint64_t sseq
= it
.second
.second
;
646 if (session
->get_state_seq() != sseq
) {
647 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
649 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
650 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
651 mds
->sessionmap
.touch_session(session
);
652 mds
->send_message_client(new MClientSession(CEPH_SESSION_OPEN
), session
);
653 if (mdcache
->is_readonly())
654 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
657 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
658 assert(session
->is_open() || session
->is_stale());
662 session
->dec_importing();
665 mds
->sessionmap
.mark_dirty(session
);
668 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
671 class C_MDS_TerminatedSessions
: public ServerContext
{
672 void finish(int r
) override
{
673 server
->terminating_sessions
= false;
676 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
679 void Server::terminate_sessions()
681 dout(2) << "terminate_sessions" << dendl
;
683 terminating_sessions
= true;
685 // kill them off. clients will retry etc.
686 set
<Session
*> sessions
;
687 mds
->sessionmap
.get_client_session_set(sessions
);
688 for (set
<Session
*>::const_iterator p
= sessions
.begin();
691 Session
*session
= *p
;
692 if (session
->is_closing() ||
693 session
->is_killing() ||
694 session
->is_closed())
696 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
699 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
703 void Server::find_idle_sessions()
705 dout(10) << "find_idle_sessions. laggy until " << mds
->get_laggy_until() << dendl
;
708 // (caps go stale, lease die)
709 utime_t now
= ceph_clock_now();
710 utime_t cutoff
= now
;
711 cutoff
-= g_conf
->mds_session_timeout
;
713 Session
*session
= mds
->sessionmap
.get_oldest_session(Session::STATE_OPEN
);
715 dout(20) << "laggiest active session is " << session
->info
.inst
<< dendl
;
716 if (session
->last_cap_renew
>= cutoff
) {
717 dout(20) << "laggiest active session is " << session
->info
.inst
<< " and sufficiently new ("
718 << session
->last_cap_renew
<< ")" << dendl
;
722 dout(10) << "new stale session " << session
->info
.inst
<< " last " << session
->last_cap_renew
<< dendl
;
723 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
724 mds
->locker
->revoke_stale_caps(session
);
725 mds
->locker
->remove_stale_leases(session
);
726 mds
->send_message_client(new MClientSession(CEPH_SESSION_STALE
, session
->get_push_seq()), session
);
727 finish_flush_session(session
, session
->get_push_seq());
732 cutoff
-= g_conf
->mds_session_autoclose
;
734 // don't kick clients if we've been laggy
735 if (mds
->get_laggy_until() > cutoff
) {
736 dout(10) << " laggy_until " << mds
->get_laggy_until() << " > cutoff " << cutoff
737 << ", not kicking any clients to be safe" << dendl
;
741 if (mds
->sessionmap
.get_sessions().size() == 1 &&
742 mds
->mdsmap
->get_num_in_mds() == 1) {
743 dout(20) << "not evicting a slow client, because there is only one"
748 // Collect a list of sessions exceeding the autoclose threshold
749 std::vector
<Session
*> to_evict
;
750 const auto sessions_p
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
751 if (sessions_p
== mds
->sessionmap
.by_state
.end() || sessions_p
->second
->empty()) {
754 const auto &stale_sessions
= sessions_p
->second
;
755 assert(stale_sessions
!= nullptr);
757 for (const auto &session
: *stale_sessions
) {
758 if (session
->is_importing()) {
759 dout(10) << "stopping at importing session " << session
->info
.inst
<< dendl
;
762 assert(session
->is_stale());
763 if (session
->last_cap_renew
>= cutoff
) {
764 dout(20) << "oldest stale session is " << session
->info
.inst
<< " and sufficiently new ("
765 << session
->last_cap_renew
<< ")" << dendl
;
769 to_evict
.push_back(session
);
772 for (const auto &session
: to_evict
) {
774 age
-= session
->last_cap_renew
;
775 mds
->clog
->warn() << "evicting unresponsive client " << *session
776 << ", after " << age
<< " seconds";
777 dout(10) << "autoclosing stale session " << session
->info
.inst
<< " last "
778 << session
->last_cap_renew
<< dendl
;
780 if (g_conf
->mds_session_blacklist_on_timeout
) {
781 std::stringstream ss
;
782 mds
->evict_client(session
->info
.inst
.name
.num(), false, true,
785 kill_session(session
, NULL
);
791 * XXX bump in the interface here, not using an MDSInternalContextBase here
792 * because all the callers right now happen to use a SaferCond
794 void Server::kill_session(Session
*session
, Context
*on_safe
)
796 assert(mds
->mds_lock
.is_locked_by_me());
798 if ((session
->is_opening() ||
799 session
->is_open() ||
800 session
->is_stale()) &&
801 !session
->is_importing()) {
802 dout(10) << "kill_session " << session
<< dendl
;
803 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
805 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
806 assert(session
->is_closing() ||
807 session
->is_closed() ||
808 session
->is_killing() ||
809 session
->is_importing());
811 on_safe
->complete(0);
816 size_t Server::apply_blacklist(const std::set
<entity_addr_t
> &blacklist
)
818 std::list
<Session
*> victims
;
819 const auto sessions
= mds
->sessionmap
.get_sessions();
820 for (const auto p
: sessions
) {
821 if (!p
.first
.is_client()) {
822 // Do not apply OSDMap blacklist to MDS daemons, we find out
823 // about their death via MDSMap.
827 Session
*s
= p
.second
;
828 if (blacklist
.count(s
->info
.inst
.addr
)) {
829 victims
.push_back(s
);
833 for (const auto s
: victims
) {
834 kill_session(s
, nullptr);
837 dout(10) << "apply_blacklist: killed " << victims
.size() << dendl
;
839 return victims
.size();
842 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
844 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
845 version_t pv
= mds
->sessionmap
.mark_projected(session
);
848 // release alloc and pending-alloc inos for this session
849 // and wipe out session state, in case the session close aborts for some reason
850 interval_set
<inodeno_t
> both
;
851 both
.insert(session
->info
.prealloc_inos
);
852 both
.insert(session
->pending_prealloc_inos
);
854 mds
->inotable
->project_release_ids(both
);
855 piv
= mds
->inotable
->get_projected_version();
859 mdlog
->start_submit_entry(new ESession(session
->info
.inst
, false, pv
, both
, piv
),
860 new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
));
863 // clean up requests, too
864 elist
<MDRequestImpl
*>::iterator p
=
865 session
->requests
.begin(member_offset(MDRequestImpl
,
866 item_session_request
));
868 MDRequestRef mdr
= mdcache
->request_get((*p
)->reqid
);
870 mdcache
->request_kill(mdr
);
873 finish_flush_session(session
, session
->get_push_seq());
876 void Server::reconnect_clients(MDSInternalContext
*reconnect_done_
)
878 reconnect_done
= reconnect_done_
;
880 set
<Session
*> sessions
;
881 mds
->sessionmap
.get_client_session_set(sessions
);
882 for (auto session
: sessions
) {
883 if (session
->is_open())
884 client_reconnect_gather
.insert(session
->get_client());
887 if (client_reconnect_gather
.empty()) {
888 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
889 reconnect_gather_finish();
893 // clients will get the mdsmap and discover we're reconnecting via the monitor.
895 reconnect_start
= ceph_clock_now();
896 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
897 mds
->sessionmap
.dump();
900 /* This function DOES put the passed message before returning*/
901 void Server::handle_client_reconnect(MClientReconnect
*m
)
903 dout(7) << "handle_client_reconnect " << m
->get_source() << dendl
;
904 client_t from
= m
->get_source().num();
905 Session
*session
= mds
->get_session(m
);
908 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
909 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
910 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
914 utime_t delay
= ceph_clock_now();
915 delay
-= reconnect_start
;
916 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
919 if (!mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
920 // XXX maybe in the future we can do better than this?
921 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
922 mds
->clog
->info() << "denied reconnect attempt (mds is "
923 << ceph_mds_state_name(mds
->get_state())
924 << ") from " << m
->get_source_inst()
925 << " after " << delay
<< " (allowed interval " << g_conf
->mds_reconnect_timeout
<< ")";
927 } else if (!session
->is_open()) {
928 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl
;
929 mds
->clog
->info() << "denied reconnect attempt (mds is "
930 << ceph_mds_state_name(mds
->get_state())
931 << ") from " << m
->get_source_inst() << " (session is closed)";
933 } else if (mdcache
->is_readonly()) {
934 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl
;
935 mds
->clog
->info() << "denied reconnect attempt (mds is read-only)";
940 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE
));
945 // notify client of success with an OPEN
946 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN
));
947 session
->last_cap_renew
= ceph_clock_now();
948 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
951 for (vector
<ceph_mds_snaprealm_reconnect
>::iterator p
= m
->realms
.begin();
952 p
!= m
->realms
.end();
954 CInode
*in
= mdcache
->get_inode(inodeno_t(p
->ino
));
955 if (in
&& in
->state_test(CInode::STATE_PURGING
))
958 assert(in
->snaprealm
);
959 if (in
->snaprealm
->have_past_parents_open()) {
960 dout(15) << "open snaprealm (w/ past parents) on " << *in
<< dendl
;
961 mdcache
->finish_snaprealm_reconnect(from
, in
->snaprealm
, snapid_t(p
->seq
));
963 dout(15) << "open snaprealm (w/o past parents) on " << *in
<< dendl
;
964 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
967 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p
->ino
)
968 << " seq " << p
->seq
<< dendl
;
969 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
974 for (map
<inodeno_t
, cap_reconnect_t
>::iterator p
= m
->caps
.begin();
977 // make sure our last_cap_id is MAX over all issued caps
978 if (p
->second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
979 mdcache
->last_cap_id
= p
->second
.capinfo
.cap_id
;
981 CInode
*in
= mdcache
->get_inode(p
->first
);
982 if (in
&& in
->state_test(CInode::STATE_PURGING
))
984 if (in
&& in
->is_auth()) {
985 // we recovered it, and it's ours. take note.
986 dout(15) << "open cap realm " << inodeno_t(p
->second
.capinfo
.snaprealm
)
987 << " on " << *in
<< dendl
;
988 in
->reconnect_cap(from
, p
->second
, session
);
989 mdcache
->add_reconnected_cap(from
, p
->first
, p
->second
);
990 recover_filelocks(in
, p
->second
.flockbl
, m
->get_orig_source().num());
994 if (in
&& !in
->is_auth()) {
996 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
997 // add to cap export list.
998 p
->second
.path
.clear(); // we don't need path
999 mdcache
->rejoin_export_caps(p
->first
, from
, p
->second
,
1000 in
->authority().first
);
1002 // don't know if the inode is mine
1003 dout(10) << "missing ino " << p
->first
<< ", will load later" << dendl
;
1004 p
->second
.path
.clear(); // we don't need path
1005 mdcache
->rejoin_recovered_caps(p
->first
, from
, p
->second
, MDS_RANK_NONE
);
1008 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1010 // remove from gather set
1011 client_reconnect_gather
.erase(from
);
1012 if (client_reconnect_gather
.empty())
1013 reconnect_gather_finish();
1020 void Server::reconnect_gather_finish()
1022 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1023 assert(reconnect_done
);
1024 reconnect_done
->complete(0);
1025 reconnect_done
= NULL
;
1028 void Server::reconnect_tick()
1030 if (reconnect_evicting
) {
1031 dout(4) << "reconnect_tick: waiting for evictions" << dendl
;
1035 utime_t reconnect_end
= reconnect_start
;
1036 reconnect_end
+= g_conf
->mds_reconnect_timeout
;
1037 if (ceph_clock_now() >= reconnect_end
&&
1038 !client_reconnect_gather
.empty()) {
1039 dout(10) << "reconnect timed out" << dendl
;
1041 // If we're doing blacklist evictions, use this to wait for them before
1042 // proceeding to reconnect_gather_finish
1043 MDSGatherBuilder
gather(g_ceph_context
);
1045 for (set
<client_t
>::iterator p
= client_reconnect_gather
.begin();
1046 p
!= client_reconnect_gather
.end();
1048 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
1050 dout(1) << "reconnect gave up on " << session
->info
.inst
<< dendl
;
1052 mds
->clog
->warn() << "evicting unresponsive client " << *session
1053 << ", after waiting " << g_conf
->mds_reconnect_timeout
1054 << " seconds during MDS startup";
1056 if (g_conf
->mds_session_blacklist_on_timeout
) {
1057 std::stringstream ss
;
1058 mds
->evict_client(session
->info
.inst
.name
.num(), false, true, ss
,
1061 kill_session(session
, NULL
);
1064 failed_reconnects
++;
1066 client_reconnect_gather
.clear();
1068 if (gather
.has_subs()) {
1069 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1070 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new FunctionContext(
1071 [this](int r
){reconnect_gather_finish();})));
1073 reconnect_evicting
= true;
1075 reconnect_gather_finish();
1080 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1082 if (!locks
.length()) return;
1085 bufferlist::iterator p
= locks
.begin();
1086 ::decode(numlocks
, p
);
1087 for (int i
= 0; i
< numlocks
; ++i
) {
1089 lock
.client
= client
;
1090 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1091 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1093 ::decode(numlocks
, p
);
1094 for (int i
= 0; i
< numlocks
; ++i
) {
1096 lock
.client
= client
;
1097 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1098 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1104 * Call this when the MDCache is oversized, to send requests to the clients
1105 * to trim some caps, and consequently unpin some inodes in the MDCache so
1106 * that it can trim too.
1108 void Server::recall_client_state(void)
1110 /* try to recall at least 80% of all caps */
1111 uint64_t max_caps_per_client
= Capability::count() * g_conf
->get_val
<double>("mds_max_ratio_caps_per_client");
1112 uint64_t min_caps_per_client
= g_conf
->get_val
<uint64_t>("mds_min_caps_per_client");
1113 if (max_caps_per_client
< min_caps_per_client
) {
1114 dout(0) << "max_caps_per_client " << max_caps_per_client
1115 << " < min_caps_per_client " << min_caps_per_client
<< dendl
;
1116 max_caps_per_client
= min_caps_per_client
+ 1;
1119 /* unless this ratio is smaller: */
1120 /* ratio: determine the amount of caps to recall from each client. Use
1121 * percentage full over the cache reservation. Cap the ratio at 80% of client
1123 double ratio
= 1.0-fmin(0.80, mdcache
->cache_toofull_ratio());
1125 dout(10) << "recall_client_state " << ratio
1126 << ", caps per client " << min_caps_per_client
<< "-" << max_caps_per_client
1129 set
<Session
*> sessions
;
1130 mds
->sessionmap
.get_client_session_set(sessions
);
1131 for (auto &session
: sessions
) {
1132 if (!session
->is_open() ||
1133 !session
->info
.inst
.name
.is_client())
1136 dout(10) << " session " << session
->info
.inst
1137 << " caps " << session
->caps
.size()
1138 << ", leases " << session
->leases
.size()
1141 uint64_t newlim
= MAX(MIN((session
->caps
.size() * ratio
), max_caps_per_client
), min_caps_per_client
);
1142 if (session
->caps
.size() > newlim
) {
1143 MClientSession
*m
= new MClientSession(CEPH_SESSION_RECALL_STATE
);
1144 m
->head
.max_caps
= newlim
;
1145 mds
->send_message_client(m
, session
);
1146 session
->notify_recall_sent(newlim
);
1151 void Server::force_clients_readonly()
1153 dout(10) << "force_clients_readonly" << dendl
;
1154 set
<Session
*> sessions
;
1155 mds
->sessionmap
.get_client_session_set(sessions
);
1156 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1157 p
!= sessions
.end();
1159 Session
*session
= *p
;
1160 if (!session
->info
.inst
.name
.is_client() ||
1161 !(session
->is_open() || session
->is_stale()))
1163 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
1168 * some generic stuff for finishing off requests
1170 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1172 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1173 assert(!mdr
->has_completed
);
1175 // note trace items for eventual reply.
1184 early_reply(mdr
, in
, dn
);
1186 mdr
->committing
= true;
1187 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1189 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1190 if (mds
->queue_one_replay()) {
1191 dout(10) << " queued next replay op" << dendl
;
1193 dout(10) << " journaled last replay op, flushing" << dendl
;
1196 } else if (mdr
->did_early_reply
)
1197 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1202 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1206 string
event_str("submit entry: ");
1208 mdr
->mark_event_string(event_str
);
1210 mdlog
->submit_entry(le
, fin
);
1214 * send response built from mdr contents and error code; clean up mdr
1216 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1218 if (mdr
->client_request
) {
1219 reply_client_request(mdr
, new MClientReply(mdr
->client_request
, r
));
1221 // add here to avoid counting ops multiple times (e.g., locks, loading)
1222 switch(mdr
->client_request
->get_op()) {
1223 case CEPH_MDS_OP_LOOKUPHASH
:
1224 logger
->inc(l_mdss_req_lookuphash
);
1226 case CEPH_MDS_OP_LOOKUPINO
:
1227 logger
->inc(l_mdss_req_lookupino
);
1229 case CEPH_MDS_OP_LOOKUPPARENT
:
1230 logger
->inc(l_mdss_req_lookupparent
);
1232 case CEPH_MDS_OP_LOOKUPNAME
:
1233 logger
->inc(l_mdss_req_lookupname
);
1235 case CEPH_MDS_OP_LOOKUP
:
1236 logger
->inc(l_mdss_req_lookup
);
1238 case CEPH_MDS_OP_LOOKUPSNAP
:
1239 logger
->inc(l_mdss_req_lookupsnap
);
1241 case CEPH_MDS_OP_GETATTR
:
1242 logger
->inc(l_mdss_req_getattr
);
1244 case CEPH_MDS_OP_SETATTR
:
1245 logger
->inc(l_mdss_req_setattr
);
1247 case CEPH_MDS_OP_SETLAYOUT
:
1248 logger
->inc(l_mdss_req_setlayout
);
1250 case CEPH_MDS_OP_SETDIRLAYOUT
:
1251 logger
->inc(l_mdss_req_setdirlayout
);
1253 case CEPH_MDS_OP_SETXATTR
:
1254 logger
->inc(l_mdss_req_setxattr
);
1256 case CEPH_MDS_OP_RMXATTR
:
1257 logger
->inc(l_mdss_req_rmxattr
);
1259 case CEPH_MDS_OP_READDIR
:
1260 logger
->inc(l_mdss_req_readdir
);
1262 case CEPH_MDS_OP_SETFILELOCK
:
1263 logger
->inc(l_mdss_req_setfilelock
);
1265 case CEPH_MDS_OP_GETFILELOCK
:
1266 logger
->inc(l_mdss_req_getfilelock
);
1268 case CEPH_MDS_OP_CREATE
:
1269 logger
->inc(l_mdss_req_create
);
1270 case CEPH_MDS_OP_OPEN
:
1271 logger
->inc(l_mdss_req_open
);
1273 case CEPH_MDS_OP_MKNOD
:
1274 logger
->inc(l_mdss_req_mknod
);
1276 case CEPH_MDS_OP_LINK
:
1277 logger
->inc(l_mdss_req_link
);
1279 case CEPH_MDS_OP_UNLINK
:
1280 logger
->inc(l_mdss_req_unlink
);
1282 case CEPH_MDS_OP_RMDIR
:
1283 logger
->inc(l_mdss_req_rmdir
);
1285 case CEPH_MDS_OP_RENAME
:
1286 logger
->inc(l_mdss_req_rename
);
1288 case CEPH_MDS_OP_MKDIR
:
1289 logger
->inc(l_mdss_req_mkdir
);
1291 case CEPH_MDS_OP_SYMLINK
:
1292 logger
->inc(l_mdss_req_symlink
);
1294 case CEPH_MDS_OP_LSSNAP
:
1295 logger
->inc(l_mdss_req_lssnap
);
1297 case CEPH_MDS_OP_MKSNAP
:
1298 logger
->inc(l_mdss_req_mksnap
);
1300 case CEPH_MDS_OP_RMSNAP
:
1301 logger
->inc(l_mdss_req_rmsnap
);
1303 case CEPH_MDS_OP_RENAMESNAP
:
1304 logger
->inc(l_mdss_req_renamesnap
);
1307 } else if (mdr
->internal_op
> -1) {
1308 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1309 if (!mdr
->internal_op_finish
)
1310 assert(0 == "trying to respond to internal op without finisher");
1311 mdr
->internal_op_finish
->complete(r
);
1312 mdcache
->request_finish(mdr
);
1316 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1318 if (!g_conf
->mds_early_reply
)
1321 if (mdr
->no_early_reply
) {
1322 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
1326 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
1327 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
1331 if (mdr
->alloc_ino
) {
1332 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
1336 MClientRequest
*req
= mdr
->client_request
;
1337 entity_inst_t client_inst
= req
->get_source_inst();
1338 if (client_inst
.name
.is_mds())
1341 if (req
->is_replay()) {
1342 dout(10) << " no early reply on replay op" << dendl
;
1347 MClientReply
*reply
= new MClientReply(req
, 0);
1348 reply
->set_unsafe();
1350 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1352 //_rename_finish() does not send dentry link/unlink message to replicas.
1353 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1354 // that have projected linkages from getting new replica.
1355 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
1357 dout(10) << "early_reply " << reply
->get_result()
1358 << " (" << cpp_strerror(reply
->get_result())
1359 << ") " << *req
<< dendl
;
1361 if (tracei
|| tracedn
) {
1363 mdr
->cap_releases
.erase(tracei
->vino());
1365 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1367 set_trace_dist(mdr
->session
, reply
, tracei
, tracedn
, mdr
->snapid
,
1368 req
->get_dentry_wanted(), mdr
);
1371 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1372 req
->get_connection()->send_message(reply
);
1374 mdr
->did_early_reply
= true;
1376 mds
->logger
->inc(l_mds_reply
);
1377 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
1378 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1379 dout(20) << "lat " << lat
<< dendl
;
1381 mdr
->mark_event("early_replied");
1386 * include a trace to tracei
1389 void Server::reply_client_request(MDRequestRef
& mdr
, MClientReply
*reply
)
1392 MClientRequest
*req
= mdr
->client_request
;
1394 dout(7) << "reply_client_request " << reply
->get_result()
1395 << " (" << cpp_strerror(reply
->get_result())
1396 << ") " << *req
<< dendl
;
1398 mdr
->mark_event("replying");
1400 Session
*session
= mdr
->session
;
1402 // note successful request in session map?
1404 // setfilelock requests are special, they only modify states in MDS memory.
1405 // The states get lost when MDS fails. If Client re-send a completed
1406 // setfilelock request, it means that client did not receive corresponding
1407 // setfilelock reply. So MDS should re-execute the setfilelock request.
1408 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
1409 reply
->get_result() == 0 && session
) {
1410 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
1411 session
->add_completed_request(mdr
->reqid
.tid
, created
);
1413 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
1417 // give any preallocated inos to the session
1418 apply_allocated_inos(mdr
, session
);
1420 // get tracei/tracedn from mdr?
1421 snapid_t snapid
= mdr
->snapid
;
1422 CInode
*tracei
= mdr
->tracei
;
1423 CDentry
*tracedn
= mdr
->tracedn
;
1425 bool is_replay
= mdr
->client_request
->is_replay();
1426 bool did_early_reply
= mdr
->did_early_reply
;
1427 entity_inst_t client_inst
= req
->get_source_inst();
1428 int dentry_wanted
= req
->get_dentry_wanted();
1430 if (!did_early_reply
&& !is_replay
) {
1432 mds
->logger
->inc(l_mds_reply
);
1433 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
1434 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1435 dout(20) << "lat " << lat
<< dendl
;
1438 mdr
->cap_releases
.erase(tracei
->vino());
1440 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1443 // drop non-rdlocks before replying, so that we can issue leases
1444 mdcache
->request_drop_non_rdlocks(mdr
);
1447 if (client_inst
.name
.is_mds() || !session
) {
1448 reply
->put(); // mds doesn't need a reply
1452 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
1453 (tracei
|| tracedn
)) {
1456 mdcache
->try_reconnect_cap(tracei
, session
);
1458 // include metadata in reply
1459 set_trace_dist(session
, reply
, tracei
, tracedn
,
1460 snapid
, dentry_wanted
,
1465 // We can set the extra bl unconditionally: if it's already been sent in the
1466 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1467 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1469 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
1470 req
->get_connection()->send_message(reply
);
1473 if (req
->is_queued_for_replay() &&
1474 (mdr
->has_completed
|| reply
->get_result() < 0)) {
1475 if (reply
->get_result() < 0) {
1476 int r
= reply
->get_result();
1477 derr
<< "reply_client_request: failed to replay " << *req
1478 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
1479 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
1481 mds
->queue_one_replay();
1485 mdcache
->request_finish(mdr
);
1487 // take a closer look at tracei, if it happens to be a remote link
1490 tracedn
->get_projected_linkage()->is_remote()) {
1491 mdcache
->eval_remote(tracedn
);
1496 void Server::encode_empty_dirstat(bufferlist
& bl
)
1498 static DirStat empty
;
1502 void Server::encode_infinite_lease(bufferlist
& bl
)
1509 dout(20) << "encode_infinite_lease " << e
<< dendl
;
1512 void Server::encode_null_lease(bufferlist
& bl
)
1519 dout(20) << "encode_null_lease " << e
<< dendl
;
1524 * pass inode OR dentry (not both, or we may get confused)
1526 * trace is in reverse order (i.e. root inode comes last)
1528 void Server::set_trace_dist(Session
*session
, MClientReply
*reply
,
1529 CInode
*in
, CDentry
*dn
,
1534 // skip doing this for debugging purposes?
1535 if (g_conf
->mds_inject_traceless_reply_probability
&&
1536 mdr
->ls
&& !mdr
->o_trunc
&&
1537 (rand() % 10000 < g_conf
->mds_inject_traceless_reply_probability
* 10000.0)) {
1538 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
1542 // inode, dentry, dir, ..., inode
1544 mds_rank_t whoami
= mds
->get_nodeid();
1545 client_t client
= session
->get_client();
1546 utime_t now
= ceph_clock_now();
1548 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
1550 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1553 if (snapid
== CEPH_NOSNAP
) {
1556 realm
= in
->find_snaprealm();
1558 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
1559 reply
->snapbl
= realm
->get_snap_trace();
1560 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
1565 reply
->head
.is_dentry
= 1;
1566 CDir
*dir
= dn
->get_dir();
1567 CInode
*diri
= dir
->get_inode();
1569 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
1570 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
1572 #ifdef MDS_VERIFY_FRAGSTAT
1573 if (dir
->is_complete())
1574 dir
->verify_fragstat();
1576 dir
->encode_dirstat(bl
, whoami
);
1577 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
1579 ::encode(dn
->get_name(), bl
);
1580 if (snapid
== CEPH_NOSNAP
)
1581 mds
->locker
->issue_client_lease(dn
, client
, bl
, now
, session
);
1583 encode_null_lease(bl
);
1584 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
1586 reply
->head
.is_dentry
= 0;
1590 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
1591 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
1592 reply
->head
.is_target
= 1;
1594 reply
->head
.is_target
= 0;
1596 reply
->set_trace(bl
);
1603 * process a client request
1604 * This function DOES put the passed message before returning
1606 void Server::handle_client_request(MClientRequest
*req
)
1608 dout(4) << "handle_client_request " << *req
<< dendl
;
1611 mds
->logger
->inc(l_mds_request
);
1613 logger
->inc(l_mdss_handle_client_request
);
1615 if (!mdcache
->is_open()) {
1616 dout(5) << "waiting for root" << dendl
;
1617 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
1622 Session
*session
= 0;
1623 if (req
->get_source().is_client()) {
1624 session
= mds
->get_session(req
);
1626 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
1627 } else if (session
->is_closed() ||
1628 session
->is_closing() ||
1629 session
->is_killing()) {
1630 dout(5) << "session closed|closing|killing, dropping" << dendl
;
1634 if (req
->is_queued_for_replay())
1635 mds
->queue_one_replay();
1642 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
1643 // send it? hrm, this isn't ideal; they may get a lot of copies if
1644 // they have a high request rate.
1647 // completed request?
1648 bool has_completed
= false;
1649 if (req
->is_replay() || req
->get_retry_attempt()) {
1652 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
1653 has_completed
= true;
1654 // Don't send traceless reply if the completed request has created
1655 // new inode. Treat the request as lookup request instead.
1656 if (req
->is_replay() ||
1657 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
1658 req
->get_op() != CEPH_MDS_OP_OPEN
&&
1659 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
1660 dout(5) << "already completed " << req
->get_reqid() << dendl
;
1661 MClientReply
*reply
= new MClientReply(req
, 0);
1662 if (created
!= inodeno_t()) {
1664 ::encode(created
, extra
);
1665 reply
->set_extra_bl(extra
);
1667 req
->get_connection()->send_message(reply
);
1669 if (req
->is_queued_for_replay())
1670 mds
->queue_one_replay();
1675 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
1676 req
->get_op() != CEPH_MDS_OP_CREATE
) {
1677 dout(10) << " completed request which created new inode " << created
1678 << ", convert it to lookup request" << dendl
;
1679 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
1680 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
1685 // trim completed_request list
1686 if (req
->get_oldest_client_tid() > 0) {
1687 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
1689 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
1690 // Sessions 'completed_requests' was dirtied, mark it to be
1691 // potentially flushed at segment expiry.
1692 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
1694 if (session
->get_num_trim_requests_warnings() > 0 &&
1695 session
->get_num_completed_requests() * 2 < g_conf
->mds_max_completed_requests
)
1696 session
->reset_num_trim_requests_warnings();
1698 if (session
->get_num_completed_requests() >=
1699 (g_conf
->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
1700 session
->inc_num_trim_requests_warnings();
1702 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
1703 << req
->get_oldest_client_tid() << "), "
1704 << session
->get_num_completed_requests()
1705 << " completed requests recorded in session\n";
1706 mds
->clog
->warn() << ss
.str();
1707 dout(20) << __func__
<< " " << ss
.str() << dendl
;
1712 // register + dispatch
1713 MDRequestRef mdr
= mdcache
->request_start(req
);
1718 mdr
->session
= session
;
1719 session
->requests
.push_back(&mdr
->item_session_request
);
1723 mdr
->has_completed
= true;
1725 // process embedded cap releases?
1726 // (only if NOT replay!)
1727 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
1728 client_t client
= req
->get_source().num();
1729 for (vector
<MClientRequest::Release
>::iterator p
= req
->releases
.begin();
1730 p
!= req
->releases
.end();
1732 mds
->locker
->process_request_cap_release(mdr
, client
, p
->item
, p
->dname
);
1733 req
->releases
.clear();
1736 dispatch_client_request(mdr
);
1740 void Server::handle_osd_map()
1742 /* Note that we check the OSDMAP_FULL flag directly rather than
1743 * using osdmap_full_flag(), because we want to know "is the flag set"
1744 * rather than "does the flag apply to us?" */
1745 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
1746 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
1747 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
1748 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
1749 << o
.get_epoch() << dendl
;
1753 void Server::dispatch_client_request(MDRequestRef
& mdr
)
1755 // we shouldn't be waiting on anyone.
1756 assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
1759 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
1761 } else if (mdr
->aborted
) {
1762 mdr
->aborted
= false;
1763 mdcache
->request_kill(mdr
);
1767 MClientRequest
*req
= mdr
->client_request
;
1769 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
1771 dout(7) << "dispatch_client_request " << *req
<< dendl
;
1773 if (req
->may_write()) {
1774 if (mdcache
->is_readonly()) {
1775 dout(10) << " read-only FS" << dendl
;
1776 respond_to_request(mdr
, -EROFS
);
1779 if (mdr
->has_more() && mdr
->more()->slave_error
) {
1780 dout(10) << " got error from slaves" << dendl
;
1781 respond_to_request(mdr
, mdr
->more()->slave_error
);
1787 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1788 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
1789 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1790 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
1791 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
1792 req
->get_op() == CEPH_MDS_OP_CREATE
||
1793 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
1794 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
1795 ((req
->get_op() == CEPH_MDS_OP_LINK
||
1796 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
1797 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
1800 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1801 respond_to_request(mdr
, -ENOSPC
);
1804 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1808 switch (req
->get_op()) {
1809 case CEPH_MDS_OP_LOOKUPHASH
:
1810 case CEPH_MDS_OP_LOOKUPINO
:
1811 handle_client_lookup_ino(mdr
, false, false);
1813 case CEPH_MDS_OP_LOOKUPPARENT
:
1814 handle_client_lookup_ino(mdr
, true, false);
1816 case CEPH_MDS_OP_LOOKUPNAME
:
1817 handle_client_lookup_ino(mdr
, false, true);
1821 case CEPH_MDS_OP_LOOKUP
:
1822 handle_client_getattr(mdr
, true);
1825 case CEPH_MDS_OP_LOOKUPSNAP
:
1826 // lookupsnap does not reference a CDentry; treat it as a getattr
1827 case CEPH_MDS_OP_GETATTR
:
1828 handle_client_getattr(mdr
, false);
1831 case CEPH_MDS_OP_SETATTR
:
1832 handle_client_setattr(mdr
);
1834 case CEPH_MDS_OP_SETLAYOUT
:
1835 handle_client_setlayout(mdr
);
1837 case CEPH_MDS_OP_SETDIRLAYOUT
:
1838 handle_client_setdirlayout(mdr
);
1840 case CEPH_MDS_OP_SETXATTR
:
1841 handle_client_setxattr(mdr
);
1843 case CEPH_MDS_OP_RMXATTR
:
1844 handle_client_removexattr(mdr
);
1847 case CEPH_MDS_OP_READDIR
:
1848 handle_client_readdir(mdr
);
1851 case CEPH_MDS_OP_SETFILELOCK
:
1852 handle_client_file_setlock(mdr
);
1855 case CEPH_MDS_OP_GETFILELOCK
:
1856 handle_client_file_readlock(mdr
);
1860 case CEPH_MDS_OP_CREATE
:
1861 if (mdr
->has_completed
)
1862 handle_client_open(mdr
); // already created.. just open
1864 handle_client_openc(mdr
);
1867 case CEPH_MDS_OP_OPEN
:
1868 handle_client_open(mdr
);
1873 case CEPH_MDS_OP_MKNOD
:
1874 handle_client_mknod(mdr
);
1876 case CEPH_MDS_OP_LINK
:
1877 handle_client_link(mdr
);
1879 case CEPH_MDS_OP_UNLINK
:
1880 case CEPH_MDS_OP_RMDIR
:
1881 handle_client_unlink(mdr
);
1883 case CEPH_MDS_OP_RENAME
:
1884 handle_client_rename(mdr
);
1886 case CEPH_MDS_OP_MKDIR
:
1887 handle_client_mkdir(mdr
);
1889 case CEPH_MDS_OP_SYMLINK
:
1890 handle_client_symlink(mdr
);
1895 case CEPH_MDS_OP_LSSNAP
:
1896 handle_client_lssnap(mdr
);
1898 case CEPH_MDS_OP_MKSNAP
:
1899 handle_client_mksnap(mdr
);
1901 case CEPH_MDS_OP_RMSNAP
:
1902 handle_client_rmsnap(mdr
);
1904 case CEPH_MDS_OP_RENAMESNAP
:
1905 handle_client_renamesnap(mdr
);
1909 dout(1) << " unknown client op " << req
->get_op() << dendl
;
1910 respond_to_request(mdr
, -EOPNOTSUPP
);
1915 // ---------------------------------------
1918 /* This function DOES put the passed message before returning*/
1919 void Server::handle_slave_request(MMDSSlaveRequest
*m
)
1921 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
1922 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1924 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
1928 return handle_slave_request_reply(m
);
1930 // the purpose of rename notify is enforcing causal message ordering. making sure
1931 // bystanders have received all messages from rename srcdn's auth MDS.
1932 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
1933 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(m
->get_reqid(), m
->get_attempt(),
1934 MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
1935 mds
->send_message(reply
, m
->get_connection());
1940 CDentry
*straydn
= NULL
;
1941 if (m
->stray
.length() > 0) {
1942 straydn
= mdcache
->add_replica_stray(m
->stray
, from
);
1947 // am i a new slave?
1949 if (mdcache
->have_request(m
->get_reqid())) {
1951 mdr
= mdcache
->request_get(m
->get_reqid());
1953 // is my request newer?
1954 if (mdr
->attempt
> m
->get_attempt()) {
1955 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
1956 << ", dropping " << *m
<< dendl
;
1962 if (mdr
->attempt
< m
->get_attempt()) {
1963 // mine is old, close it out
1964 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
1965 << ", closing out" << dendl
;
1966 mdcache
->request_finish(mdr
);
1968 } else if (mdr
->slave_to_mds
!= from
) {
1969 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
1974 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
&& m
->is_abort()) {
1975 mdr
->aborted
= true;
1976 if (mdr
->slave_request
) {
1977 // only abort on-going xlock, wrlock and auth pin
1978 assert(!mdr
->slave_did_prepare());
1980 mdcache
->request_finish(mdr
);
1988 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
1989 dout(10) << "missing slave request for " << m
->get_reqid()
1990 << " OP_FINISH, must have lost race with a forward" << dendl
;
1994 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
1995 mdr
->set_op_stamp(m
->op_stamp
);
1997 assert(mdr
->slave_request
== 0); // only one at a time, please!
2001 mdr
->straydn
= straydn
;
2004 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2005 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2006 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2008 } else if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2009 mdr
->locks
.empty()) {
2010 dout(3) << "not active yet, waiting" << dendl
;
2011 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2015 mdr
->slave_request
= m
;
2017 dispatch_slave_request(mdr
);
2020 /* This function DOES put the passed message before returning*/
2021 void Server::handle_slave_request_reply(MMDSSlaveRequest
*m
)
2023 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2025 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2026 metareqid_t r
= m
->get_reqid();
2027 if (!mdcache
->have_uncommitted_master(r
, from
)) {
2028 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2029 << from
<< " reqid " << r
<< dendl
;
2033 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2034 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2038 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
2039 metareqid_t r
= m
->get_reqid();
2040 mdcache
->committed_master_slave(r
, from
);
2045 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2046 if (m
->get_attempt() != mdr
->attempt
) {
2047 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
2048 << m
->get_attempt() << dendl
;
2053 switch (m
->get_op()) {
2054 case MMDSSlaveRequest::OP_XLOCKACK
:
2056 // identify lock, master request
2057 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2058 m
->get_object_info());
2059 mdr
->more()->slaves
.insert(from
);
2060 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2061 mdr
->xlocks
.insert(lock
);
2062 mdr
->locks
.insert(lock
);
2063 mdr
->finish_locking(lock
);
2064 lock
->get_xlock(mdr
, mdr
->get_client());
2066 assert(mdr
->more()->waiting_on_slave
.count(from
));
2067 mdr
->more()->waiting_on_slave
.erase(from
);
2068 assert(mdr
->more()->waiting_on_slave
.empty());
2069 mdcache
->dispatch_request(mdr
);
2073 case MMDSSlaveRequest::OP_WRLOCKACK
:
2075 // identify lock, master request
2076 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2077 m
->get_object_info());
2078 mdr
->more()->slaves
.insert(from
);
2079 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2080 mdr
->remote_wrlocks
[lock
] = from
;
2081 mdr
->locks
.insert(lock
);
2082 mdr
->finish_locking(lock
);
2084 assert(mdr
->more()->waiting_on_slave
.count(from
));
2085 mdr
->more()->waiting_on_slave
.erase(from
);
2086 assert(mdr
->more()->waiting_on_slave
.empty());
2087 mdcache
->dispatch_request(mdr
);
2091 case MMDSSlaveRequest::OP_AUTHPINACK
:
2092 handle_slave_auth_pin_ack(mdr
, m
);
2095 case MMDSSlaveRequest::OP_LINKPREPACK
:
2096 handle_slave_link_prep_ack(mdr
, m
);
2099 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
2100 handle_slave_rmdir_prep_ack(mdr
, m
);
2103 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
2104 handle_slave_rename_prep_ack(mdr
, m
);
2107 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
2108 handle_slave_rename_notify_ack(mdr
, m
);
2119 /* This function DOES put the mdr->slave_request before returning*/
2120 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
2122 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
2125 dout(7) << " abort flag set, finishing" << dendl
;
2126 mdcache
->request_finish(mdr
);
2130 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
2132 int op
= mdr
->slave_request
->get_op();
2134 case MMDSSlaveRequest::OP_XLOCK
:
2135 case MMDSSlaveRequest::OP_WRLOCK
:
2138 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2139 mdr
->slave_request
->get_object_info());
2142 dout(10) << "don't have object, dropping" << dendl
;
2143 ceph_abort(); // can this happen, if we auth pinned properly.
2145 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2146 dout(10) << "not auth for remote xlock attempt, dropping on "
2147 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2149 // use acquire_locks so that we get auth_pinning.
2150 set
<SimpleLock
*> rdlocks
;
2151 set
<SimpleLock
*> wrlocks
= mdr
->wrlocks
;
2152 set
<SimpleLock
*> xlocks
= mdr
->xlocks
;
2156 case MMDSSlaveRequest::OP_XLOCK
:
2157 xlocks
.insert(lock
);
2158 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2160 case MMDSSlaveRequest::OP_WRLOCK
:
2161 wrlocks
.insert(lock
);
2162 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2166 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
2170 MMDSSlaveRequest
*r
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, replycode
);
2171 r
->set_lock_type(lock
->get_type());
2172 lock
->get_parent()->set_object_info(r
->get_object_info());
2173 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2177 mdr
->slave_request
->put();
2178 mdr
->slave_request
= 0;
2182 case MMDSSlaveRequest::OP_UNXLOCK
:
2183 case MMDSSlaveRequest::OP_UNWRLOCK
:
2185 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2186 mdr
->slave_request
->get_object_info());
2188 bool need_issue
= false;
2190 case MMDSSlaveRequest::OP_UNXLOCK
:
2191 mds
->locker
->xlock_finish(lock
, mdr
.get(), &need_issue
);
2193 case MMDSSlaveRequest::OP_UNWRLOCK
:
2194 mds
->locker
->wrlock_finish(lock
, mdr
.get(), &need_issue
);
2198 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2200 // done. no ack necessary.
2201 mdr
->slave_request
->put();
2202 mdr
->slave_request
= 0;
2206 case MMDSSlaveRequest::OP_DROPLOCKS
:
2207 mds
->locker
->drop_locks(mdr
.get());
2208 mdr
->slave_request
->put();
2209 mdr
->slave_request
= 0;
2212 case MMDSSlaveRequest::OP_AUTHPIN
:
2213 handle_slave_auth_pin(mdr
);
2216 case MMDSSlaveRequest::OP_LINKPREP
:
2217 case MMDSSlaveRequest::OP_UNLINKPREP
:
2218 handle_slave_link_prep(mdr
);
2221 case MMDSSlaveRequest::OP_RMDIRPREP
:
2222 handle_slave_rmdir_prep(mdr
);
2225 case MMDSSlaveRequest::OP_RENAMEPREP
:
2226 handle_slave_rename_prep(mdr
);
2229 case MMDSSlaveRequest::OP_FINISH
:
2230 // information about rename imported caps
2231 if (mdr
->slave_request
->inode_export
.length() > 0)
2232 mdr
->more()->inode_import
.claim(mdr
->slave_request
->inode_export
);
2233 // finish off request.
2234 mdcache
->request_finish(mdr
);
2242 /* This function DOES put the mdr->slave_request before returning*/
2243 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2245 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2247 // build list of objects
2248 list
<MDSCacheObject
*> objects
;
2249 CInode
*auth_pin_freeze
= NULL
;
2250 bool fail
= false, wouldblock
= false, readonly
= false;
2252 if (mdcache
->is_readonly()) {
2253 dout(10) << " read-only FS" << dendl
;
2259 for (vector
<MDSCacheObjectInfo
>::iterator p
= mdr
->slave_request
->get_authpins().begin();
2260 p
!= mdr
->slave_request
->get_authpins().end();
2262 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2264 dout(10) << " don't have " << *p
<< dendl
;
2269 objects
.push_back(object
);
2270 if (*p
== mdr
->slave_request
->get_authpin_freeze())
2271 auth_pin_freeze
= static_cast<CInode
*>(object
);
2275 // can we auth pin them?
2277 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2280 if (!(*p
)->is_auth()) {
2281 dout(10) << " not auth for " << **p
<< dendl
;
2285 if (mdr
->is_auth_pinned(*p
))
2287 if (!mdr
->can_auth_pin(*p
)) {
2288 if (mdr
->slave_request
->is_nonblock()) {
2289 dout(10) << " can't auth_pin (freezing?) " << **p
<< " nonblocking" << dendl
;
2295 dout(10) << " waiting for authpinnable on " << **p
<< dendl
;
2296 (*p
)->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2297 mdr
->drop_local_auth_pins();
2299 mds
->locker
->notify_freeze_waiter(*p
);
2307 mdr
->drop_local_auth_pins(); // just in case
2309 /* freeze authpin wrong inode */
2310 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2311 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2312 mdr
->unfreeze_auth_pin(true);
2314 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2315 * on the source inode to complete. This happens after all locks for the rename
2316 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2317 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2318 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2319 * The solution is freeze the inode and prevent other MDRequests from getting new
2322 if (auth_pin_freeze
) {
2323 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
2324 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
2325 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
2326 mds
->mdlog
->flush();
2330 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2333 dout(10) << "auth_pinning " << **p
<< dendl
;
2339 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
2341 // return list of my auth_pins (if any)
2342 for (set
<MDSCacheObject
*>::iterator p
= mdr
->auth_pins
.begin();
2343 p
!= mdr
->auth_pins
.end();
2345 MDSCacheObjectInfo info
;
2346 (*p
)->set_object_info(info
);
2347 reply
->get_authpins().push_back(info
);
2348 if (*p
== (MDSCacheObject
*)auth_pin_freeze
)
2349 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
2353 reply
->mark_error_wouldblock();
2355 reply
->mark_error_rofs();
2357 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
2359 // clean up this request
2360 mdr
->slave_request
->put();
2361 mdr
->slave_request
= 0;
2365 /* This function DOES NOT put the passed ack before returning*/
2366 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
2368 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
2369 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
2372 set
<MDSCacheObject
*> pinned
;
2373 for (vector
<MDSCacheObjectInfo
>::iterator p
= ack
->get_authpins().begin();
2374 p
!= ack
->get_authpins().end();
2376 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2377 assert(object
); // we pinned it
2378 dout(10) << " remote has pinned " << *object
<< dendl
;
2379 if (!mdr
->is_auth_pinned(object
))
2380 mdr
->remote_auth_pins
[object
] = from
;
2381 if (*p
== ack
->get_authpin_freeze())
2382 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
2383 pinned
.insert(object
);
2386 // removed frozen auth pin ?
2387 if (mdr
->more()->is_remote_frozen_authpin
&&
2388 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
2389 auto p
= mdr
->remote_auth_pins
.find(mdr
->more()->rename_inode
);
2390 assert(p
!= mdr
->remote_auth_pins
.end());
2391 if (p
->second
== from
) {
2392 mdr
->more()->is_remote_frozen_authpin
= false;
2396 // removed auth pins?
2397 map
<MDSCacheObject
*, mds_rank_t
>::iterator p
= mdr
->remote_auth_pins
.begin();
2398 while (p
!= mdr
->remote_auth_pins
.end()) {
2399 MDSCacheObject
* object
= p
->first
;
2400 if (p
->second
== from
&& pinned
.count(object
) == 0) {
2401 dout(10) << " remote has unpinned " << *object
<< dendl
;
2402 mdr
->remote_auth_pins
.erase(p
++);
2408 if (ack
->is_error_rofs()) {
2409 mdr
->more()->slave_error
= -EROFS
;
2410 mdr
->aborted
= true;
2411 } else if (ack
->is_error_wouldblock()) {
2412 mdr
->more()->slave_error
= -EWOULDBLOCK
;
2413 mdr
->aborted
= true;
2417 mdr
->more()->slaves
.insert(from
);
2419 // clear from waiting list
2420 assert(mdr
->more()->waiting_on_slave
.count(from
));
2421 mdr
->more()->waiting_on_slave
.erase(from
);
2424 if (mdr
->more()->waiting_on_slave
.empty())
2425 mdcache
->dispatch_request(mdr
);
2427 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
2431 // ---------------------------------------
2436 * check whether we are permitted to complete a request
2438 * Check whether we have permission to perform the operation specified
2439 * by mask on the given inode, based on the capability in the mdr's
2442 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
2445 int r
= mdr
->session
->check_access(
2447 mdr
->client_request
->get_caller_uid(),
2448 mdr
->client_request
->get_caller_gid(),
2449 &mdr
->client_request
->get_caller_gid_list(),
2450 mdr
->client_request
->head
.args
.setattr
.uid
,
2451 mdr
->client_request
->head
.args
.setattr
.gid
);
2453 respond_to_request(mdr
, r
);
2461 * check whether fragment has reached maximum size
2464 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
2466 const auto size
= in
->get_frag_size();
2467 if (size
>= g_conf
->mds_bal_fragment_size_max
) {
2468 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf
->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
2469 respond_to_request(mdr
, -ENOSPC
);
2477 /** validate_dentry_dir
2479 * verify that the dir exists and would own the dname.
2480 * do not check if the dentry exists.
2482 CDir
*Server::validate_dentry_dir(MDRequestRef
& mdr
, CInode
*diri
, boost::string_view dname
)
2484 // make sure parent is a dir?
2485 if (!diri
->is_dir()) {
2486 dout(7) << "validate_dentry_dir: not a dir" << dendl
;
2487 respond_to_request(mdr
, -ENOTDIR
);
2492 frag_t fg
= diri
->pick_dirfrag(dname
);
2493 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
2498 if (dir
->is_frozen()) {
2499 dout(7) << "dir is frozen " << *dir
<< dendl
;
2500 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2508 /** prepare_null_dentry
2509 * prepare a null (or existing) dentry in given dir.
2510 * wait for any dn lock.
2512 CDentry
* Server::prepare_null_dentry(MDRequestRef
& mdr
, CDir
*dir
, boost::string_view dname
, bool okexist
)
2514 dout(10) << "prepare_null_dentry " << dname
<< " in " << *dir
<< dendl
;
2515 assert(dir
->is_auth());
2517 client_t client
= mdr
->get_client();
2519 // does it already exist?
2520 CDentry
*dn
= dir
->lookup(dname
);
2523 if (dn->lock.is_xlocked_by_other(mdr)) {
2524 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2525 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2529 if (!dn
->get_linkage(client
, mdr
)->is_null()) {
2530 // name already exists
2531 dout(10) << "dentry " << dname
<< " exists in " << *dir
<< dendl
;
2533 respond_to_request(mdr
, -EEXIST
);
2537 dn
->first
= dir
->inode
->find_snaprealm()->get_newest_seq() + 1;
2543 // make sure dir is complete
2544 if (!dir
->is_complete() && (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2545 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2546 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2551 dn
= dir
->add_null_dentry(dname
, dir
->inode
->find_snaprealm()->get_newest_seq() + 1);
2553 dout(10) << "prepare_null_dentry added " << *dn
<< dendl
;
2557 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
2559 CDentry
*straydn
= mdr
->straydn
;
2562 in
->name_stray_dentry(straydname
);
2563 if (straydn
->get_name() == straydname
)
2566 assert(!mdr
->done_locking
);
2567 mdr
->unpin(straydn
);
2570 CDir
*straydir
= mdcache
->get_stray_dir(in
);
2572 if (!mdr
->client_request
->is_replay() &&
2573 !check_fragment_space(mdr
, straydir
))
2576 straydn
= mdcache
->get_or_create_stray_dentry(in
);
2577 mdr
->straydn
= straydn
;
2582 /** prepare_new_inode
2584 * create a new inode. set c/m/atime. hit dir pop.
2586 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
2587 file_layout_t
*layout
)
2589 CInode
*in
= new CInode(mdcache
);
2591 // Server::prepare_force_open_sessions() can re-open session in closing
2592 // state. In that corner case, session's prealloc_inos are being freed.
2593 // To simplify the code, we disallow using/refilling session's prealloc_ino
2594 // while session is opening.
2595 bool allow_prealloc_inos
= !mdr
->session
->is_opening();
2598 if (allow_prealloc_inos
&&
2599 mdr
->session
->info
.prealloc_inos
.size()) {
2600 mdr
->used_prealloc_ino
=
2601 in
->inode
.ino
= mdr
->session
->take_ino(useino
); // prealloc -> used
2602 mds
->sessionmap
.mark_projected(mdr
->session
);
2604 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
2605 << " (" << mdr
->session
->info
.prealloc_inos
2606 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
2610 in
->inode
.ino
= mds
->inotable
->project_alloc_id();
2611 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
2614 if (useino
&& useino
!= in
->inode
.ino
) {
2615 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
2616 mds
->clog
->error() << mdr
->client_request
->get_source()
2617 << " specified ino " << useino
2618 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
2619 //ceph_abort(); // just for now.
2622 if (allow_prealloc_inos
&&
2623 mdr
->session
->get_num_projected_prealloc_inos() < g_conf
->mds_client_prealloc_inos
/ 2) {
2624 int need
= g_conf
->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
2625 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
2626 assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
2627 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
2628 mds
->sessionmap
.mark_projected(mdr
->session
);
2629 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
2632 in
->inode
.version
= 1;
2633 in
->inode
.xattr_version
= 1;
2634 in
->inode
.nlink
= 1; // FIXME
2636 in
->inode
.mode
= mode
;
2638 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
2639 if (in
->inode
.is_dir()) {
2640 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
2641 } else if (layout
) {
2642 in
->inode
.layout
= *layout
;
2644 in
->inode
.layout
= mdcache
->default_file_layout
;
2647 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
2648 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
2650 CInode
*diri
= dir
->get_inode();
2652 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
2654 if (diri
->inode
.mode
& S_ISGID
) {
2655 dout(10) << " dir is sticky" << dendl
;
2656 in
->inode
.gid
= diri
->inode
.gid
;
2657 if (S_ISDIR(mode
)) {
2658 dout(10) << " new dir also sticky" << dendl
;
2659 in
->inode
.mode
|= S_ISGID
;
2662 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
2664 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
2666 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
2667 mdr
->get_op_stamp();
2669 in
->inode
.change_attr
= 0;
2671 MClientRequest
*req
= mdr
->client_request
;
2672 if (req
->get_data().length()) {
2673 bufferlist::iterator p
= req
->get_data().begin();
2675 // xattrs on new inode?
2676 CInode::mempool_xattr_map xattrs
;
2677 ::decode(xattrs
, p
);
2678 for (const auto &p
: xattrs
) {
2679 dout(10) << "prepare_new_inode setting xattr " << p
.first
<< dendl
;
2680 auto em
= in
->xattrs
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple(p
.second
));
2682 em
.first
->second
= p
.second
;
2686 if (!mds
->mdsmap
->get_inline_data_enabled() ||
2687 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
2688 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
2690 mdcache
->add_inode(in
); // add
2691 dout(10) << "prepare_new_inode " << *in
<< dendl
;
2695 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
2697 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
2698 << " inotablev " << mds
->inotable
->get_projected_version()
2700 blob
->set_ino_alloc(mdr
->alloc_ino
,
2701 mdr
->used_prealloc_ino
,
2703 mdr
->client_request
->get_source(),
2704 mds
->sessionmap
.get_projected(),
2705 mds
->inotable
->get_projected_version());
2708 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
2710 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
2711 << " / " << mdr
->prealloc_inos
2712 << " / " << mdr
->used_prealloc_ino
<< dendl
;
2714 if (mdr
->alloc_ino
) {
2715 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
2717 if (mdr
->prealloc_inos
.size()) {
2719 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
2720 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
2721 mds
->sessionmap
.mark_dirty(session
);
2722 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
2724 if (mdr
->used_prealloc_ino
) {
2726 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
2727 mds
->sessionmap
.mark_dirty(session
);
2731 class C_MDS_TryFindInode
: public ServerContext
{
2734 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
2735 void finish(int r
) override
{
2736 if (r
== -ESTALE
) // :( find_ino_peers failed
2737 server
->respond_to_request(mdr
, r
);
2739 server
->dispatch_client_request(mdr
);
2743 CDir
*Server::traverse_to_auth_dir(MDRequestRef
& mdr
, vector
<CDentry
*> &trace
, filepath refpath
)
2745 // figure parent dir vs dname
2746 if (refpath
.depth() == 0) {
2747 dout(7) << "can't do that to root" << dendl
;
2748 respond_to_request(mdr
, -EINVAL
);
2751 string dname
= refpath
.last_dentry();
2752 refpath
.pop_dentry();
2754 dout(10) << "traverse_to_auth_dir dirpath " << refpath
<< " dname " << dname
<< dendl
;
2756 // traverse to parent dir
2758 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &trace
, &diri
, MDS_TRAVERSE_FORWARD
);
2759 if (r
> 0) return 0; // delayed
2762 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2763 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
2766 respond_to_request(mdr
, r
);
2770 // is it an auth dir?
2771 CDir
*dir
= validate_dentry_dir(mdr
, diri
, dname
);
2773 return 0; // forwarded or waiting for freeze
2775 dout(10) << "traverse_to_auth_dir " << *dir
<< dendl
;
2779 /* If this returns null, the request has been handled
2780 * as appropriate: forwarded on, or the client's been replied to */
2781 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
, int n
,
2782 set
<SimpleLock
*> &rdlocks
,
2784 bool no_want_auth
, /* for readdir, who doesn't want auth _even_if_ it's
2786 file_layout_t
**layout
,
2787 bool no_lookup
) // true if we cannot return a null dentry lease
2789 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2790 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
2792 if (mdr
->done_locking
)
2796 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &mdr
->dn
[n
], &mdr
->in
[n
], MDS_TRAVERSE_FORWARD
);
2798 return NULL
; // delayed
2799 if (r
< 0) { // error
2800 if (r
== -ENOENT
&& n
== 0 && mdr
->dn
[n
].size()) {
2802 mdr
->tracedn
= mdr
->dn
[n
][mdr
->dn
[n
].size()-1];
2803 respond_to_request(mdr
, r
);
2804 } else if (r
== -ESTALE
) {
2805 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2806 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
2807 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
2809 dout(10) << "FAIL on error " << r
<< dendl
;
2810 respond_to_request(mdr
, r
);
2814 CInode
*ref
= mdr
->in
[n
];
2815 dout(10) << "ref is " << *ref
<< dendl
;
2817 // fw to inode auth?
2818 if (mdr
->snapid
!= CEPH_NOSNAP
&& !no_want_auth
)
2822 if (ref
->is_ambiguous_auth()) {
2823 dout(10) << "waiting for single auth on " << *ref
<< dendl
;
2824 ref
->add_waiter(CInode::WAIT_SINGLEAUTH
, new C_MDS_RetryRequest(mdcache
, mdr
));
2827 if (!ref
->is_auth()) {
2828 dout(10) << "fw to auth for " << *ref
<< dendl
;
2829 mdcache
->request_forward(mdr
, ref
->authority().first
);
2834 // do NOT proceed if freezing, as cap release may defer in that case, and
2835 // we could deadlock when we try to lock @ref.
2836 // if we're already auth_pinned, continue; the release has already been processed.
2837 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
2838 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
2839 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
2840 ref
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2841 /* If we have any auth pins, this will deadlock.
2842 * But the only way to get here if we've already got auth pins
2843 * is because we're on an inode with snapshots that got updated
2844 * between dispatches of this request. So we're going to drop
2845 * our locks and our auth pins and reacquire them later.
2847 * This is safe since we're only in this function when working on
2848 * a single MDS request; otherwise we'd be in
2849 * rdlock_path_xlock_dentry.
2851 mds
->locker
->drop_locks(mdr
.get(), NULL
);
2852 mdr
->drop_local_auth_pins();
2853 if (!mdr
->remote_auth_pins
.empty())
2854 mds
->locker
->notify_freeze_waiter(ref
);
2861 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2862 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2864 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, ref
, layout
);
2866 mds
->locker
->include_snap_rdlocks(rdlocks
, ref
);
2874 /** rdlock_path_xlock_dentry
2875 * traverse path to the directory that could/would contain dentry.
2876 * make sure i am auth for that dentry, forward as necessary.
2877 * create null dentry in place (or use existing if okexist).
2878 * get rdlocks on traversed dentries, xlock on new dentry.
2880 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
, int n
,
2881 set
<SimpleLock
*>& rdlocks
, set
<SimpleLock
*>& wrlocks
, set
<SimpleLock
*>& xlocks
,
2882 bool okexist
, bool mustexist
, bool alwaysxlock
,
2883 file_layout_t
**layout
)
2885 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2887 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
2889 client_t client
= mdr
->get_client();
2891 if (mdr
->done_locking
)
2892 return mdr
->dn
[n
].back();
2894 CDir
*dir
= traverse_to_auth_dir(mdr
, mdr
->dn
[n
], refpath
);
2897 CInode
*diri
= dir
->get_inode();
2898 if (!mdr
->reqid
.name
.is_mds()) {
2899 if (diri
->is_system() && !diri
->is_root()) {
2900 respond_to_request(mdr
, -EROFS
);
2904 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
2905 respond_to_request(mdr
, -ENOENT
);
2909 // make a null dentry?
2910 boost::string_view dname
= refpath
.last_dentry();
2913 dn
= dir
->lookup(dname
);
2915 // make sure dir is complete
2916 if (!dn
&& !dir
->is_complete() &&
2917 (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2918 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2919 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2924 if (dn
&& !dn
->lock
.can_read(client
) && dn
->lock
.get_xlock_by() != mdr
) {
2925 dout(10) << "waiting on xlocked dentry " << *dn
<< dendl
;
2926 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryRequest(mdcache
, mdr
));
2931 if (!dn
|| dn
->get_linkage(client
, mdr
)->is_null()) {
2932 dout(7) << "dentry " << dname
<< " dne in " << *dir
<< dendl
;
2933 respond_to_request(mdr
, -ENOENT
);
2937 dn
= prepare_null_dentry(mdr
, dir
, dname
, okexist
);
2942 mdr
->dn
[n
].push_back(dn
);
2943 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
2944 mdr
->in
[n
] = dnl
->get_inode();
2947 // NOTE: rename takes the same set of locks for srcdn
2948 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2949 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2950 if (alwaysxlock
|| dnl
->is_null())
2951 xlocks
.insert(&dn
->lock
); // new dn, xlock
2953 rdlocks
.insert(&dn
->lock
); // existing dn, rdlock
2954 wrlocks
.insert(&dn
->get_dir()->inode
->filelock
); // also, wrlock on dir mtime
2955 wrlocks
.insert(&dn
->get_dir()->inode
->nestlock
); // also, wrlock on dir mtime
2957 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, dn
->get_dir()->inode
, layout
);
2959 mds
->locker
->include_snap_rdlocks(rdlocks
, dn
->get_dir()->inode
);
2969 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2971 * @param diri base inode
2972 * @param fg the exact frag we want
2973 * @param mdr request
2974 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2976 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
2978 CDir
*dir
= diri
->get_dirfrag(fg
);
2980 // not open and inode not mine?
2981 if (!dir
&& !diri
->is_auth()) {
2982 mds_rank_t inauth
= diri
->authority().first
;
2983 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
2984 mdcache
->request_forward(mdr
, inauth
);
2988 // not open and inode frozen?
2989 if (!dir
&& diri
->is_frozen()) {
2990 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
2991 assert(diri
->get_parent_dir());
2992 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2998 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3000 // am i auth for the dirfrag?
3001 if (!dir
->is_auth()) {
3002 mds_rank_t auth
= dir
->authority().first
;
3003 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3004 << ", fw to mds." << auth
<< dendl
;
3005 mdcache
->request_forward(mdr
, auth
);
3013 // ===============================================================================
3016 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3018 MClientRequest
*req
= mdr
->client_request
;
3019 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3021 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3022 // refpath can't be empty for lookup but it can for
3023 // getattr (we do getattr with empty refpath for mount of '/')
3024 respond_to_request(mdr
, -EINVAL
);
3028 bool want_auth
= false;
3029 int mask
= req
->head
.args
.getattr
.mask
;
3030 if (mask
& CEPH_STAT_RSTAT
)
3031 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3033 CInode
*ref
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, want_auth
, false, NULL
,
3038 * if client currently holds the EXCL cap on a field, do not rdlock
3039 * it; client's stat() will result in valid info if _either_ EXCL
3040 * cap is held or MDS rdlocks and reads the value here.
3042 * handling this case here is easier than weakening rdlock
3043 * semantics... that would cause problems elsewhere.
3045 client_t client
= mdr
->get_client();
3047 Capability
*cap
= ref
->get_client_cap(client
);
3048 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3049 mdr
->snapid
<= cap
->client_follows
))
3050 issued
= cap
->issued();
3052 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3053 rdlocks
.insert(&ref
->linklock
);
3054 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3055 rdlocks
.insert(&ref
->authlock
);
3056 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3057 rdlocks
.insert(&ref
->xattrlock
);
3058 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3059 // Don't wait on unstable filelock if client is allowed to read file size.
3060 // This can reduce the response time of getattr in the case that multiple
3061 // clients do stat(2) and there are writers.
3062 // The downside of this optimization is that mds may not issue Fs caps along
3063 // with getattr reply. Client may need to send more getattr requests.
3064 if (mdr
->rdlocks
.count(&ref
->filelock
)) {
3065 rdlocks
.insert(&ref
->filelock
);
3066 } else if (ref
->filelock
.is_stable() ||
3067 ref
->filelock
.get_num_wrlocks() > 0 ||
3068 !ref
->filelock
.can_read(mdr
->get_client())) {
3069 rdlocks
.insert(&ref
->filelock
);
3070 mdr
->done_locking
= false;
3074 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3077 if (!check_access(mdr
, ref
, MAY_READ
))
3080 utime_t now
= ceph_clock_now();
3081 mdr
->set_mds_stamp(now
);
3083 // note which caps are requested, so we return at least a snapshot
3084 // value for them. (currently this matters for xattrs and inline data)
3085 mdr
->getattr_caps
= mask
;
3087 mds
->balancer
->hit_inode(now
, ref
, META_POP_IRD
,
3088 req
->get_source().num());
3091 dout(10) << "reply to stat on " << *req
<< dendl
;
3094 mdr
->tracedn
= mdr
->dn
[0].back();
3095 respond_to_request(mdr
, 0);
3098 struct C_MDS_LookupIno2
: public ServerContext
{
3100 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3101 void finish(int r
) override
{
3102 server
->_lookup_ino_2(mdr
, r
);
3106 /* This function DOES clean up the mdr before returning*/
3110 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3111 bool want_parent
, bool want_dentry
)
3113 MClientRequest
*req
= mdr
->client_request
;
3115 inodeno_t ino
= req
->get_filepath().get_ino();
3116 CInode
*in
= mdcache
->get_inode(ino
);
3117 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3118 respond_to_request(mdr
, -ESTALE
);
3122 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3126 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->is_open() &&
3127 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
3131 // check for nothing (not read or write); this still applies the
3133 if (!check_access(mdr
, in
, 0))
3136 CDentry
*dn
= in
->get_projected_parent_dn();
3137 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3139 set
<SimpleLock
*> rdlocks
;
3140 if (dn
&& (want_parent
|| want_dentry
)) {
3142 rdlocks
.insert(&dn
->lock
);
3145 unsigned mask
= req
->head
.args
.getattr
.mask
;
3147 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3149 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3150 issued
= cap
->issued();
3151 // permission bits, ACL/security xattrs
3152 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3153 rdlocks
.insert(&in
->authlock
);
3154 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3155 rdlocks
.insert(&in
->xattrlock
);
3157 mdr
->getattr_caps
= mask
;
3160 if (!rdlocks
.empty()) {
3161 set
<SimpleLock
*> wrlocks
, xlocks
;
3162 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3166 // need read access to directory inode
3167 if (!check_access(mdr
, diri
, MAY_READ
))
3173 if (in
->is_base()) {
3174 respond_to_request(mdr
, -EINVAL
);
3177 if (!diri
|| diri
->is_stray()) {
3178 respond_to_request(mdr
, -ESTALE
);
3181 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3183 respond_to_request(mdr
, 0);
3186 inodeno_t dirino
= req
->get_filepath2().get_ino();
3187 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3188 respond_to_request(mdr
, -ENOENT
);
3191 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3193 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3198 respond_to_request(mdr
, 0);
3202 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
3204 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
3205 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3207 // `r` is a rank if >=0, else an error code
3209 mds_rank_t
dest_rank(r
);
3210 if (dest_rank
== mds
->get_nodeid())
3211 dispatch_client_request(mdr
);
3213 mdcache
->request_forward(mdr
, dest_rank
);
3218 if (r
== -ENOENT
|| r
== -ENODATA
)
3220 respond_to_request(mdr
, r
);
3224 /* This function takes responsibility for the passed mdr*/
3225 void Server::handle_client_open(MDRequestRef
& mdr
)
3227 MClientRequest
*req
= mdr
->client_request
;
3228 dout(7) << "open on " << req
->get_filepath() << dendl
;
3230 int flags
= req
->head
.args
.open
.flags
;
3231 int cmode
= ceph_flags_to_mode(flags
);
3233 respond_to_request(mdr
, -EINVAL
);
3237 bool need_auth
= !file_mode_is_readonly(cmode
) ||
3238 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
3240 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
3241 dout(7) << "read-only FS" << dendl
;
3242 respond_to_request(mdr
, -EROFS
);
3246 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3247 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, need_auth
);
3251 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
3253 mdr
->done_locking
= false;
3254 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3259 if (!cur
->inode
.is_file()) {
3260 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3261 cmode
= CEPH_FILE_MODE_PIN
;
3262 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3263 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
3264 flags
&= ~CEPH_O_TRUNC
;
3267 dout(10) << "open flags = " << flags
3268 << ", filemode = " << cmode
3269 << ", need_auth = " << need_auth
3273 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3274 dout(7) << "not a file or dir " << *cur << dendl;
3275 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3278 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
3279 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
3280 respond_to_request(mdr
, -EINVAL
);
3284 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
3285 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
3286 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3287 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
3291 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
3292 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
3293 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
3294 respond_to_request(mdr
, -EPERM
);
3298 // snapped data is read only
3299 if (mdr
->snapid
!= CEPH_NOSNAP
&&
3300 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
3301 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
3302 respond_to_request(mdr
, -EROFS
);
3306 unsigned mask
= req
->head
.args
.open
.mask
;
3308 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
3310 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3311 issued
= cap
->issued();
3312 // permission bits, ACL/security xattrs
3313 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3314 rdlocks
.insert(&cur
->authlock
);
3315 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3316 rdlocks
.insert(&cur
->xattrlock
);
3318 mdr
->getattr_caps
= mask
;
3322 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
3323 assert(cur
->is_auth());
3325 xlocks
.insert(&cur
->filelock
);
3326 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3329 if (!check_access(mdr
, cur
, MAY_WRITE
))
3332 // wait for pending truncate?
3333 const auto pi
= cur
->get_projected_inode();
3334 if (pi
->is_truncating()) {
3335 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
3336 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
3337 mds
->locker
->drop_locks(mdr
.get());
3338 mdr
->drop_local_auth_pins();
3339 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
3343 do_open_truncate(mdr
, cmode
);
3347 // sync filelock if snapped.
3348 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3349 // and that data itself is flushed so that we can read the snapped data off disk.
3350 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
3351 rdlocks
.insert(&cur
->filelock
);
3354 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3358 if (cmode
& CEPH_FILE_MODE_WR
)
3360 if (!check_access(mdr
, cur
, mask
))
3363 utime_t now
= ceph_clock_now();
3364 mdr
->set_mds_stamp(now
);
3366 if (cur
->is_file() || cur
->is_dir()) {
3367 if (mdr
->snapid
== CEPH_NOSNAP
) {
3369 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
->session
, 0, req
->is_replay());
3371 dout(12) << "open issued caps " << ccap_string(cap
->pending())
3372 << " for " << req
->get_source()
3373 << " on " << *cur
<< dendl
;
3375 int caps
= ceph_caps_for_mode(cmode
);
3376 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
3377 << " for " << req
->get_source()
3378 << " snapid " << mdr
->snapid
3379 << " on " << *cur
<< dendl
;
3380 mdr
->snap_caps
= caps
;
3384 // increase max_size?
3385 if (cmode
& CEPH_FILE_MODE_WR
)
3386 mds
->locker
->check_inode_max_size(cur
);
3388 // make sure this inode gets into the journal
3389 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
3390 !cur
->item_open_file
.is_on_list()) {
3391 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3392 EOpen
*le
= new EOpen(mds
->mdlog
);
3393 mdlog
->start_entry(le
);
3394 le
->add_clean_inode(cur
);
3395 ls
->open_files
.push_back(&cur
->item_open_file
);
3396 mdlog
->submit_entry(le
);
3400 if (cmode
& CEPH_FILE_MODE_WR
)
3401 mds
->balancer
->hit_inode(now
, cur
, META_POP_IWR
);
3403 mds
->balancer
->hit_inode(now
, cur
, META_POP_IRD
,
3404 mdr
->client_request
->get_source().num());
3407 if (req
->get_dentry_wanted()) {
3408 assert(mdr
->dn
[0].size());
3409 dn
= mdr
->dn
[0].back();
3414 respond_to_request(mdr
, 0);
3417 class C_MDS_openc_finish
: public ServerLogContext
{
3422 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
, snapid_t f
) :
3423 ServerLogContext(s
, r
), dn(d
), newi(ni
), follows(f
) {}
3424 void finish(int r
) override
{
3427 dn
->pop_projected_linkage();
3429 // dirty inode, dn, dir
3430 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
3431 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
3432 newi
->mark_dirty_parent(mdr
->ls
, true);
3436 get_mds()->locker
->share_inode_max_size(newi
);
3438 MDRequestRef null_ref
;
3439 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
3441 utime_t now
= ceph_clock_now();
3442 get_mds()->balancer
->hit_inode(now
, newi
, META_POP_IWR
);
3444 server
->respond_to_request(mdr
, 0);
3446 assert(g_conf
->mds_kill_openc_at
!= 1);
3450 /* This function takes responsibility for the passed mdr*/
3451 void Server::handle_client_openc(MDRequestRef
& mdr
)
3453 MClientRequest
*req
= mdr
->client_request
;
3454 client_t client
= mdr
->get_client();
3456 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
3458 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
3460 respond_to_request(mdr
, -EINVAL
);
3464 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
3467 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(),
3468 &mdr
->dn
[0], NULL
, MDS_TRAVERSE_FORWARD
);
3472 handle_client_open(mdr
);
3475 if (r
< 0 && r
!= -ENOENT
) {
3477 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3478 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
3479 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), c
);
3481 dout(10) << "FAIL on error " << r
<< dendl
;
3482 respond_to_request(mdr
, r
);
3488 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3489 file_layout_t
*dir_layout
= NULL
;
3490 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
,
3491 !excl
, false, false, &dir_layout
);
3493 if (mdr
->snapid
!= CEPH_NOSNAP
) {
3494 respond_to_request(mdr
, -EROFS
);
3498 file_layout_t layout
;
3500 layout
= *dir_layout
;
3502 layout
= mdcache
->default_file_layout
;
3504 // What kind of client caps are required to complete this operation
3505 uint64_t access
= MAY_WRITE
;
3507 const auto default_layout
= layout
;
3509 // fill in any special params from client
3510 if (req
->head
.args
.open
.stripe_unit
)
3511 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
3512 if (req
->head
.args
.open
.stripe_count
)
3513 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
3514 if (req
->head
.args
.open
.object_size
)
3515 layout
.object_size
= req
->head
.args
.open
.object_size
;
3516 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
3517 (__s32
)req
->head
.args
.open
.pool
>= 0) {
3518 layout
.pool_id
= req
->head
.args
.open
.pool
;
3520 // make sure we have as new a map as the client
3521 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
3522 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
3527 // If client doesn't have capability to modify layout pools, then
3528 // only permit this request if the requested pool matches what the
3529 // file would have inherited anyway from its parent.
3530 if (default_layout
!= layout
) {
3531 access
|= MAY_SET_VXATTR
;
3534 if (!layout
.is_valid()) {
3535 dout(10) << " invalid initial file layout" << dendl
;
3536 respond_to_request(mdr
, -EINVAL
);
3539 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
3540 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
3541 respond_to_request(mdr
, -EINVAL
);
3546 CDir
*dir
= dn
->get_dir();
3547 CInode
*diri
= dir
->get_inode();
3548 rdlocks
.insert(&diri
->authlock
);
3549 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3552 if (!check_access(mdr
, diri
, access
))
3555 if (!check_fragment_space(mdr
, dir
))
3558 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3560 if (!dnl
->is_null()) {
3562 assert(req
->head
.args
.open
.flags
& CEPH_O_EXCL
);
3563 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl
;
3564 mdr
->tracei
= dnl
->get_inode();
3566 respond_to_request(mdr
, -EEXIST
);
3571 SnapRealm
*realm
= diri
->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3572 snapid_t follows
= realm
->get_newest_seq();
3574 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
3575 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
3579 dn
->push_projected_linkage(in
);
3581 in
->inode
.version
= dn
->pre_dirty();
3582 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
3583 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
3584 in
->inode
.update_backtrace();
3585 if (cmode
& CEPH_FILE_MODE_WR
) {
3586 in
->inode
.client_ranges
[client
].range
.first
= 0;
3587 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.get_layout_size_increment();
3588 in
->inode
.client_ranges
[client
].follows
= follows
;
3590 in
->inode
.rstat
.rfiles
= 1;
3592 assert(dn
->first
== follows
+1);
3593 in
->first
= dn
->first
;
3596 mdr
->ls
= mdlog
->get_current_segment();
3597 EUpdate
*le
= new EUpdate(mdlog
, "openc");
3598 mdlog
->start_entry(le
);
3599 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
3600 journal_allocated_inos(mdr
, &le
->metablob
);
3601 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
3602 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
3605 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, req
->is_replay());
3606 in
->authlock
.set_state(LOCK_EXCL
);
3607 in
->xattrlock
.set_state(LOCK_EXCL
);
3609 // make sure this inode gets into the journal
3610 le
->metablob
.add_opened_ino(in
->ino());
3611 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3612 ls
->open_files
.push_back(&in
->item_open_file
);
3614 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
, follows
);
3616 if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
3617 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
3618 // add the file created flag onto the reply if create_flags features is supported
3619 ::encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
3622 journal_and_reply(mdr
, in
, dn
, le
, fin
);
3624 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3625 // have overshot the split size (multiple opencs in flight), so here is
3626 // an early chance to split the dir if this openc makes it oversized.
3627 mds
->balancer
->maybe_fragment(dir
, false);
3632 void Server::handle_client_readdir(MDRequestRef
& mdr
)
3634 MClientRequest
*req
= mdr
->client_request
;
3635 client_t client
= req
->get_source().num();
3636 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3637 CInode
*diri
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, false, true);
3640 // it's a directory, right?
3641 if (!diri
->is_dir()) {
3643 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
3644 respond_to_request(mdr
, -ENOTDIR
);
3648 rdlocks
.insert(&diri
->filelock
);
3649 rdlocks
.insert(&diri
->dirfragtreelock
);
3651 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3654 if (!check_access(mdr
, diri
, MAY_READ
))
3658 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
3659 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
3660 string offset_str
= req
->get_path2();
3662 __u32 offset_hash
= 0;
3663 if (!offset_str
.empty())
3664 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
3666 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
3668 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
3669 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
3671 // does the frag exist?
3672 if (diri
->dirfragtree
[fg
.value()] != fg
) {
3674 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3675 if (fg
.contains((unsigned)offset_hash
)) {
3676 newfg
= diri
->dirfragtree
[offset_hash
];
3678 // client actually wants next frag
3679 newfg
= diri
->dirfragtree
[fg
.value()];
3683 newfg
= diri
->dirfragtree
[fg
.value()];
3685 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
3689 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
3693 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
3694 assert(dir
->is_auth());
3696 if (!dir
->is_complete()) {
3697 if (dir
->is_frozen()) {
3698 dout(7) << "dir is frozen " << *dir
<< dendl
;
3699 mds
->locker
->drop_locks(mdr
.get());
3700 mdr
->drop_local_auth_pins();
3701 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3705 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
3706 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
3710 #ifdef MDS_VERIFY_FRAGSTAT
3711 dir
->verify_fragstat();
3714 utime_t now
= ceph_clock_now();
3715 mdr
->set_mds_stamp(now
);
3717 snapid_t snapid
= mdr
->snapid
;
3718 dout(10) << "snapid " << snapid
<< dendl
;
3720 SnapRealm
*realm
= diri
->find_snaprealm();
3722 unsigned max
= req
->head
.args
.readdir
.max_entries
;
3724 max
= dir
->get_num_any(); // whatever, something big.
3725 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
3727 // make sure at least one item can be encoded
3728 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
3732 dir
->encode_dirstat(dirbl
, mds
->get_nodeid());
3734 // count bytes available.
3735 // this isn't perfect, but we should capture the main variable/unbounded size items!
3736 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
3737 int bytes_left
= max_bytes
- front_bytes
;
3738 bytes_left
-= realm
->get_snap_trace().length();
3740 // build dir contents
3743 bool start
= !offset_hash
&& offset_str
.empty();
3744 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3745 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
3746 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
3747 bool end
= (it
== dir
->end());
3748 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
3749 CDentry
*dn
= it
->second
;
3752 if (dn
->state_test(CDentry::STATE_PURGING
))
3755 bool dnp
= dn
->use_projected(client
, mdr
);
3756 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
3761 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
3762 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
3767 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
3768 if (!(offset_key
< dn
->key()))
3772 CInode
*in
= dnl
->get_inode();
3774 if (in
&& in
->ino() == CEPH_INO_CEPH
)
3778 // better for the MDS to do the work, if we think the client will stat any of these files.
3779 if (dnl
->is_remote() && !in
) {
3780 in
= mdcache
->get_inode(dnl
->get_remote_ino());
3782 dn
->link_remote(dnl
, in
);
3783 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
3784 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
3787 // touch everything i _do_ have
3788 for (auto &p
: *dir
) {
3789 if (!p
.second
->get_linkage()->is_null())
3790 mdcache
->lru
.lru_touch(p
.second
);
3793 // already issued caps and leases, reply immediately.
3794 if (dnbl
.length() > 0) {
3795 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
3796 dout(10) << " open remote dentry after caps were issued, stopping at "
3797 << dnbl
.length() << " < " << bytes_left
<< dendl
;
3801 mds
->locker
->drop_locks(mdr
.get());
3802 mdr
->drop_local_auth_pins();
3803 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
3809 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
3810 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
3814 unsigned start_len
= dnbl
.length();
3817 dout(12) << "including dn " << *dn
<< dendl
;
3818 ::encode(dn
->get_name(), dnbl
);
3819 mds
->locker
->issue_client_lease(dn
, client
, dnbl
, now
, mdr
->session
);
3822 dout(12) << "including inode " << *in
<< dendl
;
3823 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
3825 // chop off dn->name, lease
3826 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
3828 keep
.substr_of(dnbl
, 0, start_len
);
3836 mdcache
->lru
.lru_touch(dn
);
3841 flags
= CEPH_READDIR_FRAG_END
;
3843 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
3845 // client only understand END and COMPLETE flags ?
3846 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3847 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
3850 // finish final blob
3851 ::encode(numfiles
, dirbl
);
3852 ::encode(flags
, dirbl
);
3853 dirbl
.claim_append(dnbl
);
3856 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
3857 << " bytes=" << dirbl
.length()
3858 << " start=" << (int)start
3859 << " end=" << (int)end
3861 mdr
->reply_extra_bl
= dirbl
;
3863 // bump popularity. NOTE: this doesn't quite capture it.
3864 mds
->balancer
->hit_dir(now
, dir
, META_POP_IRD
, -1, numfiles
);
3868 respond_to_request(mdr
, 0);
3873 // ===============================================================================
3878 * finisher for basic inode updates
3880 class C_MDS_inode_update_finish
: public ServerLogContext
{
3882 bool truncating_smaller
, changed_ranges
;
3884 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
3885 bool sm
=false, bool cr
=false) :
3886 ServerLogContext(s
, r
), in(i
), truncating_smaller(sm
), changed_ranges(cr
) { }
3887 void finish(int r
) override
{
3891 in
->pop_and_dirty_projected_inode(mdr
->ls
);
3894 // notify any clients
3895 if (truncating_smaller
&& in
->inode
.is_truncating()) {
3896 get_mds()->locker
->issue_truncate(in
);
3897 get_mds()->mdcache
->truncate_inode(in
, mdr
->ls
);
3900 utime_t now
= ceph_clock_now();
3901 get_mds()->balancer
->hit_inode(now
, in
, META_POP_IWR
);
3903 server
->respond_to_request(mdr
, 0);
3906 get_mds()->locker
->share_inode_max_size(in
);
3910 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
3912 MClientRequest
*req
= mdr
->client_request
;
3913 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3915 // get the inode to operate on, and set up any locks needed for that
3916 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3920 xlocks
.insert(&cur
->flocklock
);
3921 /* acquire_locks will return true if it gets the locks. If it fails,
3922 it will redeliver this request at a later date, so drop the request.
3924 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
3925 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
3929 // copy the lock change into a ceph_filelock so we can store/apply it
3930 ceph_filelock set_lock
;
3931 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
3932 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
3933 set_lock
.client
= req
->get_orig_source().num();
3934 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
3935 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
3936 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
3937 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
3939 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
3941 ceph_lock_state_t
*lock_state
= NULL
;
3942 bool interrupt
= false;
3944 // get the appropriate lock state
3945 switch (req
->head
.args
.filelock_change
.rule
) {
3946 case CEPH_LOCK_FLOCK_INTR
:
3949 case CEPH_LOCK_FLOCK
:
3950 lock_state
= cur
->get_flock_lock_state();
3953 case CEPH_LOCK_FCNTL_INTR
:
3956 case CEPH_LOCK_FCNTL
:
3957 lock_state
= cur
->get_fcntl_lock_state();
3961 dout(10) << "got unknown lock type " << set_lock
.type
3962 << ", dropping request!" << dendl
;
3963 respond_to_request(mdr
, -EOPNOTSUPP
);
3967 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
3968 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
3969 list
<ceph_filelock
> activated_locks
;
3970 list
<MDSInternalContextBase
*> waiters
;
3971 if (lock_state
->is_waiting(set_lock
)) {
3972 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
3973 lock_state
->remove_waiting(set_lock
);
3974 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
3975 } else if (!interrupt
) {
3976 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
3977 lock_state
->remove_lock(set_lock
, activated_locks
);
3978 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
3980 mds
->queue_waiters(waiters
);
3982 respond_to_request(mdr
, 0);
3984 dout(10) << " lock attempt on " << set_lock
<< dendl
;
3985 bool deadlock
= false;
3986 if (mdr
->more()->flock_was_waiting
&&
3987 !lock_state
->is_waiting(set_lock
)) {
3988 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
3989 respond_to_request(mdr
, -EINTR
);
3990 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
3991 dout(10) << " it failed on this attempt" << dendl
;
3992 // couldn't set lock right now
3994 respond_to_request(mdr
, -EDEADLK
);
3995 } else if (!will_wait
) {
3996 respond_to_request(mdr
, -EWOULDBLOCK
);
3998 dout(10) << " added to waiting list" << dendl
;
3999 assert(lock_state
->is_waiting(set_lock
));
4000 mdr
->more()->flock_was_waiting
= true;
4001 mds
->locker
->drop_locks(mdr
.get());
4002 mdr
->drop_local_auth_pins();
4003 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4006 respond_to_request(mdr
, 0);
4008 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4011 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4013 MClientRequest
*req
= mdr
->client_request
;
4014 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4016 // get the inode to operate on, and set up any locks needed for that
4017 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4021 /* acquire_locks will return true if it gets the locks. If it fails,
4022 it will redeliver this request at a later date, so drop the request.
4024 rdlocks
.insert(&cur
->flocklock
);
4025 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
4026 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4030 // copy the lock change into a ceph_filelock so we can store/apply it
4031 ceph_filelock checking_lock
;
4032 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4033 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4034 checking_lock
.client
= req
->get_orig_source().num();
4035 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4036 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4037 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4039 // get the appropriate lock state
4040 ceph_lock_state_t
*lock_state
= NULL
;
4041 switch (req
->head
.args
.filelock_change
.rule
) {
4042 case CEPH_LOCK_FLOCK
:
4043 lock_state
= cur
->get_flock_lock_state();
4046 case CEPH_LOCK_FCNTL
:
4047 lock_state
= cur
->get_fcntl_lock_state();
4051 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4052 respond_to_request(mdr
, -EINVAL
);
4055 lock_state
->look_for_lock(checking_lock
);
4058 ::encode(checking_lock
, lock_bl
);
4060 mdr
->reply_extra_bl
= lock_bl
;
4061 respond_to_request(mdr
, 0);
4064 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4066 MClientRequest
*req
= mdr
->client_request
;
4067 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4068 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4071 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4072 respond_to_request(mdr
, -EROFS
);
4075 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4076 respond_to_request(mdr
, -EPERM
);
4080 __u32 mask
= req
->head
.args
.setattr
.mask
;
4081 __u32 access_mask
= MAY_WRITE
;
4084 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4085 xlocks
.insert(&cur
->authlock
);
4086 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4087 xlocks
.insert(&cur
->filelock
);
4088 if (mask
& CEPH_SETATTR_CTIME
)
4089 wrlocks
.insert(&cur
->versionlock
);
4091 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4094 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
4095 access_mask
|= MAY_CHOWN
;
4097 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
4098 access_mask
|= MAY_CHGRP
;
4100 if (!check_access(mdr
, cur
, access_mask
))
4103 // trunc from bigger -> smaller?
4104 auto pip
= cur
->get_projected_inode();
4106 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
4108 // ENOSPC on growing file while full, but allow shrinks
4109 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4110 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
4111 respond_to_request(mdr
, -ENOSPC
);
4115 bool truncating_smaller
= false;
4116 if (mask
& CEPH_SETATTR_SIZE
) {
4117 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
4118 if (truncating_smaller
&& pip
->is_truncating()) {
4119 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
4120 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4121 mds
->locker
->drop_locks(mdr
.get());
4122 mdr
->drop_local_auth_pins();
4123 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4128 bool changed_ranges
= false;
4131 mdr
->ls
= mdlog
->get_current_segment();
4132 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
4133 mdlog
->start_entry(le
);
4135 auto &pi
= cur
->project_inode();
4137 if (mask
& CEPH_SETATTR_UID
)
4138 pi
.inode
.uid
= req
->head
.args
.setattr
.uid
;
4139 if (mask
& CEPH_SETATTR_GID
)
4140 pi
.inode
.gid
= req
->head
.args
.setattr
.gid
;
4142 if (mask
& CEPH_SETATTR_MODE
)
4143 pi
.inode
.mode
= (pi
.inode
.mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
4144 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4145 S_ISREG(pi
.inode
.mode
) &&
4146 (pi
.inode
.mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
4147 pi
.inode
.mode
&= ~(S_ISUID
|S_ISGID
);
4150 if (mask
& CEPH_SETATTR_MTIME
)
4151 pi
.inode
.mtime
= req
->head
.args
.setattr
.mtime
;
4152 if (mask
& CEPH_SETATTR_ATIME
)
4153 pi
.inode
.atime
= req
->head
.args
.setattr
.atime
;
4154 if (mask
& CEPH_SETATTR_BTIME
)
4155 pi
.inode
.btime
= req
->head
.args
.setattr
.btime
;
4156 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4157 pi
.inode
.time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4158 if (mask
& CEPH_SETATTR_SIZE
) {
4159 if (truncating_smaller
) {
4160 pi
.inode
.truncate(old_size
, req
->head
.args
.setattr
.size
);
4161 le
->metablob
.add_truncate_start(cur
->ino());
4163 pi
.inode
.size
= req
->head
.args
.setattr
.size
;
4164 pi
.inode
.rstat
.rbytes
= pi
.inode
.size
;
4166 pi
.inode
.mtime
= mdr
->get_op_stamp();
4168 // adjust client's max_size?
4169 CInode::mempool_inode::client_range_map new_ranges
;
4170 bool max_increased
= false;
4171 mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
.size
, &new_ranges
, &max_increased
);
4172 if (pi
.inode
.client_ranges
!= new_ranges
) {
4173 dout(10) << " client_ranges " << pi
.inode
.client_ranges
<< " -> " << new_ranges
<< dendl
;
4174 pi
.inode
.client_ranges
= new_ranges
;
4175 changed_ranges
= true;
4179 pi
.inode
.version
= cur
->pre_dirty();
4180 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4181 pi
.inode
.change_attr
++;
4184 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4185 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4186 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4188 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
4189 truncating_smaller
, changed_ranges
));
4191 // flush immediately if there are readers/writers waiting
4192 if (xlocks
.count(&cur
->filelock
) &&
4193 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
4194 mds
->mdlog
->flush();
4197 /* Takes responsibility for mdr */
4198 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
4200 CInode
*in
= mdr
->in
[0];
4201 client_t client
= mdr
->get_client();
4204 dout(10) << "do_open_truncate " << *in
<< dendl
;
4206 SnapRealm
*realm
= in
->find_snaprealm();
4207 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, mdr
->client_request
->is_replay());
4209 mdr
->ls
= mdlog
->get_current_segment();
4210 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
4211 mdlog
->start_entry(le
);
4214 auto &pi
= in
->project_inode();
4215 pi
.inode
.version
= in
->pre_dirty();
4216 pi
.inode
.mtime
= pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4217 pi
.inode
.change_attr
++;
4219 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
.size
, mdr
->client_request
->head
.args
.open
.old_size
);
4221 pi
.inode
.truncate(old_size
, 0);
4222 le
->metablob
.add_truncate_start(in
->ino());
4225 bool changed_ranges
= false;
4226 if (cmode
& CEPH_FILE_MODE_WR
) {
4227 pi
.inode
.client_ranges
[client
].range
.first
= 0;
4228 pi
.inode
.client_ranges
[client
].range
.last
= pi
.inode
.get_layout_size_increment();
4229 pi
.inode
.client_ranges
[client
].follows
= in
->find_snaprealm()->get_newest_seq();
4230 changed_ranges
= true;
4233 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
4235 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
4236 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
4238 // make sure ino gets into the journal
4239 le
->metablob
.add_opened_ino(in
->ino());
4240 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
4241 ls
->open_files
.push_back(&in
->item_open_file
);
4243 mdr
->o_trunc
= true;
4246 if (mdr
->client_request
->get_dentry_wanted()) {
4247 assert(mdr
->dn
[0].size());
4248 dn
= mdr
->dn
[0].back();
4251 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
4253 // Although the `open` part can give an early reply, the truncation won't
4254 // happen until our EUpdate is persistent, to give the client a prompt
4255 // response we must also flush that event.
4260 /* This function cleans up the passed mdr */
4261 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
4263 MClientRequest
*req
= mdr
->client_request
;
4264 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4265 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4268 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4269 respond_to_request(mdr
, -EROFS
);
4272 if (!cur
->is_file()) {
4273 respond_to_request(mdr
, -EINVAL
);
4276 if (cur
->get_projected_inode()->size
||
4277 cur
->get_projected_inode()->truncate_seq
> 1) {
4278 respond_to_request(mdr
, -ENOTEMPTY
);
4283 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4284 // save existing layout for later
4285 const auto old_layout
= layout
;
4287 int access
= MAY_WRITE
;
4289 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4290 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4291 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4292 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4293 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4294 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4295 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4296 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4298 // make sure we have as new a map as the client
4299 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4300 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4305 // Don't permit layout modifications without 'p' caps
4306 if (layout
!= old_layout
) {
4307 access
|= MAY_SET_VXATTR
;
4310 if (!layout
.is_valid()) {
4311 dout(10) << "bad layout" << dendl
;
4312 respond_to_request(mdr
, -EINVAL
);
4315 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4316 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4317 respond_to_request(mdr
, -EINVAL
);
4321 xlocks
.insert(&cur
->filelock
);
4322 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4325 if (!check_access(mdr
, cur
, access
))
4329 auto &pi
= cur
->project_inode();
4330 pi
.inode
.layout
= layout
;
4331 // add the old pool to the inode
4332 pi
.inode
.add_old_pool(old_layout
.pool_id
);
4333 pi
.inode
.version
= cur
->pre_dirty();
4334 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4335 pi
.inode
.change_attr
++;
4338 mdr
->ls
= mdlog
->get_current_segment();
4339 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4340 mdlog
->start_entry(le
);
4341 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4342 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4343 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4345 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4348 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
4350 MClientRequest
*req
= mdr
->client_request
;
4351 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4352 file_layout_t
*dir_layout
= NULL
;
4353 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4356 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4357 respond_to_request(mdr
, -EROFS
);
4361 if (!cur
->is_dir()) {
4362 respond_to_request(mdr
, -ENOTDIR
);
4366 xlocks
.insert(&cur
->policylock
);
4367 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4371 const auto old_pi
= cur
->get_projected_inode();
4372 file_layout_t layout
;
4373 if (old_pi
->has_layout())
4374 layout
= old_pi
->layout
;
4375 else if (dir_layout
)
4376 layout
= *dir_layout
;
4378 layout
= mdcache
->default_file_layout
;
4380 // Level of access required to complete
4381 int access
= MAY_WRITE
;
4383 const auto old_layout
= layout
;
4385 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4386 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4387 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4388 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4389 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4390 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4391 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4392 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4393 // make sure we have as new a map as the client
4394 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4395 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4400 if (layout
!= old_layout
) {
4401 access
|= MAY_SET_VXATTR
;
4404 if (!layout
.is_valid()) {
4405 dout(10) << "bad layout" << dendl
;
4406 respond_to_request(mdr
, -EINVAL
);
4409 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4410 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4411 respond_to_request(mdr
, -EINVAL
);
4415 if (!check_access(mdr
, cur
, access
))
4418 auto &pi
= cur
->project_inode();
4419 pi
.inode
.layout
= layout
;
4420 pi
.inode
.version
= cur
->pre_dirty();
4423 mdr
->ls
= mdlog
->get_current_segment();
4424 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4425 mdlog
->start_entry(le
);
4426 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4427 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4428 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4430 mdr
->no_early_reply
= true;
4431 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4436 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
4437 file_layout_t
*layout
, bool validate
)
4439 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4441 if (name
== "layout") {
4442 string::iterator begin
= value
.begin();
4443 string::iterator end
= value
.end();
4444 keys_and_values
<string::iterator
> p
; // create instance of parser
4445 std::map
<string
, string
> m
; // map to receive results
4446 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4449 string
left(begin
, end
);
4450 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4453 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4454 // Skip validation on each attr, we do it once at the end (avoid
4455 // rejecting intermediate states if the overall result is ok)
4456 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
4457 osdmap
, layout
, false);
4461 } else if (name
== "layout.object_size") {
4462 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
4463 } else if (name
== "layout.stripe_unit") {
4464 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
4465 } else if (name
== "layout.stripe_count") {
4466 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
4467 } else if (name
== "layout.pool") {
4469 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
4470 } catch (boost::bad_lexical_cast
const&) {
4471 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
4473 dout(10) << " unknown pool " << value
<< dendl
;
4476 layout
->pool_id
= pool
;
4478 } else if (name
== "layout.pool_namespace") {
4479 layout
->pool_ns
= value
;
4481 dout(10) << " unknown layout vxattr " << name
<< dendl
;
4484 } catch (boost::bad_lexical_cast
const&) {
4485 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4489 if (validate
&& !layout
->is_valid()) {
4490 dout(10) << "bad layout" << dendl
;
4493 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
4494 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
4500 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
4502 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4504 if (name
== "quota") {
4505 string::iterator begin
= value
.begin();
4506 string::iterator end
= value
.end();
4507 keys_and_values
<string::iterator
> p
; // create instance of parser
4508 std::map
<string
, string
> m
; // map to receive results
4509 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4512 string
left(begin
, end
);
4513 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4516 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4517 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
4521 } else if (name
== "quota.max_bytes") {
4522 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4525 quota
->max_bytes
= q
;
4526 } else if (name
== "quota.max_files") {
4527 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4530 quota
->max_files
= q
;
4532 dout(10) << " unknown quota vxattr " << name
<< dendl
;
4535 } catch (boost::bad_lexical_cast
const&) {
4536 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4540 if (!quota
->is_valid()) {
4541 dout(10) << "bad quota" << dendl
;
4548 * Verify that the file layout attribute carried by client
4549 * is well-formatted.
4550 * Return 0 on success, otherwise this function takes
4551 * responsibility for the passed mdr.
4553 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
4556 file_layout_t
*layout
)
4558 MClientRequest
*req
= mdr
->client_request
;
4562 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4563 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4564 epoch
= osdmap
.get_epoch();
4569 // we don't have the specified pool, make sure our map
4570 // is newer than or as new as the client.
4571 epoch_t req_epoch
= req
->get_osdmap_epoch();
4573 if (req_epoch
> epoch
) {
4575 // well, our map is older. consult mds.
4576 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
4578 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
4579 return r
; // wait, fin will retry this request later
4583 // now we have at least as new a map as the client, try again.
4584 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4585 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4586 epoch
= osdmap
.get_epoch();
4589 assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
4591 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
4593 // For compatibility with client w/ old code, we still need get the
4594 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4595 // we can remove those code.
4596 mdr
->waited_for_osdmap
= true;
4597 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
4598 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
4608 respond_to_request(mdr
, r
);
4616 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4617 file_layout_t
*dir_layout
,
4618 set
<SimpleLock
*> rdlocks
,
4619 set
<SimpleLock
*> wrlocks
,
4620 set
<SimpleLock
*> xlocks
)
4622 MClientRequest
*req
= mdr
->client_request
;
4623 string
name(req
->get_path2());
4624 bufferlist bl
= req
->get_data();
4625 string
value (bl
.c_str(), bl
.length());
4626 dout(10) << "handle_set_vxattr " << name
4627 << " val " << value
.length()
4628 << " bytes on " << *cur
4631 CInode::mempool_inode
*pip
= nullptr;
4634 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
4638 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
4639 if (!cur
->is_dir()) {
4640 respond_to_request(mdr
, -EINVAL
);
4644 file_layout_t layout
;
4645 if (cur
->get_projected_inode()->has_layout())
4646 layout
= cur
->get_projected_inode()->layout
;
4647 else if (dir_layout
)
4648 layout
= *dir_layout
;
4650 layout
= mdcache
->default_file_layout
;
4652 rest
= name
.substr(name
.find("layout"));
4653 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4656 xlocks
.insert(&cur
->policylock
);
4657 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4660 auto &pi
= cur
->project_inode();
4661 pi
.inode
.layout
= layout
;
4662 mdr
->no_early_reply
= true;
4664 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
4665 if (!cur
->is_file()) {
4666 respond_to_request(mdr
, -EINVAL
);
4669 if (cur
->get_projected_inode()->size
||
4670 cur
->get_projected_inode()->truncate_seq
> 1) {
4671 respond_to_request(mdr
, -ENOTEMPTY
);
4674 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4675 rest
= name
.substr(name
.find("layout"));
4676 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4679 xlocks
.insert(&cur
->filelock
);
4680 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4683 auto &pi
= cur
->project_inode();
4684 int64_t old_pool
= pi
.inode
.layout
.pool_id
;
4685 pi
.inode
.add_old_pool(old_pool
);
4686 pi
.inode
.layout
= layout
;
4688 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
4689 if (!cur
->is_dir() || cur
->is_root()) {
4690 respond_to_request(mdr
, -EINVAL
);
4694 quota_info_t quota
= cur
->get_projected_inode()->quota
;
4696 rest
= name
.substr(name
.find("quota"));
4697 int r
= parse_quota_vxattr(rest
, value
, "a
);
4699 respond_to_request(mdr
, r
);
4703 xlocks
.insert(&cur
->policylock
);
4704 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4707 auto &pi
= cur
->project_inode();
4708 pi
.inode
.quota
= quota
;
4710 mdr
->no_early_reply
= true;
4713 client_t exclude_ct
= mdr
->get_client();
4714 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
);
4715 } else if (name
.find("ceph.dir.pin") == 0) {
4716 if (!cur
->is_dir() || cur
->is_root()) {
4717 respond_to_request(mdr
, -EINVAL
);
4723 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
4724 if (rank
< 0) rank
= MDS_RANK_NONE
;
4725 } catch (boost::bad_lexical_cast
const&) {
4726 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4727 respond_to_request(mdr
, -EINVAL
);
4731 xlocks
.insert(&cur
->policylock
);
4732 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4735 auto &pi
= cur
->project_inode();
4736 cur
->set_export_pin(rank
);
4739 dout(10) << " unknown vxattr " << name
<< dendl
;
4740 respond_to_request(mdr
, -EINVAL
);
4745 pip
->ctime
= pip
->rstat
.rctime
= mdr
->get_op_stamp();
4746 pip
->version
= cur
->pre_dirty();
4748 pip
->update_backtrace();
4751 mdr
->ls
= mdlog
->get_current_segment();
4752 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
4753 mdlog
->start_entry(le
);
4754 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4755 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4756 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4758 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4762 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4763 file_layout_t
*dir_layout
,
4764 set
<SimpleLock
*> rdlocks
,
4765 set
<SimpleLock
*> wrlocks
,
4766 set
<SimpleLock
*> xlocks
)
4768 MClientRequest
*req
= mdr
->client_request
;
4769 string
name(req
->get_path2());
4771 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
4773 if (name
== "ceph.dir.layout") {
4774 if (!cur
->is_dir()) {
4775 respond_to_request(mdr
, -ENODATA
);
4778 if (cur
->is_root()) {
4779 dout(10) << "can't remove layout policy on the root directory" << dendl
;
4780 respond_to_request(mdr
, -EINVAL
);
4784 if (!cur
->get_projected_inode()->has_layout()) {
4785 respond_to_request(mdr
, -ENODATA
);
4789 xlocks
.insert(&cur
->policylock
);
4790 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4793 auto &pi
= cur
->project_inode();
4794 pi
.inode
.clear_layout();
4795 pi
.inode
.version
= cur
->pre_dirty();
4798 mdr
->ls
= mdlog
->get_current_segment();
4799 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
4800 mdlog
->start_entry(le
);
4801 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4802 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4803 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4805 mdr
->no_early_reply
= true;
4806 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4808 } else if (name
== "ceph.dir.layout.pool_namespace"
4809 || name
== "ceph.file.layout.pool_namespace") {
4810 // Namespace is the only layout field that has a meaningful
4811 // null/none value (empty string, means default layout). Is equivalent
4812 // to a setxattr with empty string: pass through the empty payload of
4813 // the rmxattr request to do this.
4814 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4818 respond_to_request(mdr
, -ENODATA
);
4821 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
4825 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
4826 ServerLogContext(s
, r
), in(i
) { }
4827 void finish(int r
) override
{
4831 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4835 utime_t now
= ceph_clock_now();
4836 get_mds()->balancer
->hit_inode(now
, in
, META_POP_IWR
);
4838 server
->respond_to_request(mdr
, 0);
4842 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
4844 MClientRequest
*req
= mdr
->client_request
;
4845 string
name(req
->get_path2());
4846 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4849 file_layout_t
*dir_layout
= NULL
;
4850 if (name
.compare(0, 15, "ceph.dir.layout") == 0)
4851 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4853 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4857 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4858 respond_to_request(mdr
, -EROFS
);
4862 int flags
= req
->head
.args
.setxattr
.flags
;
4864 // magic ceph.* namespace?
4865 if (name
.compare(0, 5, "ceph.") == 0) {
4866 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4870 xlocks
.insert(&cur
->xattrlock
);
4871 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4874 if (!check_access(mdr
, cur
, MAY_WRITE
))
4877 auto pxattrs
= cur
->get_projected_xattrs();
4878 size_t len
= req
->get_data().length();
4879 size_t inc
= len
+ name
.length();
4881 // check xattrs kv pairs size
4882 size_t cur_xattrs_size
= 0;
4883 for (const auto& p
: *pxattrs
) {
4884 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(std::string(boost::string_view(p
.first
))) == 0)) {
4887 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
4890 if (((cur_xattrs_size
+ inc
) > g_conf
->mds_max_xattr_pairs_size
)) {
4891 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4892 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
4893 respond_to_request(mdr
, -ENOSPC
);
4897 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(mempool::mds_co::string(boost::string_view(name
)))) {
4898 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
4899 respond_to_request(mdr
, -EEXIST
);
4902 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(mempool::mds_co::string(boost::string_view(name
)))) {
4903 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
4904 respond_to_request(mdr
, -ENODATA
);
4908 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
4911 auto &pi
= cur
->project_inode(true);
4912 pi
.inode
.version
= cur
->pre_dirty();
4913 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4914 pi
.inode
.change_attr
++;
4915 pi
.inode
.xattr_version
++;
4916 auto &px
= *pi
.xattrs
;
4917 if ((flags
& CEPH_XATTR_REMOVE
)) {
4918 px
.erase(mempool::mds_co::string(boost::string_view(name
)));
4920 bufferptr b
= buffer::create(len
);
4922 req
->get_data().copy(0, len
, b
.c_str());
4923 auto em
= px
.emplace(std::piecewise_construct
, std::forward_as_tuple(mempool::mds_co::string(boost::string_view(name
))), std::forward_as_tuple(b
));
4925 em
.first
->second
= b
;
4929 mdr
->ls
= mdlog
->get_current_segment();
4930 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
4931 mdlog
->start_entry(le
);
4932 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4933 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4934 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4936 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4939 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
4941 MClientRequest
*req
= mdr
->client_request
;
4942 std::string
name(req
->get_path2());
4943 std::set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4944 file_layout_t
*dir_layout
= NULL
;
4946 if (name
== "ceph.dir.layout")
4947 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4949 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4953 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4954 respond_to_request(mdr
, -EROFS
);
4958 if (name
.compare(0, 5, "ceph.") == 0) {
4959 handle_remove_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4963 xlocks
.insert(&cur
->xattrlock
);
4964 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4967 auto pxattrs
= cur
->get_projected_xattrs();
4968 if (pxattrs
->count(mempool::mds_co::string(boost::string_view(name
))) == 0) {
4969 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
4970 respond_to_request(mdr
, -ENODATA
);
4974 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
4977 auto &pi
= cur
->project_inode(true);
4978 auto &px
= *pi
.xattrs
;
4979 pi
.inode
.version
= cur
->pre_dirty();
4980 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4981 pi
.inode
.change_attr
++;
4982 pi
.inode
.xattr_version
++;
4983 px
.erase(mempool::mds_co::string(boost::string_view(name
)));
4986 mdr
->ls
= mdlog
->get_current_segment();
4987 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
4988 mdlog
->start_entry(le
);
4989 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4990 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4991 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4993 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4997 // =================================================================
4998 // DIRECTORY and NAMESPACE OPS
5001 // ------------------------------------------------
5005 class C_MDS_mknod_finish
: public ServerLogContext
{
5009 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
5010 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
5011 void finish(int r
) override
{
5015 dn
->pop_projected_linkage();
5017 // be a bit hacky with the inode version, here.. we decrement it
5018 // just to keep mark_dirty() happen. (we didn't bother projecting
5019 // a new version of hte inode since it's just been created)
5020 newi
->inode
.version
--;
5021 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
5022 newi
->mark_dirty_parent(mdr
->ls
, true);
5025 if (newi
->inode
.is_dir()) {
5026 CDir
*dir
= newi
->get_dirfrag(frag_t());
5028 dir
->fnode
.version
--;
5029 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
5030 dir
->mark_new(mdr
->ls
);
5035 MDRequestRef null_ref
;
5036 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
5038 if (newi
->inode
.is_file())
5039 get_mds()->locker
->share_inode_max_size(newi
);
5042 utime_t now
= ceph_clock_now();
5043 get_mds()->balancer
->hit_inode(now
, newi
, META_POP_IWR
);
5046 server
->respond_to_request(mdr
, 0);
5051 void Server::handle_client_mknod(MDRequestRef
& mdr
)
5053 MClientRequest
*req
= mdr
->client_request
;
5054 client_t client
= mdr
->get_client();
5055 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5056 file_layout_t
*dir_layout
= NULL
;
5057 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false,
5060 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5061 respond_to_request(mdr
, -EROFS
);
5064 CInode
*diri
= dn
->get_dir()->get_inode();
5065 rdlocks
.insert(&diri
->authlock
);
5066 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5069 if (!check_access(mdr
, diri
, MAY_WRITE
))
5072 if (!check_fragment_space(mdr
, dn
->get_dir()))
5075 unsigned mode
= req
->head
.args
.mknod
.mode
;
5076 if ((mode
& S_IFMT
) == 0)
5080 file_layout_t layout
;
5081 if (dir_layout
&& S_ISREG(mode
))
5082 layout
= *dir_layout
;
5084 layout
= mdcache
->default_file_layout
;
5086 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5087 snapid_t follows
= realm
->get_newest_seq();
5088 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
5092 dn
->push_projected_linkage(newi
);
5094 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
5095 newi
->inode
.version
= dn
->pre_dirty();
5096 newi
->inode
.rstat
.rfiles
= 1;
5097 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
5098 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
5099 newi
->inode
.update_backtrace();
5101 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5102 // want to write to it (e.g., if they are reexporting NFS)
5103 if (S_ISREG(newi
->inode
.mode
)) {
5104 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
5105 newi
->inode
.client_ranges
[client
].range
.first
= 0;
5106 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.get_layout_size_increment();
5107 newi
->inode
.client_ranges
[client
].follows
= follows
;
5109 // issue a cap on the file
5110 int cmode
= CEPH_FILE_MODE_RDWR
;
5111 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5115 // put locks in excl mode
5116 newi
->filelock
.set_state(LOCK_EXCL
);
5117 newi
->authlock
.set_state(LOCK_EXCL
);
5118 newi
->xattrlock
.set_state(LOCK_EXCL
);
5122 assert(dn
->first
== follows
+ 1);
5123 newi
->first
= dn
->first
;
5125 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
5128 mdr
->ls
= mdlog
->get_current_segment();
5129 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
5130 mdlog
->start_entry(le
);
5131 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5132 journal_allocated_inos(mdr
, &le
->metablob
);
5134 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
5135 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5136 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
5138 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5144 /* This function takes responsibility for the passed mdr*/
5145 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
5147 MClientRequest
*req
= mdr
->client_request
;
5148 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5149 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5151 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5152 respond_to_request(mdr
, -EROFS
);
5155 CDir
*dir
= dn
->get_dir();
5156 CInode
*diri
= dir
->get_inode();
5157 rdlocks
.insert(&diri
->authlock
);
5158 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5161 // mkdir check access
5162 if (!check_access(mdr
, diri
, MAY_WRITE
))
5165 if (!check_fragment_space(mdr
, dir
))
5169 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5170 snapid_t follows
= realm
->get_newest_seq();
5172 unsigned mode
= req
->head
.args
.mkdir
.mode
;
5175 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5178 // it's a directory.
5179 dn
->push_projected_linkage(newi
);
5181 newi
->inode
.version
= dn
->pre_dirty();
5182 newi
->inode
.rstat
.rsubdirs
= 1;
5183 newi
->inode
.update_backtrace();
5185 dout(12) << " follows " << follows
<< dendl
;
5186 assert(dn
->first
== follows
+ 1);
5187 newi
->first
= dn
->first
;
5189 // ...and that new dir is empty.
5190 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
5191 newdir
->state_set(CDir::STATE_CREATING
);
5192 newdir
->mark_complete();
5193 newdir
->fnode
.version
= newdir
->pre_dirty();
5196 mdr
->ls
= mdlog
->get_current_segment();
5197 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
5198 mdlog
->start_entry(le
);
5199 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5200 journal_allocated_inos(mdr
, &le
->metablob
);
5201 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5202 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5203 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
5205 // issue a cap on the directory
5206 int cmode
= CEPH_FILE_MODE_RDWR
;
5207 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5211 // put locks in excl mode
5212 newi
->filelock
.set_state(LOCK_EXCL
);
5213 newi
->authlock
.set_state(LOCK_EXCL
);
5214 newi
->xattrlock
.set_state(LOCK_EXCL
);
5217 // make sure this inode gets into the journal
5218 le
->metablob
.add_opened_ino(newi
->ino());
5219 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
5220 ls
->open_files
.push_back(&newi
->item_open_file
);
5222 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5228 void Server::handle_client_symlink(MDRequestRef
& mdr
)
5230 MClientRequest
*req
= mdr
->client_request
;
5231 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5232 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5234 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5235 respond_to_request(mdr
, -EROFS
);
5238 CDir
*dir
= dn
->get_dir();
5239 CInode
*diri
= dir
->get_inode();
5240 rdlocks
.insert(&diri
->authlock
);
5241 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5244 if (!check_access(mdr
, diri
, MAY_WRITE
))
5247 if (!check_fragment_space(mdr
, dir
))
5250 unsigned mode
= S_IFLNK
| 0777;
5251 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5255 dn
->push_projected_linkage(newi
);
5257 newi
->symlink
= mempool::mds_co::string(boost::string_view(req
->get_path2()));
5258 newi
->inode
.size
= newi
->symlink
.length();
5259 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
5260 newi
->inode
.rstat
.rfiles
= 1;
5261 newi
->inode
.version
= dn
->pre_dirty();
5262 newi
->inode
.update_backtrace();
5264 newi
->first
= dn
->first
;
5267 mdr
->ls
= mdlog
->get_current_segment();
5268 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
5269 mdlog
->start_entry(le
);
5270 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5271 journal_allocated_inos(mdr
, &le
->metablob
);
5272 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5273 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5275 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5284 void Server::handle_client_link(MDRequestRef
& mdr
)
5286 MClientRequest
*req
= mdr
->client_request
;
5288 dout(7) << "handle_client_link " << req
->get_filepath()
5289 << " to " << req
->get_filepath2()
5292 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5294 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5296 CInode
*targeti
= rdlock_path_pin_ref(mdr
, 1, rdlocks
, false);
5297 if (!targeti
) return;
5298 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5299 respond_to_request(mdr
, -EROFS
);
5303 CDir
*dir
= dn
->get_dir();
5304 dout(7) << "handle_client_link link " << dn
->get_name() << " in " << *dir
<< dendl
;
5305 dout(7) << "target is " << *targeti
<< dendl
;
5306 if (targeti
->is_dir()) {
5307 // if srcdn is replica, need to make sure its linkage is correct
5308 vector
<CDentry
*>& trace
= mdr
->dn
[1];
5309 if (trace
.empty() ||
5310 trace
.back()->is_auth() ||
5311 trace
.back()->lock
.can_read(mdr
->get_client())) {
5312 dout(7) << "target is a dir, failing..." << dendl
;
5313 respond_to_request(mdr
, -EINVAL
);
5318 xlocks
.insert(&targeti
->linklock
);
5320 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5323 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5324 if (!check_access(mdr
, targeti
, MAY_WRITE
))
5327 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
5330 if (!check_fragment_space(mdr
, dir
))
5335 assert(g_conf
->mds_kill_link_at
!= 1);
5338 if (targeti
->is_auth())
5339 _link_local(mdr
, dn
, targeti
);
5341 _link_remote(mdr
, true, dn
, targeti
);
5345 class C_MDS_link_local_finish
: public ServerLogContext
{
5351 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
5352 version_t dnpv_
, version_t tipv_
) :
5353 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
5354 dnpv(dnpv_
), tipv(tipv_
) { }
5355 void finish(int r
) override
{
5357 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
);
5362 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
5364 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
5366 mdr
->ls
= mdlog
->get_current_segment();
5368 // predirty NEW dentry
5369 version_t dnpv
= dn
->pre_dirty();
5370 version_t tipv
= targeti
->pre_dirty();
5372 // project inode update
5373 auto &pi
= targeti
->project_inode();
5375 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5376 pi
.inode
.change_attr
++;
5377 pi
.inode
.version
= tipv
;
5380 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
5381 mdlog
->start_entry(le
);
5382 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5383 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
5384 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
5385 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5386 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
5388 // do this after predirty_*, to avoid funky extra dnl arg
5389 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5391 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
));
5394 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
5395 version_t dnpv
, version_t tipv
)
5397 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
5399 // link and unlock the NEW dentry
5400 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
5401 if (!dnl
->get_inode())
5402 dn
->link_remote(dnl
, targeti
);
5403 dn
->mark_dirty(dnpv
, mdr
->ls
);
5406 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5410 MDRequestRef null_ref
;
5411 mdcache
->send_dentry_link(dn
, null_ref
);
5413 // bump target popularity
5414 utime_t now
= ceph_clock_now();
5415 mds
->balancer
->hit_inode(now
, targeti
, META_POP_IWR
);
5416 mds
->balancer
->hit_dir(now
, dn
->get_dir(), META_POP_IWR
);
5419 respond_to_request(mdr
, 0);
5423 // link / unlink remote
5425 class C_MDS_link_remote_finish
: public ServerLogContext
{
5431 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
5432 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
5433 dpv(d
->get_projected_version()) {}
5434 void finish(int r
) override
{
5436 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
5440 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
5442 dout(10) << "_link_remote "
5443 << (inc
? "link ":"unlink ")
5444 << *dn
<< " to " << *targeti
<< dendl
;
5446 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5447 mds_rank_t linkauth
= targeti
->authority().first
;
5448 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
5449 if (mds
->is_cluster_degraded() &&
5450 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
5451 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
5452 if (mdr
->more()->waiting_on_slave
.empty())
5453 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
5457 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
5460 op
= MMDSSlaveRequest::OP_LINKPREP
;
5462 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
5463 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, op
);
5464 targeti
->set_object_info(req
->get_object_info());
5465 req
->op_stamp
= mdr
->get_op_stamp();
5466 mds
->send_message_mds(req
, linkauth
);
5468 assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
5469 mdr
->more()->waiting_on_slave
.insert(linkauth
);
5472 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
5474 assert(g_conf
->mds_kill_link_at
!= 2);
5476 mdr
->set_mds_stamp(ceph_clock_now());
5479 mdr
->ls
= mdlog
->get_current_segment();
5480 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
5481 mdlog
->start_entry(le
);
5482 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5483 if (!mdr
->more()->witnessed
.empty()) {
5484 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
5485 le
->reqid
= mdr
->reqid
;
5486 le
->had_slaves
= true;
5487 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
5492 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
5493 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5494 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5497 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
5498 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
5499 le
->metablob
.add_null_dentry(dn
, true);
5500 dn
->push_projected_linkage();
5503 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
5506 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
5507 CDentry
*dn
, CInode
*targeti
,
5510 dout(10) << "_link_remote_finish "
5511 << (inc
? "link ":"unlink ")
5512 << *dn
<< " to " << *targeti
<< dendl
;
5514 assert(g_conf
->mds_kill_link_at
!= 3);
5516 if (!mdr
->more()->witnessed
.empty())
5517 mdcache
->logged_master_update(mdr
->reqid
);
5520 // link the new dentry
5521 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
5522 if (!dnl
->get_inode())
5523 dn
->link_remote(dnl
, targeti
);
5524 dn
->mark_dirty(dpv
, mdr
->ls
);
5526 // unlink main dentry
5527 dn
->get_dir()->unlink_inode(dn
);
5528 dn
->pop_projected_linkage();
5529 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
5534 MDRequestRef null_ref
;
5536 mdcache
->send_dentry_link(dn
, null_ref
);
5538 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
5540 // bump target popularity
5541 utime_t now
= ceph_clock_now();
5542 mds
->balancer
->hit_inode(now
, targeti
, META_POP_IWR
);
5543 mds
->balancer
->hit_dir(now
, dn
->get_dir(), META_POP_IWR
);
5546 respond_to_request(mdr
, 0);
5549 // removing a new dn?
5550 dn
->get_dir()->try_remove_unlinked_dn(dn
);
5554 // remote linking/unlinking
5556 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
5559 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5560 ServerLogContext(s
, r
), targeti(t
) { }
5561 void finish(int r
) override
{
5563 server
->_logged_slave_link(mdr
, targeti
);
5567 class C_MDS_SlaveLinkCommit
: public ServerContext
{
5571 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5572 ServerContext(s
), mdr(r
), targeti(t
) { }
5573 void finish(int r
) override
{
5574 server
->_commit_slave_link(mdr
, r
, targeti
);
5578 /* This function DOES put the mdr->slave_request before returning*/
5579 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
5581 dout(10) << "handle_slave_link_prep " << *mdr
5582 << " on " << mdr
->slave_request
->get_object_info()
5585 assert(g_conf
->mds_kill_link_at
!= 4);
5587 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
5589 dout(10) << "targeti " << *targeti
<< dendl
;
5590 CDentry
*dn
= targeti
->get_parent_dn();
5591 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5592 assert(dnl
->is_primary());
5594 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
5596 mdr
->auth_pin(targeti
);
5598 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5599 assert(g_conf
->mds_kill_link_at
!= 5);
5602 mdr
->ls
= mdlog
->get_current_segment();
5603 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
5604 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
5605 mdlog
->start_entry(le
);
5607 auto &pi
= dnl
->get_inode()->project_inode();
5609 // update journaled target inode
5611 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
5619 link_rollback rollback
;
5620 rollback
.reqid
= mdr
->reqid
;
5621 rollback
.ino
= targeti
->ino();
5622 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
5623 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
5624 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
5625 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
5626 rollback
.was_inc
= inc
;
5627 ::encode(rollback
, le
->rollback
);
5628 mdr
->more()->rollback_bl
= le
->rollback
;
5630 pi
.inode
.ctime
= mdr
->get_op_stamp();
5631 pi
.inode
.version
= targeti
->pre_dirty();
5633 dout(10) << " projected inode " << pi
.inode
.ino
<< " v " << pi
.inode
.version
<< dendl
;
5636 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
5637 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
5639 // set up commit waiter
5640 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
5642 mdr
->more()->slave_update_journaled
= true;
5643 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
),
5648 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
)
5650 dout(10) << "_logged_slave_link " << *mdr
5651 << " " << *targeti
<< dendl
;
5653 assert(g_conf
->mds_kill_link_at
!= 6);
5655 // update the target
5656 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5660 utime_t now
= ceph_clock_now();
5661 mds
->balancer
->hit_inode(now
, targeti
, META_POP_IWR
);
5664 mdr
->slave_request
->put();
5665 mdr
->slave_request
= 0;
5668 if (!mdr
->aborted
) {
5669 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5670 MMDSSlaveRequest::OP_LINKPREPACK
);
5671 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
5673 dout(10) << " abort flag set, finishing" << dendl
;
5674 mdcache
->request_finish(mdr
);
5679 struct C_MDS_CommittedSlave
: public ServerLogContext
{
5680 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
5681 void finish(int r
) override
{
5682 server
->_committed_slave(mdr
);
5686 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
5688 dout(10) << "_commit_slave_link " << *mdr
5690 << " " << *targeti
<< dendl
;
5692 assert(g_conf
->mds_kill_link_at
!= 7);
5695 // drop our pins, etc.
5698 // write a commit to the journal
5699 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
5700 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
5701 mdlog
->start_entry(le
);
5702 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
5705 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
5709 void Server::_committed_slave(MDRequestRef
& mdr
)
5711 dout(10) << "_committed_slave " << *mdr
<< dendl
;
5713 assert(g_conf
->mds_kill_link_at
!= 8);
5715 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5716 MMDSSlaveRequest::OP_COMMITTED
);
5717 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
5718 mdcache
->request_finish(mdr
);
5721 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
5723 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
) : ServerLogContext(s
, r
), mut(m
) {}
5724 void finish(int r
) override
{
5725 server
->_link_rollback_finish(mut
, mdr
);
5729 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
5731 link_rollback rollback
;
5732 bufferlist::iterator p
= rbl
.begin();
5733 ::decode(rollback
, p
);
5735 dout(10) << "do_link_rollback on " << rollback
.reqid
5736 << (rollback
.was_inc
? " inc":" dec")
5737 << " ino " << rollback
.ino
5740 assert(g_conf
->mds_kill_link_at
!= 9);
5742 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
5743 assert(mdr
|| mds
->is_resolve());
5745 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
5746 mut
->ls
= mds
->mdlog
->get_current_segment();
5748 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
5750 dout(10) << " target is " << *in
<< dendl
;
5751 assert(!in
->is_projected()); // live slave request hold versionlock xlock.
5753 auto &pi
= in
->project_inode();
5754 pi
.inode
.version
= in
->pre_dirty();
5755 mut
->add_projected_inode(in
);
5757 // parent dir rctime
5758 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
5759 fnode_t
*pf
= parent
->project_fnode();
5760 mut
->add_projected_fnode(parent
);
5761 pf
->version
= parent
->pre_dirty();
5762 if (pf
->fragstat
.mtime
== pi
.inode
.ctime
) {
5763 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
5764 if (pf
->rstat
.rctime
== pi
.inode
.ctime
)
5765 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
5766 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
5767 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
5771 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= rollback
.old_ctime
;
5772 if (rollback
.was_inc
)
5778 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
5779 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
5780 mdlog
->start_entry(le
);
5781 le
->commit
.add_dir_context(parent
);
5782 le
->commit
.add_dir(parent
, true);
5783 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
5785 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
),
5790 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
)
5792 dout(10) << "_link_rollback_finish" << dendl
;
5794 assert(g_conf
->mds_kill_link_at
!= 10);
5798 mdcache
->request_finish(mdr
);
5800 mdcache
->finish_rollback(mut
->reqid
);
5806 /* This function DOES NOT put the passed message before returning*/
5807 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*m
)
5809 dout(10) << "handle_slave_link_prep_ack " << *mdr
5810 << " " << *m
<< dendl
;
5811 mds_rank_t from
= mds_rank_t(m
->get_source().num());
5813 assert(g_conf
->mds_kill_link_at
!= 11);
5816 mdr
->more()->slaves
.insert(from
);
5819 assert(mdr
->more()->witnessed
.count(from
) == 0);
5820 mdr
->more()->witnessed
.insert(from
);
5821 assert(!m
->is_not_journaled());
5822 mdr
->more()->has_journaled_slaves
= true;
5824 // remove from waiting list
5825 assert(mdr
->more()->waiting_on_slave
.count(from
));
5826 mdr
->more()->waiting_on_slave
.erase(from
);
5828 assert(mdr
->more()->waiting_on_slave
.empty());
5830 dispatch_client_request(mdr
); // go again!
5839 void Server::handle_client_unlink(MDRequestRef
& mdr
)
5841 MClientRequest
*req
= mdr
->client_request
;
5842 client_t client
= mdr
->get_client();
5846 if (req
->get_op() == CEPH_MDS_OP_RMDIR
) rmdir
= true;
5848 if (req
->get_filepath().depth() == 0) {
5849 respond_to_request(mdr
, -EINVAL
);
5854 vector
<CDentry
*> trace
;
5856 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(), &trace
, &in
, MDS_TRAVERSE_FORWARD
);
5860 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
5861 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr
));
5864 respond_to_request(mdr
, r
);
5867 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5868 respond_to_request(mdr
, -EROFS
);
5872 CDentry
*dn
= trace
[trace
.size()-1];
5874 if (!dn
->is_auth()) {
5875 mdcache
->request_forward(mdr
, dn
->authority().first
);
5879 CInode
*diri
= dn
->get_dir()->get_inode();
5881 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
5882 assert(!dnl
->is_null());
5885 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
5887 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
5889 dout(7) << "dn links to " << *in
<< dendl
;
5894 // do empty directory checks
5895 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
5896 respond_to_request(mdr
, -ENOTEMPTY
);
5900 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
5901 respond_to_request(mdr
, -EISDIR
);
5907 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
5908 respond_to_request(mdr
, -ENOTDIR
);
5913 // -- create stray dentry? --
5914 CDentry
*straydn
= NULL
;
5915 if (dnl
->is_primary()) {
5916 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
5919 dout(10) << " straydn is " << *straydn
<< dendl
;
5920 } else if (mdr
->straydn
) {
5921 mdr
->unpin(mdr
->straydn
);
5922 mdr
->straydn
= NULL
;
5926 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5928 for (int i
=0; i
<(int)trace
.size()-1; i
++)
5929 rdlocks
.insert(&trace
[i
]->lock
);
5930 xlocks
.insert(&dn
->lock
);
5931 wrlocks
.insert(&diri
->filelock
);
5932 wrlocks
.insert(&diri
->nestlock
);
5933 xlocks
.insert(&in
->linklock
);
5935 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
5936 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
5937 xlocks
.insert(&straydn
->lock
);
5940 rdlocks
.insert(&in
->filelock
); // to verify it's empty
5941 mds
->locker
->include_snap_rdlocks(rdlocks
, dnl
->get_inode());
5943 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5947 _dir_is_nonempty(mdr
, in
)) {
5948 respond_to_request(mdr
, -ENOTEMPTY
);
5952 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5953 if (!check_access(mdr
, diri
, MAY_WRITE
))
5958 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
5959 // subtree root auths need to be witnesses
5960 set
<mds_rank_t
> witnesses
;
5961 in
->list_replicas(witnesses
);
5962 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
5964 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
5965 p
!= witnesses
.end();
5967 if (mdr
->more()->witnessed
.count(*p
)) {
5968 dout(10) << " already witnessed by mds." << *p
<< dendl
;
5969 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
5970 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
5972 if (!_rmdir_prepare_witness(mdr
, *p
, trace
, straydn
))
5976 if (!mdr
->more()->waiting_on_slave
.empty())
5977 return; // we're waiting for a witness.
5981 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
5982 _link_remote(mdr
, false, dn
, dnl
->get_inode());
5984 _unlink_local(mdr
, dn
, straydn
);
5987 class C_MDS_unlink_local_finish
: public ServerLogContext
{
5990 version_t dnpv
; // deleted dentry
5992 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
5993 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
5994 dnpv(d
->get_projected_version()) {}
5995 void finish(int r
) override
{
5997 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
6001 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6003 dout(10) << "_unlink_local " << *dn
<< dendl
;
6005 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
6006 CInode
*in
= dnl
->get_inode();
6008 SnapRealm
*realm
= in
->find_snaprealm();
6009 snapid_t follows
= realm
->get_newest_seq();
6012 mdr
->ls
= mdlog
->get_current_segment();
6014 // prepare log entry
6015 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
6016 mdlog
->start_entry(le
);
6017 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6018 if (!mdr
->more()->witnessed
.empty()) {
6019 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6020 le
->reqid
= mdr
->reqid
;
6021 le
->had_slaves
= true;
6022 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6026 assert(dnl
->is_primary());
6027 straydn
->push_projected_linkage(in
);
6028 straydn
->first
= follows
+ 1;
6031 // the unlinked dentry
6034 auto &pi
= in
->project_inode();
6037 dn
->make_path_string(t
, true);
6038 pi
.inode
.stray_prior_path
= mempool::mds_co::string(boost::string_view(t
));
6040 mdr
->add_projected_inode(in
); // do this _after_ my dn->pre_dirty().. we apply that one manually.
6041 pi
.inode
.version
= in
->pre_dirty();
6042 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
6043 pi
.inode
.change_attr
++;
6045 if (pi
.inode
.nlink
== 0)
6046 in
->state_set(CInode::STATE_ORPHAN
);
6048 if (dnl
->is_primary()) {
6049 // primary link. add stray dentry.
6051 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
6052 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6054 // project snaprealm, too
6055 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap())
6056 in
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
6058 pi
.inode
.update_backtrace();
6059 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
6061 // remote link. update remote inode.
6062 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6063 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
6064 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
6067 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6068 le
->metablob
.add_null_dentry(dn
, true);
6071 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6072 le
->metablob
.renamed_dirino
= in
->ino();
6075 dn
->push_projected_linkage();
6079 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6081 in
->maybe_export_pin(true);
6084 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
6087 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
6088 CDentry
*dn
, CDentry
*straydn
,
6091 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
6093 if (!mdr
->more()->witnessed
.empty())
6094 mdcache
->logged_master_update(mdr
->reqid
);
6096 // unlink main dentry
6097 dn
->get_dir()->unlink_inode(dn
);
6098 dn
->pop_projected_linkage();
6100 // relink as stray? (i.e. was primary link?)
6101 CInode
*strayin
= NULL
;
6102 bool snap_is_new
= false;
6104 dout(20) << " straydn is " << *straydn
<< dendl
;
6105 CDentry::linkage_t
*straydnl
= straydn
->pop_projected_linkage();
6106 strayin
= straydnl
->get_inode();
6108 snap_is_new
= strayin
->snaprealm
? true : false;
6109 mdcache
->touch_dentry_bottom(straydn
);
6112 dn
->mark_dirty(dnpv
, mdr
->ls
);
6115 if (snap_is_new
) //only new if strayin exists
6116 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, true);
6118 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
6120 // update subtree map?
6121 if (straydn
&& strayin
->is_dir())
6122 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
6125 utime_t now
= ceph_clock_now();
6126 mds
->balancer
->hit_dir(now
, dn
->get_dir(), META_POP_IWR
);
6129 respond_to_request(mdr
, 0);
6131 // removing a new dn?
6132 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6135 // respond_to_request() drops locks. So stray reintegration can race with us.
6136 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6137 // Tip off the MDCache that this dentry is a stray that
6138 // might be elegible for purge.
6139 mdcache
->notify_stray(straydn
);
6143 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
6145 if (mds
->is_cluster_degraded() &&
6146 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
6147 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
6148 if (mdr
->more()->waiting_on_slave
.empty())
6149 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
6153 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
6154 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6155 MMDSSlaveRequest::OP_RMDIRPREP
);
6156 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
6157 for (auto dn
: trace
)
6158 req
->srcdnpath
.push_dentry(dn
->get_name());
6159 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
6161 req
->op_stamp
= mdr
->get_op_stamp();
6162 mds
->send_message_mds(req
, who
);
6164 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
6165 mdr
->more()->waiting_on_slave
.insert(who
);
6169 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
6170 CDentry
*dn
, *straydn
;
6171 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
6172 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
6173 void finish(int r
) override
{
6174 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
6178 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
6181 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
6182 : ServerContext(s
), mdr(r
), straydn(sd
) { }
6183 void finish(int r
) override
{
6184 server
->_commit_slave_rmdir(mdr
, r
, straydn
);
6188 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
6190 dout(10) << "handle_slave_rmdir_prep " << *mdr
6191 << " " << mdr
->slave_request
->srcdnpath
6192 << " to " << mdr
->slave_request
->destdnpath
6195 vector
<CDentry
*> trace
;
6196 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
6197 dout(10) << " src " << srcpath
<< dendl
;
6199 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &in
, MDS_TRAVERSE_DISCOVERXLOCK
);
6202 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
6207 CDentry
*dn
= trace
[trace
.size()-1];
6208 dout(10) << " dn " << *dn
<< dendl
;
6211 assert(mdr
->straydn
);
6212 CDentry
*straydn
= mdr
->straydn
;
6213 dout(10) << " straydn " << *straydn
<< dendl
;
6215 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6217 rmdir_rollback rollback
;
6218 rollback
.reqid
= mdr
->reqid
;
6219 rollback
.src_dir
= dn
->get_dir()->dirfrag();
6220 rollback
.src_dname
= std::string(dn
->get_name());
6221 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
6222 rollback
.dest_dname
= std::string(straydn
->get_name());
6223 ::encode(rollback
, mdr
->more()->rollback_bl
);
6224 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
6226 // set up commit waiter
6227 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
, straydn
);
6229 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
6230 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
6231 dn
->get_dir()->unlink_inode(dn
);
6232 straydn
->get_dir()->link_primary_inode(straydn
, in
);
6234 assert(straydn
->first
>= in
->first
);
6235 in
->first
= straydn
->first
;
6237 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), false);
6239 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6240 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6241 reply
->mark_not_journaled();
6242 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6244 // send caps to auth (if we're not already)
6245 if (in
->is_any_caps() && !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
6246 mdcache
->migrator
->export_caps(in
);
6248 mdcache
->touch_dentry_bottom(straydn
); // move stray to end of lru
6250 mdr
->slave_request
->put();
6251 mdr
->slave_request
= 0;
6256 straydn
->push_projected_linkage(in
);
6257 dn
->push_projected_linkage();
6259 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
6260 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
6261 mdlog
->start_entry(le
);
6262 le
->rollback
= mdr
->more()->rollback_bl
;
6264 le
->commit
.add_dir_context(straydn
->get_dir());
6265 le
->commit
.add_primary_dentry(straydn
, in
, true);
6266 // slave: no need to journal original dentry
6268 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6269 le
->commit
.renamed_dirino
= in
->ino();
6271 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6273 mdr
->more()->slave_update_journaled
= true;
6274 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
6279 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6281 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
6283 // update our cache now, so we are consistent with what is in the journal
6284 // when we journal a subtree map
6285 CInode
*in
= dn
->get_linkage()->get_inode();
6286 dn
->get_dir()->unlink_inode(dn
);
6287 straydn
->pop_projected_linkage();
6288 dn
->pop_projected_linkage();
6289 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), true);
6292 mdr
->slave_request
->put();
6293 mdr
->slave_request
= 0;
6296 if (!mdr
->aborted
) {
6297 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6298 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6299 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6301 dout(10) << " abort flag set, finishing" << dendl
;
6302 mdcache
->request_finish(mdr
);
6306 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
6308 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6309 << " " << *ack
<< dendl
;
6311 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
6313 mdr
->more()->slaves
.insert(from
);
6314 mdr
->more()->witnessed
.insert(from
);
6315 if (!ack
->is_not_journaled())
6316 mdr
->more()->has_journaled_slaves
= true;
6318 // remove from waiting list
6319 assert(mdr
->more()->waiting_on_slave
.count(from
));
6320 mdr
->more()->waiting_on_slave
.erase(from
);
6322 if (mdr
->more()->waiting_on_slave
.empty())
6323 dispatch_client_request(mdr
); // go again!
6325 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
6328 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
6330 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
6333 if (mdr
->more()->slave_update_journaled
) {
6334 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
6335 if (strayin
&& !strayin
->snaprealm
)
6336 mdcache
->clear_dirty_bits_for_stray(strayin
);
6341 if (mdr
->more()->slave_update_journaled
) {
6342 // write a commit to the journal
6343 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
6344 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
6345 ESlaveUpdate::RMDIR
);
6346 mdlog
->start_entry(le
);
6347 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6350 _committed_slave(mdr
);
6354 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6358 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
6362 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
6363 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
6364 void finish(int r
) override
{
6365 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
6369 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6371 // unlink the other rollback methods, the rmdir rollback is only
6372 // needed to record the subtree changes in the journal for inode
6373 // replicas who are auth for empty dirfrags. no actual changes to
6374 // the file system are taking place here, so there is no Mutation.
6376 rmdir_rollback rollback
;
6377 bufferlist::iterator p
= rbl
.begin();
6378 ::decode(rollback
, p
);
6380 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
6381 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6382 assert(mdr
|| mds
->is_resolve());
6384 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
6386 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
6388 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
6390 dout(10) << " dn " << *dn
<< dendl
;
6391 dir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
6393 CDentry
*straydn
= dir
->lookup(rollback
.dest_dname
);
6395 dout(10) << " straydn " << *dn
<< dendl
;
6396 CInode
*in
= straydn
->get_linkage()->get_inode();
6398 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
6399 assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
6401 straydn
->get_dir()->unlink_inode(straydn
);
6402 dn
->get_dir()->link_primary_inode(dn
, in
);
6404 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), false);
6406 mdcache
->request_finish(mdr
);
6407 mdcache
->finish_rollback(rollback
.reqid
);
6411 dn
->push_projected_linkage(in
);
6412 straydn
->push_projected_linkage();
6414 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
6415 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
6416 mdlog
->start_entry(le
);
6418 le
->commit
.add_dir_context(dn
->get_dir());
6419 le
->commit
.add_primary_dentry(dn
, in
, true);
6420 // slave: no need to journal straydn
6422 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6423 le
->commit
.renamed_dirino
= in
->ino();
6425 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
6427 submit_mdlog_entry(le
,
6428 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
6434 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
6436 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
6438 straydn
->get_dir()->unlink_inode(straydn
);
6439 dn
->pop_projected_linkage();
6440 straydn
->pop_projected_linkage();
6442 CInode
*in
= dn
->get_linkage()->get_inode();
6443 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), true);
6444 if (mds
->is_resolve()) {
6445 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
6446 mdcache
->try_trim_non_auth_subtree(root
);
6450 mdcache
->request_finish(mdr
);
6452 mdcache
->finish_rollback(reqid
);
6456 /** _dir_is_nonempty[_unlocked]
6458 * check if a directory is non-empty (i.e. we can rmdir it).
6460 * the unlocked varient this is a fastpath check. we can't really be
6461 * sure until we rdlock the filelock.
6463 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
6465 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
6466 assert(in
->is_auth());
6468 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
6469 return true; // in a snapshot!
6472 in
->get_dirfrags(ls
);
6473 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6475 // is the frag obviously non-empty?
6476 if (dir
->is_auth()) {
6477 if (dir
->get_projected_fnode()->fragstat
.size()) {
6478 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6479 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
6488 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
6490 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
6491 assert(in
->is_auth());
6492 assert(in
->filelock
.can_read(mdr
->get_client()));
6494 frag_info_t dirstat
;
6495 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
6498 in
->get_dirfrags(ls
);
6499 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6501 const fnode_t
*pf
= dir
->get_projected_fnode();
6502 if (pf
->fragstat
.size()) {
6503 dout(10) << "dir_is_nonempty dirstat has "
6504 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
6508 if (pf
->accounted_fragstat
.version
== dirstat_version
)
6509 dirstat
.add(pf
->accounted_fragstat
);
6511 dirstat
.add(pf
->fragstat
);
6514 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
6518 // ======================================================
6521 class C_MDS_rename_finish
: public ServerLogContext
{
6526 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
6527 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
6528 ServerLogContext(s
, r
),
6529 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
6530 void finish(int r
) override
{
6532 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
6537 /** handle_client_rename
6539 * rename master is the destdn auth. this is because cached inodes
6540 * must remain connected. thus, any replica of srci, must also
6541 * replicate destdn, and possibly straydn, so that srci (and
6542 * destdn->inode) remain connected during the rename.
6544 * to do this, we freeze srci, then master (destdn auth) verifies that
6545 * all other nodes have also replciated destdn and straydn. note that
6546 * destdn replicas need not also replicate srci. this only works when
6549 * This function takes responsibility for the passed mdr.
6551 void Server::handle_client_rename(MDRequestRef
& mdr
)
6553 MClientRequest
*req
= mdr
->client_request
;
6554 dout(7) << "handle_client_rename " << *req
<< dendl
;
6556 filepath destpath
= req
->get_filepath();
6557 filepath srcpath
= req
->get_filepath2();
6558 if (destpath
.depth() == 0 || srcpath
.depth() == 0) {
6559 respond_to_request(mdr
, -EINVAL
);
6562 boost::string_view destname
= destpath
.last_dentry();
6564 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
6565 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
6567 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
6569 CDentry
*destdn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, true, false, true);
6570 if (!destdn
) return;
6571 dout(10) << " destdn " << *destdn
<< dendl
;
6572 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6573 respond_to_request(mdr
, -EROFS
);
6576 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
6577 CDir
*destdir
= destdn
->get_dir();
6578 assert(destdir
->is_auth());
6580 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &srctrace
, NULL
, MDS_TRAVERSE_DISCOVER
);
6585 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
6586 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
6588 dout(10) << "FAIL on error " << r
<< dendl
;
6589 respond_to_request(mdr
, r
);
6594 assert(!srctrace
.empty());
6595 CDentry
*srcdn
= srctrace
[srctrace
.size()-1];
6596 dout(10) << " srcdn " << *srcdn
<< dendl
;
6597 if (srcdn
->last
!= CEPH_NOSNAP
) {
6598 respond_to_request(mdr
, -EROFS
);
6601 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
6602 CInode
*srci
= srcdnl
->get_inode();
6603 dout(10) << " srci " << *srci
<< dendl
;
6606 if (!destdnl
->is_null()) {
6607 //dout(10) << "dest dn exists " << *destdn << dendl;
6608 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
6610 dout(10) << " oldin " << *oldin
<< dendl
;
6612 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6613 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
6614 respond_to_request(mdr
, -ENOTEMPTY
);
6618 // if srcdn is replica, need to make sure its linkage is correct
6619 if (srcdn
->is_auth() ||
6620 srcdn
->lock
.can_read(mdr
->get_client()) ||
6621 (srcdn
->lock
.is_xlocked() && srcdn
->lock
.get_xlock_by() == mdr
)) {
6622 // mv /some/thing /to/some/existing_other_thing
6623 if (oldin
->is_dir() && !srci
->is_dir()) {
6624 respond_to_request(mdr
, -EISDIR
);
6627 if (!oldin
->is_dir() && srci
->is_dir()) {
6628 respond_to_request(mdr
, -ENOTDIR
);
6631 if (srci
== oldin
&& !srcdn
->get_dir()->inode
->is_stray()) {
6632 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
6638 // -- some sanity checks --
6640 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6641 if (destpath
.get_ino() != srcpath
.get_ino() &&
6642 !(req
->get_source().is_mds() &&
6643 MDS_INO_IS_MDSDIR(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6644 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
6645 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
6646 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6647 while (srcbase
!= destbase
&&
6648 !srcbase
->is_projected_ancestor_of(destbase
)) {
6649 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
6650 srctrace
.insert(srctrace
.begin(), pdn
);
6651 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
6652 srcbase
= pdn
->get_dir()->get_inode();
6655 // then, extend destpath until it shares the same parent inode as srcpath.
6656 while (destbase
!= srcbase
) {
6657 CDentry
*pdn
= destbase
->get_projected_parent_dn();
6658 desttrace
.insert(desttrace
.begin(), pdn
);
6659 rdlocks
.insert(&pdn
->lock
);
6660 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
6661 destbase
= pdn
->get_dir()->get_inode();
6663 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
6667 if (srcdn
->get_dir() == destdir
&& srcdn
->get_name() == destname
) {
6668 dout(7) << "rename src=dest, noop" << dendl
;
6669 respond_to_request(mdr
, 0);
6673 // dest a child of src?
6674 // e.g. mv /usr /usr/foo
6675 CDentry
*pdn
= destdir
->inode
->get_projected_parent_dn();
6678 dout(7) << "cannot rename item to be a child of itself" << dendl
;
6679 respond_to_request(mdr
, -EINVAL
);
6682 pdn
= pdn
->get_dir()->inode
->parent
;
6685 // is this a stray migration, reintegration or merge? (sanity checks!)
6686 if (mdr
->reqid
.name
.is_mds() &&
6687 !(MDS_INO_IS_MDSDIR(srcpath
.get_ino()) &&
6688 MDS_INO_IS_MDSDIR(destpath
.get_ino())) &&
6689 !(destdnl
->is_remote() &&
6690 destdnl
->get_remote_ino() == srci
->ino())) {
6691 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
6695 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
6696 (srcdnl
->is_primary() || destdnl
->is_primary()));
6698 dout(10) << " this is a link merge" << dendl
;
6700 // -- create stray dentry? --
6701 CDentry
*straydn
= NULL
;
6702 if (destdnl
->is_primary() && !linkmerge
) {
6703 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
6706 dout(10) << " straydn is " << *straydn
<< dendl
;
6707 } else if (mdr
->straydn
) {
6708 mdr
->unpin(mdr
->straydn
);
6709 mdr
->straydn
= NULL
;
6712 // -- prepare witness list --
6714 * NOTE: we use _all_ replicas as witnesses.
6715 * this probably isn't totally necessary (esp for file renames),
6716 * but if/when we change that, we have to make sure rejoin is
6717 * sufficiently robust to handle strong rejoins from survivors
6718 * with totally wrong dentry->inode linkage.
6719 * (currently, it can ignore rename effects, because the resolve
6720 * stage will sort them out.)
6722 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
6723 if (srcdn
->is_auth())
6724 srcdn
->list_replicas(witnesses
);
6726 witnesses
.insert(srcdn
->authority().first
);
6727 if (srcdnl
->is_remote() && !srci
->is_auth())
6728 witnesses
.insert(srci
->authority().first
);
6729 destdn
->list_replicas(witnesses
);
6730 if (destdnl
->is_remote() && !oldin
->is_auth())
6731 witnesses
.insert(oldin
->authority().first
);
6732 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6736 map
<SimpleLock
*, mds_rank_t
> remote_wrlocks
;
6738 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6739 for (int i
=0; i
<(int)srctrace
.size(); i
++)
6740 rdlocks
.insert(&srctrace
[i
]->lock
);
6741 xlocks
.insert(&srcdn
->lock
);
6742 mds_rank_t srcdirauth
= srcdn
->get_dir()->authority().first
;
6743 if (srcdirauth
!= mds
->get_nodeid()) {
6744 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth
<< dendl
;
6745 remote_wrlocks
[&srcdn
->get_dir()->inode
->filelock
] = srcdirauth
;
6746 remote_wrlocks
[&srcdn
->get_dir()->inode
->nestlock
] = srcdirauth
;
6748 rdlocks
.insert(&srci
->dirfragtreelock
);
6750 wrlocks
.insert(&srcdn
->get_dir()->inode
->filelock
);
6751 wrlocks
.insert(&srcdn
->get_dir()->inode
->nestlock
);
6753 mds
->locker
->include_snap_rdlocks(rdlocks
, srcdn
->get_dir()->inode
);
6757 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
6758 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
6759 xlocks
.insert(&straydn
->lock
);
6762 // xlock versionlock on dentries if there are witnesses.
6763 // replicas can't see projected dentry linkages, and will get
6764 // confused if we try to pipeline things.
6765 if (!witnesses
.empty()) {
6766 // take xlock on all projected ancestor dentries for srcdn and destdn.
6767 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6768 for (int i
= 0; i
<(int)srctrace
.size(); i
++) {
6769 if (srctrace
[i
]->is_auth() && srctrace
[i
]->is_projected())
6770 xlocks
.insert(&srctrace
[i
]->versionlock
);
6772 for (int i
=0; i
<(int)desttrace
.size(); i
++) {
6773 if (desttrace
[i
]->is_auth() && desttrace
[i
]->is_projected())
6774 xlocks
.insert(&desttrace
[i
]->versionlock
);
6776 // xlock srci and oldin's primary dentries, so witnesses can call
6777 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6779 if (srcdnl
->is_remote())
6780 xlocks
.insert(&srci
->get_projected_parent_dn()->lock
);
6781 if (destdnl
->is_remote())
6782 xlocks
.insert(&oldin
->get_projected_parent_dn()->lock
);
6785 // we need to update srci's ctime. xlock its least contended lock to do that...
6786 xlocks
.insert(&srci
->linklock
);
6788 // xlock oldin (for nlink--)
6790 xlocks
.insert(&oldin
->linklock
);
6791 if (oldin
->is_dir())
6792 rdlocks
.insert(&oldin
->filelock
);
6794 if (srcdnl
->is_primary() && srci
->is_dir())
6795 // FIXME: this should happen whenever we are renamning between
6796 // realms, regardless of the file type
6797 // FIXME: If/when this changes, make sure to update the
6798 // "allowance" in handle_slave_rename_prep
6799 xlocks
.insert(&srci
->snaplock
); // FIXME: an auth bcast could be sufficient?
6801 rdlocks
.insert(&srci
->snaplock
);
6803 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: NULL
;
6804 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
,
6805 &remote_wrlocks
, auth_pin_freeze
))
6808 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6809 if (!check_access(mdr
, srcdn
->get_dir()->get_inode(), MAY_WRITE
))
6812 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
6815 if (!check_fragment_space(mdr
, destdn
->get_dir()))
6818 if (!check_access(mdr
, srci
, MAY_WRITE
))
6822 // with read lock, really verify oldin is empty
6825 _dir_is_nonempty(mdr
, oldin
)) {
6826 respond_to_request(mdr
, -ENOTEMPTY
);
6830 /* project_past_snaprealm_parent() will do this job
6832 // moving between snaprealms?
6833 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6834 SnapRealm *srcrealm = srci->find_snaprealm();
6835 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6836 if (srcrealm != destrealm &&
6837 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6838 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6839 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6840 mdcache->snaprealm_create(mdr, srci);
6846 assert(g_conf
->mds_kill_rename_at
!= 1);
6848 // -- open all srcdn inode frags, if any --
6849 // we need these open so that auth can properly delegate from inode to dirfrags
6850 // after the inode is _ours_.
6851 if (srcdnl
->is_primary() &&
6852 !srcdn
->is_auth() &&
6854 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
6855 mdr
->set_stickydirs(srci
);
6858 srci
->dirfragtree
.get_leaves(frags
);
6859 for (list
<frag_t
>::iterator p
= frags
.begin();
6862 CDir
*dir
= srci
->get_dirfrag(*p
);
6864 dout(10) << " opening " << *p
<< " under " << *srci
<< dendl
;
6865 mdcache
->open_remote_dirfrag(srci
, *p
, new C_MDS_RetryRequest(mdcache
, mdr
));
6871 // -- prepare witnesses --
6873 // do srcdn auth last
6874 mds_rank_t last
= MDS_RANK_NONE
;
6875 if (!srcdn
->is_auth()) {
6876 last
= srcdn
->authority().first
;
6877 mdr
->more()->srcdn_auth_mds
= last
;
6878 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6879 // are involved in the rename operation.
6880 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
6881 dout(10) << " preparing ambiguous auth for srci" << dendl
;
6882 assert(mdr
->more()->is_remote_frozen_authpin
);
6883 assert(mdr
->more()->rename_inode
== srci
);
6884 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6889 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6890 p
!= witnesses
.end();
6892 if (*p
== last
) continue; // do it last!
6893 if (mdr
->more()->witnessed
.count(*p
)) {
6894 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6895 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6896 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6898 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
6902 if (!mdr
->more()->waiting_on_slave
.empty())
6903 return; // we're waiting for a witness.
6905 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
6906 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
6907 assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
6908 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6912 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6913 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
6914 assert(g_conf
->mds_kill_rename_at
!= 3);
6915 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
6916 assert(g_conf
->mds_kill_rename_at
!= 4);
6918 // -- declare now --
6919 mdr
->set_mds_stamp(ceph_clock_now());
6921 // -- prepare journal entry --
6922 mdr
->ls
= mdlog
->get_current_segment();
6923 EUpdate
*le
= new EUpdate(mdlog
, "rename");
6924 mdlog
->start_entry(le
);
6925 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6926 if (!mdr
->more()->witnessed
.empty()) {
6927 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6929 le
->reqid
= mdr
->reqid
;
6930 le
->had_slaves
= true;
6932 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6933 // no need to send frozen auth pin to recovring auth MDS of srci
6934 mdr
->more()->is_remote_frozen_authpin
= false;
6937 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
6938 if (le
->client_map
.length())
6939 le
->cmapv
= mds
->sessionmap
.get_projected();
6941 // -- commit locally --
6942 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
6944 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
6948 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
6950 dout(10) << "_rename_finish " << *mdr
<< dendl
;
6952 if (!mdr
->more()->witnessed
.empty())
6953 mdcache
->logged_master_update(mdr
->reqid
);
6956 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
6958 mdcache
->send_dentry_link(destdn
, mdr
);
6960 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
6961 CInode
*in
= destdnl
->get_inode();
6962 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
6964 // test hack: test slave commit
6965 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
6966 assert(g_conf
->mds_kill_rename_at
!= 5);
6967 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
6968 assert(g_conf
->mds_kill_rename_at
!= 6);
6971 utime_t now
= ceph_clock_now();
6972 mds
->balancer
->hit_dir(now
, srcdn
->get_dir(), META_POP_IWR
);
6973 if (destdnl
->is_remote() && in
->is_auth())
6974 mds
->balancer
->hit_inode(now
, in
, META_POP_IWR
);
6976 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6978 assert(g_conf
->mds_kill_rename_at
!= 7);
6981 respond_to_request(mdr
, 0);
6984 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
6987 // respond_to_request() drops locks. So stray reintegration can race with us.
6988 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6989 mdcache
->notify_stray(straydn
);
6997 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
6998 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
7000 if (mds
->is_cluster_degraded() &&
7001 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7002 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
7003 if (mdr
->more()->waiting_on_slave
.empty())
7004 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7008 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
7009 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7010 MMDSSlaveRequest::OP_RENAMEPREP
);
7012 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
7013 for (auto dn
: srctrace
)
7014 req
->srcdnpath
.push_dentry(dn
->get_name());
7015 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
7016 for (auto dn
: dsttrace
)
7017 req
->destdnpath
.push_dentry(dn
->get_name());
7019 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
7021 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
7023 // srcdn auth will verify our current witness list is sufficient
7024 req
->witnesses
= witnesse
;
7026 req
->op_stamp
= mdr
->get_op_stamp();
7027 mds
->send_message_mds(req
, who
);
7029 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
7030 mdr
->more()->waiting_on_slave
.insert(who
);
7034 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
7036 version_t oldpv
= mdr
->more()->inode_import_v
;
7038 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7041 bufferlist::iterator blp
= mdr
->more()->inode_import
.begin();
7044 map
<client_t
,entity_inst_t
> client_map
;
7045 decode(client_map
, blp
);
7046 prepare_force_open_sessions(client_map
, mdr
->more()->imported_session_map
);
7047 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
7049 list
<ScatterLock
*> updated_scatterlocks
;
7050 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
7051 mdr
->more()->cap_imports
, updated_scatterlocks
);
7053 // hack: force back to !auth and clean, temporarily
7054 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
7055 srcdnl
->get_inode()->mark_clean();
7060 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
7063 diri
->get_dirfrags(ls
);
7065 bool force_journal
= false;
7067 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7068 if ((*p
)->is_subtree_root() && (*p
)->get_dir_auth().first
== mds
->get_nodeid()) {
7069 dout(10) << " frag " << (*p
)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
7070 force_journal
= true;
7073 dout(20) << " frag " << (*p
)->get_frag() << " is not auth subtree dirfrag" << dendl
;
7076 // see if any children of our frags are auth subtrees.
7077 list
<CDir
*> subtrees
;
7078 mdcache
->list_subtrees(subtrees
);
7079 dout(10) << " subtrees " << subtrees
<< " frags " << ls
<< dendl
;
7080 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7082 for (list
<CDir
*>::iterator q
= subtrees
.begin(); q
!= subtrees
.end(); ++q
) {
7083 if (dir
->contains(*q
)) {
7084 if ((*q
)->get_dir_auth().first
== mds
->get_nodeid()) {
7085 dout(10) << " frag " << (*p
)->get_frag() << " contains (maybe) auth subtree, will force journal "
7087 force_journal
= true;
7090 dout(20) << " frag " << (*p
)->get_frag() << " contains but isn't auth for " << **q
<< dendl
;
7092 dout(20) << " frag " << (*p
)->get_frag() << " does not contain " << **q
<< dendl
;
7098 return force_journal
;
7101 void Server::_rename_prepare(MDRequestRef
& mdr
,
7102 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
7103 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7105 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7107 dout(10) << " straydn " << *straydn
<< dendl
;
7109 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7110 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7111 CInode
*srci
= srcdnl
->get_inode();
7112 CInode
*oldin
= destdnl
->get_inode();
7114 // primary+remote link merge?
7115 bool linkmerge
= (srci
== destdnl
->get_inode() &&
7116 (srcdnl
->is_primary() || destdnl
->is_primary()));
7117 bool silent
= srcdn
->get_dir()->inode
->is_stray();
7119 bool force_journal_dest
= false;
7120 if (srci
->is_dir() && !destdn
->is_auth()) {
7121 if (srci
->is_auth()) {
7122 // if we are auth for srci and exporting it, force journal because journal replay needs
7123 // the source inode to create auth subtrees.
7124 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
7125 force_journal_dest
= true;
7127 force_journal_dest
= _need_force_journal(srci
, false);
7130 bool force_journal_stray
= false;
7131 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
7132 force_journal_stray
= _need_force_journal(oldin
, true);
7135 dout(10) << " merging remote and primary links to the same inode" << dendl
;
7137 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
7138 if (force_journal_dest
)
7139 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
7140 if (force_journal_stray
)
7141 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
7143 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
7144 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
7145 metablob
->renamed_dirino
= srci
->ino();
7146 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
7147 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
7148 metablob
->renamed_dirino
= oldin
->ino();
7152 CInode::mempool_inode
*spi
= 0; // renamed inode
7153 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
7157 if (destdnl
->is_primary()) {
7158 assert(straydn
); // moving to straydn.
7159 // link--, and move.
7160 if (destdn
->is_auth()) {
7161 auto &pi
= oldin
->project_inode(); //project_snaprealm
7162 pi
.inode
.version
= straydn
->pre_dirty(pi
.inode
.version
);
7163 pi
.inode
.update_backtrace();
7166 straydn
->push_projected_linkage(oldin
);
7167 } else if (destdnl
->is_remote()) {
7169 if (oldin
->is_auth()) {
7170 auto &pi
= oldin
->project_inode();
7171 pi
.inode
.version
= oldin
->pre_dirty();
7178 if (srcdnl
->is_remote()) {
7181 if (destdn
->is_auth())
7182 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
7183 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7185 if (srci
->is_auth()) {
7186 auto &pi
= srci
->project_inode();
7187 pi
.inode
.version
= srci
->pre_dirty();
7191 dout(10) << " will merge remote onto primary link" << dendl
;
7192 if (destdn
->is_auth()) {
7193 auto &pi
= oldin
->project_inode();
7194 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
7199 if (destdn
->is_auth()) {
7201 if (srcdn
->is_auth())
7202 oldpv
= srci
->get_projected_version();
7204 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
7206 // note which dirfrags have child subtrees in the journal
7207 // event, so that we can open those (as bounds) during replay.
7208 if (srci
->is_dir()) {
7210 srci
->get_dirfrags(ls
);
7211 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7213 if (!dir
->is_auth())
7214 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
7216 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
7219 auto &pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
7220 // & srcdnl->snaprealm
7221 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
7222 pi
.inode
.update_backtrace();
7225 destdn
->push_projected_linkage(srci
);
7229 if (srcdn
->is_auth())
7230 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
7231 srcdn
->push_projected_linkage(); // push null linkage
7235 spi
->ctime
= spi
->rstat
.rctime
= mdr
->get_op_stamp();
7241 tpi
->ctime
= tpi
->rstat
.rctime
= mdr
->get_op_stamp();
7245 destdn
->make_path_string(t
, true);
7246 tpi
->stray_prior_path
= mempool::mds_co::string(boost::string_view(t
));
7249 if (tpi
->nlink
== 0)
7250 oldin
->state_set(CInode::STATE_ORPHAN
);
7254 // prepare nesting, mtime updates
7255 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
7257 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7258 // then link the source inode to destdn
7259 if (destdnl
->is_primary()) {
7261 if (straydn
->is_auth()) {
7262 metablob
->add_dir_context(straydn
->get_dir());
7263 metablob
->add_dir(straydn
->get_dir(), true);
7268 if (destdn
->is_auth() && !destdnl
->is_null()) {
7269 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
7270 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
7271 if (destdnl
->is_primary()) {
7273 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
7274 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7279 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
7280 int flags
= predirty_dir
| predirty_primary
;
7281 if (srcdn
->is_auth())
7282 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
7283 if (destdn
->is_auth())
7284 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
7286 SnapRealm
*src_realm
= srci
->find_snaprealm();
7287 SnapRealm
*dest_realm
= destdn
->get_dir()->inode
->find_snaprealm();
7288 snapid_t next_dest_snap
= dest_realm
->get_newest_seq() + 1;
7290 // add it all to the metablob
7293 if (destdnl
->is_primary()) {
7295 if (destdn
->is_auth()) {
7296 // project snaprealm, too
7297 if (oldin
->snaprealm
|| dest_realm
->get_newest_seq() + 1 > oldin
->get_oldest_snap())
7298 oldin
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
7299 straydn
->first
= MAX(oldin
->first
, next_dest_snap
);
7300 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
7301 } else if (force_journal_stray
) {
7302 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
7303 metablob
->add_dir_context(straydn
->get_dir());
7304 metablob
->add_primary_dentry(straydn
, oldin
, true);
7306 } else if (destdnl
->is_remote()) {
7307 if (oldin
->is_auth()) {
7309 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
7310 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
7311 CEPH_NOSNAP
, 0, destdnl
);
7312 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
7318 if (srcdnl
->is_remote()) {
7320 if (destdn
->is_auth() && !destdnl
->is_null())
7321 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7323 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7325 if (destdn
->is_auth())
7326 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7327 if (srci
->get_projected_parent_dn()->is_auth()) { // it's remote
7328 metablob
->add_dir_context(srci
->get_projected_parent_dir());
7329 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci
->get_projected_parent_dn(), CEPH_NOSNAP
, 0, srcdnl
);
7330 metablob
->add_primary_dentry(srci
->get_projected_parent_dn(), srci
, true);
7333 if (destdn
->is_auth() && !destdnl
->is_null())
7334 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7336 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7338 if (destdn
->is_auth())
7339 metablob
->add_primary_dentry(destdn
, destdnl
->get_inode(), true, true);
7341 } else if (srcdnl
->is_primary()) {
7342 // project snap parent update?
7343 if (destdn
->is_auth() && src_realm
!= dest_realm
&&
7344 (srci
->snaprealm
|| src_realm
->get_newest_seq() + 1 > srci
->get_oldest_snap()))
7345 srci
->project_past_snaprealm_parent(dest_realm
);
7347 if (destdn
->is_auth() && !destdnl
->is_null())
7348 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7350 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7352 if (destdn
->is_auth())
7353 metablob
->add_primary_dentry(destdn
, srci
, true, true);
7354 else if (force_journal_dest
) {
7355 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
7356 metablob
->add_dir_context(destdn
->get_dir());
7357 metablob
->add_primary_dentry(destdn
, srci
, true);
7358 if (srcdn
->is_auth() && srci
->is_dir()) {
7359 // journal new subtrees root dirfrags
7361 srci
->get_dirfrags(ls
);
7362 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7365 metablob
->add_dir(dir
, true);
7372 if (srcdn
->is_auth()) {
7373 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
7374 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
7375 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7376 // both primary and NULL dentries. Because during journal replay, null dentry is
7377 // processed after primary dentry.
7378 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
7379 metablob
->add_primary_dentry(srcdn
, srci
, true);
7380 metablob
->add_null_dentry(srcdn
, true);
7382 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
7384 // make renamed inode first track the dn
7385 if (srcdnl
->is_primary() && destdn
->is_auth())
7386 srci
->first
= destdn
->first
;
7388 if (oldin
&& oldin
->is_dir()) {
7390 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
7393 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
7398 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7400 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7401 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
7403 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7404 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7406 CInode
*oldin
= destdnl
->get_inode();
7408 // primary+remote link merge?
7409 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7410 (srcdnl
->is_primary() || destdnl
->is_primary()));
7414 if (destdnl
->is_primary()) {
7416 dout(10) << "straydn is " << *straydn
<< dendl
;
7417 destdn
->get_dir()->unlink_inode(destdn
, false);
7419 straydn
->pop_projected_linkage();
7420 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7421 assert(!straydn
->is_projected()); // no other projected
7423 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
7426 if (destdn
->is_auth()) {
7427 bool hadrealm
= (oldin
->snaprealm
? true : false);
7428 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7429 if (oldin
->snaprealm
&& !hadrealm
)
7430 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
);
7432 // FIXME this snaprealm is not filled out correctly
7433 //oldin->open_snaprealm(); might be sufficient..
7435 } else if (destdnl
->is_remote()) {
7436 destdn
->get_dir()->unlink_inode(destdn
, false);
7437 if (oldin
->is_auth())
7438 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7442 // unlink src before we relink it at dest
7443 CInode
*in
= srcdnl
->get_inode();
7446 bool srcdn_was_remote
= srcdnl
->is_remote();
7447 srcdn
->get_dir()->unlink_inode(srcdn
);
7450 if (srcdn_was_remote
) {
7453 destdnl
= destdn
->pop_projected_linkage();
7454 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7455 assert(!destdn
->is_projected()); // no other projected
7457 destdn
->link_remote(destdnl
, in
);
7458 if (destdn
->is_auth())
7459 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
7462 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7464 dout(10) << "merging remote onto primary link" << dendl
;
7465 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7469 dout(10) << "merging primary onto remote link" << dendl
;
7470 destdn
->get_dir()->unlink_inode(destdn
, false);
7472 destdnl
= destdn
->pop_projected_linkage();
7473 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7474 assert(!destdn
->is_projected()); // no other projected
7476 // srcdn inode import?
7477 if (!srcdn
->is_auth() && destdn
->is_auth()) {
7478 assert(mdr
->more()->inode_import
.length() > 0);
7480 map
<client_t
,Capability::Import
> imported_caps
;
7482 // finish cap imports
7483 finish_force_open_sessions(mdr
->more()->imported_session_map
);
7484 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
7485 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
7486 mdr
->more()->srcdn_auth_mds
, true,
7487 mdr
->more()->imported_session_map
,
7488 mdr
->more()->cap_imports
[destdnl
->get_inode()],
7492 mdr
->more()->inode_import
.clear();
7493 ::encode(imported_caps
, mdr
->more()->inode_import
);
7495 /* hack: add an auth pin for each xlock we hold. These were
7496 * remote xlocks previously but now they're local and
7497 * we're going to try and unpin when we xlock_finish. */
7498 for (set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7499 i
!= mdr
->xlocks
.end();
7501 if ((*i
)->get_parent() == destdnl
->get_inode() &&
7502 !(*i
)->is_locallock())
7503 mds
->locker
->xlock_import(*i
);
7505 // hack: fix auth bit
7506 in
->state_set(CInode::STATE_AUTH
);
7508 mdr
->clear_ambiguous_auth();
7511 if (destdn
->is_auth()) {
7512 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7515 // FIXME: fix up snaprealm!
7520 if (srcdn
->is_auth())
7521 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
7522 srcdn
->pop_projected_linkage();
7523 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7524 assert(!srcdn
->is_projected()); // no other projected
7526 // apply remaining projected inodes (nested)
7529 // update subtree map?
7530 if (destdnl
->is_primary() && in
->is_dir())
7531 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
7533 if (straydn
&& oldin
->is_dir())
7534 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
7536 // removing a new dn?
7537 if (srcdn
->is_auth())
7538 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
7546 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
7547 CDentry
*srcdn
, *destdn
, *straydn
;
7549 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7550 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7551 void finish(int r
) override
{
7552 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7556 class C_MDS_SlaveRenameCommit
: public ServerContext
{
7558 CDentry
*srcdn
, *destdn
, *straydn
;
7560 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7561 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7562 void finish(int r
) override
{
7563 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
7567 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
7570 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
7571 ServerContext(s
), mdr(r
) {}
7572 void finish(int r
) override
{
7573 server
->_slave_rename_sessions_flushed(mdr
);
7577 /* This function DOES put the mdr->slave_request before returning*/
7578 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
7580 dout(10) << "handle_slave_rename_prep " << *mdr
7581 << " " << mdr
->slave_request
->srcdnpath
7582 << " to " << mdr
->slave_request
->destdnpath
7585 if (mdr
->slave_request
->is_interrupted()) {
7586 dout(10) << " slave request interrupted, sending noop reply" << dendl
;
7587 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7588 reply
->mark_interrupted();
7589 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7590 mdr
->slave_request
->put();
7591 mdr
->slave_request
= 0;
7596 filepath
destpath(mdr
->slave_request
->destdnpath
);
7597 dout(10) << " dest " << destpath
<< dendl
;
7598 vector
<CDentry
*> trace
;
7599 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, destpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVERXLOCK
);
7602 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7606 assert(r
== 0); // we shouldn't get an error here!
7608 CDentry
*destdn
= trace
[trace
.size()-1];
7609 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7610 dout(10) << " destdn " << *destdn
<< dendl
;
7614 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
7615 dout(10) << " src " << srcpath
<< dendl
;
7616 CInode
*srci
= nullptr;
7617 r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &srci
, MDS_TRAVERSE_DISCOVERXLOCK
);
7621 // srcpath must not point to a null dentry
7622 assert(srci
!= nullptr);
7624 CDentry
*srcdn
= trace
[trace
.size()-1];
7625 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7626 dout(10) << " srcdn " << *srcdn
<< dendl
;
7631 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7632 (srcdnl
->is_primary() || destdnl
->is_primary()));
7633 CDentry
*straydn
= mdr
->straydn
;
7634 if (destdnl
->is_primary() && !linkmerge
)
7637 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7638 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
7640 // set up commit waiter (early, to clean up any freezing etc we do)
7641 if (!mdr
->more()->slave_commit
)
7642 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
7645 if (srcdn
->is_auth()) {
7646 set
<mds_rank_t
> srcdnrep
;
7647 srcdn
->list_replicas(srcdnrep
);
7649 bool reply_witness
= false;
7650 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
7653 // - avoid conflicting lock state changes
7654 // - avoid concurrent updates to the inode
7655 // (this could also be accomplished with the versionlock)
7656 int allowance
= 2; // 1 for the mdr auth_pin, 1 for the link lock
7657 allowance
+= srcdnl
->get_inode()->is_dir(); // for the snap lock
7658 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
7659 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
7661 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7662 if (srcdnl
->get_inode()->is_frozen_auth_pin())
7663 mdr
->unfreeze_auth_pin();
7665 if (!frozen_inode
) {
7666 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
7671 * set ambiguous auth for srci
7672 * NOTE: we don't worry about ambiguous cache expire as we do
7673 * with subtree migrations because all slaves will pin
7674 * srcdn->get_inode() for duration of this rename.
7676 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7678 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7679 // the master will send another OP_RENAMEPREP slave request later.
7680 if (mdr
->slave_request
->witnesses
.size() > 1) {
7681 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
7682 reply_witness
= true;
7685 // make sure bystanders have received all lock related messages
7686 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7687 if (*p
== mdr
->slave_to_mds
||
7688 (mds
->is_cluster_degraded() &&
7689 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
7691 MMDSSlaveRequest
*notify
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7692 MMDSSlaveRequest::OP_RENAMENOTIFY
);
7693 mds
->send_message_mds(notify
, *p
);
7694 mdr
->more()->waiting_on_slave
.insert(*p
);
7697 // make sure clients have received all cap related messages
7698 set
<client_t
> export_client_set
;
7699 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
7701 MDSGatherBuilder
gather(g_ceph_context
);
7702 flush_client_sessions(export_client_set
, gather
);
7703 if (gather
.has_subs()) {
7704 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
7705 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
7710 // is witness list sufficient?
7711 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7712 if (*p
== mdr
->slave_to_mds
||
7713 mdr
->slave_request
->witnesses
.count(*p
)) continue;
7714 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
7715 reply_witness
= true;
7719 if (reply_witness
) {
7720 assert(!srcdnrep
.empty());
7721 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7722 MMDSSlaveRequest::OP_RENAMEPREPACK
);
7723 reply
->witnesses
.swap(srcdnrep
);
7724 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7725 mdr
->slave_request
->put();
7726 mdr
->slave_request
= 0;
7729 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
7730 if (!mdr
->more()->waiting_on_slave
.empty()) {
7731 dout(10) << " still waiting for rename notify acks from "
7732 << mdr
->more()->waiting_on_slave
<< dendl
;
7735 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
7736 // set ambiguous auth for srci on witnesses
7737 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7740 // encode everything we'd need to roll this back... basically, just the original state.
7741 rename_rollback rollback
;
7743 rollback
.reqid
= mdr
->reqid
;
7745 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
7746 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7747 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7748 rollback
.orig_src
.dname
= std::string(srcdn
->get_name());
7749 if (srcdnl
->is_primary())
7750 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
7752 assert(srcdnl
->is_remote());
7753 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
7754 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
7757 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
7758 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7759 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7760 rollback
.orig_dest
.dname
= std::string(destdn
->get_name());
7761 if (destdnl
->is_primary())
7762 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
7763 else if (destdnl
->is_remote()) {
7764 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
7765 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
7769 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
7770 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7771 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7772 rollback
.stray
.dname
= std::string(straydn
->get_name());
7774 ::encode(rollback
, mdr
->more()->rollback_bl
);
7775 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7778 mdr
->ls
= mdlog
->get_current_segment();
7779 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
7780 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
7781 mdlog
->start_entry(le
);
7782 le
->rollback
= mdr
->more()->rollback_bl
;
7784 bufferlist blah
; // inode import data... obviously not used if we're the slave
7785 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
7787 if (le
->commit
.empty()) {
7788 dout(10) << " empty metablob, skipping journal" << dendl
;
7789 mdlog
->cancel_entry(le
);
7791 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7793 mdr
->more()->slave_update_journaled
= true;
7794 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
7800 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
7801 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7803 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
7806 MMDSSlaveRequest
*reply
= NULL
;
7807 if (!mdr
->aborted
) {
7808 reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7809 if (!mdr
->more()->slave_update_journaled
)
7810 reply
->mark_not_journaled();
7813 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7814 CDentry::linkage_t
*destdnl
= NULL
;
7815 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7818 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
7819 // set export bounds for CInode::encode_export()
7821 if (srcdnl
->get_inode()->is_dir()) {
7822 srcdnl
->get_inode()->get_dirfrags(bounds
);
7823 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7824 (*p
)->state_set(CDir::STATE_EXPORTBOUND
);
7827 map
<client_t
,entity_inst_t
> exported_client_map
;
7829 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
7830 exported_client_map
);
7832 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7833 (*p
)->state_clear(CDir::STATE_EXPORTBOUND
);
7836 ::encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
7837 reply
->inode_export
.claim_append(inodebl
);
7838 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
7841 // remove mdr auth pin
7842 mdr
->auth_unpin(srcdnl
->get_inode());
7843 mdr
->more()->is_inode_exporter
= true;
7845 if (srcdnl
->get_inode()->is_dirty())
7846 srcdnl
->get_inode()->mark_clean();
7848 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
7852 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
7854 destdnl
= destdn
->get_linkage();
7857 utime_t now
= ceph_clock_now();
7858 mds
->balancer
->hit_dir(now
, srcdn
->get_dir(), META_POP_IWR
);
7859 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
7860 mds
->balancer
->hit_inode(now
, destdnl
->get_inode(), META_POP_IWR
);
7863 mdr
->slave_request
->put();
7864 mdr
->slave_request
= 0;
7868 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7870 assert(mdr
->aborted
);
7871 dout(10) << " abort flag set, finishing" << dendl
;
7872 mdcache
->request_finish(mdr
);
7876 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
7877 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7879 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
7881 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7883 list
<MDSInternalContextBase
*> finished
;
7885 // unfreeze+singleauth inode
7886 // hmm, do i really need to delay this?
7887 if (mdr
->more()->is_inode_exporter
) {
7889 CInode
*in
= destdnl
->get_inode();
7892 // we exported, clear out any xlocks that we moved to another MDS
7893 set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7894 while (i
!= mdr
->xlocks
.end()) {
7895 SimpleLock
*lock
= *i
++;
7897 // we only care about xlocks on the exported inode
7898 if (lock
->get_parent() == in
&&
7899 !lock
->is_locallock())
7900 mds
->locker
->xlock_export(lock
, mdr
.get());
7903 map
<client_t
,Capability::Import
> peer_imported
;
7904 bufferlist::iterator bp
= mdr
->more()->inode_import
.begin();
7905 ::decode(peer_imported
, bp
);
7907 dout(10) << " finishing inode export on " << *destdnl
->get_inode() << dendl
;
7908 mdcache
->migrator
->finish_export_inode(destdnl
->get_inode(), ceph_clock_now(),
7909 mdr
->slave_to_mds
, peer_imported
, finished
);
7910 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
7913 assert(destdnl
->get_inode()->is_frozen_inode());
7914 destdnl
->get_inode()->unfreeze_inode(finished
);
7918 if (mdr
->more()->is_ambiguous_auth
) {
7919 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
7920 mdr
->more()->is_ambiguous_auth
= false;
7923 if (straydn
&& mdr
->more()->slave_update_journaled
) {
7924 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7925 if (strayin
&& !strayin
->snaprealm
)
7926 mdcache
->clear_dirty_bits_for_stray(strayin
);
7929 mds
->queue_waiters(finished
);
7932 if (mdr
->more()->slave_update_journaled
) {
7933 // write a commit to the journal
7934 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
7935 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
7936 ESlaveUpdate::RENAME
);
7937 mdlog
->start_entry(le
);
7938 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
7941 _committed_slave(mdr
);
7946 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7947 // witness list from the master, and they failed before we tried prep again.
7948 if (mdr
->more()->rollback_bl
.length()) {
7949 if (mdr
->more()->is_inode_exporter
) {
7950 dout(10) << " reversing inode export of " << *destdnl
->get_inode() << dendl
;
7951 destdnl
->get_inode()->abort_export();
7953 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
7954 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
7955 // rollback but preserve the slave request
7956 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
7957 mdr
->more()->rollback_bl
.clear();
7959 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
7961 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
7963 if (mdr
->more()->is_ambiguous_auth
) {
7964 if (srcdn
->is_auth())
7965 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
7967 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
7968 mdr
->more()->is_ambiguous_auth
= false;
7970 mds
->queue_waiters(finished
);
7971 mdcache
->request_finish(mdr
);
7976 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
7977 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
7980 pf
= dir
->project_fnode();
7981 mut
->add_projected_fnode(dir
);
7982 pf
->version
= dir
->pre_dirty();
7985 pf
->fragstat
.nsubdirs
+= linkunlink
;
7987 pf
->fragstat
.nfiles
+= linkunlink
;
7990 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
7991 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
7992 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
7993 pf
->rstat
.rsnaprealms
+= linkunlink
* rstat
.rsnaprealms
;
7995 if (pf
->fragstat
.mtime
== ctime
) {
7996 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
7997 if (pf
->rstat
.rctime
== ctime
)
7998 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
8000 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
8001 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
8004 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
8011 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
8012 CDentry
*sd
, version_t pv
, CDentry
*dd
,
8013 CDentry
*st
, bool f
) :
8014 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
8015 straydn(st
), finish_mdr(f
) {}
8016 void finish(int r
) override
{
8017 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
8018 destdn
, straydn
, finish_mdr
);
8022 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
8025 rename_rollback rollback
;
8026 bufferlist::iterator p
= rbl
.begin();
8027 ::decode(rollback
, p
);
8029 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
8030 // need to finish this update before sending resolve to claim the subtree
8031 mdcache
->add_rollback(rollback
.reqid
, master
);
8033 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
8034 mut
->ls
= mds
->mdlog
->get_current_segment();
8036 CDentry
*srcdn
= NULL
;
8037 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
8039 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
8041 dout(10) << " srcdir " << *srcdir
<< dendl
;
8042 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
8044 dout(10) << " srcdn " << *srcdn
<< dendl
;
8045 assert(srcdn
->get_linkage()->is_null());
8047 dout(10) << " srcdn not found" << dendl
;
8049 dout(10) << " srcdir not found" << dendl
;
8051 CDentry
*destdn
= NULL
;
8052 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
8054 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
8056 dout(10) << " destdir " << *destdir
<< dendl
;
8057 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
8059 dout(10) << " destdn " << *destdn
<< dendl
;
8061 dout(10) << " destdn not found" << dendl
;
8063 dout(10) << " destdir not found" << dendl
;
8066 if (rollback
.orig_src
.ino
) {
8067 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
8068 if (in
&& in
->is_dir())
8069 assert(srcdn
&& destdn
);
8071 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
8073 CDir
*straydir
= NULL
;
8074 CDentry
*straydn
= NULL
;
8075 if (rollback
.stray
.dirfrag
.ino
) {
8076 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
8078 dout(10) << "straydir " << *straydir
<< dendl
;
8079 straydn
= straydir
->lookup(rollback
.stray
.dname
);
8081 dout(10) << " straydn " << *straydn
<< dendl
;
8082 assert(straydn
->get_linkage()->is_primary());
8084 dout(10) << " straydn not found" << dendl
;
8086 dout(10) << "straydir not found" << dendl
;
8089 CInode
*target
= NULL
;
8090 if (rollback
.orig_dest
.ino
) {
8091 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
8093 assert(destdn
&& straydn
);
8094 } else if (rollback
.orig_dest
.remote_ino
)
8095 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
8097 // can't use is_auth() in the resolve stage
8098 mds_rank_t whoami
= mds
->get_nodeid();
8100 assert(!destdn
|| destdn
->authority().first
!= whoami
);
8101 assert(!straydn
|| straydn
->authority().first
!= whoami
);
8103 bool force_journal_src
= false;
8104 bool force_journal_dest
= false;
8105 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
8106 force_journal_src
= _need_force_journal(in
, false);
8107 if (in
&& target
&& target
->is_dir())
8108 force_journal_dest
= _need_force_journal(in
, true);
8110 version_t srcdnpv
= 0;
8113 if (srcdn
->authority().first
== whoami
)
8114 srcdnpv
= srcdn
->pre_dirty();
8115 if (rollback
.orig_src
.ino
) {
8117 srcdn
->push_projected_linkage(in
);
8119 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
8120 rollback
.orig_src
.remote_d_type
);
8123 CInode::mempool_inode
*pip
= 0;
8125 if (in
->authority().first
== whoami
) {
8126 auto &pi
= in
->project_inode();
8127 mut
->add_projected_inode(in
);
8128 pi
.inode
.version
= in
->pre_dirty();
8131 pip
= in
->get_projected_inode();
8132 if (pip
->ctime
== rollback
.ctime
)
8133 pip
->ctime
= pip
->rstat
.rctime
= rollback
.orig_src
.old_ctime
;
8136 if (srcdn
&& srcdn
->authority().first
== whoami
) {
8138 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
8139 in
? in
->is_dir() : false, 1, pip
? pip
->accounted_rstat
: blah
);
8144 if (rollback
.orig_dest
.ino
&& target
) {
8145 destdn
->push_projected_linkage(target
);
8146 } else if (rollback
.orig_dest
.remote_ino
) {
8147 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
8148 rollback
.orig_dest
.remote_d_type
);
8150 // the dentry will be trimmed soon, it's ok to have wrong linkage
8151 if (rollback
.orig_dest
.ino
)
8152 assert(mds
->is_resolve());
8153 destdn
->push_projected_linkage();
8158 straydn
->push_projected_linkage();
8161 CInode::mempool_inode
*ti
= NULL
;
8162 if (target
->authority().first
== whoami
) {
8163 auto &pi
= target
->project_inode();
8164 mut
->add_projected_inode(target
);
8165 pi
.inode
.version
= target
->pre_dirty();
8168 ti
= target
->get_projected_inode();
8169 if (ti
->ctime
== rollback
.ctime
)
8170 ti
->ctime
= ti
->rstat
.rctime
= rollback
.orig_dest
.old_ctime
;
8171 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
8172 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
8173 assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
8175 assert(rollback
.orig_dest
.remote_ino
&&
8176 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
8182 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
8184 dout(0) << " srci back to " << *in
<< dendl
;
8186 dout(0) << " destdn back to " << *destdn
<< dendl
;
8188 dout(0) << " desti back to " << *target
<< dendl
;
8191 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
8192 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
8193 mdlog
->start_entry(le
);
8195 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
8196 le
->commit
.add_dir_context(srcdir
);
8197 if (rollback
.orig_src
.ino
)
8198 le
->commit
.add_primary_dentry(srcdn
, 0, true);
8200 le
->commit
.add_remote_dentry(srcdn
, true);
8203 if (!rollback
.orig_src
.ino
&& // remote linkage
8204 in
&& in
->authority().first
== whoami
) {
8205 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
8206 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
8209 if (force_journal_dest
) {
8210 assert(rollback
.orig_dest
.ino
);
8211 le
->commit
.add_dir_context(destdir
);
8212 le
->commit
.add_primary_dentry(destdn
, 0, true);
8215 // slave: no need to journal straydn
8217 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
8218 assert(rollback
.orig_dest
.remote_ino
);
8219 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
8220 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
8223 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
8224 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
8225 le
->commit
.renamed_dirino
= in
->ino();
8226 if (srcdn
->authority().first
== whoami
) {
8228 in
->get_dirfrags(ls
);
8229 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
8231 if (!dir
->is_auth())
8232 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
8234 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
8236 } else if (force_journal_dest
) {
8237 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
8238 le
->commit
.renamed_dirino
= target
->ino();
8241 if (target
&& target
->is_dir()) {
8243 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
8246 if (in
&& in
->is_dir()) {
8248 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
8251 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
8252 assert(le
->commit
.empty());
8253 mdlog
->cancel_entry(le
);
8255 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, finish_mdr
);
8257 assert(!le
->commit
.empty());
8259 mdr
->more()->slave_update_journaled
= false;
8260 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
, srcdn
, srcdnpv
,
8261 destdn
, straydn
, finish_mdr
);
8262 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
8267 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
8268 version_t srcdnpv
, CDentry
*destdn
,
8269 CDentry
*straydn
, bool finish_mdr
)
8271 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
8274 straydn
->get_dir()->unlink_inode(straydn
);
8275 straydn
->pop_projected_linkage();
8278 destdn
->get_dir()->unlink_inode(destdn
);
8279 destdn
->pop_projected_linkage();
8282 srcdn
->pop_projected_linkage();
8283 if (srcdn
->authority().first
== mds
->get_nodeid())
8284 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
8289 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
8290 CInode
*in
= srcdn
->get_linkage()->get_inode();
8291 if (srcdn
->authority().first
== mds
->get_nodeid())
8292 in
->state_set(CInode::STATE_AUTH
);
8293 // update subtree map?
8294 if (in
&& in
->is_dir()) {
8296 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
8301 CInode
*oldin
= destdn
->get_linkage()->get_inode();
8302 // update subtree map?
8303 if (oldin
&& oldin
->is_dir()) {
8305 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
8309 if (mds
->is_resolve()) {
8312 root
= mdcache
->get_subtree_root(straydn
->get_dir());
8314 root
= mdcache
->get_subtree_root(destdn
->get_dir());
8316 mdcache
->try_trim_non_auth_subtree(root
);
8320 list
<MDSInternalContextBase
*> finished
;
8321 if (mdr
->more()->is_ambiguous_auth
) {
8322 if (srcdn
->is_auth())
8323 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
8325 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8326 mdr
->more()->is_ambiguous_auth
= false;
8328 mds
->queue_waiters(finished
);
8329 if (finish_mdr
|| mdr
->aborted
)
8330 mdcache
->request_finish(mdr
);
8332 mdr
->more()->slave_rolling_back
= false;
8335 mdcache
->finish_rollback(mut
->reqid
);
8340 /* This function DOES put the passed message before returning*/
8341 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8343 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8344 << " witnessed by " << ack
->get_source()
8345 << " " << *ack
<< dendl
;
8346 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8349 mdr
->more()->slaves
.insert(from
);
8350 if (mdr
->more()->srcdn_auth_mds
== from
&&
8351 mdr
->more()->is_remote_frozen_authpin
&&
8352 !mdr
->more()->is_ambiguous_auth
) {
8353 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
8356 // witnessed? or add extra witnesses?
8357 assert(mdr
->more()->witnessed
.count(from
) == 0);
8358 if (ack
->is_interrupted()) {
8359 dout(10) << " slave request interrupted, noop" << dendl
;
8360 } else if (ack
->witnesses
.empty()) {
8361 mdr
->more()->witnessed
.insert(from
);
8362 if (!ack
->is_not_journaled())
8363 mdr
->more()->has_journaled_slaves
= true;
8365 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
8366 mdr
->more()->extra_witnesses
.swap(ack
->witnesses
);
8367 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
8371 if (ack
->inode_export
.length()) {
8372 dout(10) << " got srci import" << dendl
;
8373 mdr
->more()->inode_import
.claim(ack
->inode_export
);
8374 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
8377 // remove from waiting list
8378 assert(mdr
->more()->waiting_on_slave
.count(from
));
8379 mdr
->more()->waiting_on_slave
.erase(from
);
8381 if (mdr
->more()->waiting_on_slave
.empty())
8382 dispatch_client_request(mdr
); // go again!
8384 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
8387 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8389 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
8390 << ack
->get_source() << dendl
;
8391 assert(mdr
->is_slave());
8392 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8394 if (mdr
->more()->waiting_on_slave
.count(from
)) {
8395 mdr
->more()->waiting_on_slave
.erase(from
);
8397 if (mdr
->more()->waiting_on_slave
.empty()) {
8398 if (mdr
->slave_request
)
8399 dispatch_slave_request(mdr
);
8401 dout(10) << " still waiting for rename notify acks from "
8402 << mdr
->more()->waiting_on_slave
<< dendl
;
8406 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
8408 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
8410 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
8411 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
8413 if (mdr
->more()->waiting_on_slave
.empty()) {
8414 if (mdr
->slave_request
)
8415 dispatch_slave_request(mdr
);
8417 dout(10) << " still waiting for rename notify acks from "
8418 << mdr
->more()->waiting_on_slave
<< dendl
;
8423 /* This function takes responsibility for the passed mdr*/
8424 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
8426 MClientRequest
*req
= mdr
->client_request
;
8429 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8430 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8431 respond_to_request(mdr
, -ESTALE
);
8434 if (!diri
->is_auth()) {
8435 mdcache
->request_forward(mdr
, diri
->authority().first
);
8438 if (!diri
->is_dir()) {
8439 respond_to_request(mdr
, -ENOTDIR
);
8442 dout(10) << "lssnap on " << *diri
<< dendl
;
8445 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8446 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8447 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8450 if (!check_access(mdr
, diri
, MAY_READ
))
8453 SnapRealm
*realm
= diri
->find_snaprealm();
8454 map
<snapid_t
,SnapInfo
*> infomap
;
8455 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
8457 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
8459 max_entries
= infomap
.size();
8460 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
8462 // make sure at least one item can be encoded
8463 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
8465 __u64 last_snapid
= 0;
8466 string offset_str
= req
->get_path2();
8467 if (!offset_str
.empty())
8468 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
8471 encode_empty_dirstat(dirbl
);
8473 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
8477 map
<snapid_t
,SnapInfo
*>::iterator p
= infomap
.upper_bound(last_snapid
);
8478 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
8479 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
8483 if (p
->second
->ino
== diri
->ino())
8484 snap_name
= std::string(p
->second
->name
);
8486 snap_name
= std::string(p
->second
->get_long_name());
8488 unsigned start_len
= dnbl
.length();
8489 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
8492 ::encode(snap_name
, dnbl
);
8493 encode_infinite_lease(dnbl
);
8495 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
8498 keep
.substr_of(dnbl
, 0, start_len
);
8505 ::encode(num
, dirbl
);
8507 if (p
== infomap
.end()) {
8508 flags
= CEPH_READDIR_FRAG_END
;
8509 if (last_snapid
== 0)
8510 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
8512 ::encode(flags
, dirbl
);
8513 dirbl
.claim_append(dnbl
);
8515 mdr
->reply_extra_bl
= dirbl
;
8517 respond_to_request(mdr
, 0);
8523 struct C_MDS_mksnap_finish
: public ServerLogContext
{
8526 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
8527 ServerLogContext(s
, r
), diri(di
), info(i
) {}
8528 void finish(int r
) override
{
8529 server
->_mksnap_finish(mdr
, diri
, info
);
8533 /* This function takes responsibility for the passed mdr*/
8534 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
8536 if (!mds
->mdsmap
->allows_snaps()) {
8537 // you can't make snapshots until you set an option right now
8538 respond_to_request(mdr
, -EPERM
);
8542 MClientRequest
*req
= mdr
->client_request
;
8543 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8544 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8545 respond_to_request(mdr
, -ESTALE
);
8549 if (!diri
->is_auth()) { // fw to auth?
8550 mdcache
->request_forward(mdr
, diri
->authority().first
);
8555 if (!diri
->is_dir()) {
8556 respond_to_request(mdr
, -ENOTDIR
);
8559 if (diri
->is_system() && !diri
->is_root()) {
8560 // no snaps in system dirs (root is ok)
8561 respond_to_request(mdr
, -EPERM
);
8565 boost::string_view snapname
= req
->get_filepath().last_dentry();
8567 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8568 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8569 respond_to_request(mdr
, -EPERM
);
8573 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
8576 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8578 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8579 rdlocks
.erase(&diri
->snaplock
);
8580 xlocks
.insert(&diri
->snaplock
);
8582 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8585 if (!check_access(mdr
, diri
, MAY_WRITE
))
8588 // make sure name is unique
8589 if (diri
->snaprealm
&&
8590 diri
->snaprealm
->exists(snapname
)) {
8591 respond_to_request(mdr
, -EEXIST
);
8594 if (snapname
.length() == 0 ||
8595 snapname
[0] == '_') {
8596 respond_to_request(mdr
, -EINVAL
);
8600 // allocate a snapid
8601 if (!mdr
->more()->stid
) {
8603 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
8604 mdr
->get_mds_stamp(),
8605 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8606 new C_MDS_RetryRequest(mdcache
, mdr
));
8610 version_t stid
= mdr
->more()->stid
;
8612 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8613 ::decode(snapid
, p
);
8614 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
8618 info
.ino
= diri
->ino();
8619 info
.snapid
= snapid
;
8620 info
.name
= std::string(snapname
);
8621 info
.stamp
= mdr
->get_op_stamp();
8623 auto &pi
= diri
->project_inode(false, true);
8624 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= info
.stamp
;
8625 pi
.inode
.version
= diri
->pre_dirty();
8627 // project the snaprealm
8628 auto &newsnap
= *pi
.snapnode
;
8629 newsnap
.created
= snapid
;
8630 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
8632 em
.first
->second
= info
;
8633 newsnap
.seq
= snapid
;
8634 newsnap
.last_created
= snapid
;
8636 // journal the inode changes
8637 mdr
->ls
= mdlog
->get_current_segment();
8638 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
8639 mdlog
->start_entry(le
);
8641 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8642 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8643 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8644 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8646 // journal the snaprealm changes
8647 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
8652 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
8654 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
8656 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
8658 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8661 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8664 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8666 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
8670 mdr
->snapid
= info
.snapid
;
8672 respond_to_request(mdr
, 0);
8678 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
8681 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8682 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8683 void finish(int r
) override
{
8684 server
->_rmsnap_finish(mdr
, diri
, snapid
);
8688 /* This function takes responsibility for the passed mdr*/
8689 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
8691 MClientRequest
*req
= mdr
->client_request
;
8693 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8694 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8695 respond_to_request(mdr
, -ESTALE
);
8698 if (!diri
->is_auth()) { // fw to auth?
8699 mdcache
->request_forward(mdr
, diri
->authority().first
);
8702 if (!diri
->is_dir()) {
8703 respond_to_request(mdr
, -ENOTDIR
);
8707 boost::string_view snapname
= req
->get_filepath().last_dentry();
8709 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8710 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8711 respond_to_request(mdr
, -EPERM
);
8715 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
8718 if (snapname
.length() == 0 || snapname
[0] == '_') {
8719 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
8722 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
8723 respond_to_request(mdr
, -ENOENT
);
8726 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
8727 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
8729 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8730 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8731 rdlocks
.erase(&diri
->snaplock
);
8732 xlocks
.insert(&diri
->snaplock
);
8734 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8737 if (!check_access(mdr
, diri
, MAY_WRITE
))
8741 if (!mdr
->more()->stid
) {
8742 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
8743 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8744 new C_MDS_RetryRequest(mdcache
, mdr
));
8747 version_t stid
= mdr
->more()->stid
;
8748 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8751 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8754 auto &pi
= diri
->project_inode(false, true);
8755 pi
.inode
.version
= diri
->pre_dirty();
8756 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
8758 mdr
->ls
= mdlog
->get_current_segment();
8759 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
8760 mdlog
->start_entry(le
);
8762 // project the snaprealm
8763 auto &newnode
= *pi
.snapnode
;
8764 newnode
.snaps
.erase(snapid
);
8766 newnode
.last_destroyed
= seq
;
8768 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8769 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8770 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8771 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8773 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
8778 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8780 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8781 snapid_t stid
= mdr
->more()->stid
;
8782 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8786 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8789 mds
->snapclient
->commit(stid
, mdr
->ls
);
8791 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8793 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
8797 respond_to_request(mdr
, 0);
8799 // purge snapshot data
8800 if (diri
->snaprealm
->have_past_parents_open())
8801 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
8804 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
8807 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8808 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8809 void finish(int r
) override
{
8810 server
->_renamesnap_finish(mdr
, diri
, snapid
);
8814 /* This function takes responsibility for the passed mdr*/
8815 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
8817 MClientRequest
*req
= mdr
->client_request
;
8818 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
8819 respond_to_request(mdr
, -EINVAL
);
8823 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8824 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8825 respond_to_request(mdr
, -ESTALE
);
8829 if (!diri
->is_auth()) { // fw to auth?
8830 mdcache
->request_forward(mdr
, diri
->authority().first
);
8834 if (!diri
->is_dir()) { // dir only
8835 respond_to_request(mdr
, -ENOTDIR
);
8839 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
||
8840 mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8841 respond_to_request(mdr
, -EPERM
);
8845 boost::string_view dstname
= req
->get_filepath().last_dentry();
8846 boost::string_view srcname
= req
->get_filepath2().last_dentry();
8847 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
8849 if (srcname
.length() == 0 || srcname
[0] == '_') {
8850 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
8853 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
8854 respond_to_request(mdr
, -ENOENT
);
8857 if (dstname
.length() == 0 || dstname
[0] == '_') {
8858 respond_to_request(mdr
, -EINVAL
);
8861 if (diri
->snaprealm
->exists(dstname
)) {
8862 respond_to_request(mdr
, -EEXIST
);
8866 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
8867 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
8870 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8872 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8873 rdlocks
.erase(&diri
->snaplock
);
8874 xlocks
.insert(&diri
->snaplock
);
8876 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8879 if (!check_access(mdr
, diri
, MAY_WRITE
))
8883 if (!mdr
->more()->stid
) {
8884 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
8885 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8886 new C_MDS_RetryRequest(mdcache
, mdr
));
8890 version_t stid
= mdr
->more()->stid
;
8891 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8894 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8897 auto &pi
= diri
->project_inode(false, true);
8898 pi
.inode
.ctime
= pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
8899 pi
.inode
.version
= diri
->pre_dirty();
8901 // project the snaprealm
8902 auto &newsnap
= *pi
.snapnode
;
8903 auto it
= newsnap
.snaps
.find(snapid
);
8904 assert(it
!= newsnap
.snaps
.end());
8905 it
->second
.name
= std::string(dstname
);
8907 // journal the inode changes
8908 mdr
->ls
= mdlog
->get_current_segment();
8909 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
8910 mdlog
->start_entry(le
);
8912 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8913 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8914 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8915 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8917 // journal the snaprealm changes
8918 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
8923 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8925 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8927 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8930 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8932 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8934 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
, true);
8939 mdr
->snapid
= snapid
;
8940 respond_to_request(mdr
, 0);
8944 * Return true if server is in state RECONNECT and this
8945 * client has not yet reconnected.
8947 bool Server::waiting_for_reconnect(client_t c
) const
8949 return client_reconnect_gather
.count(c
) > 0;
8952 void Server::dump_reconnect_status(Formatter
*f
) const
8954 f
->open_object_section("reconnect_status");
8955 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;