1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
27 #include "MDBalancer.h"
29 #include "SnapClient.h"
32 #include "msg/Messenger.h"
34 #include "osdc/Objecter.h"
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
43 #include "messages/MMDSSlaveRequest.h"
45 #include "messages/MLock.h"
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
66 #include "common/config.h"
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
73 class ServerContext
: public MDSInternalContextBase
{
76 MDSRank
*get_mds() override
82 explicit ServerContext(Server
*s
) : server(s
) {
83 assert(server
!= NULL
);
87 class ServerLogContext
: public MDSLogContextBase
{
90 MDSRank
*get_mds() override
96 void pre_finish(int r
) override
{
98 mdr
->mark_event("journal_committed: ");
101 explicit ServerLogContext(Server
*s
) : server(s
) {
102 assert(server
!= NULL
);
104 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
105 assert(server
!= NULL
);
109 void Server::create_logger()
111 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
112 plb
.add_u64_counter(l_mdss_handle_client_request
,"handle_client_request",
113 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
114 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
115 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
116 plb
.add_u64_counter(l_mdss_handle_client_session
, "handle_client_session",
117 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING
);
118 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request", "Client requests dispatched");
119 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request", "Server requests dispatched");
120 plb
.add_u64_counter(l_mdss_req_lookuphash
, "req_lookuphash",
121 "Request type lookup hash of inode");
122 plb
.add_u64_counter(l_mdss_req_lookupino
, "req_lookupino",
123 "Request type lookup inode");
124 plb
.add_u64_counter(l_mdss_req_lookupparent
, "req_lookupparent",
125 "Request type lookup parent");
126 plb
.add_u64_counter(l_mdss_req_lookupname
, "req_lookupname",
127 "Request type lookup name");
128 plb
.add_u64_counter(l_mdss_req_lookup
, "req_lookup",
129 "Request type lookup");
130 plb
.add_u64_counter(l_mdss_req_lookupsnap
, "req_lookupsnap",
131 "Request type lookup snapshot");
132 plb
.add_u64_counter(l_mdss_req_getattr
, "req_getattr",
133 "Request type get attribute");
134 plb
.add_u64_counter(l_mdss_req_setattr
, "req_setattr",
135 "Request type set attribute");
136 plb
.add_u64_counter(l_mdss_req_setlayout
, "req_setlayout",
137 "Request type set file layout");
138 plb
.add_u64_counter(l_mdss_req_setdirlayout
, "req_setdirlayout",
139 "Request type set directory layout");
140 plb
.add_u64_counter(l_mdss_req_setxattr
, "req_setxattr",
141 "Request type set extended attribute");
142 plb
.add_u64_counter(l_mdss_req_rmxattr
, "req_rmxattr",
143 "Request type remove extended attribute");
144 plb
.add_u64_counter(l_mdss_req_readdir
, "req_readdir",
145 "Request type read directory");
146 plb
.add_u64_counter(l_mdss_req_setfilelock
, "req_setfilelock",
147 "Request type set file lock");
148 plb
.add_u64_counter(l_mdss_req_getfilelock
, "req_getfilelock",
149 "Request type get file lock");
150 plb
.add_u64_counter(l_mdss_req_create
, "req_create",
151 "Request type create");
152 plb
.add_u64_counter(l_mdss_req_open
, "req_open",
153 "Request type open");
154 plb
.add_u64_counter(l_mdss_req_mknod
, "req_mknod",
155 "Request type make node");
156 plb
.add_u64_counter(l_mdss_req_link
, "req_link",
157 "Request type link");
158 plb
.add_u64_counter(l_mdss_req_unlink
, "req_unlink",
159 "Request type unlink");
160 plb
.add_u64_counter(l_mdss_req_rmdir
, "req_rmdir",
161 "Request type remove directory");
162 plb
.add_u64_counter(l_mdss_req_rename
, "req_rename",
163 "Request type rename");
164 plb
.add_u64_counter(l_mdss_req_mkdir
, "req_mkdir",
165 "Request type make directory");
166 plb
.add_u64_counter(l_mdss_req_symlink
, "req_symlink",
167 "Request type symbolic link");
168 plb
.add_u64_counter(l_mdss_req_lssnap
, "req_lssnap",
169 "Request type list snapshot");
170 plb
.add_u64_counter(l_mdss_req_mksnap
, "req_mksnap",
171 "Request type make snapshot");
172 plb
.add_u64_counter(l_mdss_req_rmsnap
, "req_rmsnap",
173 "Request type remove snapshot");
174 plb
.add_u64_counter(l_mdss_req_renamesnap
, "req_renamesnap",
175 "Request type rename snapshot");
176 logger
= plb
.create_perf_counters();
177 g_ceph_context
->get_perfcounters_collection()->add(logger
);
180 Server::Server(MDSRank
*m
) :
182 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
185 reconnect_done(NULL
),
186 failed_reconnects(0),
187 reconnect_evicting(false),
188 terminating_sessions(false)
193 /* This function DOES put the passed message before returning*/
194 void Server::dispatch(Message
*m
)
196 switch (m
->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT
:
198 handle_client_reconnect(static_cast<MClientReconnect
*>(m
));
203 if (!mds
->is_active()) {
204 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&&
205 (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
)) {
206 MClientRequest
*req
= static_cast<MClientRequest
*>(m
);
207 Session
*session
= get_session(req
);
208 if (!session
|| session
->is_closed()) {
209 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
213 bool queue_replay
= false;
214 if (req
->is_replay()) {
215 dout(3) << "queuing replayed op" << dendl
;
217 } else if (req
->get_retry_attempt()) {
218 // process completed request in clientreplay stage. The completed request
219 // might have created new file/directorie. This guarantees MDS sends a reply
220 // to client before other request modifies the new file/directorie.
221 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
222 dout(3) << "queuing completed op" << dendl
;
225 // this request was created before the cap reconnect message, drop any embedded
227 req
->releases
.clear();
230 req
->mark_queued_for_replay();
231 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
236 bool wait_for_active
= true;
237 if (m
->get_type() == MSG_MDS_SLAVE_REQUEST
) {
238 // handle_slave_request() will wait if necessary
239 wait_for_active
= false;
240 } else if (mds
->is_stopping()) {
241 if (m
->get_source().is_mds() ||
242 m
->get_type() == CEPH_MSG_CLIENT_SESSION
)
243 wait_for_active
= false;
244 } else if (mds
->is_clientreplay()) {
245 // session open requests need to be handled during replay,
246 // close requests need to be delayed
247 if ((m
->get_type() == CEPH_MSG_CLIENT_SESSION
&&
248 (static_cast<MClientSession
*>(m
))->get_op() != CEPH_SESSION_REQUEST_CLOSE
)) {
249 wait_for_active
= false;
250 } else if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
) {
251 MClientRequest
*req
= static_cast<MClientRequest
*>(m
);
252 if (req
->is_queued_for_replay()) {
253 wait_for_active
= false;
257 if (wait_for_active
) {
258 dout(3) << "not active yet, waiting" << dendl
;
259 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
264 switch (m
->get_type()) {
265 case CEPH_MSG_CLIENT_SESSION
:
266 handle_client_session(static_cast<MClientSession
*>(m
));
268 case CEPH_MSG_CLIENT_REQUEST
:
269 handle_client_request(static_cast<MClientRequest
*>(m
));
271 case MSG_MDS_SLAVE_REQUEST
:
272 handle_slave_request(static_cast<MMDSSlaveRequest
*>(m
));
275 derr
<< "server unknown message " << m
->get_type() << dendl
;
276 assert(0 == "server unknown message");
282 // ----------------------------------------------------------
283 // SESSION management
285 class C_MDS_session_finish
: public ServerLogContext
{
290 interval_set
<inodeno_t
> inos
;
294 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
295 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
296 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
>& i
, version_t iv
, Context
*fin_
= NULL
) :
297 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(i
), inotablev(iv
), fin(fin_
) { }
298 void finish(int r
) override
{
300 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
);
307 Session
*Server::get_session(Message
*m
)
309 Session
*session
= static_cast<Session
*>(m
->get_connection()->get_priv());
311 dout(20) << "get_session have " << session
<< " " << session
->info
.inst
312 << " state " << session
->get_state_name() << dendl
;
313 session
->put(); // not carry ref
315 dout(20) << "get_session dne for " << m
->get_source_inst() << dendl
;
320 /* This function DOES put the passed message before returning*/
321 void Server::handle_client_session(MClientSession
*m
)
324 bool blacklisted
= false;
325 Session
*session
= get_session(m
);
327 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
328 assert(m
->get_source().is_client()); // should _not_ come from an mds!
331 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
337 logger
->inc(l_mdss_handle_client_session
);
340 switch (m
->get_op()) {
341 case CEPH_SESSION_REQUEST_OPEN
:
342 if (session
->is_opening() ||
343 session
->is_open() ||
344 session
->is_stale() ||
345 session
->is_killing()) {
346 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
347 // set client metadata for session opened by prepare_force_open_sessions
348 if (!m
->client_meta
.empty())
349 session
->set_client_metadata(m
->client_meta
);
353 assert(session
->is_closed() ||
354 session
->is_closing());
356 if (mds
->is_stopping()) {
357 dout(10) << "mds is stopping, dropping open req" << dendl
;
362 blacklisted
= mds
->objecter
->with_osdmap(
363 [session
](const OSDMap
&osd_map
) -> bool {
364 return osd_map
.is_blacklisted(session
->info
.inst
.addr
);
368 dout(10) << "ignoring blacklisted client " << session
->info
.inst
.addr
<< dendl
;
373 session
->set_client_metadata(m
->client_meta
);
374 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN "
375 << session
->info
.client_metadata
.size() << " metadata entries:" << dendl
;
376 for (map
<string
, string
>::iterator i
= session
->info
.client_metadata
.begin();
377 i
!= session
->info
.client_metadata
.end(); ++i
) {
378 dout(20) << " " << i
->first
<< ": " << i
->second
<< dendl
;
381 // Special case for the 'root' metadata path; validate that the claimed
382 // root is actually within the caps of the session
383 if (session
->info
.client_metadata
.count("root")) {
384 const auto claimed_root
= session
->info
.client_metadata
.at("root");
385 // claimed_root has a leading "/" which we strip before passing
387 if (claimed_root
.empty() || claimed_root
[0] != '/' ||
388 !session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
389 derr
<< __func__
<< " forbidden path claimed as mount root: "
390 << claimed_root
<< " by " << m
->get_source() << dendl
;
391 // Tell the client we're rejecting their open
392 mds
->send_message_client(new MClientSession(CEPH_SESSION_REJECT
), session
);
393 mds
->clog
->warn() << "client session with invalid root '" <<
394 claimed_root
<< "' denied (" << session
->info
.inst
<< ")";
396 // Drop out; don't record this session in SessionMap or journal it.
401 if (session
->is_closed())
402 mds
->sessionmap
.add_session(session
);
404 pv
= mds
->sessionmap
.mark_projected(session
);
405 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
406 mds
->sessionmap
.touch_session(session
);
407 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, m
->client_meta
),
408 new C_MDS_session_finish(this, session
, sseq
, true, pv
));
412 case CEPH_SESSION_REQUEST_RENEWCAPS
:
413 if (session
->is_open() ||
414 session
->is_stale()) {
415 mds
->sessionmap
.touch_session(session
);
416 if (session
->is_stale()) {
417 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
418 mds
->locker
->resume_stale_caps(session
);
419 mds
->sessionmap
.touch_session(session
);
421 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS
, m
->get_seq()));
423 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
427 case CEPH_SESSION_REQUEST_CLOSE
:
429 if (session
->is_closed() ||
430 session
->is_closing() ||
431 session
->is_killing()) {
432 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
436 if (session
->is_importing()) {
437 dout(10) << "ignoring close req on importing session" << dendl
;
441 assert(session
->is_open() ||
442 session
->is_stale() ||
443 session
->is_opening());
444 if (m
->get_seq() < session
->get_push_seq()) {
445 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
446 << ", dropping" << dendl
;
450 // We are getting a seq that is higher than expected.
451 // Handle the same as any other seqn error.
453 if (m
->get_seq() != session
->get_push_seq()) {
454 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
455 << ", BUGGY!" << dendl
;
456 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
457 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
461 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
465 case CEPH_SESSION_FLUSHMSG_ACK
:
466 finish_flush_session(session
, m
->get_seq());
469 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
470 if (mds
->is_active())
480 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
482 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
483 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
485 if (!session
->is_open() ||
486 !session
->connection
.get() ||
487 !session
->connection
->has_feature(CEPH_FEATURE_EXPORT_PEER
))
489 version_t seq
= session
->wait_for_flush(gather
.new_sub());
490 mds
->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG
, seq
), session
);
494 void Server::finish_flush_session(Session
*session
, version_t seq
)
496 list
<MDSInternalContextBase
*> finished
;
497 session
->finish_flush(seq
, finished
);
498 mds
->queue_waiters(finished
);
501 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
502 interval_set
<inodeno_t
>& inos
, version_t piv
)
504 dout(10) << "_session_logged " << session
->info
.inst
<< " state_seq " << state_seq
<< " " << (open
? "open":"close")
505 << " " << pv
<< dendl
;
508 assert(session
->is_closing() || session
->is_killing() ||
509 session
->is_opening()); // re-open closing session
510 session
->info
.prealloc_inos
.subtract(inos
);
511 mds
->inotable
->apply_release_ids(inos
);
512 assert(mds
->inotable
->get_version() == piv
);
515 mds
->sessionmap
.mark_dirty(session
);
518 if (session
->get_state_seq() != state_seq
) {
519 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
520 << ", noop" << dendl
;
521 // close must have been canceled (by an import?), or any number of other things..
523 assert(session
->is_opening());
524 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
525 mds
->sessionmap
.touch_session(session
);
526 assert(session
->connection
!= NULL
);
527 session
->connection
->send_message(new MClientSession(CEPH_SESSION_OPEN
));
528 if (mdcache
->is_readonly())
529 session
->connection
->send_message(new MClientSession(CEPH_SESSION_FORCE_RO
));
530 } else if (session
->is_closing() ||
531 session
->is_killing()) {
532 // kill any lingering capabilities, leases, requests
533 while (!session
->caps
.empty()) {
534 Capability
*cap
= session
->caps
.front();
535 CInode
*in
= cap
->get_inode();
536 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
537 mds
->locker
->remove_client_cap(in
, session
->info
.inst
.name
.num());
539 while (!session
->leases
.empty()) {
540 ClientLease
*r
= session
->leases
.front();
541 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
542 dout(20) << " killing client lease of " << *dn
<< dendl
;
543 dn
->remove_client_lease(r
, mds
->locker
);
545 if (client_reconnect_gather
.count(session
->info
.get_client())) {
546 dout(20) << " removing client from reconnect set" << dendl
;
547 client_reconnect_gather
.erase(session
->info
.get_client());
549 if (client_reconnect_gather
.empty()) {
550 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
551 reconnect_gather_finish();
555 if (session
->is_closing()) {
556 // mark con disposable. if there is a fault, we will get a
557 // reset and clean it up. if the client hasn't received the
558 // CLOSE message yet, they will reconnect and get an
559 // ms_handle_remote_reset() and realize they had in fact closed.
560 // do this *before* sending the message to avoid a possible
562 if (session
->connection
!= NULL
) {
563 // Conditional because terminate_sessions will indiscrimately
564 // put sessions in CLOSING whether they ever had a conn or not.
565 session
->connection
->mark_disposable();
569 mds
->send_message_client(new MClientSession(CEPH_SESSION_CLOSE
), session
);
570 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
572 mds
->sessionmap
.remove_session(session
);
573 } else if (session
->is_killing()) {
574 // destroy session, close connection
575 if (session
->connection
!= NULL
) {
576 session
->connection
->mark_down();
578 mds
->sessionmap
.remove_session(session
);
588 * Inject sessions from some source other than actual connections.
591 * - sessions inferred from journal replay
592 * - sessions learned from other MDSs during rejoin
593 * - sessions learned from other MDSs during dir/caps migration
594 * - sessions learned from other MDSs during a cross-MDS rename
596 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
597 map
<client_t
,uint64_t>& sseqmap
)
599 version_t pv
= mds
->sessionmap
.get_projected();
601 dout(10) << "prepare_force_open_sessions " << pv
602 << " on " << cm
.size() << " clients"
604 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
606 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
607 pv
= mds
->sessionmap
.mark_projected(session
);
608 if (session
->is_closed() ||
609 session
->is_closing() ||
610 session
->is_killing())
611 sseqmap
[p
->first
] = mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
613 assert(session
->is_open() ||
614 session
->is_opening() ||
615 session
->is_stale());
616 session
->inc_importing();
621 void Server::finish_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
622 map
<client_t
,uint64_t>& sseqmap
,
626 * FIXME: need to carefully consider the race conditions between a
627 * client trying to close a session and an MDS doing an import
628 * trying to force open a session...
630 dout(10) << "finish_force_open_sessions on " << cm
.size() << " clients,"
631 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
634 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
636 Session
*session
= mds
->sessionmap
.get_session(p
->second
.name
);
639 if (sseqmap
.count(p
->first
)) {
640 uint64_t sseq
= sseqmap
[p
->first
];
641 if (session
->get_state_seq() != sseq
) {
642 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
644 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
645 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
646 mds
->sessionmap
.touch_session(session
);
647 mds
->send_message_client(new MClientSession(CEPH_SESSION_OPEN
), session
);
648 if (mdcache
->is_readonly())
649 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
652 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
653 assert(session
->is_open() || session
->is_stale());
657 session
->dec_importing();
660 mds
->sessionmap
.mark_dirty(session
);
663 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
666 class C_MDS_TerminatedSessions
: public ServerContext
{
667 void finish(int r
) override
{
668 server
->terminating_sessions
= false;
671 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
674 void Server::terminate_sessions()
676 dout(2) << "terminate_sessions" << dendl
;
678 terminating_sessions
= true;
680 // kill them off. clients will retry etc.
681 set
<Session
*> sessions
;
682 mds
->sessionmap
.get_client_session_set(sessions
);
683 for (set
<Session
*>::const_iterator p
= sessions
.begin();
686 Session
*session
= *p
;
687 if (session
->is_closing() ||
688 session
->is_killing() ||
689 session
->is_closed())
691 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
694 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
698 void Server::find_idle_sessions()
700 dout(10) << "find_idle_sessions. laggy until " << mds
->get_laggy_until() << dendl
;
703 // (caps go stale, lease die)
704 utime_t now
= ceph_clock_now();
705 utime_t cutoff
= now
;
706 cutoff
-= g_conf
->mds_session_timeout
;
708 Session
*session
= mds
->sessionmap
.get_oldest_session(Session::STATE_OPEN
);
710 dout(20) << "laggiest active session is " << session
->info
.inst
<< dendl
;
711 if (session
->last_cap_renew
>= cutoff
) {
712 dout(20) << "laggiest active session is " << session
->info
.inst
<< " and sufficiently new ("
713 << session
->last_cap_renew
<< ")" << dendl
;
717 dout(10) << "new stale session " << session
->info
.inst
<< " last " << session
->last_cap_renew
<< dendl
;
718 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
719 mds
->locker
->revoke_stale_caps(session
);
720 mds
->locker
->remove_stale_leases(session
);
721 mds
->send_message_client(new MClientSession(CEPH_SESSION_STALE
, session
->get_push_seq()), session
);
722 finish_flush_session(session
, session
->get_push_seq());
727 cutoff
-= g_conf
->mds_session_autoclose
;
729 // don't kick clients if we've been laggy
730 if (mds
->get_laggy_until() > cutoff
) {
731 dout(10) << " laggy_until " << mds
->get_laggy_until() << " > cutoff " << cutoff
732 << ", not kicking any clients to be safe" << dendl
;
736 if (mds
->sessionmap
.get_sessions().size() == 1 &&
737 mds
->mdsmap
->get_num_in_mds() == 1) {
738 dout(20) << "not evicting a slow client, because there is only one"
743 // Collect a list of sessions exceeding the autoclose threshold
744 std::vector
<Session
*> to_evict
;
745 const auto sessions_p
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
746 if (sessions_p
== mds
->sessionmap
.by_state
.end() || sessions_p
->second
->empty()) {
749 const auto &stale_sessions
= sessions_p
->second
;
750 assert(stale_sessions
!= nullptr);
752 for (const auto &session
: *stale_sessions
) {
753 if (session
->is_importing()) {
754 dout(10) << "stopping at importing session " << session
->info
.inst
<< dendl
;
757 assert(session
->is_stale());
758 if (session
->last_cap_renew
>= cutoff
) {
759 dout(20) << "oldest stale session is " << session
->info
.inst
<< " and sufficiently new ("
760 << session
->last_cap_renew
<< ")" << dendl
;
764 to_evict
.push_back(session
);
767 for (const auto &session
: to_evict
) {
769 age
-= session
->last_cap_renew
;
770 mds
->clog
->warn() << "evicting unresponsive client " << *session
771 << ", after " << age
<< " seconds";
772 dout(10) << "autoclosing stale session " << session
->info
.inst
<< " last "
773 << session
->last_cap_renew
<< dendl
;
775 if (g_conf
->mds_session_blacklist_on_timeout
) {
776 std::stringstream ss
;
777 mds
->evict_client(session
->info
.inst
.name
.num(), false, true,
780 kill_session(session
, NULL
);
786 * XXX bump in the interface here, not using an MDSInternalContextBase here
787 * because all the callers right now happen to use a SaferCond
789 void Server::kill_session(Session
*session
, Context
*on_safe
)
791 assert(mds
->mds_lock
.is_locked_by_me());
793 if ((session
->is_opening() ||
794 session
->is_open() ||
795 session
->is_stale()) &&
796 !session
->is_importing()) {
797 dout(10) << "kill_session " << session
<< dendl
;
798 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
800 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
801 assert(session
->is_closing() ||
802 session
->is_closed() ||
803 session
->is_killing() ||
804 session
->is_importing());
806 on_safe
->complete(0);
811 size_t Server::apply_blacklist(const std::set
<entity_addr_t
> &blacklist
)
813 std::list
<Session
*> victims
;
814 const auto sessions
= mds
->sessionmap
.get_sessions();
815 for (const auto p
: sessions
) {
816 if (!p
.first
.is_client()) {
817 // Do not apply OSDMap blacklist to MDS daemons, we find out
818 // about their death via MDSMap.
822 Session
*s
= p
.second
;
823 if (blacklist
.count(s
->info
.inst
.addr
)) {
824 victims
.push_back(s
);
828 for (const auto s
: victims
) {
829 kill_session(s
, nullptr);
832 dout(10) << "apply_blacklist: killed " << victims
.size() << dendl
;
834 return victims
.size();
837 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
839 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
840 version_t pv
= mds
->sessionmap
.mark_projected(session
);
843 // release alloc and pending-alloc inos for this session
844 // and wipe out session state, in case the session close aborts for some reason
845 interval_set
<inodeno_t
> both
;
846 both
.insert(session
->info
.prealloc_inos
);
847 both
.insert(session
->pending_prealloc_inos
);
849 mds
->inotable
->project_release_ids(both
);
850 piv
= mds
->inotable
->get_projected_version();
854 mdlog
->start_submit_entry(new ESession(session
->info
.inst
, false, pv
, both
, piv
),
855 new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
));
858 // clean up requests, too
859 elist
<MDRequestImpl
*>::iterator p
=
860 session
->requests
.begin(member_offset(MDRequestImpl
,
861 item_session_request
));
863 MDRequestRef mdr
= mdcache
->request_get((*p
)->reqid
);
865 mdcache
->request_kill(mdr
);
868 finish_flush_session(session
, session
->get_push_seq());
871 void Server::reconnect_clients(MDSInternalContext
*reconnect_done_
)
873 reconnect_done
= reconnect_done_
;
874 mds
->sessionmap
.get_client_set(client_reconnect_gather
);
876 if (client_reconnect_gather
.empty()) {
877 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
878 reconnect_gather_finish();
882 // clients will get the mdsmap and discover we're reconnecting via the monitor.
884 reconnect_start
= ceph_clock_now();
885 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
886 mds
->sessionmap
.dump();
889 /* This function DOES put the passed message before returning*/
890 void Server::handle_client_reconnect(MClientReconnect
*m
)
892 dout(7) << "handle_client_reconnect " << m
->get_source() << dendl
;
893 client_t from
= m
->get_source().num();
894 Session
*session
= get_session(m
);
897 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
898 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
899 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
903 utime_t delay
= ceph_clock_now();
904 delay
-= reconnect_start
;
905 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
908 if (!mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
909 // XXX maybe in the future we can do better than this?
910 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
911 mds
->clog
->info() << "denied reconnect attempt (mds is "
912 << ceph_mds_state_name(mds
->get_state())
913 << ") from " << m
->get_source_inst()
914 << " after " << delay
<< " (allowed interval " << g_conf
->mds_reconnect_timeout
<< ")";
916 } else if (session
->is_closed()) {
917 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl
;
918 mds
->clog
->info() << "denied reconnect attempt (mds is "
919 << ceph_mds_state_name(mds
->get_state())
920 << ") from " << m
->get_source_inst() << " (session is closed)";
922 } else if (mdcache
->is_readonly()) {
923 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl
;
924 mds
->clog
->info() << "denied reconnect attempt (mds is read-only)";
929 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE
));
934 // notify client of success with an OPEN
935 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN
));
936 session
->last_cap_renew
= ceph_clock_now();
937 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
940 for (vector
<ceph_mds_snaprealm_reconnect
>::iterator p
= m
->realms
.begin();
941 p
!= m
->realms
.end();
943 CInode
*in
= mdcache
->get_inode(inodeno_t(p
->ino
));
944 if (in
&& in
->state_test(CInode::STATE_PURGING
))
947 assert(in
->snaprealm
);
948 if (in
->snaprealm
->have_past_parents_open()) {
949 dout(15) << "open snaprealm (w/ past parents) on " << *in
<< dendl
;
950 mdcache
->finish_snaprealm_reconnect(from
, in
->snaprealm
, snapid_t(p
->seq
));
952 dout(15) << "open snaprealm (w/o past parents) on " << *in
<< dendl
;
953 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
956 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p
->ino
)
957 << " seq " << p
->seq
<< dendl
;
958 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
963 for (map
<inodeno_t
, cap_reconnect_t
>::iterator p
= m
->caps
.begin();
966 // make sure our last_cap_id is MAX over all issued caps
967 if (p
->second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
968 mdcache
->last_cap_id
= p
->second
.capinfo
.cap_id
;
970 CInode
*in
= mdcache
->get_inode(p
->first
);
971 if (in
&& in
->state_test(CInode::STATE_PURGING
))
973 if (in
&& in
->is_auth()) {
974 // we recovered it, and it's ours. take note.
975 dout(15) << "open cap realm " << inodeno_t(p
->second
.capinfo
.snaprealm
)
976 << " on " << *in
<< dendl
;
977 in
->reconnect_cap(from
, p
->second
, session
);
978 mdcache
->add_reconnected_cap(from
, p
->first
, p
->second
);
979 recover_filelocks(in
, p
->second
.flockbl
, m
->get_orig_source().num());
983 if (in
&& !in
->is_auth()) {
985 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
986 // add to cap export list.
987 p
->second
.path
.clear(); // we don't need path
988 mdcache
->rejoin_export_caps(p
->first
, from
, p
->second
,
989 in
->authority().first
);
991 // don't know if the inode is mine
992 dout(10) << "missing ino " << p
->first
<< ", will load later" << dendl
;
993 p
->second
.path
.clear(); // we don't need path
994 mdcache
->rejoin_recovered_caps(p
->first
, from
, p
->second
, MDS_RANK_NONE
);
998 // remove from gather set
999 client_reconnect_gather
.erase(from
);
1000 if (client_reconnect_gather
.empty())
1001 reconnect_gather_finish();
1008 void Server::reconnect_gather_finish()
1010 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1011 assert(reconnect_done
);
1012 reconnect_done
->complete(0);
1013 reconnect_done
= NULL
;
1016 void Server::reconnect_tick()
1018 if (reconnect_evicting
) {
1019 dout(4) << "reconnect_tick: waiting for evictions" << dendl
;
1023 utime_t reconnect_end
= reconnect_start
;
1024 reconnect_end
+= g_conf
->mds_reconnect_timeout
;
1025 if (ceph_clock_now() >= reconnect_end
&&
1026 !client_reconnect_gather
.empty()) {
1027 dout(10) << "reconnect timed out" << dendl
;
1029 // If we're doing blacklist evictions, use this to wait for them before
1030 // proceeding to reconnect_gather_finish
1031 MDSGatherBuilder
gather(g_ceph_context
);
1033 for (set
<client_t
>::iterator p
= client_reconnect_gather
.begin();
1034 p
!= client_reconnect_gather
.end();
1036 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
1038 dout(1) << "reconnect gave up on " << session
->info
.inst
<< dendl
;
1040 mds
->clog
->warn() << "evicting unresponsive client " << *session
1041 << ", after waiting " << g_conf
->mds_reconnect_timeout
1042 << " seconds during MDS startup";
1044 if (g_conf
->mds_session_blacklist_on_timeout
) {
1045 std::stringstream ss
;
1046 mds
->evict_client(session
->info
.inst
.name
.num(), false, true, ss
,
1049 kill_session(session
, NULL
);
1052 failed_reconnects
++;
1054 client_reconnect_gather
.clear();
1056 if (gather
.has_subs()) {
1057 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1058 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new FunctionContext(
1059 [this](int r
){reconnect_gather_finish();})));
1061 reconnect_evicting
= true;
1063 reconnect_gather_finish();
1068 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1070 if (!locks
.length()) return;
1073 bufferlist::iterator p
= locks
.begin();
1074 ::decode(numlocks
, p
);
1075 for (int i
= 0; i
< numlocks
; ++i
) {
1077 lock
.client
= client
;
1078 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1079 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1081 ::decode(numlocks
, p
);
1082 for (int i
= 0; i
< numlocks
; ++i
) {
1084 lock
.client
= client
;
1085 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1086 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1092 * Call this when the MDCache is oversized, to send requests to the clients
1093 * to trim some caps, and consequently unpin some inodes in the MDCache so
1094 * that it can trim too.
1096 void Server::recall_client_state(void)
1098 /* try to recall at least 80% of all caps */
1099 uint64_t max_caps_per_client
= Capability::count() * g_conf
->get_val
<double>("mds_max_ratio_caps_per_client");
1100 uint64_t min_caps_per_client
= g_conf
->get_val
<uint64_t>("mds_min_caps_per_client");
1101 if (max_caps_per_client
< min_caps_per_client
) {
1102 dout(0) << "max_caps_per_client " << max_caps_per_client
1103 << " < min_caps_per_client " << min_caps_per_client
<< dendl
;
1104 max_caps_per_client
= min_caps_per_client
+ 1;
1107 /* unless this ratio is smaller: */
1108 /* ratio: determine the amount of caps to recall from each client. Use
1109 * percentage full over the cache reservation. Cap the ratio at 80% of client
1111 double ratio
= 1.0-fmin(0.80, mdcache
->cache_toofull_ratio());
1113 dout(10) << "recall_client_state " << ratio
1114 << ", caps per client " << min_caps_per_client
<< "-" << max_caps_per_client
1117 set
<Session
*> sessions
;
1118 mds
->sessionmap
.get_client_session_set(sessions
);
1119 for (auto &session
: sessions
) {
1120 if (!session
->is_open() ||
1121 !session
->info
.inst
.name
.is_client())
1124 dout(10) << " session " << session
->info
.inst
1125 << " caps " << session
->caps
.size()
1126 << ", leases " << session
->leases
.size()
1129 uint64_t newlim
= MAX(MIN((session
->caps
.size() * ratio
), max_caps_per_client
), min_caps_per_client
);
1130 if (session
->caps
.size() > newlim
) {
1131 MClientSession
*m
= new MClientSession(CEPH_SESSION_RECALL_STATE
);
1132 m
->head
.max_caps
= newlim
;
1133 mds
->send_message_client(m
, session
);
1134 session
->notify_recall_sent(newlim
);
1139 void Server::force_clients_readonly()
1141 dout(10) << "force_clients_readonly" << dendl
;
1142 set
<Session
*> sessions
;
1143 mds
->sessionmap
.get_client_session_set(sessions
);
1144 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1145 p
!= sessions
.end();
1147 Session
*session
= *p
;
1148 if (!session
->info
.inst
.name
.is_client() ||
1149 !(session
->is_open() || session
->is_stale()))
1151 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
1156 * some generic stuff for finishing off requests
1158 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1160 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1161 assert(!mdr
->has_completed
);
1163 // note trace items for eventual reply.
1172 early_reply(mdr
, in
, dn
);
1174 mdr
->committing
= true;
1175 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1177 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1178 if (mds
->queue_one_replay()) {
1179 dout(10) << " queued next replay op" << dendl
;
1181 dout(10) << " journaled last replay op, flushing" << dendl
;
1184 } else if (mdr
->did_early_reply
)
1185 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1190 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1194 string
event_str("submit entry: ");
1196 mdr
->mark_event_string(event_str
);
1198 mdlog
->submit_entry(le
, fin
);
1202 * send response built from mdr contents and error code; clean up mdr
1204 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1206 if (mdr
->client_request
) {
1207 reply_client_request(mdr
, new MClientReply(mdr
->client_request
, r
));
1209 // add here to avoid counting ops multiple times (e.g., locks, loading)
1210 switch(mdr
->client_request
->get_op()) {
1211 case CEPH_MDS_OP_LOOKUPHASH
:
1212 logger
->inc(l_mdss_req_lookuphash
);
1214 case CEPH_MDS_OP_LOOKUPINO
:
1215 logger
->inc(l_mdss_req_lookupino
);
1217 case CEPH_MDS_OP_LOOKUPPARENT
:
1218 logger
->inc(l_mdss_req_lookupparent
);
1220 case CEPH_MDS_OP_LOOKUPNAME
:
1221 logger
->inc(l_mdss_req_lookupname
);
1223 case CEPH_MDS_OP_LOOKUP
:
1224 logger
->inc(l_mdss_req_lookup
);
1226 case CEPH_MDS_OP_LOOKUPSNAP
:
1227 logger
->inc(l_mdss_req_lookupsnap
);
1229 case CEPH_MDS_OP_GETATTR
:
1230 logger
->inc(l_mdss_req_getattr
);
1232 case CEPH_MDS_OP_SETATTR
:
1233 logger
->inc(l_mdss_req_setattr
);
1235 case CEPH_MDS_OP_SETLAYOUT
:
1236 logger
->inc(l_mdss_req_setlayout
);
1238 case CEPH_MDS_OP_SETDIRLAYOUT
:
1239 logger
->inc(l_mdss_req_setdirlayout
);
1241 case CEPH_MDS_OP_SETXATTR
:
1242 logger
->inc(l_mdss_req_setxattr
);
1244 case CEPH_MDS_OP_RMXATTR
:
1245 logger
->inc(l_mdss_req_rmxattr
);
1247 case CEPH_MDS_OP_READDIR
:
1248 logger
->inc(l_mdss_req_readdir
);
1250 case CEPH_MDS_OP_SETFILELOCK
:
1251 logger
->inc(l_mdss_req_setfilelock
);
1253 case CEPH_MDS_OP_GETFILELOCK
:
1254 logger
->inc(l_mdss_req_getfilelock
);
1256 case CEPH_MDS_OP_CREATE
:
1257 logger
->inc(l_mdss_req_create
);
1258 case CEPH_MDS_OP_OPEN
:
1259 logger
->inc(l_mdss_req_open
);
1261 case CEPH_MDS_OP_MKNOD
:
1262 logger
->inc(l_mdss_req_mknod
);
1264 case CEPH_MDS_OP_LINK
:
1265 logger
->inc(l_mdss_req_link
);
1267 case CEPH_MDS_OP_UNLINK
:
1268 logger
->inc(l_mdss_req_unlink
);
1270 case CEPH_MDS_OP_RMDIR
:
1271 logger
->inc(l_mdss_req_rmdir
);
1273 case CEPH_MDS_OP_RENAME
:
1274 logger
->inc(l_mdss_req_rename
);
1276 case CEPH_MDS_OP_MKDIR
:
1277 logger
->inc(l_mdss_req_mkdir
);
1279 case CEPH_MDS_OP_SYMLINK
:
1280 logger
->inc(l_mdss_req_symlink
);
1282 case CEPH_MDS_OP_LSSNAP
:
1283 logger
->inc(l_mdss_req_lssnap
);
1285 case CEPH_MDS_OP_MKSNAP
:
1286 logger
->inc(l_mdss_req_mksnap
);
1288 case CEPH_MDS_OP_RMSNAP
:
1289 logger
->inc(l_mdss_req_rmsnap
);
1291 case CEPH_MDS_OP_RENAMESNAP
:
1292 logger
->inc(l_mdss_req_renamesnap
);
1295 } else if (mdr
->internal_op
> -1) {
1296 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1297 if (!mdr
->internal_op_finish
)
1298 assert(0 == "trying to respond to internal op without finisher");
1299 mdr
->internal_op_finish
->complete(r
);
1300 mdcache
->request_finish(mdr
);
1304 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1306 if (!g_conf
->mds_early_reply
)
1309 if (mdr
->no_early_reply
) {
1310 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
1314 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
1315 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
1319 if (mdr
->alloc_ino
) {
1320 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
1324 MClientRequest
*req
= mdr
->client_request
;
1325 entity_inst_t client_inst
= req
->get_source_inst();
1326 if (client_inst
.name
.is_mds())
1329 if (req
->is_replay()) {
1330 dout(10) << " no early reply on replay op" << dendl
;
1335 MClientReply
*reply
= new MClientReply(req
, 0);
1336 reply
->set_unsafe();
1338 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1340 //_rename_finish() does not send dentry link/unlink message to replicas.
1341 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1342 // that have projected linkages from getting new replica.
1343 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
1345 dout(10) << "early_reply " << reply
->get_result()
1346 << " (" << cpp_strerror(reply
->get_result())
1347 << ") " << *req
<< dendl
;
1349 if (tracei
|| tracedn
) {
1351 mdr
->cap_releases
.erase(tracei
->vino());
1353 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1355 set_trace_dist(mdr
->session
, reply
, tracei
, tracedn
, mdr
->snapid
,
1356 req
->get_dentry_wanted(), mdr
);
1359 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1360 req
->get_connection()->send_message(reply
);
1362 mdr
->did_early_reply
= true;
1364 mds
->logger
->inc(l_mds_reply
);
1365 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
1366 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1367 dout(20) << "lat " << lat
<< dendl
;
1369 mdr
->mark_event("early_replied");
1374 * include a trace to tracei
1377 void Server::reply_client_request(MDRequestRef
& mdr
, MClientReply
*reply
)
1380 MClientRequest
*req
= mdr
->client_request
;
1382 dout(7) << "reply_client_request " << reply
->get_result()
1383 << " (" << cpp_strerror(reply
->get_result())
1384 << ") " << *req
<< dendl
;
1386 mdr
->mark_event("replying");
1388 Session
*session
= mdr
->session
;
1390 // note successful request in session map?
1392 // setfilelock requests are special, they only modify states in MDS memory.
1393 // The states get lost when MDS fails. If Client re-send a completed
1394 // setfilelock request, it means that client did not receive corresponding
1395 // setfilelock reply. So MDS should re-execute the setfilelock request.
1396 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
1397 reply
->get_result() == 0 && session
) {
1398 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
1399 session
->add_completed_request(mdr
->reqid
.tid
, created
);
1401 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
1405 // give any preallocated inos to the session
1406 apply_allocated_inos(mdr
, session
);
1408 // get tracei/tracedn from mdr?
1409 snapid_t snapid
= mdr
->snapid
;
1410 CInode
*tracei
= mdr
->tracei
;
1411 CDentry
*tracedn
= mdr
->tracedn
;
1413 bool is_replay
= mdr
->client_request
->is_replay();
1414 bool did_early_reply
= mdr
->did_early_reply
;
1415 entity_inst_t client_inst
= req
->get_source_inst();
1416 int dentry_wanted
= req
->get_dentry_wanted();
1418 if (!did_early_reply
&& !is_replay
) {
1420 mds
->logger
->inc(l_mds_reply
);
1421 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
1422 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1423 dout(20) << "lat " << lat
<< dendl
;
1426 mdr
->cap_releases
.erase(tracei
->vino());
1428 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1431 // drop non-rdlocks before replying, so that we can issue leases
1432 mdcache
->request_drop_non_rdlocks(mdr
);
1435 if (client_inst
.name
.is_mds() || !session
) {
1436 reply
->put(); // mds doesn't need a reply
1440 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
1441 (tracei
|| tracedn
)) {
1444 mdcache
->try_reconnect_cap(tracei
, session
);
1446 // include metadata in reply
1447 set_trace_dist(session
, reply
, tracei
, tracedn
,
1448 snapid
, dentry_wanted
,
1453 // We can set the extra bl unconditionally: if it's already been sent in the
1454 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1455 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1457 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
1458 req
->get_connection()->send_message(reply
);
1461 if (req
->is_queued_for_replay() &&
1462 (mdr
->has_completed
|| reply
->get_result() < 0)) {
1463 if (reply
->get_result() < 0) {
1464 int r
= reply
->get_result();
1465 derr
<< "reply_client_request: failed to replay " << *req
1466 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
1467 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
1469 mds
->queue_one_replay();
1473 mdcache
->request_finish(mdr
);
1475 // take a closer look at tracei, if it happens to be a remote link
1478 tracedn
->get_projected_linkage()->is_remote()) {
1479 mdcache
->eval_remote(tracedn
);
1484 void Server::encode_empty_dirstat(bufferlist
& bl
)
1486 static DirStat empty
;
1490 void Server::encode_infinite_lease(bufferlist
& bl
)
1497 dout(20) << "encode_infinite_lease " << e
<< dendl
;
1500 void Server::encode_null_lease(bufferlist
& bl
)
1507 dout(20) << "encode_null_lease " << e
<< dendl
;
1512 * pass inode OR dentry (not both, or we may get confused)
1514 * trace is in reverse order (i.e. root inode comes last)
1516 void Server::set_trace_dist(Session
*session
, MClientReply
*reply
,
1517 CInode
*in
, CDentry
*dn
,
1522 // skip doing this for debugging purposes?
1523 if (g_conf
->mds_inject_traceless_reply_probability
&&
1524 mdr
->ls
&& !mdr
->o_trunc
&&
1525 (rand() % 10000 < g_conf
->mds_inject_traceless_reply_probability
* 10000.0)) {
1526 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
1530 // inode, dentry, dir, ..., inode
1532 mds_rank_t whoami
= mds
->get_nodeid();
1533 client_t client
= session
->get_client();
1534 utime_t now
= ceph_clock_now();
1536 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
1538 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1541 if (snapid
== CEPH_NOSNAP
) {
1544 realm
= in
->find_snaprealm();
1546 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
1547 reply
->snapbl
= realm
->get_snap_trace();
1548 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
1553 reply
->head
.is_dentry
= 1;
1554 CDir
*dir
= dn
->get_dir();
1555 CInode
*diri
= dir
->get_inode();
1557 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
1558 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
1560 #ifdef MDS_VERIFY_FRAGSTAT
1561 if (dir
->is_complete())
1562 dir
->verify_fragstat();
1564 dir
->encode_dirstat(bl
, whoami
);
1565 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
1567 ::encode(dn
->get_name(), bl
);
1568 if (snapid
== CEPH_NOSNAP
)
1569 mds
->locker
->issue_client_lease(dn
, client
, bl
, now
, session
);
1571 encode_null_lease(bl
);
1572 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
1574 reply
->head
.is_dentry
= 0;
1578 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
1579 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
1580 reply
->head
.is_target
= 1;
1582 reply
->head
.is_target
= 0;
1584 reply
->set_trace(bl
);
1591 * process a client request
1592 * This function DOES put the passed message before returning
1594 void Server::handle_client_request(MClientRequest
*req
)
1596 dout(4) << "handle_client_request " << *req
<< dendl
;
1599 mds
->logger
->inc(l_mds_request
);
1601 logger
->inc(l_mdss_handle_client_request
);
1603 if (!mdcache
->is_open()) {
1604 dout(5) << "waiting for root" << dendl
;
1605 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
1610 Session
*session
= 0;
1611 if (req
->get_source().is_client()) {
1612 session
= get_session(req
);
1614 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
1615 } else if (session
->is_closed() ||
1616 session
->is_closing() ||
1617 session
->is_killing()) {
1618 dout(5) << "session closed|closing|killing, dropping" << dendl
;
1622 if (req
->is_queued_for_replay())
1623 mds
->queue_one_replay();
1630 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
1631 // send it? hrm, this isn't ideal; they may get a lot of copies if
1632 // they have a high request rate.
1635 // completed request?
1636 bool has_completed
= false;
1637 if (req
->is_replay() || req
->get_retry_attempt()) {
1640 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
1641 has_completed
= true;
1642 // Don't send traceless reply if the completed request has created
1643 // new inode. Treat the request as lookup request instead.
1644 if (req
->is_replay() ||
1645 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
1646 req
->get_op() != CEPH_MDS_OP_OPEN
&&
1647 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
1648 dout(5) << "already completed " << req
->get_reqid() << dendl
;
1649 MClientReply
*reply
= new MClientReply(req
, 0);
1650 if (created
!= inodeno_t()) {
1652 ::encode(created
, extra
);
1653 reply
->set_extra_bl(extra
);
1655 req
->get_connection()->send_message(reply
);
1657 if (req
->is_queued_for_replay())
1658 mds
->queue_one_replay();
1663 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
1664 req
->get_op() != CEPH_MDS_OP_CREATE
) {
1665 dout(10) << " completed request which created new inode " << created
1666 << ", convert it to lookup request" << dendl
;
1667 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
1668 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
1673 // trim completed_request list
1674 if (req
->get_oldest_client_tid() > 0) {
1675 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
1677 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
1678 // Sessions 'completed_requests' was dirtied, mark it to be
1679 // potentially flushed at segment expiry.
1680 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
1682 if (session
->get_num_trim_requests_warnings() > 0 &&
1683 session
->get_num_completed_requests() * 2 < g_conf
->mds_max_completed_requests
)
1684 session
->reset_num_trim_requests_warnings();
1686 if (session
->get_num_completed_requests() >=
1687 (g_conf
->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
1688 session
->inc_num_trim_requests_warnings();
1690 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
1691 << req
->get_oldest_client_tid() << "), "
1692 << session
->get_num_completed_requests()
1693 << " completed requests recorded in session\n";
1694 mds
->clog
->warn() << ss
.str();
1695 dout(20) << __func__
<< " " << ss
.str() << dendl
;
1700 // register + dispatch
1701 MDRequestRef mdr
= mdcache
->request_start(req
);
1706 mdr
->session
= session
;
1707 session
->requests
.push_back(&mdr
->item_session_request
);
1711 mdr
->has_completed
= true;
1713 // process embedded cap releases?
1714 // (only if NOT replay!)
1715 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
1716 client_t client
= req
->get_source().num();
1717 for (vector
<MClientRequest::Release
>::iterator p
= req
->releases
.begin();
1718 p
!= req
->releases
.end();
1720 mds
->locker
->process_request_cap_release(mdr
, client
, p
->item
, p
->dname
);
1721 req
->releases
.clear();
1724 dispatch_client_request(mdr
);
1728 void Server::handle_osd_map()
1730 /* Note that we check the OSDMAP_FULL flag directly rather than
1731 * using osdmap_full_flag(), because we want to know "is the flag set"
1732 * rather than "does the flag apply to us?" */
1733 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
1734 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
1735 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
1736 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
1737 << o
.get_epoch() << dendl
;
1741 void Server::dispatch_client_request(MDRequestRef
& mdr
)
1743 // we shouldn't be waiting on anyone.
1744 assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
1747 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
1751 MClientRequest
*req
= mdr
->client_request
;
1753 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
1755 dout(7) << "dispatch_client_request " << *req
<< dendl
;
1757 if (req
->may_write()) {
1758 if (mdcache
->is_readonly()) {
1759 dout(10) << " read-only FS" << dendl
;
1760 respond_to_request(mdr
, -EROFS
);
1763 if (mdr
->has_more() && mdr
->more()->slave_error
) {
1764 dout(10) << " got error from slaves" << dendl
;
1765 respond_to_request(mdr
, mdr
->more()->slave_error
);
1771 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1772 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
1773 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1774 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
1775 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
1776 req
->get_op() == CEPH_MDS_OP_CREATE
||
1777 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
1778 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
1779 ((req
->get_op() == CEPH_MDS_OP_LINK
||
1780 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
1781 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
1784 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1785 respond_to_request(mdr
, -ENOSPC
);
1788 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1792 switch (req
->get_op()) {
1793 case CEPH_MDS_OP_LOOKUPHASH
:
1794 case CEPH_MDS_OP_LOOKUPINO
:
1795 handle_client_lookup_ino(mdr
, false, false);
1797 case CEPH_MDS_OP_LOOKUPPARENT
:
1798 handle_client_lookup_ino(mdr
, true, false);
1800 case CEPH_MDS_OP_LOOKUPNAME
:
1801 handle_client_lookup_ino(mdr
, false, true);
1805 case CEPH_MDS_OP_LOOKUP
:
1806 handle_client_getattr(mdr
, true);
1809 case CEPH_MDS_OP_LOOKUPSNAP
:
1810 // lookupsnap does not reference a CDentry; treat it as a getattr
1811 case CEPH_MDS_OP_GETATTR
:
1812 handle_client_getattr(mdr
, false);
1815 case CEPH_MDS_OP_SETATTR
:
1816 handle_client_setattr(mdr
);
1818 case CEPH_MDS_OP_SETLAYOUT
:
1819 handle_client_setlayout(mdr
);
1821 case CEPH_MDS_OP_SETDIRLAYOUT
:
1822 handle_client_setdirlayout(mdr
);
1824 case CEPH_MDS_OP_SETXATTR
:
1825 handle_client_setxattr(mdr
);
1827 case CEPH_MDS_OP_RMXATTR
:
1828 handle_client_removexattr(mdr
);
1831 case CEPH_MDS_OP_READDIR
:
1832 handle_client_readdir(mdr
);
1835 case CEPH_MDS_OP_SETFILELOCK
:
1836 handle_client_file_setlock(mdr
);
1839 case CEPH_MDS_OP_GETFILELOCK
:
1840 handle_client_file_readlock(mdr
);
1844 case CEPH_MDS_OP_CREATE
:
1845 if (mdr
->has_completed
)
1846 handle_client_open(mdr
); // already created.. just open
1848 handle_client_openc(mdr
);
1851 case CEPH_MDS_OP_OPEN
:
1852 handle_client_open(mdr
);
1857 case CEPH_MDS_OP_MKNOD
:
1858 handle_client_mknod(mdr
);
1860 case CEPH_MDS_OP_LINK
:
1861 handle_client_link(mdr
);
1863 case CEPH_MDS_OP_UNLINK
:
1864 case CEPH_MDS_OP_RMDIR
:
1865 handle_client_unlink(mdr
);
1867 case CEPH_MDS_OP_RENAME
:
1868 handle_client_rename(mdr
);
1870 case CEPH_MDS_OP_MKDIR
:
1871 handle_client_mkdir(mdr
);
1873 case CEPH_MDS_OP_SYMLINK
:
1874 handle_client_symlink(mdr
);
1879 case CEPH_MDS_OP_LSSNAP
:
1880 handle_client_lssnap(mdr
);
1882 case CEPH_MDS_OP_MKSNAP
:
1883 handle_client_mksnap(mdr
);
1885 case CEPH_MDS_OP_RMSNAP
:
1886 handle_client_rmsnap(mdr
);
1888 case CEPH_MDS_OP_RENAMESNAP
:
1889 handle_client_renamesnap(mdr
);
1893 dout(1) << " unknown client op " << req
->get_op() << dendl
;
1894 respond_to_request(mdr
, -EOPNOTSUPP
);
1899 // ---------------------------------------
1902 /* This function DOES put the passed message before returning*/
1903 void Server::handle_slave_request(MMDSSlaveRequest
*m
)
1905 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
1906 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1908 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
1912 return handle_slave_request_reply(m
);
1914 // the purpose of rename notify is enforcing causal message ordering. making sure
1915 // bystanders have received all messages from rename srcdn's auth MDS.
1916 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
1917 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(m
->get_reqid(), m
->get_attempt(),
1918 MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
1919 mds
->send_message(reply
, m
->get_connection());
1924 CDentry
*straydn
= NULL
;
1925 if (m
->stray
.length() > 0) {
1926 straydn
= mdcache
->add_replica_stray(m
->stray
, from
);
1931 // am i a new slave?
1933 if (mdcache
->have_request(m
->get_reqid())) {
1935 mdr
= mdcache
->request_get(m
->get_reqid());
1937 // is my request newer?
1938 if (mdr
->attempt
> m
->get_attempt()) {
1939 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
1940 << ", dropping " << *m
<< dendl
;
1946 if (mdr
->attempt
< m
->get_attempt()) {
1947 // mine is old, close it out
1948 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
1949 << ", closing out" << dendl
;
1950 mdcache
->request_finish(mdr
);
1952 } else if (mdr
->slave_to_mds
!= from
) {
1953 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
1958 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
&& m
->is_abort()) {
1959 mdr
->aborted
= true;
1960 if (mdr
->slave_request
) {
1961 // only abort on-going xlock, wrlock and auth pin
1962 assert(!mdr
->slave_did_prepare());
1964 mdcache
->request_finish(mdr
);
1971 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
1972 dout(10) << "missing slave request for " << m
->get_reqid()
1973 << " OP_FINISH, must have lost race with a forward" << dendl
;
1977 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
1978 mdr
->set_op_stamp(m
->op_stamp
);
1980 assert(mdr
->slave_request
== 0); // only one at a time, please!
1984 mdr
->straydn
= straydn
;
1987 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
1988 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
1989 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
1991 } else if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
1992 mdr
->locks
.empty()) {
1993 dout(3) << "not active yet, waiting" << dendl
;
1994 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
1998 mdr
->slave_request
= m
;
2000 dispatch_slave_request(mdr
);
2003 /* This function DOES put the passed message before returning*/
2004 void Server::handle_slave_request_reply(MMDSSlaveRequest
*m
)
2006 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2008 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2009 metareqid_t r
= m
->get_reqid();
2010 if (!mdcache
->have_uncommitted_master(r
, from
)) {
2011 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2012 << from
<< " reqid " << r
<< dendl
;
2016 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2017 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2021 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
2022 metareqid_t r
= m
->get_reqid();
2023 mdcache
->committed_master_slave(r
, from
);
2028 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2029 if (m
->get_attempt() != mdr
->attempt
) {
2030 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
2031 << m
->get_attempt() << dendl
;
2036 switch (m
->get_op()) {
2037 case MMDSSlaveRequest::OP_XLOCKACK
:
2039 // identify lock, master request
2040 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2041 m
->get_object_info());
2042 mdr
->more()->slaves
.insert(from
);
2043 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2044 mdr
->xlocks
.insert(lock
);
2045 mdr
->locks
.insert(lock
);
2046 mdr
->finish_locking(lock
);
2047 lock
->get_xlock(mdr
, mdr
->get_client());
2049 assert(mdr
->more()->waiting_on_slave
.count(from
));
2050 mdr
->more()->waiting_on_slave
.erase(from
);
2051 assert(mdr
->more()->waiting_on_slave
.empty());
2052 mdcache
->dispatch_request(mdr
);
2056 case MMDSSlaveRequest::OP_WRLOCKACK
:
2058 // identify lock, master request
2059 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2060 m
->get_object_info());
2061 mdr
->more()->slaves
.insert(from
);
2062 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2063 mdr
->remote_wrlocks
[lock
] = from
;
2064 mdr
->locks
.insert(lock
);
2065 mdr
->finish_locking(lock
);
2067 assert(mdr
->more()->waiting_on_slave
.count(from
));
2068 mdr
->more()->waiting_on_slave
.erase(from
);
2069 assert(mdr
->more()->waiting_on_slave
.empty());
2070 mdcache
->dispatch_request(mdr
);
2074 case MMDSSlaveRequest::OP_AUTHPINACK
:
2075 handle_slave_auth_pin_ack(mdr
, m
);
2078 case MMDSSlaveRequest::OP_LINKPREPACK
:
2079 handle_slave_link_prep_ack(mdr
, m
);
2082 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
2083 handle_slave_rmdir_prep_ack(mdr
, m
);
2086 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
2087 handle_slave_rename_prep_ack(mdr
, m
);
2090 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
2091 handle_slave_rename_notify_ack(mdr
, m
);
2102 /* This function DOES put the mdr->slave_request before returning*/
2103 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
2105 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
2108 dout(7) << " abort flag set, finishing" << dendl
;
2109 mdcache
->request_finish(mdr
);
2113 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
2115 int op
= mdr
->slave_request
->get_op();
2117 case MMDSSlaveRequest::OP_XLOCK
:
2118 case MMDSSlaveRequest::OP_WRLOCK
:
2121 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2122 mdr
->slave_request
->get_object_info());
2125 dout(10) << "don't have object, dropping" << dendl
;
2126 ceph_abort(); // can this happen, if we auth pinned properly.
2128 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2129 dout(10) << "not auth for remote xlock attempt, dropping on "
2130 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2132 // use acquire_locks so that we get auth_pinning.
2133 set
<SimpleLock
*> rdlocks
;
2134 set
<SimpleLock
*> wrlocks
= mdr
->wrlocks
;
2135 set
<SimpleLock
*> xlocks
= mdr
->xlocks
;
2139 case MMDSSlaveRequest::OP_XLOCK
:
2140 xlocks
.insert(lock
);
2141 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2143 case MMDSSlaveRequest::OP_WRLOCK
:
2144 wrlocks
.insert(lock
);
2145 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2149 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
2153 MMDSSlaveRequest
*r
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, replycode
);
2154 r
->set_lock_type(lock
->get_type());
2155 lock
->get_parent()->set_object_info(r
->get_object_info());
2156 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2160 mdr
->slave_request
->put();
2161 mdr
->slave_request
= 0;
2165 case MMDSSlaveRequest::OP_UNXLOCK
:
2166 case MMDSSlaveRequest::OP_UNWRLOCK
:
2168 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2169 mdr
->slave_request
->get_object_info());
2171 bool need_issue
= false;
2173 case MMDSSlaveRequest::OP_UNXLOCK
:
2174 mds
->locker
->xlock_finish(lock
, mdr
.get(), &need_issue
);
2176 case MMDSSlaveRequest::OP_UNWRLOCK
:
2177 mds
->locker
->wrlock_finish(lock
, mdr
.get(), &need_issue
);
2181 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2183 // done. no ack necessary.
2184 mdr
->slave_request
->put();
2185 mdr
->slave_request
= 0;
2189 case MMDSSlaveRequest::OP_DROPLOCKS
:
2190 mds
->locker
->drop_locks(mdr
.get());
2191 mdr
->slave_request
->put();
2192 mdr
->slave_request
= 0;
2195 case MMDSSlaveRequest::OP_AUTHPIN
:
2196 handle_slave_auth_pin(mdr
);
2199 case MMDSSlaveRequest::OP_LINKPREP
:
2200 case MMDSSlaveRequest::OP_UNLINKPREP
:
2201 handle_slave_link_prep(mdr
);
2204 case MMDSSlaveRequest::OP_RMDIRPREP
:
2205 handle_slave_rmdir_prep(mdr
);
2208 case MMDSSlaveRequest::OP_RENAMEPREP
:
2209 handle_slave_rename_prep(mdr
);
2212 case MMDSSlaveRequest::OP_FINISH
:
2213 // information about rename imported caps
2214 if (mdr
->slave_request
->inode_export
.length() > 0)
2215 mdr
->more()->inode_import
.claim(mdr
->slave_request
->inode_export
);
2216 // finish off request.
2217 mdcache
->request_finish(mdr
);
2225 /* This function DOES put the mdr->slave_request before returning*/
2226 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2228 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2230 // build list of objects
2231 list
<MDSCacheObject
*> objects
;
2232 CInode
*auth_pin_freeze
= NULL
;
2233 bool fail
= false, wouldblock
= false, readonly
= false;
2235 if (mdcache
->is_readonly()) {
2236 dout(10) << " read-only FS" << dendl
;
2242 for (vector
<MDSCacheObjectInfo
>::iterator p
= mdr
->slave_request
->get_authpins().begin();
2243 p
!= mdr
->slave_request
->get_authpins().end();
2245 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2247 dout(10) << " don't have " << *p
<< dendl
;
2252 objects
.push_back(object
);
2253 if (*p
== mdr
->slave_request
->get_authpin_freeze())
2254 auth_pin_freeze
= static_cast<CInode
*>(object
);
2258 // can we auth pin them?
2260 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2263 if (!(*p
)->is_auth()) {
2264 dout(10) << " not auth for " << **p
<< dendl
;
2268 if (mdr
->is_auth_pinned(*p
))
2270 if (!mdr
->can_auth_pin(*p
)) {
2271 if (mdr
->slave_request
->is_nonblock()) {
2272 dout(10) << " can't auth_pin (freezing?) " << **p
<< " nonblocking" << dendl
;
2278 dout(10) << " waiting for authpinnable on " << **p
<< dendl
;
2279 (*p
)->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2280 mdr
->drop_local_auth_pins();
2282 mds
->locker
->notify_freeze_waiter(*p
);
2290 mdr
->drop_local_auth_pins(); // just in case
2292 /* freeze authpin wrong inode */
2293 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2294 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2295 mdr
->unfreeze_auth_pin(true);
2297 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2298 * on the source inode to complete. This happens after all locks for the rename
2299 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2300 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2301 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2302 * The solution is freeze the inode and prevent other MDRequests from getting new
2305 if (auth_pin_freeze
) {
2306 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
2307 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
2308 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
2309 mds
->mdlog
->flush();
2313 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2316 dout(10) << "auth_pinning " << **p
<< dendl
;
2322 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
2324 // return list of my auth_pins (if any)
2325 for (set
<MDSCacheObject
*>::iterator p
= mdr
->auth_pins
.begin();
2326 p
!= mdr
->auth_pins
.end();
2328 MDSCacheObjectInfo info
;
2329 (*p
)->set_object_info(info
);
2330 reply
->get_authpins().push_back(info
);
2331 if (*p
== (MDSCacheObject
*)auth_pin_freeze
)
2332 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
2336 reply
->mark_error_wouldblock();
2338 reply
->mark_error_rofs();
2340 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
2342 // clean up this request
2343 mdr
->slave_request
->put();
2344 mdr
->slave_request
= 0;
2348 /* This function DOES NOT put the passed ack before returning*/
2349 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
2351 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
2352 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
2355 set
<MDSCacheObject
*> pinned
;
2356 for (vector
<MDSCacheObjectInfo
>::iterator p
= ack
->get_authpins().begin();
2357 p
!= ack
->get_authpins().end();
2359 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2360 assert(object
); // we pinned it
2361 dout(10) << " remote has pinned " << *object
<< dendl
;
2362 if (!mdr
->is_auth_pinned(object
))
2363 mdr
->remote_auth_pins
[object
] = from
;
2364 if (*p
== ack
->get_authpin_freeze())
2365 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
2366 pinned
.insert(object
);
2369 // removed frozen auth pin ?
2370 if (mdr
->more()->is_remote_frozen_authpin
&&
2371 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
2372 auto p
= mdr
->remote_auth_pins
.find(mdr
->more()->rename_inode
);
2373 assert(p
!= mdr
->remote_auth_pins
.end());
2374 if (p
->second
== from
) {
2375 mdr
->more()->is_remote_frozen_authpin
= false;
2379 // removed auth pins?
2380 map
<MDSCacheObject
*, mds_rank_t
>::iterator p
= mdr
->remote_auth_pins
.begin();
2381 while (p
!= mdr
->remote_auth_pins
.end()) {
2382 MDSCacheObject
* object
= p
->first
;
2383 if (p
->second
== from
&& pinned
.count(object
) == 0) {
2384 dout(10) << " remote has unpinned " << *object
<< dendl
;
2385 mdr
->remote_auth_pins
.erase(p
++);
2391 if (ack
->is_error_rofs()) {
2392 mdr
->more()->slave_error
= -EROFS
;
2393 mdr
->aborted
= true;
2394 } else if (ack
->is_error_wouldblock()) {
2395 mdr
->more()->slave_error
= -EWOULDBLOCK
;
2396 mdr
->aborted
= true;
2400 mdr
->more()->slaves
.insert(from
);
2402 // clear from waiting list
2403 assert(mdr
->more()->waiting_on_slave
.count(from
));
2404 mdr
->more()->waiting_on_slave
.erase(from
);
2407 if (mdr
->more()->waiting_on_slave
.empty())
2408 mdcache
->dispatch_request(mdr
);
2410 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
2414 // ---------------------------------------
2419 * check whether we are permitted to complete a request
2421 * Check whether we have permission to perform the operation specified
2422 * by mask on the given inode, based on the capability in the mdr's
2425 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
2428 int r
= mdr
->session
->check_access(
2430 mdr
->client_request
->get_caller_uid(),
2431 mdr
->client_request
->get_caller_gid(),
2432 &mdr
->client_request
->get_caller_gid_list(),
2433 mdr
->client_request
->head
.args
.setattr
.uid
,
2434 mdr
->client_request
->head
.args
.setattr
.gid
);
2436 respond_to_request(mdr
, r
);
2444 * check whether fragment has reached maximum size
2447 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
2449 const auto size
= in
->get_frag_size();
2450 if (size
>= g_conf
->mds_bal_fragment_size_max
) {
2451 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf
->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
2452 respond_to_request(mdr
, -ENOSPC
);
2460 /** validate_dentry_dir
2462 * verify that the dir exists and would own the dname.
2463 * do not check if the dentry exists.
2465 CDir
*Server::validate_dentry_dir(MDRequestRef
& mdr
, CInode
*diri
, const string
& dname
)
2467 // make sure parent is a dir?
2468 if (!diri
->is_dir()) {
2469 dout(7) << "validate_dentry_dir: not a dir" << dendl
;
2470 respond_to_request(mdr
, -ENOTDIR
);
2475 frag_t fg
= diri
->pick_dirfrag(dname
);
2476 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
2481 if (dir
->is_frozen()) {
2482 dout(7) << "dir is frozen " << *dir
<< dendl
;
2483 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2491 /** prepare_null_dentry
2492 * prepare a null (or existing) dentry in given dir.
2493 * wait for any dn lock.
2495 CDentry
* Server::prepare_null_dentry(MDRequestRef
& mdr
, CDir
*dir
, const string
& dname
, bool okexist
)
2497 dout(10) << "prepare_null_dentry " << dname
<< " in " << *dir
<< dendl
;
2498 assert(dir
->is_auth());
2500 client_t client
= mdr
->get_client();
2502 // does it already exist?
2503 CDentry
*dn
= dir
->lookup(dname
);
2506 if (dn->lock.is_xlocked_by_other(mdr)) {
2507 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2508 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2512 if (!dn
->get_linkage(client
, mdr
)->is_null()) {
2513 // name already exists
2514 dout(10) << "dentry " << dname
<< " exists in " << *dir
<< dendl
;
2516 respond_to_request(mdr
, -EEXIST
);
2520 dn
->first
= dir
->inode
->find_snaprealm()->get_newest_seq() + 1;
2526 // make sure dir is complete
2527 if (!dir
->is_complete() && (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2528 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2529 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2534 dn
= dir
->add_null_dentry(dname
, dir
->inode
->find_snaprealm()->get_newest_seq() + 1);
2536 dout(10) << "prepare_null_dentry added " << *dn
<< dendl
;
2540 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
2542 CDentry
*straydn
= mdr
->straydn
;
2545 in
->name_stray_dentry(straydname
);
2546 if (straydn
->get_name() == straydname
)
2549 assert(!mdr
->done_locking
);
2550 mdr
->unpin(straydn
);
2553 CDir
*straydir
= mdcache
->get_stray_dir(in
);
2555 if (!mdr
->client_request
->is_replay() &&
2556 !check_fragment_space(mdr
, straydir
))
2559 straydn
= mdcache
->get_or_create_stray_dentry(in
);
2560 mdr
->straydn
= straydn
;
2565 /** prepare_new_inode
2567 * create a new inode. set c/m/atime. hit dir pop.
2569 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
2570 file_layout_t
*layout
)
2572 CInode
*in
= new CInode(mdcache
);
2574 // Server::prepare_force_open_sessions() can re-open session in closing
2575 // state. In that corner case, session's prealloc_inos are being freed.
2576 // To simplify the code, we disallow using/refilling session's prealloc_ino
2577 // while session is opening.
2578 bool allow_prealloc_inos
= !mdr
->session
->is_opening();
2581 if (allow_prealloc_inos
&&
2582 mdr
->session
->info
.prealloc_inos
.size()) {
2583 mdr
->used_prealloc_ino
=
2584 in
->inode
.ino
= mdr
->session
->take_ino(useino
); // prealloc -> used
2585 mds
->sessionmap
.mark_projected(mdr
->session
);
2587 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
2588 << " (" << mdr
->session
->info
.prealloc_inos
2589 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
2593 in
->inode
.ino
= mds
->inotable
->project_alloc_id();
2594 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
2597 if (useino
&& useino
!= in
->inode
.ino
) {
2598 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
2599 mds
->clog
->error() << mdr
->client_request
->get_source()
2600 << " specified ino " << useino
2601 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
2602 //ceph_abort(); // just for now.
2605 if (allow_prealloc_inos
&&
2606 mdr
->session
->get_num_projected_prealloc_inos() < g_conf
->mds_client_prealloc_inos
/ 2) {
2607 int need
= g_conf
->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
2608 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
2609 assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
2610 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
2611 mds
->sessionmap
.mark_projected(mdr
->session
);
2612 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
2615 in
->inode
.version
= 1;
2616 in
->inode
.xattr_version
= 1;
2617 in
->inode
.nlink
= 1; // FIXME
2619 in
->inode
.mode
= mode
;
2621 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
2622 if (in
->inode
.is_dir()) {
2623 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
2624 } else if (layout
) {
2625 in
->inode
.layout
= *layout
;
2627 in
->inode
.layout
= mdcache
->default_file_layout
;
2630 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
2631 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
2633 CInode
*diri
= dir
->get_inode();
2635 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
2637 if (diri
->inode
.mode
& S_ISGID
) {
2638 dout(10) << " dir is sticky" << dendl
;
2639 in
->inode
.gid
= diri
->inode
.gid
;
2640 if (S_ISDIR(mode
)) {
2641 dout(10) << " new dir also sticky" << dendl
;
2642 in
->inode
.mode
|= S_ISGID
;
2645 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
2647 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
2649 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
2650 mdr
->get_op_stamp();
2652 in
->inode
.change_attr
= 0;
2654 MClientRequest
*req
= mdr
->client_request
;
2655 if (req
->get_data().length()) {
2656 bufferlist::iterator p
= req
->get_data().begin();
2658 // xattrs on new inode?
2659 map
<string
,bufferptr
> xattrs
;
2660 ::decode(xattrs
, p
);
2661 for (map
<string
,bufferptr
>::iterator p
= xattrs
.begin(); p
!= xattrs
.end(); ++p
) {
2662 dout(10) << "prepare_new_inode setting xattr " << p
->first
<< dendl
;
2663 in
->xattrs
[p
->first
] = p
->second
;
2667 if (!mds
->mdsmap
->get_inline_data_enabled() ||
2668 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
2669 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
2671 mdcache
->add_inode(in
); // add
2672 dout(10) << "prepare_new_inode " << *in
<< dendl
;
2676 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
2678 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
2679 << " inotablev " << mds
->inotable
->get_projected_version()
2681 blob
->set_ino_alloc(mdr
->alloc_ino
,
2682 mdr
->used_prealloc_ino
,
2684 mdr
->client_request
->get_source(),
2685 mds
->sessionmap
.get_projected(),
2686 mds
->inotable
->get_projected_version());
2689 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
2691 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
2692 << " / " << mdr
->prealloc_inos
2693 << " / " << mdr
->used_prealloc_ino
<< dendl
;
2695 if (mdr
->alloc_ino
) {
2696 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
2698 if (mdr
->prealloc_inos
.size()) {
2700 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
2701 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
2702 mds
->sessionmap
.mark_dirty(session
);
2703 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
2705 if (mdr
->used_prealloc_ino
) {
2707 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
2708 mds
->sessionmap
.mark_dirty(session
);
2712 class C_MDS_TryFindInode
: public ServerContext
{
2715 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
2716 void finish(int r
) override
{
2717 if (r
== -ESTALE
) // :( find_ino_peers failed
2718 server
->respond_to_request(mdr
, r
);
2720 server
->dispatch_client_request(mdr
);
2724 CDir
*Server::traverse_to_auth_dir(MDRequestRef
& mdr
, vector
<CDentry
*> &trace
, filepath refpath
)
2726 // figure parent dir vs dname
2727 if (refpath
.depth() == 0) {
2728 dout(7) << "can't do that to root" << dendl
;
2729 respond_to_request(mdr
, -EINVAL
);
2732 string dname
= refpath
.last_dentry();
2733 refpath
.pop_dentry();
2735 dout(10) << "traverse_to_auth_dir dirpath " << refpath
<< " dname " << dname
<< dendl
;
2737 // traverse to parent dir
2739 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &trace
, &diri
, MDS_TRAVERSE_FORWARD
);
2740 if (r
> 0) return 0; // delayed
2743 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2744 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
2747 respond_to_request(mdr
, r
);
2751 // is it an auth dir?
2752 CDir
*dir
= validate_dentry_dir(mdr
, diri
, dname
);
2754 return 0; // forwarded or waiting for freeze
2756 dout(10) << "traverse_to_auth_dir " << *dir
<< dendl
;
2760 /* If this returns null, the request has been handled
2761 * as appropriate: forwarded on, or the client's been replied to */
2762 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
, int n
,
2763 set
<SimpleLock
*> &rdlocks
,
2765 bool no_want_auth
, /* for readdir, who doesn't want auth _even_if_ it's
2767 file_layout_t
**layout
,
2768 bool no_lookup
) // true if we cannot return a null dentry lease
2770 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2771 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
2773 if (mdr
->done_locking
)
2777 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &mdr
->dn
[n
], &mdr
->in
[n
], MDS_TRAVERSE_FORWARD
);
2779 return NULL
; // delayed
2780 if (r
< 0) { // error
2781 if (r
== -ENOENT
&& n
== 0 && mdr
->dn
[n
].size()) {
2783 mdr
->tracedn
= mdr
->dn
[n
][mdr
->dn
[n
].size()-1];
2784 respond_to_request(mdr
, r
);
2785 } else if (r
== -ESTALE
) {
2786 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2787 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
2788 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
2790 dout(10) << "FAIL on error " << r
<< dendl
;
2791 respond_to_request(mdr
, r
);
2795 CInode
*ref
= mdr
->in
[n
];
2796 dout(10) << "ref is " << *ref
<< dendl
;
2798 // fw to inode auth?
2799 if (mdr
->snapid
!= CEPH_NOSNAP
&& !no_want_auth
)
2803 if (ref
->is_ambiguous_auth()) {
2804 dout(10) << "waiting for single auth on " << *ref
<< dendl
;
2805 ref
->add_waiter(CInode::WAIT_SINGLEAUTH
, new C_MDS_RetryRequest(mdcache
, mdr
));
2808 if (!ref
->is_auth()) {
2809 dout(10) << "fw to auth for " << *ref
<< dendl
;
2810 mdcache
->request_forward(mdr
, ref
->authority().first
);
2815 // do NOT proceed if freezing, as cap release may defer in that case, and
2816 // we could deadlock when we try to lock @ref.
2817 // if we're already auth_pinned, continue; the release has already been processed.
2818 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
2819 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
2820 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
2821 ref
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2822 /* If we have any auth pins, this will deadlock.
2823 * But the only way to get here if we've already got auth pins
2824 * is because we're on an inode with snapshots that got updated
2825 * between dispatches of this request. So we're going to drop
2826 * our locks and our auth pins and reacquire them later.
2828 * This is safe since we're only in this function when working on
2829 * a single MDS request; otherwise we'd be in
2830 * rdlock_path_xlock_dentry.
2832 mds
->locker
->drop_locks(mdr
.get(), NULL
);
2833 mdr
->drop_local_auth_pins();
2834 if (!mdr
->remote_auth_pins
.empty())
2835 mds
->locker
->notify_freeze_waiter(ref
);
2842 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2843 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2845 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, ref
, layout
);
2847 mds
->locker
->include_snap_rdlocks(rdlocks
, ref
);
2855 /** rdlock_path_xlock_dentry
2856 * traverse path to the directory that could/would contain dentry.
2857 * make sure i am auth for that dentry, forward as necessary.
2858 * create null dentry in place (or use existing if okexist).
2859 * get rdlocks on traversed dentries, xlock on new dentry.
2861 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
, int n
,
2862 set
<SimpleLock
*>& rdlocks
, set
<SimpleLock
*>& wrlocks
, set
<SimpleLock
*>& xlocks
,
2863 bool okexist
, bool mustexist
, bool alwaysxlock
,
2864 file_layout_t
**layout
)
2866 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2868 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
2870 client_t client
= mdr
->get_client();
2872 if (mdr
->done_locking
)
2873 return mdr
->dn
[n
].back();
2875 CDir
*dir
= traverse_to_auth_dir(mdr
, mdr
->dn
[n
], refpath
);
2878 CInode
*diri
= dir
->get_inode();
2879 if (!mdr
->reqid
.name
.is_mds()) {
2880 if (diri
->is_system() && !diri
->is_root()) {
2881 respond_to_request(mdr
, -EROFS
);
2885 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
2886 respond_to_request(mdr
, -ENOENT
);
2890 // make a null dentry?
2891 const string
&dname
= refpath
.last_dentry();
2894 dn
= dir
->lookup(dname
);
2896 // make sure dir is complete
2897 if (!dn
&& !dir
->is_complete() &&
2898 (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2899 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2900 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2905 if (dn
&& !dn
->lock
.can_read(client
) && dn
->lock
.get_xlock_by() != mdr
) {
2906 dout(10) << "waiting on xlocked dentry " << *dn
<< dendl
;
2907 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryRequest(mdcache
, mdr
));
2912 if (!dn
|| dn
->get_linkage(client
, mdr
)->is_null()) {
2913 dout(7) << "dentry " << dname
<< " dne in " << *dir
<< dendl
;
2914 respond_to_request(mdr
, -ENOENT
);
2918 dn
= prepare_null_dentry(mdr
, dir
, dname
, okexist
);
2923 mdr
->dn
[n
].push_back(dn
);
2924 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
2925 mdr
->in
[n
] = dnl
->get_inode();
2928 // NOTE: rename takes the same set of locks for srcdn
2929 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2930 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2931 if (alwaysxlock
|| dnl
->is_null())
2932 xlocks
.insert(&dn
->lock
); // new dn, xlock
2934 rdlocks
.insert(&dn
->lock
); // existing dn, rdlock
2935 wrlocks
.insert(&dn
->get_dir()->inode
->filelock
); // also, wrlock on dir mtime
2936 wrlocks
.insert(&dn
->get_dir()->inode
->nestlock
); // also, wrlock on dir mtime
2938 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, dn
->get_dir()->inode
, layout
);
2940 mds
->locker
->include_snap_rdlocks(rdlocks
, dn
->get_dir()->inode
);
2950 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2952 * @param diri base inode
2953 * @param fg the exact frag we want
2954 * @param mdr request
2955 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2957 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
2959 CDir
*dir
= diri
->get_dirfrag(fg
);
2961 // not open and inode not mine?
2962 if (!dir
&& !diri
->is_auth()) {
2963 mds_rank_t inauth
= diri
->authority().first
;
2964 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
2965 mdcache
->request_forward(mdr
, inauth
);
2969 // not open and inode frozen?
2970 if (!dir
&& diri
->is_frozen()) {
2971 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
2972 assert(diri
->get_parent_dir());
2973 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2979 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
2981 // am i auth for the dirfrag?
2982 if (!dir
->is_auth()) {
2983 mds_rank_t auth
= dir
->authority().first
;
2984 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2985 << ", fw to mds." << auth
<< dendl
;
2986 mdcache
->request_forward(mdr
, auth
);
2994 // ===============================================================================
2997 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
2999 MClientRequest
*req
= mdr
->client_request
;
3000 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3002 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3003 // refpath can't be empty for lookup but it can for
3004 // getattr (we do getattr with empty refpath for mount of '/')
3005 respond_to_request(mdr
, -EINVAL
);
3009 CInode
*ref
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, false, false, NULL
, !is_lookup
);
3013 * if client currently holds the EXCL cap on a field, do not rdlock
3014 * it; client's stat() will result in valid info if _either_ EXCL
3015 * cap is held or MDS rdlocks and reads the value here.
3017 * handling this case here is easier than weakening rdlock
3018 * semantics... that would cause problems elsewhere.
3020 client_t client
= mdr
->get_client();
3022 Capability
*cap
= ref
->get_client_cap(client
);
3023 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3024 mdr
->snapid
<= cap
->client_follows
))
3025 issued
= cap
->issued();
3027 int mask
= req
->head
.args
.getattr
.mask
;
3028 if ((mask
& CEPH_CAP_LINK_SHARED
) && (issued
& CEPH_CAP_LINK_EXCL
) == 0) rdlocks
.insert(&ref
->linklock
);
3029 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0) rdlocks
.insert(&ref
->authlock
);
3030 if ((mask
& CEPH_CAP_FILE_SHARED
) && (issued
& CEPH_CAP_FILE_EXCL
) == 0) rdlocks
.insert(&ref
->filelock
);
3031 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0) rdlocks
.insert(&ref
->xattrlock
);
3033 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3036 if (!check_access(mdr
, ref
, MAY_READ
))
3039 // note which caps are requested, so we return at least a snapshot
3040 // value for them. (currently this matters for xattrs and inline data)
3041 mdr
->getattr_caps
= mask
;
3043 mds
->balancer
->hit_inode(ceph_clock_now(), ref
, META_POP_IRD
,
3044 req
->get_source().num());
3047 dout(10) << "reply to stat on " << *req
<< dendl
;
3050 mdr
->tracedn
= mdr
->dn
[0].back();
3051 respond_to_request(mdr
, 0);
3054 struct C_MDS_LookupIno2
: public ServerContext
{
3056 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3057 void finish(int r
) override
{
3058 server
->_lookup_ino_2(mdr
, r
);
3062 /* This function DOES clean up the mdr before returning*/
3066 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3067 bool want_parent
, bool want_dentry
)
3069 MClientRequest
*req
= mdr
->client_request
;
3071 inodeno_t ino
= req
->get_filepath().get_ino();
3072 CInode
*in
= mdcache
->get_inode(ino
);
3073 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3074 respond_to_request(mdr
, -ESTALE
);
3078 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3082 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->is_open() &&
3083 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
3087 // check for nothing (not read or write); this still applies the
3089 if (!check_access(mdr
, in
, 0))
3092 CDentry
*dn
= in
->get_projected_parent_dn();
3093 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3095 set
<SimpleLock
*> rdlocks
;
3096 if (dn
&& (want_parent
|| want_dentry
)) {
3098 rdlocks
.insert(&dn
->lock
);
3101 unsigned mask
= req
->head
.args
.getattr
.mask
;
3103 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3105 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3106 issued
= cap
->issued();
3107 // permission bits, ACL/security xattrs
3108 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3109 rdlocks
.insert(&in
->authlock
);
3110 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3111 rdlocks
.insert(&in
->xattrlock
);
3113 mdr
->getattr_caps
= mask
;
3116 if (!rdlocks
.empty()) {
3117 set
<SimpleLock
*> wrlocks
, xlocks
;
3118 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3122 // need read access to directory inode
3123 if (!check_access(mdr
, diri
, MAY_READ
))
3129 if (in
->is_base()) {
3130 respond_to_request(mdr
, -EINVAL
);
3133 if (!diri
|| diri
->is_stray()) {
3134 respond_to_request(mdr
, -ESTALE
);
3137 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3139 respond_to_request(mdr
, 0);
3142 inodeno_t dirino
= req
->get_filepath2().get_ino();
3143 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3144 respond_to_request(mdr
, -ENOENT
);
3147 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3149 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3154 respond_to_request(mdr
, 0);
3158 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
3160 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
3161 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3163 // `r` is a rank if >=0, else an error code
3165 mds_rank_t
dest_rank(r
);
3166 if (dest_rank
== mds
->get_nodeid())
3167 dispatch_client_request(mdr
);
3169 mdcache
->request_forward(mdr
, dest_rank
);
3174 if (r
== -ENOENT
|| r
== -ENODATA
)
3176 respond_to_request(mdr
, r
);
3180 /* This function takes responsibility for the passed mdr*/
3181 void Server::handle_client_open(MDRequestRef
& mdr
)
3183 MClientRequest
*req
= mdr
->client_request
;
3184 dout(7) << "open on " << req
->get_filepath() << dendl
;
3186 int flags
= req
->head
.args
.open
.flags
;
3187 int cmode
= ceph_flags_to_mode(flags
);
3189 respond_to_request(mdr
, -EINVAL
);
3193 bool need_auth
= !file_mode_is_readonly(cmode
) ||
3194 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
3196 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
3197 dout(7) << "read-only FS" << dendl
;
3198 respond_to_request(mdr
, -EROFS
);
3202 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3203 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, need_auth
);
3207 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
3209 mdr
->done_locking
= false;
3210 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3215 if (!cur
->inode
.is_file()) {
3216 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3217 cmode
= CEPH_FILE_MODE_PIN
;
3218 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3219 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
3220 flags
&= ~CEPH_O_TRUNC
;
3223 dout(10) << "open flags = " << flags
3224 << ", filemode = " << cmode
3225 << ", need_auth = " << need_auth
3229 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3230 dout(7) << "not a file or dir " << *cur << dendl;
3231 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3234 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
3235 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
3236 respond_to_request(mdr
, -EINVAL
);
3240 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
3241 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
3242 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3243 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
3247 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
3248 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
3249 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
3250 respond_to_request(mdr
, -EPERM
);
3254 // snapped data is read only
3255 if (mdr
->snapid
!= CEPH_NOSNAP
&&
3256 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
3257 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
3258 respond_to_request(mdr
, -EROFS
);
3262 unsigned mask
= req
->head
.args
.open
.mask
;
3264 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
3266 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3267 issued
= cap
->issued();
3268 // permission bits, ACL/security xattrs
3269 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3270 rdlocks
.insert(&cur
->authlock
);
3271 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3272 rdlocks
.insert(&cur
->xattrlock
);
3274 mdr
->getattr_caps
= mask
;
3278 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
3279 assert(cur
->is_auth());
3281 xlocks
.insert(&cur
->filelock
);
3282 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3285 if (!check_access(mdr
, cur
, MAY_WRITE
))
3288 // wait for pending truncate?
3289 const inode_t
*pi
= cur
->get_projected_inode();
3290 if (pi
->is_truncating()) {
3291 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
3292 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
3293 mds
->locker
->drop_locks(mdr
.get());
3294 mdr
->drop_local_auth_pins();
3295 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
3299 do_open_truncate(mdr
, cmode
);
3303 // sync filelock if snapped.
3304 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3305 // and that data itself is flushed so that we can read the snapped data off disk.
3306 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
3307 rdlocks
.insert(&cur
->filelock
);
3310 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3314 if (cmode
& CEPH_FILE_MODE_WR
)
3316 if (!check_access(mdr
, cur
, mask
))
3319 if (cur
->is_file() || cur
->is_dir()) {
3320 if (mdr
->snapid
== CEPH_NOSNAP
) {
3322 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
->session
, 0, req
->is_replay());
3324 dout(12) << "open issued caps " << ccap_string(cap
->pending())
3325 << " for " << req
->get_source()
3326 << " on " << *cur
<< dendl
;
3328 int caps
= ceph_caps_for_mode(cmode
);
3329 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
3330 << " for " << req
->get_source()
3331 << " snapid " << mdr
->snapid
3332 << " on " << *cur
<< dendl
;
3333 mdr
->snap_caps
= caps
;
3337 // increase max_size?
3338 if (cmode
& CEPH_FILE_MODE_WR
)
3339 mds
->locker
->check_inode_max_size(cur
);
3341 // make sure this inode gets into the journal
3342 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
3343 !cur
->item_open_file
.is_on_list()) {
3344 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3345 EOpen
*le
= new EOpen(mds
->mdlog
);
3346 mdlog
->start_entry(le
);
3347 le
->add_clean_inode(cur
);
3348 ls
->open_files
.push_back(&cur
->item_open_file
);
3349 mdlog
->submit_entry(le
);
3353 if (cmode
& CEPH_FILE_MODE_WR
)
3354 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), cur
, META_POP_IWR
);
3356 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), cur
, META_POP_IRD
,
3357 mdr
->client_request
->get_source().num());
3360 if (req
->get_dentry_wanted()) {
3361 assert(mdr
->dn
[0].size());
3362 dn
= mdr
->dn
[0].back();
3367 respond_to_request(mdr
, 0);
3370 class C_MDS_openc_finish
: public ServerLogContext
{
3375 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
, snapid_t f
) :
3376 ServerLogContext(s
, r
), dn(d
), newi(ni
), follows(f
) {}
3377 void finish(int r
) override
{
3380 dn
->pop_projected_linkage();
3382 // dirty inode, dn, dir
3383 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
3384 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
3385 newi
->_mark_dirty_parent(mdr
->ls
, true);
3389 get_mds()->locker
->share_inode_max_size(newi
);
3391 MDRequestRef null_ref
;
3392 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
3394 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), newi
, META_POP_IWR
);
3396 server
->respond_to_request(mdr
, 0);
3398 assert(g_conf
->mds_kill_openc_at
!= 1);
3402 /* This function takes responsibility for the passed mdr*/
3403 void Server::handle_client_openc(MDRequestRef
& mdr
)
3405 MClientRequest
*req
= mdr
->client_request
;
3406 client_t client
= mdr
->get_client();
3408 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
3410 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
3412 respond_to_request(mdr
, -EINVAL
);
3416 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
3419 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(),
3420 &mdr
->dn
[0], NULL
, MDS_TRAVERSE_FORWARD
);
3424 handle_client_open(mdr
);
3427 if (r
< 0 && r
!= -ENOENT
) {
3429 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3430 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
3431 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), c
);
3433 dout(10) << "FAIL on error " << r
<< dendl
;
3434 respond_to_request(mdr
, r
);
3440 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3441 file_layout_t
*dir_layout
= NULL
;
3442 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
,
3443 !excl
, false, false, &dir_layout
);
3445 if (mdr
->snapid
!= CEPH_NOSNAP
) {
3446 respond_to_request(mdr
, -EROFS
);
3450 file_layout_t layout
;
3452 layout
= *dir_layout
;
3454 layout
= mdcache
->default_file_layout
;
3456 // What kind of client caps are required to complete this operation
3457 uint64_t access
= MAY_WRITE
;
3459 const auto default_layout
= layout
;
3461 // fill in any special params from client
3462 if (req
->head
.args
.open
.stripe_unit
)
3463 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
3464 if (req
->head
.args
.open
.stripe_count
)
3465 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
3466 if (req
->head
.args
.open
.object_size
)
3467 layout
.object_size
= req
->head
.args
.open
.object_size
;
3468 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
3469 (__s32
)req
->head
.args
.open
.pool
>= 0) {
3470 layout
.pool_id
= req
->head
.args
.open
.pool
;
3472 // make sure we have as new a map as the client
3473 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
3474 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
3479 // If client doesn't have capability to modify layout pools, then
3480 // only permit this request if the requested pool matches what the
3481 // file would have inherited anyway from its parent.
3482 if (default_layout
!= layout
) {
3483 access
|= MAY_SET_VXATTR
;
3486 if (!layout
.is_valid()) {
3487 dout(10) << " invalid initial file layout" << dendl
;
3488 respond_to_request(mdr
, -EINVAL
);
3491 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
3492 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
3493 respond_to_request(mdr
, -EINVAL
);
3498 CDir
*dir
= dn
->get_dir();
3499 CInode
*diri
= dir
->get_inode();
3500 rdlocks
.insert(&diri
->authlock
);
3501 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3504 if (!check_access(mdr
, diri
, access
))
3507 if (!check_fragment_space(mdr
, dir
))
3510 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3512 if (!dnl
->is_null()) {
3514 assert(req
->head
.args
.open
.flags
& CEPH_O_EXCL
);
3515 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl
;
3516 mdr
->tracei
= dnl
->get_inode();
3518 respond_to_request(mdr
, -EEXIST
);
3523 SnapRealm
*realm
= diri
->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3524 snapid_t follows
= realm
->get_newest_seq();
3526 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
3527 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
3531 dn
->push_projected_linkage(in
);
3533 in
->inode
.version
= dn
->pre_dirty();
3534 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
3535 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
3536 in
->inode
.update_backtrace();
3537 if (cmode
& CEPH_FILE_MODE_WR
) {
3538 in
->inode
.client_ranges
[client
].range
.first
= 0;
3539 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.get_layout_size_increment();
3540 in
->inode
.client_ranges
[client
].follows
= follows
;
3542 in
->inode
.rstat
.rfiles
= 1;
3544 assert(dn
->first
== follows
+1);
3545 in
->first
= dn
->first
;
3548 mdr
->ls
= mdlog
->get_current_segment();
3549 EUpdate
*le
= new EUpdate(mdlog
, "openc");
3550 mdlog
->start_entry(le
);
3551 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
3552 journal_allocated_inos(mdr
, &le
->metablob
);
3553 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
3554 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
3557 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, req
->is_replay());
3558 in
->authlock
.set_state(LOCK_EXCL
);
3559 in
->xattrlock
.set_state(LOCK_EXCL
);
3561 // make sure this inode gets into the journal
3562 le
->metablob
.add_opened_ino(in
->ino());
3563 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3564 ls
->open_files
.push_back(&in
->item_open_file
);
3566 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
, follows
);
3568 if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
3569 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
3570 // add the file created flag onto the reply if create_flags features is supported
3571 ::encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
3574 journal_and_reply(mdr
, in
, dn
, le
, fin
);
3576 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3577 // have overshot the split size (multiple opencs in flight), so here is
3578 // an early chance to split the dir if this openc makes it oversized.
3579 mds
->balancer
->maybe_fragment(dir
, false);
3584 void Server::handle_client_readdir(MDRequestRef
& mdr
)
3586 MClientRequest
*req
= mdr
->client_request
;
3587 client_t client
= req
->get_source().num();
3588 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3589 CInode
*diri
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, false, true);
3592 // it's a directory, right?
3593 if (!diri
->is_dir()) {
3595 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
3596 respond_to_request(mdr
, -ENOTDIR
);
3600 rdlocks
.insert(&diri
->filelock
);
3601 rdlocks
.insert(&diri
->dirfragtreelock
);
3603 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3606 if (!check_access(mdr
, diri
, MAY_READ
))
3610 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
3611 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
3612 string offset_str
= req
->get_path2();
3614 __u32 offset_hash
= 0;
3615 if (!offset_str
.empty())
3616 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
3618 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
3620 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
3621 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
3623 // does the frag exist?
3624 if (diri
->dirfragtree
[fg
.value()] != fg
) {
3626 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3627 if (fg
.contains((unsigned)offset_hash
)) {
3628 newfg
= diri
->dirfragtree
[offset_hash
];
3630 // client actually wants next frag
3631 newfg
= diri
->dirfragtree
[fg
.value()];
3635 newfg
= diri
->dirfragtree
[fg
.value()];
3637 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
3641 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
3645 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
3646 assert(dir
->is_auth());
3648 if (!dir
->is_complete()) {
3649 if (dir
->is_frozen()) {
3650 dout(7) << "dir is frozen " << *dir
<< dendl
;
3651 mds
->locker
->drop_locks(mdr
.get());
3652 mdr
->drop_local_auth_pins();
3653 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3657 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
3658 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
3662 #ifdef MDS_VERIFY_FRAGSTAT
3663 dir
->verify_fragstat();
3666 utime_t now
= ceph_clock_now();
3667 mdr
->set_mds_stamp(now
);
3669 snapid_t snapid
= mdr
->snapid
;
3670 dout(10) << "snapid " << snapid
<< dendl
;
3672 SnapRealm
*realm
= diri
->find_snaprealm();
3674 unsigned max
= req
->head
.args
.readdir
.max_entries
;
3676 max
= dir
->get_num_any(); // whatever, something big.
3677 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
3679 // make sure at least one item can be encoded
3680 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
3684 dir
->encode_dirstat(dirbl
, mds
->get_nodeid());
3686 // count bytes available.
3687 // this isn't perfect, but we should capture the main variable/unbounded size items!
3688 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
3689 int bytes_left
= max_bytes
- front_bytes
;
3690 bytes_left
-= realm
->get_snap_trace().length();
3692 // build dir contents
3695 bool start
= !offset_hash
&& offset_str
.empty();
3696 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3697 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
3698 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
3699 bool end
= (it
== dir
->end());
3700 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
3701 CDentry
*dn
= it
->second
;
3704 if (dn
->state_test(CDentry::STATE_PURGING
))
3707 bool dnp
= dn
->use_projected(client
, mdr
);
3708 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
3713 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
3714 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
3719 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
3720 if (!(offset_key
< dn
->key()))
3724 CInode
*in
= dnl
->get_inode();
3726 if (in
&& in
->ino() == CEPH_INO_CEPH
)
3730 // better for the MDS to do the work, if we think the client will stat any of these files.
3731 if (dnl
->is_remote() && !in
) {
3732 in
= mdcache
->get_inode(dnl
->get_remote_ino());
3734 dn
->link_remote(dnl
, in
);
3735 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
3736 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
3739 // touch everything i _do_ have
3740 for (CDir::map_t::iterator p
= dir
->begin(); p
!= dir
->end(); ++p
)
3741 if (!p
->second
->get_linkage()->is_null())
3742 mdcache
->lru
.lru_touch(p
->second
);
3744 // already issued caps and leases, reply immediately.
3745 if (dnbl
.length() > 0) {
3746 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
3747 dout(10) << " open remote dentry after caps were issued, stopping at "
3748 << dnbl
.length() << " < " << bytes_left
<< dendl
;
3752 mds
->locker
->drop_locks(mdr
.get());
3753 mdr
->drop_local_auth_pins();
3754 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
3760 if ((int)(dnbl
.length() + dn
->name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
3761 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
3765 unsigned start_len
= dnbl
.length();
3768 dout(12) << "including dn " << *dn
<< dendl
;
3769 ::encode(dn
->name
, dnbl
);
3770 mds
->locker
->issue_client_lease(dn
, client
, dnbl
, now
, mdr
->session
);
3773 dout(12) << "including inode " << *in
<< dendl
;
3774 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
3776 // chop off dn->name, lease
3777 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
3779 keep
.substr_of(dnbl
, 0, start_len
);
3787 mdcache
->lru
.lru_touch(dn
);
3792 flags
= CEPH_READDIR_FRAG_END
;
3794 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
3796 // client only understand END and COMPLETE flags ?
3797 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3798 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
3801 // finish final blob
3802 ::encode(numfiles
, dirbl
);
3803 ::encode(flags
, dirbl
);
3804 dirbl
.claim_append(dnbl
);
3807 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
3808 << " bytes=" << dirbl
.length()
3809 << " start=" << (int)start
3810 << " end=" << (int)end
3812 mdr
->reply_extra_bl
= dirbl
;
3814 // bump popularity. NOTE: this doesn't quite capture it.
3815 mds
->balancer
->hit_dir(now
, dir
, META_POP_IRD
, -1, numfiles
);
3819 respond_to_request(mdr
, 0);
3824 // ===============================================================================
3829 * finisher for basic inode updates
3831 class C_MDS_inode_update_finish
: public ServerLogContext
{
3833 bool truncating_smaller
, changed_ranges
;
3835 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
3836 bool sm
=false, bool cr
=false) :
3837 ServerLogContext(s
, r
), in(i
), truncating_smaller(sm
), changed_ranges(cr
) { }
3838 void finish(int r
) override
{
3842 in
->pop_and_dirty_projected_inode(mdr
->ls
);
3845 // notify any clients
3846 if (truncating_smaller
&& in
->inode
.is_truncating()) {
3847 get_mds()->locker
->issue_truncate(in
);
3848 get_mds()->mdcache
->truncate_inode(in
, mdr
->ls
);
3851 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), in
, META_POP_IWR
);
3853 server
->respond_to_request(mdr
, 0);
3856 get_mds()->locker
->share_inode_max_size(in
);
3860 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
3862 MClientRequest
*req
= mdr
->client_request
;
3863 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3865 // get the inode to operate on, and set up any locks needed for that
3866 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3870 xlocks
.insert(&cur
->flocklock
);
3871 /* acquire_locks will return true if it gets the locks. If it fails,
3872 it will redeliver this request at a later date, so drop the request.
3874 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
3875 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
3879 // copy the lock change into a ceph_filelock so we can store/apply it
3880 ceph_filelock set_lock
;
3881 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
3882 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
3883 set_lock
.client
= req
->get_orig_source().num();
3884 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
3885 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
3886 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
3887 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
3889 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
3891 ceph_lock_state_t
*lock_state
= NULL
;
3892 bool interrupt
= false;
3894 // get the appropriate lock state
3895 switch (req
->head
.args
.filelock_change
.rule
) {
3896 case CEPH_LOCK_FLOCK_INTR
:
3899 case CEPH_LOCK_FLOCK
:
3900 lock_state
= cur
->get_flock_lock_state();
3903 case CEPH_LOCK_FCNTL_INTR
:
3906 case CEPH_LOCK_FCNTL
:
3907 lock_state
= cur
->get_fcntl_lock_state();
3911 dout(10) << "got unknown lock type " << set_lock
.type
3912 << ", dropping request!" << dendl
;
3913 respond_to_request(mdr
, -EOPNOTSUPP
);
3917 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
3918 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
3919 list
<ceph_filelock
> activated_locks
;
3920 list
<MDSInternalContextBase
*> waiters
;
3921 if (lock_state
->is_waiting(set_lock
)) {
3922 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
3923 lock_state
->remove_waiting(set_lock
);
3924 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
3925 } else if (!interrupt
) {
3926 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
3927 lock_state
->remove_lock(set_lock
, activated_locks
);
3928 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
3930 mds
->queue_waiters(waiters
);
3932 respond_to_request(mdr
, 0);
3934 dout(10) << " lock attempt on " << set_lock
<< dendl
;
3935 bool deadlock
= false;
3936 if (mdr
->more()->flock_was_waiting
&&
3937 !lock_state
->is_waiting(set_lock
)) {
3938 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
3939 respond_to_request(mdr
, -EINTR
);
3940 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
3941 dout(10) << " it failed on this attempt" << dendl
;
3942 // couldn't set lock right now
3944 respond_to_request(mdr
, -EDEADLK
);
3945 } else if (!will_wait
) {
3946 respond_to_request(mdr
, -EWOULDBLOCK
);
3948 dout(10) << " added to waiting list" << dendl
;
3949 assert(lock_state
->is_waiting(set_lock
));
3950 mdr
->more()->flock_was_waiting
= true;
3951 mds
->locker
->drop_locks(mdr
.get());
3952 mdr
->drop_local_auth_pins();
3953 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
3956 respond_to_request(mdr
, 0);
3958 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
3961 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
3963 MClientRequest
*req
= mdr
->client_request
;
3964 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3966 // get the inode to operate on, and set up any locks needed for that
3967 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3971 /* acquire_locks will return true if it gets the locks. If it fails,
3972 it will redeliver this request at a later date, so drop the request.
3974 rdlocks
.insert(&cur
->flocklock
);
3975 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
3976 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
3980 // copy the lock change into a ceph_filelock so we can store/apply it
3981 ceph_filelock checking_lock
;
3982 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
3983 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
3984 checking_lock
.client
= req
->get_orig_source().num();
3985 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
3986 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
3987 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
3989 // get the appropriate lock state
3990 ceph_lock_state_t
*lock_state
= NULL
;
3991 switch (req
->head
.args
.filelock_change
.rule
) {
3992 case CEPH_LOCK_FLOCK
:
3993 lock_state
= cur
->get_flock_lock_state();
3996 case CEPH_LOCK_FCNTL
:
3997 lock_state
= cur
->get_fcntl_lock_state();
4001 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4002 respond_to_request(mdr
, -EINVAL
);
4005 lock_state
->look_for_lock(checking_lock
);
4008 ::encode(checking_lock
, lock_bl
);
4010 mdr
->reply_extra_bl
= lock_bl
;
4011 respond_to_request(mdr
, 0);
4014 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4016 MClientRequest
*req
= mdr
->client_request
;
4017 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4018 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4021 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4022 respond_to_request(mdr
, -EROFS
);
4025 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4026 respond_to_request(mdr
, -EPERM
);
4030 __u32 mask
= req
->head
.args
.setattr
.mask
;
4031 __u32 access_mask
= MAY_WRITE
;
4034 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4035 xlocks
.insert(&cur
->authlock
);
4036 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4037 xlocks
.insert(&cur
->filelock
);
4038 if (mask
& CEPH_SETATTR_CTIME
)
4039 wrlocks
.insert(&cur
->versionlock
);
4041 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4044 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
4045 access_mask
|= MAY_CHOWN
;
4047 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
4048 access_mask
|= MAY_CHGRP
;
4050 if (!check_access(mdr
, cur
, access_mask
))
4053 // trunc from bigger -> smaller?
4054 inode_t
*pi
= cur
->get_projected_inode();
4056 uint64_t old_size
= MAX(pi
->size
, req
->head
.args
.setattr
.old_size
);
4058 // ENOSPC on growing file while full, but allow shrinks
4059 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4060 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
4061 respond_to_request(mdr
, -ENOSPC
);
4065 bool truncating_smaller
= false;
4066 if (mask
& CEPH_SETATTR_SIZE
) {
4067 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
4068 if (truncating_smaller
&& pi
->is_truncating()) {
4069 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
4070 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4071 mds
->locker
->drop_locks(mdr
.get());
4072 mdr
->drop_local_auth_pins();
4073 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4078 bool changed_ranges
= false;
4081 mdr
->ls
= mdlog
->get_current_segment();
4082 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
4083 mdlog
->start_entry(le
);
4085 pi
= cur
->project_inode();
4087 if (mask
& CEPH_SETATTR_UID
)
4088 pi
->uid
= req
->head
.args
.setattr
.uid
;
4089 if (mask
& CEPH_SETATTR_GID
)
4090 pi
->gid
= req
->head
.args
.setattr
.gid
;
4092 if (mask
& CEPH_SETATTR_MODE
)
4093 pi
->mode
= (pi
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
4094 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4095 S_ISREG(pi
->mode
) &&
4096 (pi
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
4097 pi
->mode
&= ~(S_ISUID
|S_ISGID
);
4100 if (mask
& CEPH_SETATTR_MTIME
)
4101 pi
->mtime
= req
->head
.args
.setattr
.mtime
;
4102 if (mask
& CEPH_SETATTR_ATIME
)
4103 pi
->atime
= req
->head
.args
.setattr
.atime
;
4104 if (mask
& CEPH_SETATTR_BTIME
)
4105 pi
->btime
= req
->head
.args
.setattr
.btime
;
4106 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4107 pi
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4108 if (mask
& CEPH_SETATTR_SIZE
) {
4109 if (truncating_smaller
) {
4110 pi
->truncate(old_size
, req
->head
.args
.setattr
.size
);
4111 le
->metablob
.add_truncate_start(cur
->ino());
4113 pi
->size
= req
->head
.args
.setattr
.size
;
4114 pi
->rstat
.rbytes
= pi
->size
;
4116 pi
->mtime
= mdr
->get_op_stamp();
4118 // adjust client's max_size?
4119 map
<client_t
,client_writeable_range_t
> new_ranges
;
4120 bool max_increased
= false;
4121 mds
->locker
->calc_new_client_ranges(cur
, pi
->size
, &new_ranges
, &max_increased
);
4122 if (pi
->client_ranges
!= new_ranges
) {
4123 dout(10) << " client_ranges " << pi
->client_ranges
<< " -> " << new_ranges
<< dendl
;
4124 pi
->client_ranges
= new_ranges
;
4125 changed_ranges
= true;
4129 pi
->version
= cur
->pre_dirty();
4130 pi
->ctime
= mdr
->get_op_stamp();
4134 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4135 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4136 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4138 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
4139 truncating_smaller
, changed_ranges
));
4141 // flush immediately if there are readers/writers waiting
4142 if (xlocks
.count(&cur
->filelock
) &&
4143 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
4144 mds
->mdlog
->flush();
4147 /* Takes responsibility for mdr */
4148 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
4150 CInode
*in
= mdr
->in
[0];
4151 client_t client
= mdr
->get_client();
4154 dout(10) << "do_open_truncate " << *in
<< dendl
;
4156 SnapRealm
*realm
= in
->find_snaprealm();
4157 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, mdr
->client_request
->is_replay());
4159 mdr
->ls
= mdlog
->get_current_segment();
4160 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
4161 mdlog
->start_entry(le
);
4164 inode_t
*pi
= in
->project_inode();
4165 pi
->version
= in
->pre_dirty();
4166 pi
->mtime
= pi
->ctime
= mdr
->get_op_stamp();
4169 uint64_t old_size
= MAX(pi
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
4171 pi
->truncate(old_size
, 0);
4172 le
->metablob
.add_truncate_start(in
->ino());
4175 bool changed_ranges
= false;
4176 if (cmode
& CEPH_FILE_MODE_WR
) {
4177 pi
->client_ranges
[client
].range
.first
= 0;
4178 pi
->client_ranges
[client
].range
.last
= pi
->get_layout_size_increment();
4179 pi
->client_ranges
[client
].follows
= in
->find_snaprealm()->get_newest_seq();
4180 changed_ranges
= true;
4183 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
4185 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
4186 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
4188 // make sure ino gets into the journal
4189 le
->metablob
.add_opened_ino(in
->ino());
4190 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
4191 ls
->open_files
.push_back(&in
->item_open_file
);
4193 mdr
->o_trunc
= true;
4196 if (mdr
->client_request
->get_dentry_wanted()) {
4197 assert(mdr
->dn
[0].size());
4198 dn
= mdr
->dn
[0].back();
4201 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
4203 // Although the `open` part can give an early reply, the truncation won't
4204 // happen until our EUpdate is persistent, to give the client a prompt
4205 // response we must also flush that event.
4210 /* This function cleans up the passed mdr */
4211 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
4213 MClientRequest
*req
= mdr
->client_request
;
4214 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4215 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4218 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4219 respond_to_request(mdr
, -EROFS
);
4222 if (!cur
->is_file()) {
4223 respond_to_request(mdr
, -EINVAL
);
4226 if (cur
->get_projected_inode()->size
||
4227 cur
->get_projected_inode()->truncate_seq
> 1) {
4228 respond_to_request(mdr
, -ENOTEMPTY
);
4233 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4234 // save existing layout for later
4235 const auto old_layout
= layout
;
4237 int access
= MAY_WRITE
;
4239 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4240 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4241 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4242 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4243 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4244 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4245 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4246 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4248 // make sure we have as new a map as the client
4249 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4250 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4255 // Don't permit layout modifications without 'p' caps
4256 if (layout
!= old_layout
) {
4257 access
|= MAY_SET_VXATTR
;
4260 if (!layout
.is_valid()) {
4261 dout(10) << "bad layout" << dendl
;
4262 respond_to_request(mdr
, -EINVAL
);
4265 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4266 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4267 respond_to_request(mdr
, -EINVAL
);
4271 xlocks
.insert(&cur
->filelock
);
4272 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4275 if (!check_access(mdr
, cur
, access
))
4279 inode_t
*pi
= cur
->project_inode();
4280 pi
->layout
= layout
;
4281 // add the old pool to the inode
4282 pi
->add_old_pool(old_layout
.pool_id
);
4283 pi
->version
= cur
->pre_dirty();
4284 pi
->ctime
= mdr
->get_op_stamp();
4288 mdr
->ls
= mdlog
->get_current_segment();
4289 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4290 mdlog
->start_entry(le
);
4291 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4292 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4293 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4295 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4298 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
4300 MClientRequest
*req
= mdr
->client_request
;
4301 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4302 file_layout_t
*dir_layout
= NULL
;
4303 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4306 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4307 respond_to_request(mdr
, -EROFS
);
4311 if (!cur
->is_dir()) {
4312 respond_to_request(mdr
, -ENOTDIR
);
4316 xlocks
.insert(&cur
->policylock
);
4317 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4321 const inode_t
*old_pi
= cur
->get_projected_inode();
4322 file_layout_t layout
;
4323 if (old_pi
->has_layout())
4324 layout
= old_pi
->layout
;
4325 else if (dir_layout
)
4326 layout
= *dir_layout
;
4328 layout
= mdcache
->default_file_layout
;
4330 // Level of access required to complete
4331 int access
= MAY_WRITE
;
4333 const auto old_layout
= layout
;
4335 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4336 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4337 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4338 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4339 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4340 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4341 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4342 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4343 // make sure we have as new a map as the client
4344 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4345 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4350 if (layout
!= old_layout
) {
4351 access
|= MAY_SET_VXATTR
;
4354 if (!layout
.is_valid()) {
4355 dout(10) << "bad layout" << dendl
;
4356 respond_to_request(mdr
, -EINVAL
);
4359 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4360 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4361 respond_to_request(mdr
, -EINVAL
);
4365 if (!check_access(mdr
, cur
, access
))
4368 inode_t
*pi
= cur
->project_inode();
4369 pi
->layout
= layout
;
4370 pi
->version
= cur
->pre_dirty();
4373 mdr
->ls
= mdlog
->get_current_segment();
4374 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4375 mdlog
->start_entry(le
);
4376 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4377 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4378 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4380 mdr
->no_early_reply
= true;
4381 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4386 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
4387 file_layout_t
*layout
, bool validate
)
4389 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4391 if (name
== "layout") {
4392 string::iterator begin
= value
.begin();
4393 string::iterator end
= value
.end();
4394 keys_and_values
<string::iterator
> p
; // create instance of parser
4395 std::map
<string
, string
> m
; // map to receive results
4396 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4399 string
left(begin
, end
);
4400 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4403 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4404 // Skip validation on each attr, we do it once at the end (avoid
4405 // rejecting intermediate states if the overall result is ok)
4406 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
4407 osdmap
, layout
, false);
4411 } else if (name
== "layout.object_size") {
4412 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
4413 } else if (name
== "layout.stripe_unit") {
4414 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
4415 } else if (name
== "layout.stripe_count") {
4416 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
4417 } else if (name
== "layout.pool") {
4419 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
4420 } catch (boost::bad_lexical_cast
const&) {
4421 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
4423 dout(10) << " unknown pool " << value
<< dendl
;
4426 layout
->pool_id
= pool
;
4428 } else if (name
== "layout.pool_namespace") {
4429 layout
->pool_ns
= value
;
4431 dout(10) << " unknown layout vxattr " << name
<< dendl
;
4434 } catch (boost::bad_lexical_cast
const&) {
4435 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4439 if (validate
&& !layout
->is_valid()) {
4440 dout(10) << "bad layout" << dendl
;
4443 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
4444 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
4450 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
4452 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4454 if (name
== "quota") {
4455 string::iterator begin
= value
.begin();
4456 string::iterator end
= value
.end();
4457 keys_and_values
<string::iterator
> p
; // create instance of parser
4458 std::map
<string
, string
> m
; // map to receive results
4459 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4462 string
left(begin
, end
);
4463 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4466 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4467 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
4471 } else if (name
== "quota.max_bytes") {
4472 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4475 quota
->max_bytes
= q
;
4476 } else if (name
== "quota.max_files") {
4477 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4480 quota
->max_files
= q
;
4482 dout(10) << " unknown quota vxattr " << name
<< dendl
;
4485 } catch (boost::bad_lexical_cast
const&) {
4486 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4490 if (!quota
->is_valid()) {
4491 dout(10) << "bad quota" << dendl
;
4498 * Verify that the file layout attribute carried by client
4499 * is well-formatted.
4500 * Return 0 on success, otherwise this function takes
4501 * responsibility for the passed mdr.
4503 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
4506 file_layout_t
*layout
)
4508 MClientRequest
*req
= mdr
->client_request
;
4512 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4513 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4514 epoch
= osdmap
.get_epoch();
4519 // we don't have the specified pool, make sure our map
4520 // is newer than or as new as the client.
4521 epoch_t req_epoch
= req
->get_osdmap_epoch();
4523 if (req_epoch
> epoch
) {
4525 // well, our map is older. consult mds.
4526 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
4528 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
4529 return r
; // wait, fin will retry this request later
4533 // now we have at least as new a map as the client, try again.
4534 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4535 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4536 epoch
= osdmap
.get_epoch();
4539 assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
4541 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
4543 // For compatibility with client w/ old code, we still need get the
4544 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4545 // we can remove those code.
4546 mdr
->waited_for_osdmap
= true;
4547 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
4548 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
4558 respond_to_request(mdr
, r
);
4566 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4567 file_layout_t
*dir_layout
,
4568 set
<SimpleLock
*> rdlocks
,
4569 set
<SimpleLock
*> wrlocks
,
4570 set
<SimpleLock
*> xlocks
)
4572 MClientRequest
*req
= mdr
->client_request
;
4573 string
name(req
->get_path2());
4574 bufferlist bl
= req
->get_data();
4575 string
value (bl
.c_str(), bl
.length());
4576 dout(10) << "handle_set_vxattr " << name
4577 << " val " << value
.length()
4578 << " bytes on " << *cur
4584 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
4588 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
4589 if (!cur
->is_dir()) {
4590 respond_to_request(mdr
, -EINVAL
);
4594 file_layout_t layout
;
4595 if (cur
->get_projected_inode()->has_layout())
4596 layout
= cur
->get_projected_inode()->layout
;
4597 else if (dir_layout
)
4598 layout
= *dir_layout
;
4600 layout
= mdcache
->default_file_layout
;
4602 rest
= name
.substr(name
.find("layout"));
4603 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4606 xlocks
.insert(&cur
->policylock
);
4607 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4610 pi
= cur
->project_inode();
4611 pi
->layout
= layout
;
4612 mdr
->no_early_reply
= true;
4613 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
4614 if (!cur
->is_file()) {
4615 respond_to_request(mdr
, -EINVAL
);
4618 if (cur
->get_projected_inode()->size
||
4619 cur
->get_projected_inode()->truncate_seq
> 1) {
4620 respond_to_request(mdr
, -ENOTEMPTY
);
4623 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4624 rest
= name
.substr(name
.find("layout"));
4625 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4628 xlocks
.insert(&cur
->filelock
);
4629 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4632 pi
= cur
->project_inode();
4633 int64_t old_pool
= pi
->layout
.pool_id
;
4634 pi
->add_old_pool(old_pool
);
4635 pi
->layout
= layout
;
4636 pi
->ctime
= mdr
->get_op_stamp();
4637 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
4638 if (!cur
->is_dir() || cur
->is_root()) {
4639 respond_to_request(mdr
, -EINVAL
);
4643 quota_info_t quota
= cur
->get_projected_inode()->quota
;
4645 rest
= name
.substr(name
.find("quota"));
4646 int r
= parse_quota_vxattr(rest
, value
, "a
);
4648 respond_to_request(mdr
, r
);
4652 xlocks
.insert(&cur
->policylock
);
4653 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4656 pi
= cur
->project_inode();
4658 mdr
->no_early_reply
= true;
4659 } else if (name
.find("ceph.dir.pin") == 0) {
4660 if (!cur
->is_dir() || cur
->is_root()) {
4661 respond_to_request(mdr
, -EINVAL
);
4667 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
4668 if (rank
< 0) rank
= MDS_RANK_NONE
;
4669 } catch (boost::bad_lexical_cast
const&) {
4670 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4671 respond_to_request(mdr
, -EINVAL
);
4675 xlocks
.insert(&cur
->policylock
);
4676 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4679 pi
= cur
->project_inode();
4680 cur
->set_export_pin(rank
);
4682 dout(10) << " unknown vxattr " << name
<< dendl
;
4683 respond_to_request(mdr
, -EINVAL
);
4688 pi
->ctime
= mdr
->get_op_stamp();
4689 pi
->version
= cur
->pre_dirty();
4691 pi
->update_backtrace();
4694 mdr
->ls
= mdlog
->get_current_segment();
4695 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
4696 mdlog
->start_entry(le
);
4697 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4698 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4699 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4701 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4705 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4706 file_layout_t
*dir_layout
,
4707 set
<SimpleLock
*> rdlocks
,
4708 set
<SimpleLock
*> wrlocks
,
4709 set
<SimpleLock
*> xlocks
)
4711 MClientRequest
*req
= mdr
->client_request
;
4712 string
name(req
->get_path2());
4714 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
4716 if (name
== "ceph.dir.layout") {
4717 if (!cur
->is_dir()) {
4718 respond_to_request(mdr
, -ENODATA
);
4721 if (cur
->is_root()) {
4722 dout(10) << "can't remove layout policy on the root directory" << dendl
;
4723 respond_to_request(mdr
, -EINVAL
);
4727 if (!cur
->get_projected_inode()->has_layout()) {
4728 respond_to_request(mdr
, -ENODATA
);
4732 xlocks
.insert(&cur
->policylock
);
4733 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4736 inode_t
*pi
= cur
->project_inode();
4738 pi
->version
= cur
->pre_dirty();
4741 mdr
->ls
= mdlog
->get_current_segment();
4742 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
4743 mdlog
->start_entry(le
);
4744 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4745 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4746 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4748 mdr
->no_early_reply
= true;
4749 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4751 } else if (name
== "ceph.dir.layout.pool_namespace"
4752 || name
== "ceph.file.layout.pool_namespace") {
4753 // Namespace is the only layout field that has a meaningful
4754 // null/none value (empty string, means default layout). Is equivalent
4755 // to a setxattr with empty string: pass through the empty payload of
4756 // the rmxattr request to do this.
4757 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4761 respond_to_request(mdr
, -ENODATA
);
4764 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
4768 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
4769 ServerLogContext(s
, r
), in(i
) { }
4770 void finish(int r
) override
{
4774 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4778 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), in
, META_POP_IWR
);
4780 server
->respond_to_request(mdr
, 0);
4784 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
4786 MClientRequest
*req
= mdr
->client_request
;
4787 string
name(req
->get_path2());
4788 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4791 file_layout_t
*dir_layout
= NULL
;
4792 if (name
.compare(0, 15, "ceph.dir.layout") == 0)
4793 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4795 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4799 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4800 respond_to_request(mdr
, -EROFS
);
4804 int flags
= req
->head
.args
.setxattr
.flags
;
4806 // magic ceph.* namespace?
4807 if (name
.compare(0, 5, "ceph.") == 0) {
4808 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4812 xlocks
.insert(&cur
->xattrlock
);
4813 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4816 if (!check_access(mdr
, cur
, MAY_WRITE
))
4819 map
<string
, bufferptr
> *pxattrs
= cur
->get_projected_xattrs();
4820 size_t len
= req
->get_data().length();
4821 size_t inc
= len
+ name
.length();
4823 // check xattrs kv pairs size
4824 size_t cur_xattrs_size
= 0;
4825 for (const auto& p
: *pxattrs
) {
4826 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(p
.first
) == 0)) {
4829 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
4832 if (((cur_xattrs_size
+ inc
) > g_conf
->mds_max_xattr_pairs_size
)) {
4833 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4834 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
4835 respond_to_request(mdr
, -ENOSPC
);
4839 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(name
)) {
4840 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
4841 respond_to_request(mdr
, -EEXIST
);
4844 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(name
)) {
4845 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
4846 respond_to_request(mdr
, -ENODATA
);
4850 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
4853 map
<string
,bufferptr
> *px
= new map
<string
,bufferptr
>;
4854 inode_t
*pi
= cur
->project_inode(px
);
4855 pi
->version
= cur
->pre_dirty();
4856 pi
->ctime
= mdr
->get_op_stamp();
4858 pi
->xattr_version
++;
4860 if (!(flags
& CEPH_XATTR_REMOVE
)) {
4861 (*px
)[name
] = buffer::create(len
);
4863 req
->get_data().copy(0, len
, (*px
)[name
].c_str());
4867 mdr
->ls
= mdlog
->get_current_segment();
4868 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
4869 mdlog
->start_entry(le
);
4870 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4871 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4872 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4874 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4877 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
4879 MClientRequest
*req
= mdr
->client_request
;
4880 string
name(req
->get_path2());
4881 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4882 file_layout_t
*dir_layout
= NULL
;
4884 if (name
== "ceph.dir.layout")
4885 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4887 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4891 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4892 respond_to_request(mdr
, -EROFS
);
4896 if (name
.compare(0, 5, "ceph.") == 0) {
4897 handle_remove_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4901 xlocks
.insert(&cur
->xattrlock
);
4902 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4905 map
<string
, bufferptr
> *pxattrs
= cur
->get_projected_xattrs();
4906 if (pxattrs
->count(name
) == 0) {
4907 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
4908 respond_to_request(mdr
, -ENODATA
);
4912 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
4915 map
<string
,bufferptr
> *px
= new map
<string
,bufferptr
>;
4916 inode_t
*pi
= cur
->project_inode(px
);
4917 pi
->version
= cur
->pre_dirty();
4918 pi
->ctime
= mdr
->get_op_stamp();
4920 pi
->xattr_version
++;
4924 mdr
->ls
= mdlog
->get_current_segment();
4925 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
4926 mdlog
->start_entry(le
);
4927 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4928 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4929 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4931 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4935 // =================================================================
4936 // DIRECTORY and NAMESPACE OPS
4939 // ------------------------------------------------
4943 class C_MDS_mknod_finish
: public ServerLogContext
{
4947 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4948 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4949 void finish(int r
) override
{
4953 dn
->pop_projected_linkage();
4955 // be a bit hacky with the inode version, here.. we decrement it
4956 // just to keep mark_dirty() happen. (we didn't bother projecting
4957 // a new version of hte inode since it's just been created)
4958 newi
->inode
.version
--;
4959 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
4960 newi
->_mark_dirty_parent(mdr
->ls
, true);
4963 if (newi
->inode
.is_dir()) {
4964 CDir
*dir
= newi
->get_dirfrag(frag_t());
4966 dir
->fnode
.version
--;
4967 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
4968 dir
->mark_new(mdr
->ls
);
4973 MDRequestRef null_ref
;
4974 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4976 if (newi
->inode
.is_file())
4977 get_mds()->locker
->share_inode_max_size(newi
);
4980 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), newi
, META_POP_IWR
);
4983 server
->respond_to_request(mdr
, 0);
4988 void Server::handle_client_mknod(MDRequestRef
& mdr
)
4990 MClientRequest
*req
= mdr
->client_request
;
4991 client_t client
= mdr
->get_client();
4992 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4993 file_layout_t
*dir_layout
= NULL
;
4994 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false,
4997 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4998 respond_to_request(mdr
, -EROFS
);
5001 CInode
*diri
= dn
->get_dir()->get_inode();
5002 rdlocks
.insert(&diri
->authlock
);
5003 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5006 if (!check_access(mdr
, diri
, MAY_WRITE
))
5009 if (!check_fragment_space(mdr
, dn
->get_dir()))
5012 unsigned mode
= req
->head
.args
.mknod
.mode
;
5013 if ((mode
& S_IFMT
) == 0)
5017 file_layout_t layout
;
5018 if (dir_layout
&& S_ISREG(mode
))
5019 layout
= *dir_layout
;
5021 layout
= mdcache
->default_file_layout
;
5023 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5024 snapid_t follows
= realm
->get_newest_seq();
5025 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
5029 dn
->push_projected_linkage(newi
);
5031 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
5032 newi
->inode
.version
= dn
->pre_dirty();
5033 newi
->inode
.rstat
.rfiles
= 1;
5034 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
5035 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
5036 newi
->inode
.update_backtrace();
5038 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5039 // want to write to it (e.g., if they are reexporting NFS)
5040 if (S_ISREG(newi
->inode
.mode
)) {
5041 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
5042 newi
->inode
.client_ranges
[client
].range
.first
= 0;
5043 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.get_layout_size_increment();
5044 newi
->inode
.client_ranges
[client
].follows
= follows
;
5046 // issue a cap on the file
5047 int cmode
= CEPH_FILE_MODE_RDWR
;
5048 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5052 // put locks in excl mode
5053 newi
->filelock
.set_state(LOCK_EXCL
);
5054 newi
->authlock
.set_state(LOCK_EXCL
);
5055 newi
->xattrlock
.set_state(LOCK_EXCL
);
5059 assert(dn
->first
== follows
+ 1);
5060 newi
->first
= dn
->first
;
5062 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
5065 mdr
->ls
= mdlog
->get_current_segment();
5066 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
5067 mdlog
->start_entry(le
);
5068 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5069 journal_allocated_inos(mdr
, &le
->metablob
);
5071 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
5072 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5073 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
5075 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5081 /* This function takes responsibility for the passed mdr*/
5082 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
5084 MClientRequest
*req
= mdr
->client_request
;
5085 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5086 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5088 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5089 respond_to_request(mdr
, -EROFS
);
5092 CDir
*dir
= dn
->get_dir();
5093 CInode
*diri
= dir
->get_inode();
5094 rdlocks
.insert(&diri
->authlock
);
5095 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5098 // mkdir check access
5099 if (!check_access(mdr
, diri
, MAY_WRITE
))
5102 if (!check_fragment_space(mdr
, dir
))
5106 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5107 snapid_t follows
= realm
->get_newest_seq();
5109 unsigned mode
= req
->head
.args
.mkdir
.mode
;
5112 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5115 // it's a directory.
5116 dn
->push_projected_linkage(newi
);
5118 newi
->inode
.version
= dn
->pre_dirty();
5119 newi
->inode
.rstat
.rsubdirs
= 1;
5120 newi
->inode
.update_backtrace();
5122 dout(12) << " follows " << follows
<< dendl
;
5123 assert(dn
->first
== follows
+ 1);
5124 newi
->first
= dn
->first
;
5126 // ...and that new dir is empty.
5127 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
5128 newdir
->state_set(CDir::STATE_CREATING
);
5129 newdir
->mark_complete();
5130 newdir
->fnode
.version
= newdir
->pre_dirty();
5133 mdr
->ls
= mdlog
->get_current_segment();
5134 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
5135 mdlog
->start_entry(le
);
5136 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5137 journal_allocated_inos(mdr
, &le
->metablob
);
5138 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5139 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5140 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
5142 // issue a cap on the directory
5143 int cmode
= CEPH_FILE_MODE_RDWR
;
5144 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5148 // put locks in excl mode
5149 newi
->filelock
.set_state(LOCK_EXCL
);
5150 newi
->authlock
.set_state(LOCK_EXCL
);
5151 newi
->xattrlock
.set_state(LOCK_EXCL
);
5154 // make sure this inode gets into the journal
5155 le
->metablob
.add_opened_ino(newi
->ino());
5156 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
5157 ls
->open_files
.push_back(&newi
->item_open_file
);
5159 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5165 void Server::handle_client_symlink(MDRequestRef
& mdr
)
5167 MClientRequest
*req
= mdr
->client_request
;
5168 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5169 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5171 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5172 respond_to_request(mdr
, -EROFS
);
5175 CDir
*dir
= dn
->get_dir();
5176 CInode
*diri
= dir
->get_inode();
5177 rdlocks
.insert(&diri
->authlock
);
5178 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5181 if (!check_access(mdr
, diri
, MAY_WRITE
))
5184 if (!check_fragment_space(mdr
, dir
))
5187 unsigned mode
= S_IFLNK
| 0777;
5188 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5192 dn
->push_projected_linkage(newi
);
5194 newi
->symlink
= req
->get_path2();
5195 newi
->inode
.size
= newi
->symlink
.length();
5196 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
5197 newi
->inode
.rstat
.rfiles
= 1;
5198 newi
->inode
.version
= dn
->pre_dirty();
5199 newi
->inode
.update_backtrace();
5201 newi
->first
= dn
->first
;
5204 mdr
->ls
= mdlog
->get_current_segment();
5205 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
5206 mdlog
->start_entry(le
);
5207 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5208 journal_allocated_inos(mdr
, &le
->metablob
);
5209 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5210 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5212 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5221 void Server::handle_client_link(MDRequestRef
& mdr
)
5223 MClientRequest
*req
= mdr
->client_request
;
5225 dout(7) << "handle_client_link " << req
->get_filepath()
5226 << " to " << req
->get_filepath2()
5229 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5231 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5233 CInode
*targeti
= rdlock_path_pin_ref(mdr
, 1, rdlocks
, false);
5234 if (!targeti
) return;
5235 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5236 respond_to_request(mdr
, -EROFS
);
5240 CDir
*dir
= dn
->get_dir();
5241 dout(7) << "handle_client_link link " << dn
->get_name() << " in " << *dir
<< dendl
;
5242 dout(7) << "target is " << *targeti
<< dendl
;
5243 if (targeti
->is_dir()) {
5244 // if srcdn is replica, need to make sure its linkage is correct
5245 vector
<CDentry
*>& trace
= mdr
->dn
[1];
5246 if (trace
.empty() ||
5247 trace
.back()->is_auth() ||
5248 trace
.back()->lock
.can_read(mdr
->get_client())) {
5249 dout(7) << "target is a dir, failing..." << dendl
;
5250 respond_to_request(mdr
, -EINVAL
);
5255 xlocks
.insert(&targeti
->linklock
);
5257 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5260 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5261 if (!check_access(mdr
, targeti
, MAY_WRITE
))
5264 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
5267 if (!check_fragment_space(mdr
, dir
))
5272 assert(g_conf
->mds_kill_link_at
!= 1);
5275 if (targeti
->is_auth())
5276 _link_local(mdr
, dn
, targeti
);
5278 _link_remote(mdr
, true, dn
, targeti
);
5282 class C_MDS_link_local_finish
: public ServerLogContext
{
5288 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
5289 version_t dnpv_
, version_t tipv_
) :
5290 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
5291 dnpv(dnpv_
), tipv(tipv_
) { }
5292 void finish(int r
) override
{
5294 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
);
5299 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
5301 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
5303 mdr
->ls
= mdlog
->get_current_segment();
5305 // predirty NEW dentry
5306 version_t dnpv
= dn
->pre_dirty();
5307 version_t tipv
= targeti
->pre_dirty();
5309 // project inode update
5310 inode_t
*pi
= targeti
->project_inode();
5312 pi
->ctime
= mdr
->get_op_stamp();
5317 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
5318 mdlog
->start_entry(le
);
5319 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5320 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
5321 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
5322 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5323 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
5325 // do this after predirty_*, to avoid funky extra dnl arg
5326 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5328 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
));
5331 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
5332 version_t dnpv
, version_t tipv
)
5334 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
5336 // link and unlock the NEW dentry
5337 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
5338 if (!dnl
->get_inode())
5339 dn
->link_remote(dnl
, targeti
);
5340 dn
->mark_dirty(dnpv
, mdr
->ls
);
5343 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5347 MDRequestRef null_ref
;
5348 mdcache
->send_dentry_link(dn
, null_ref
);
5350 // bump target popularity
5351 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), targeti
, META_POP_IWR
);
5352 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), dn
->get_dir(), META_POP_IWR
);
5355 respond_to_request(mdr
, 0);
5359 // link / unlink remote
5361 class C_MDS_link_remote_finish
: public ServerLogContext
{
5367 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
5368 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
5369 dpv(d
->get_projected_version()) {}
5370 void finish(int r
) override
{
5372 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
5376 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
5378 dout(10) << "_link_remote "
5379 << (inc
? "link ":"unlink ")
5380 << *dn
<< " to " << *targeti
<< dendl
;
5382 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5383 mds_rank_t linkauth
= targeti
->authority().first
;
5384 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
5385 if (mds
->is_cluster_degraded() &&
5386 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
5387 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
5388 if (mdr
->more()->waiting_on_slave
.empty())
5389 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
5393 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
5396 op
= MMDSSlaveRequest::OP_LINKPREP
;
5398 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
5399 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, op
);
5400 targeti
->set_object_info(req
->get_object_info());
5401 req
->op_stamp
= mdr
->get_op_stamp();
5402 mds
->send_message_mds(req
, linkauth
);
5404 assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
5405 mdr
->more()->waiting_on_slave
.insert(linkauth
);
5408 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
5410 assert(g_conf
->mds_kill_link_at
!= 2);
5412 mdr
->set_mds_stamp(ceph_clock_now());
5415 mdr
->ls
= mdlog
->get_current_segment();
5416 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
5417 mdlog
->start_entry(le
);
5418 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5419 if (!mdr
->more()->witnessed
.empty()) {
5420 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
5421 le
->reqid
= mdr
->reqid
;
5422 le
->had_slaves
= true;
5423 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
5428 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
5429 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5430 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5433 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
5434 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
5435 le
->metablob
.add_null_dentry(dn
, true);
5436 dn
->push_projected_linkage();
5439 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
5442 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
5443 CDentry
*dn
, CInode
*targeti
,
5446 dout(10) << "_link_remote_finish "
5447 << (inc
? "link ":"unlink ")
5448 << *dn
<< " to " << *targeti
<< dendl
;
5450 assert(g_conf
->mds_kill_link_at
!= 3);
5452 if (!mdr
->more()->witnessed
.empty())
5453 mdcache
->logged_master_update(mdr
->reqid
);
5456 // link the new dentry
5457 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
5458 if (!dnl
->get_inode())
5459 dn
->link_remote(dnl
, targeti
);
5460 dn
->mark_dirty(dpv
, mdr
->ls
);
5462 // unlink main dentry
5463 dn
->get_dir()->unlink_inode(dn
);
5464 dn
->pop_projected_linkage();
5465 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
5470 MDRequestRef null_ref
;
5472 mdcache
->send_dentry_link(dn
, null_ref
);
5474 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
5476 // bump target popularity
5477 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), targeti
, META_POP_IWR
);
5478 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), dn
->get_dir(), META_POP_IWR
);
5481 respond_to_request(mdr
, 0);
5484 // removing a new dn?
5485 dn
->get_dir()->try_remove_unlinked_dn(dn
);
5489 // remote linking/unlinking
5491 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
5494 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5495 ServerLogContext(s
, r
), targeti(t
) { }
5496 void finish(int r
) override
{
5498 server
->_logged_slave_link(mdr
, targeti
);
5502 class C_MDS_SlaveLinkCommit
: public ServerContext
{
5506 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5507 ServerContext(s
), mdr(r
), targeti(t
) { }
5508 void finish(int r
) override
{
5509 server
->_commit_slave_link(mdr
, r
, targeti
);
5513 /* This function DOES put the mdr->slave_request before returning*/
5514 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
5516 dout(10) << "handle_slave_link_prep " << *mdr
5517 << " on " << mdr
->slave_request
->get_object_info()
5520 assert(g_conf
->mds_kill_link_at
!= 4);
5522 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
5524 dout(10) << "targeti " << *targeti
<< dendl
;
5525 CDentry
*dn
= targeti
->get_parent_dn();
5526 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5527 assert(dnl
->is_primary());
5529 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
5531 mdr
->auth_pin(targeti
);
5533 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5534 assert(g_conf
->mds_kill_link_at
!= 5);
5537 mdr
->ls
= mdlog
->get_current_segment();
5538 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
5539 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
5540 mdlog
->start_entry(le
);
5542 inode_t
*pi
= dnl
->get_inode()->project_inode();
5544 // update journaled target inode
5546 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
5554 link_rollback rollback
;
5555 rollback
.reqid
= mdr
->reqid
;
5556 rollback
.ino
= targeti
->ino();
5557 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
5558 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
5559 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
5560 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
5561 rollback
.was_inc
= inc
;
5562 ::encode(rollback
, le
->rollback
);
5563 mdr
->more()->rollback_bl
= le
->rollback
;
5565 pi
->ctime
= mdr
->get_op_stamp();
5566 pi
->version
= targeti
->pre_dirty();
5568 dout(10) << " projected inode " << pi
<< " v " << pi
->version
<< dendl
;
5571 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
5572 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
5574 // set up commit waiter
5575 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
5577 mdr
->more()->slave_update_journaled
= true;
5578 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
),
5583 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
)
5585 dout(10) << "_logged_slave_link " << *mdr
5586 << " " << *targeti
<< dendl
;
5588 assert(g_conf
->mds_kill_link_at
!= 6);
5590 // update the target
5591 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5595 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), targeti
, META_POP_IWR
);
5598 mdr
->slave_request
->put();
5599 mdr
->slave_request
= 0;
5602 if (!mdr
->aborted
) {
5603 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5604 MMDSSlaveRequest::OP_LINKPREPACK
);
5605 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
5607 dout(10) << " abort flag set, finishing" << dendl
;
5608 mdcache
->request_finish(mdr
);
5613 struct C_MDS_CommittedSlave
: public ServerLogContext
{
5614 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
5615 void finish(int r
) override
{
5616 server
->_committed_slave(mdr
);
5620 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
5622 dout(10) << "_commit_slave_link " << *mdr
5624 << " " << *targeti
<< dendl
;
5626 assert(g_conf
->mds_kill_link_at
!= 7);
5629 // drop our pins, etc.
5632 // write a commit to the journal
5633 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
5634 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
5635 mdlog
->start_entry(le
);
5636 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
5639 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
5643 void Server::_committed_slave(MDRequestRef
& mdr
)
5645 dout(10) << "_committed_slave " << *mdr
<< dendl
;
5647 assert(g_conf
->mds_kill_link_at
!= 8);
5649 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5650 MMDSSlaveRequest::OP_COMMITTED
);
5651 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
5652 mdcache
->request_finish(mdr
);
5655 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
5657 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
) : ServerLogContext(s
, r
), mut(m
) {}
5658 void finish(int r
) override
{
5659 server
->_link_rollback_finish(mut
, mdr
);
5663 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
5665 link_rollback rollback
;
5666 bufferlist::iterator p
= rbl
.begin();
5667 ::decode(rollback
, p
);
5669 dout(10) << "do_link_rollback on " << rollback
.reqid
5670 << (rollback
.was_inc
? " inc":" dec")
5671 << " ino " << rollback
.ino
5674 assert(g_conf
->mds_kill_link_at
!= 9);
5676 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
5677 assert(mdr
|| mds
->is_resolve());
5679 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
5680 mut
->ls
= mds
->mdlog
->get_current_segment();
5682 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
5684 dout(10) << " target is " << *in
<< dendl
;
5685 assert(!in
->is_projected()); // live slave request hold versionlock xlock.
5687 inode_t
*pi
= in
->project_inode();
5688 pi
->version
= in
->pre_dirty();
5689 mut
->add_projected_inode(in
);
5691 // parent dir rctime
5692 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
5693 fnode_t
*pf
= parent
->project_fnode();
5694 mut
->add_projected_fnode(parent
);
5695 pf
->version
= parent
->pre_dirty();
5696 if (pf
->fragstat
.mtime
== pi
->ctime
) {
5697 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
5698 if (pf
->rstat
.rctime
== pi
->ctime
)
5699 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
5700 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
5701 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
5705 pi
->ctime
= rollback
.old_ctime
;
5706 if (rollback
.was_inc
)
5712 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
5713 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
5714 mdlog
->start_entry(le
);
5715 le
->commit
.add_dir_context(parent
);
5716 le
->commit
.add_dir(parent
, true);
5717 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
5719 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
),
5724 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
)
5726 dout(10) << "_link_rollback_finish" << dendl
;
5728 assert(g_conf
->mds_kill_link_at
!= 10);
5732 mdcache
->request_finish(mdr
);
5734 mdcache
->finish_rollback(mut
->reqid
);
5740 /* This function DOES NOT put the passed message before returning*/
5741 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*m
)
5743 dout(10) << "handle_slave_link_prep_ack " << *mdr
5744 << " " << *m
<< dendl
;
5745 mds_rank_t from
= mds_rank_t(m
->get_source().num());
5747 assert(g_conf
->mds_kill_link_at
!= 11);
5750 mdr
->more()->slaves
.insert(from
);
5753 assert(mdr
->more()->witnessed
.count(from
) == 0);
5754 mdr
->more()->witnessed
.insert(from
);
5755 assert(!m
->is_not_journaled());
5756 mdr
->more()->has_journaled_slaves
= true;
5758 // remove from waiting list
5759 assert(mdr
->more()->waiting_on_slave
.count(from
));
5760 mdr
->more()->waiting_on_slave
.erase(from
);
5762 assert(mdr
->more()->waiting_on_slave
.empty());
5764 dispatch_client_request(mdr
); // go again!
5773 void Server::handle_client_unlink(MDRequestRef
& mdr
)
5775 MClientRequest
*req
= mdr
->client_request
;
5776 client_t client
= mdr
->get_client();
5780 if (req
->get_op() == CEPH_MDS_OP_RMDIR
) rmdir
= true;
5782 if (req
->get_filepath().depth() == 0) {
5783 respond_to_request(mdr
, -EINVAL
);
5788 vector
<CDentry
*> trace
;
5790 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(), &trace
, &in
, MDS_TRAVERSE_FORWARD
);
5794 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
5795 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr
));
5798 respond_to_request(mdr
, r
);
5801 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5802 respond_to_request(mdr
, -EROFS
);
5806 CDentry
*dn
= trace
[trace
.size()-1];
5808 if (!dn
->is_auth()) {
5809 mdcache
->request_forward(mdr
, dn
->authority().first
);
5813 CInode
*diri
= dn
->get_dir()->get_inode();
5815 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
5816 assert(!dnl
->is_null());
5819 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
5821 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
5823 dout(7) << "dn links to " << *in
<< dendl
;
5828 // do empty directory checks
5829 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
5830 respond_to_request(mdr
, -ENOTEMPTY
);
5834 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
5835 respond_to_request(mdr
, -EISDIR
);
5841 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
5842 respond_to_request(mdr
, -ENOTDIR
);
5847 // -- create stray dentry? --
5848 CDentry
*straydn
= NULL
;
5849 if (dnl
->is_primary()) {
5850 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
5853 dout(10) << " straydn is " << *straydn
<< dendl
;
5854 } else if (mdr
->straydn
) {
5855 mdr
->unpin(mdr
->straydn
);
5856 mdr
->straydn
= NULL
;
5860 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5862 for (int i
=0; i
<(int)trace
.size()-1; i
++)
5863 rdlocks
.insert(&trace
[i
]->lock
);
5864 xlocks
.insert(&dn
->lock
);
5865 wrlocks
.insert(&diri
->filelock
);
5866 wrlocks
.insert(&diri
->nestlock
);
5867 xlocks
.insert(&in
->linklock
);
5869 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
5870 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
5871 xlocks
.insert(&straydn
->lock
);
5874 rdlocks
.insert(&in
->filelock
); // to verify it's empty
5875 mds
->locker
->include_snap_rdlocks(rdlocks
, dnl
->get_inode());
5877 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5881 _dir_is_nonempty(mdr
, in
)) {
5882 respond_to_request(mdr
, -ENOTEMPTY
);
5886 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5887 if (!check_access(mdr
, diri
, MAY_WRITE
))
5892 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
5893 // subtree root auths need to be witnesses
5894 set
<mds_rank_t
> witnesses
;
5895 in
->list_replicas(witnesses
);
5896 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
5898 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
5899 p
!= witnesses
.end();
5901 if (mdr
->more()->witnessed
.count(*p
)) {
5902 dout(10) << " already witnessed by mds." << *p
<< dendl
;
5903 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
5904 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
5906 if (!_rmdir_prepare_witness(mdr
, *p
, trace
, straydn
))
5910 if (!mdr
->more()->waiting_on_slave
.empty())
5911 return; // we're waiting for a witness.
5915 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
5916 _link_remote(mdr
, false, dn
, dnl
->get_inode());
5918 _unlink_local(mdr
, dn
, straydn
);
5921 class C_MDS_unlink_local_finish
: public ServerLogContext
{
5924 version_t dnpv
; // deleted dentry
5926 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
5927 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
5928 dnpv(d
->get_projected_version()) {}
5929 void finish(int r
) override
{
5931 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
5935 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
5937 dout(10) << "_unlink_local " << *dn
<< dendl
;
5939 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
5940 CInode
*in
= dnl
->get_inode();
5942 SnapRealm
*realm
= in
->find_snaprealm();
5943 snapid_t follows
= realm
->get_newest_seq();
5946 mdr
->ls
= mdlog
->get_current_segment();
5948 // prepare log entry
5949 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
5950 mdlog
->start_entry(le
);
5951 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5952 if (!mdr
->more()->witnessed
.empty()) {
5953 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
5954 le
->reqid
= mdr
->reqid
;
5955 le
->had_slaves
= true;
5956 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
5960 assert(dnl
->is_primary());
5961 straydn
->push_projected_linkage(in
);
5962 straydn
->first
= follows
+ 1;
5965 // the unlinked dentry
5968 inode_t
*pi
= in
->project_inode();
5969 dn
->make_path_string(pi
->stray_prior_path
, true);
5970 mdr
->add_projected_inode(in
); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5971 pi
->version
= in
->pre_dirty();
5972 pi
->ctime
= mdr
->get_op_stamp();
5976 in
->state_set(CInode::STATE_ORPHAN
);
5978 if (dnl
->is_primary()) {
5979 // primary link. add stray dentry.
5981 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
5982 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5984 // project snaprealm, too
5985 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap())
5986 in
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
5988 pi
->update_backtrace();
5989 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
5991 // remote link. update remote inode.
5992 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
5993 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5994 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5997 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
5998 le
->metablob
.add_null_dentry(dn
, true);
6001 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6002 le
->metablob
.renamed_dirino
= in
->ino();
6005 dn
->push_projected_linkage();
6009 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6011 in
->maybe_export_pin(true);
6014 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
6017 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
6018 CDentry
*dn
, CDentry
*straydn
,
6021 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
6023 if (!mdr
->more()->witnessed
.empty())
6024 mdcache
->logged_master_update(mdr
->reqid
);
6026 // unlink main dentry
6027 dn
->get_dir()->unlink_inode(dn
);
6028 dn
->pop_projected_linkage();
6030 // relink as stray? (i.e. was primary link?)
6031 CInode
*strayin
= NULL
;
6032 bool snap_is_new
= false;
6034 dout(20) << " straydn is " << *straydn
<< dendl
;
6035 CDentry::linkage_t
*straydnl
= straydn
->pop_projected_linkage();
6036 strayin
= straydnl
->get_inode();
6038 snap_is_new
= strayin
->snaprealm
? true : false;
6039 mdcache
->touch_dentry_bottom(straydn
);
6042 dn
->mark_dirty(dnpv
, mdr
->ls
);
6045 if (snap_is_new
) //only new if strayin exists
6046 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, true);
6048 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
6050 // update subtree map?
6051 if (straydn
&& strayin
->is_dir())
6052 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
6055 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), dn
->get_dir(), META_POP_IWR
);
6058 respond_to_request(mdr
, 0);
6060 // removing a new dn?
6061 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6064 // respond_to_request() drops locks. So stray reintegration can race with us.
6065 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6066 // Tip off the MDCache that this dentry is a stray that
6067 // might be elegible for purge.
6068 mdcache
->notify_stray(straydn
);
6072 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
6074 if (mds
->is_cluster_degraded() &&
6075 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
6076 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
6077 if (mdr
->more()->waiting_on_slave
.empty())
6078 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
6082 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
6083 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6084 MMDSSlaveRequest::OP_RMDIRPREP
);
6085 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
6086 for (auto dn
: trace
)
6087 req
->srcdnpath
.push_dentry(dn
->name
);
6088 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
6090 req
->op_stamp
= mdr
->get_op_stamp();
6091 mds
->send_message_mds(req
, who
);
6093 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
6094 mdr
->more()->waiting_on_slave
.insert(who
);
6098 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
6099 CDentry
*dn
, *straydn
;
6100 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
6101 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
6102 void finish(int r
) override
{
6103 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
6107 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
6110 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
6111 : ServerContext(s
), mdr(r
), straydn(sd
) { }
6112 void finish(int r
) override
{
6113 server
->_commit_slave_rmdir(mdr
, r
, straydn
);
6117 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
6119 dout(10) << "handle_slave_rmdir_prep " << *mdr
6120 << " " << mdr
->slave_request
->srcdnpath
6121 << " to " << mdr
->slave_request
->destdnpath
6124 vector
<CDentry
*> trace
;
6125 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
6126 dout(10) << " src " << srcpath
<< dendl
;
6128 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &in
, MDS_TRAVERSE_DISCOVERXLOCK
);
6131 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
6136 CDentry
*dn
= trace
[trace
.size()-1];
6137 dout(10) << " dn " << *dn
<< dendl
;
6140 assert(mdr
->straydn
);
6141 CDentry
*straydn
= mdr
->straydn
;
6142 dout(10) << " straydn " << *straydn
<< dendl
;
6144 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6146 rmdir_rollback rollback
;
6147 rollback
.reqid
= mdr
->reqid
;
6148 rollback
.src_dir
= dn
->get_dir()->dirfrag();
6149 rollback
.src_dname
= dn
->name
;
6150 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
6151 rollback
.dest_dname
= straydn
->name
;
6152 ::encode(rollback
, mdr
->more()->rollback_bl
);
6153 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
6155 // set up commit waiter
6156 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
, straydn
);
6158 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
6159 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
6160 dn
->get_dir()->unlink_inode(dn
);
6161 straydn
->get_dir()->link_primary_inode(straydn
, in
);
6163 assert(straydn
->first
>= in
->first
);
6164 in
->first
= straydn
->first
;
6166 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), false);
6168 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6169 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6170 reply
->mark_not_journaled();
6171 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6173 // send caps to auth (if we're not already)
6174 if (in
->is_any_caps() && !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
6175 mdcache
->migrator
->export_caps(in
);
6177 mdcache
->touch_dentry_bottom(straydn
); // move stray to end of lru
6179 mdr
->slave_request
->put();
6180 mdr
->slave_request
= 0;
6185 straydn
->push_projected_linkage(in
);
6186 dn
->push_projected_linkage();
6188 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
6189 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
6190 mdlog
->start_entry(le
);
6191 le
->rollback
= mdr
->more()->rollback_bl
;
6193 le
->commit
.add_dir_context(straydn
->get_dir());
6194 le
->commit
.add_primary_dentry(straydn
, in
, true);
6195 // slave: no need to journal original dentry
6197 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6198 le
->commit
.renamed_dirino
= in
->ino();
6200 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6202 mdr
->more()->slave_update_journaled
= true;
6203 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
6208 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6210 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
6212 // update our cache now, so we are consistent with what is in the journal
6213 // when we journal a subtree map
6214 CInode
*in
= dn
->get_linkage()->get_inode();
6215 dn
->get_dir()->unlink_inode(dn
);
6216 straydn
->pop_projected_linkage();
6217 dn
->pop_projected_linkage();
6218 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), true);
6221 mdr
->slave_request
->put();
6222 mdr
->slave_request
= 0;
6225 if (!mdr
->aborted
) {
6226 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6227 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6228 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6230 dout(10) << " abort flag set, finishing" << dendl
;
6231 mdcache
->request_finish(mdr
);
6235 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
6237 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6238 << " " << *ack
<< dendl
;
6240 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
6242 mdr
->more()->slaves
.insert(from
);
6243 mdr
->more()->witnessed
.insert(from
);
6244 if (!ack
->is_not_journaled())
6245 mdr
->more()->has_journaled_slaves
= true;
6247 // remove from waiting list
6248 assert(mdr
->more()->waiting_on_slave
.count(from
));
6249 mdr
->more()->waiting_on_slave
.erase(from
);
6251 if (mdr
->more()->waiting_on_slave
.empty())
6252 dispatch_client_request(mdr
); // go again!
6254 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
6257 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
6259 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
6262 if (mdr
->more()->slave_update_journaled
) {
6263 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
6264 if (strayin
&& !strayin
->snaprealm
)
6265 mdcache
->clear_dirty_bits_for_stray(strayin
);
6270 if (mdr
->more()->slave_update_journaled
) {
6271 // write a commit to the journal
6272 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
6273 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
6274 ESlaveUpdate::RMDIR
);
6275 mdlog
->start_entry(le
);
6276 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6279 _committed_slave(mdr
);
6283 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6287 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
6291 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
6292 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
6293 void finish(int r
) override
{
6294 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
6298 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6300 // unlink the other rollback methods, the rmdir rollback is only
6301 // needed to record the subtree changes in the journal for inode
6302 // replicas who are auth for empty dirfrags. no actual changes to
6303 // the file system are taking place here, so there is no Mutation.
6305 rmdir_rollback rollback
;
6306 bufferlist::iterator p
= rbl
.begin();
6307 ::decode(rollback
, p
);
6309 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
6310 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6311 assert(mdr
|| mds
->is_resolve());
6313 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
6315 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
6317 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
6319 dout(10) << " dn " << *dn
<< dendl
;
6320 dir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
6322 CDentry
*straydn
= dir
->lookup(rollback
.dest_dname
);
6324 dout(10) << " straydn " << *dn
<< dendl
;
6325 CInode
*in
= straydn
->get_linkage()->get_inode();
6327 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
6328 assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
6330 straydn
->get_dir()->unlink_inode(straydn
);
6331 dn
->get_dir()->link_primary_inode(dn
, in
);
6333 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), false);
6335 mdcache
->request_finish(mdr
);
6336 mdcache
->finish_rollback(rollback
.reqid
);
6340 dn
->push_projected_linkage(in
);
6341 straydn
->push_projected_linkage();
6343 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
6344 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
6345 mdlog
->start_entry(le
);
6347 le
->commit
.add_dir_context(dn
->get_dir());
6348 le
->commit
.add_primary_dentry(dn
, in
, true);
6349 // slave: no need to journal straydn
6351 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6352 le
->commit
.renamed_dirino
= in
->ino();
6354 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
6356 submit_mdlog_entry(le
,
6357 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
6363 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
6365 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
6367 straydn
->get_dir()->unlink_inode(straydn
);
6368 dn
->pop_projected_linkage();
6369 straydn
->pop_projected_linkage();
6371 CInode
*in
= dn
->get_linkage()->get_inode();
6372 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), true);
6373 if (mds
->is_resolve()) {
6374 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
6375 mdcache
->try_trim_non_auth_subtree(root
);
6379 mdcache
->request_finish(mdr
);
6381 mdcache
->finish_rollback(reqid
);
6385 /** _dir_is_nonempty[_unlocked]
6387 * check if a directory is non-empty (i.e. we can rmdir it).
6389 * the unlocked varient this is a fastpath check. we can't really be
6390 * sure until we rdlock the filelock.
6392 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
6394 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
6395 assert(in
->is_auth());
6397 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
6398 return true; // in a snapshot!
6401 in
->get_dirfrags(ls
);
6402 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6404 // is the frag obviously non-empty?
6405 if (dir
->is_auth()) {
6406 if (dir
->get_projected_fnode()->fragstat
.size()) {
6407 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6408 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
6417 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
6419 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
6420 assert(in
->is_auth());
6421 assert(in
->filelock
.can_read(mdr
->get_client()));
6423 frag_info_t dirstat
;
6424 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
6427 in
->get_dirfrags(ls
);
6428 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6430 const fnode_t
*pf
= dir
->get_projected_fnode();
6431 if (pf
->fragstat
.size()) {
6432 dout(10) << "dir_is_nonempty dirstat has "
6433 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
6437 if (pf
->accounted_fragstat
.version
== dirstat_version
)
6438 dirstat
.add(pf
->accounted_fragstat
);
6440 dirstat
.add(pf
->fragstat
);
6443 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
6447 // ======================================================
6450 class C_MDS_rename_finish
: public ServerLogContext
{
6455 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
6456 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
6457 ServerLogContext(s
, r
),
6458 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
6459 void finish(int r
) override
{
6461 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
6466 /** handle_client_rename
6468 * rename master is the destdn auth. this is because cached inodes
6469 * must remain connected. thus, any replica of srci, must also
6470 * replicate destdn, and possibly straydn, so that srci (and
6471 * destdn->inode) remain connected during the rename.
6473 * to do this, we freeze srci, then master (destdn auth) verifies that
6474 * all other nodes have also replciated destdn and straydn. note that
6475 * destdn replicas need not also replicate srci. this only works when
6478 * This function takes responsibility for the passed mdr.
6480 void Server::handle_client_rename(MDRequestRef
& mdr
)
6482 MClientRequest
*req
= mdr
->client_request
;
6483 dout(7) << "handle_client_rename " << *req
<< dendl
;
6485 filepath destpath
= req
->get_filepath();
6486 filepath srcpath
= req
->get_filepath2();
6487 if (destpath
.depth() == 0 || srcpath
.depth() == 0) {
6488 respond_to_request(mdr
, -EINVAL
);
6491 const string
&destname
= destpath
.last_dentry();
6493 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
6494 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
6496 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
6498 CDentry
*destdn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, true, false, true);
6499 if (!destdn
) return;
6500 dout(10) << " destdn " << *destdn
<< dendl
;
6501 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6502 respond_to_request(mdr
, -EROFS
);
6505 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
6506 CDir
*destdir
= destdn
->get_dir();
6507 assert(destdir
->is_auth());
6509 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &srctrace
, NULL
, MDS_TRAVERSE_DISCOVER
);
6514 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
6515 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
6517 dout(10) << "FAIL on error " << r
<< dendl
;
6518 respond_to_request(mdr
, r
);
6523 assert(!srctrace
.empty());
6524 CDentry
*srcdn
= srctrace
[srctrace
.size()-1];
6525 dout(10) << " srcdn " << *srcdn
<< dendl
;
6526 if (srcdn
->last
!= CEPH_NOSNAP
) {
6527 respond_to_request(mdr
, -EROFS
);
6530 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
6531 CInode
*srci
= srcdnl
->get_inode();
6532 dout(10) << " srci " << *srci
<< dendl
;
6535 if (!destdnl
->is_null()) {
6536 //dout(10) << "dest dn exists " << *destdn << dendl;
6537 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
6539 dout(10) << " oldin " << *oldin
<< dendl
;
6541 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6542 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
6543 respond_to_request(mdr
, -ENOTEMPTY
);
6547 // if srcdn is replica, need to make sure its linkage is correct
6548 if (srcdn
->is_auth() ||
6549 srcdn
->lock
.can_read(mdr
->get_client()) ||
6550 (srcdn
->lock
.is_xlocked() && srcdn
->lock
.get_xlock_by() == mdr
)) {
6551 // mv /some/thing /to/some/existing_other_thing
6552 if (oldin
->is_dir() && !srci
->is_dir()) {
6553 respond_to_request(mdr
, -EISDIR
);
6556 if (!oldin
->is_dir() && srci
->is_dir()) {
6557 respond_to_request(mdr
, -ENOTDIR
);
6560 if (srci
== oldin
&& !srcdn
->get_dir()->inode
->is_stray()) {
6561 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
6567 // -- some sanity checks --
6569 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6570 if (destpath
.get_ino() != srcpath
.get_ino() &&
6571 !(req
->get_source().is_mds() &&
6572 MDS_INO_IS_MDSDIR(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6573 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
6574 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
6575 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6576 while (srcbase
!= destbase
&&
6577 !srcbase
->is_projected_ancestor_of(destbase
)) {
6578 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
6579 srctrace
.insert(srctrace
.begin(), pdn
);
6580 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
6581 srcbase
= pdn
->get_dir()->get_inode();
6584 // then, extend destpath until it shares the same parent inode as srcpath.
6585 while (destbase
!= srcbase
) {
6586 CDentry
*pdn
= destbase
->get_projected_parent_dn();
6587 desttrace
.insert(desttrace
.begin(), pdn
);
6588 rdlocks
.insert(&pdn
->lock
);
6589 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
6590 destbase
= pdn
->get_dir()->get_inode();
6592 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
6596 if (srcdn
->get_dir() == destdir
&& srcdn
->name
== destname
) {
6597 dout(7) << "rename src=dest, noop" << dendl
;
6598 respond_to_request(mdr
, 0);
6602 // dest a child of src?
6603 // e.g. mv /usr /usr/foo
6604 CDentry
*pdn
= destdir
->inode
->get_projected_parent_dn();
6607 dout(7) << "cannot rename item to be a child of itself" << dendl
;
6608 respond_to_request(mdr
, -EINVAL
);
6611 pdn
= pdn
->get_dir()->inode
->parent
;
6614 // is this a stray migration, reintegration or merge? (sanity checks!)
6615 if (mdr
->reqid
.name
.is_mds() &&
6616 !(MDS_INO_IS_MDSDIR(srcpath
.get_ino()) &&
6617 MDS_INO_IS_MDSDIR(destpath
.get_ino())) &&
6618 !(destdnl
->is_remote() &&
6619 destdnl
->get_remote_ino() == srci
->ino())) {
6620 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
6624 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
6625 (srcdnl
->is_primary() || destdnl
->is_primary()));
6627 dout(10) << " this is a link merge" << dendl
;
6629 // -- create stray dentry? --
6630 CDentry
*straydn
= NULL
;
6631 if (destdnl
->is_primary() && !linkmerge
) {
6632 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
6635 dout(10) << " straydn is " << *straydn
<< dendl
;
6636 } else if (mdr
->straydn
) {
6637 mdr
->unpin(mdr
->straydn
);
6638 mdr
->straydn
= NULL
;
6641 // -- prepare witness list --
6643 * NOTE: we use _all_ replicas as witnesses.
6644 * this probably isn't totally necessary (esp for file renames),
6645 * but if/when we change that, we have to make sure rejoin is
6646 * sufficiently robust to handle strong rejoins from survivors
6647 * with totally wrong dentry->inode linkage.
6648 * (currently, it can ignore rename effects, because the resolve
6649 * stage will sort them out.)
6651 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
6652 if (srcdn
->is_auth())
6653 srcdn
->list_replicas(witnesses
);
6655 witnesses
.insert(srcdn
->authority().first
);
6656 if (srcdnl
->is_remote() && !srci
->is_auth())
6657 witnesses
.insert(srci
->authority().first
);
6658 destdn
->list_replicas(witnesses
);
6659 if (destdnl
->is_remote() && !oldin
->is_auth())
6660 witnesses
.insert(oldin
->authority().first
);
6661 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6665 map
<SimpleLock
*, mds_rank_t
> remote_wrlocks
;
6667 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6668 for (int i
=0; i
<(int)srctrace
.size(); i
++)
6669 rdlocks
.insert(&srctrace
[i
]->lock
);
6670 xlocks
.insert(&srcdn
->lock
);
6671 mds_rank_t srcdirauth
= srcdn
->get_dir()->authority().first
;
6672 if (srcdirauth
!= mds
->get_nodeid()) {
6673 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth
<< dendl
;
6674 remote_wrlocks
[&srcdn
->get_dir()->inode
->filelock
] = srcdirauth
;
6675 remote_wrlocks
[&srcdn
->get_dir()->inode
->nestlock
] = srcdirauth
;
6677 rdlocks
.insert(&srci
->dirfragtreelock
);
6679 wrlocks
.insert(&srcdn
->get_dir()->inode
->filelock
);
6680 wrlocks
.insert(&srcdn
->get_dir()->inode
->nestlock
);
6682 mds
->locker
->include_snap_rdlocks(rdlocks
, srcdn
->get_dir()->inode
);
6686 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
6687 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
6688 xlocks
.insert(&straydn
->lock
);
6691 // xlock versionlock on dentries if there are witnesses.
6692 // replicas can't see projected dentry linkages, and will get
6693 // confused if we try to pipeline things.
6694 if (!witnesses
.empty()) {
6695 // take xlock on all projected ancestor dentries for srcdn and destdn.
6696 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6697 for (int i
= 0; i
<(int)srctrace
.size(); i
++) {
6698 if (srctrace
[i
]->is_auth() && srctrace
[i
]->is_projected())
6699 xlocks
.insert(&srctrace
[i
]->versionlock
);
6701 for (int i
=0; i
<(int)desttrace
.size(); i
++) {
6702 if (desttrace
[i
]->is_auth() && desttrace
[i
]->is_projected())
6703 xlocks
.insert(&desttrace
[i
]->versionlock
);
6705 // xlock srci and oldin's primary dentries, so witnesses can call
6706 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6708 if (srcdnl
->is_remote())
6709 xlocks
.insert(&srci
->get_projected_parent_dn()->lock
);
6710 if (destdnl
->is_remote())
6711 xlocks
.insert(&oldin
->get_projected_parent_dn()->lock
);
6714 // we need to update srci's ctime. xlock its least contended lock to do that...
6715 xlocks
.insert(&srci
->linklock
);
6717 // xlock oldin (for nlink--)
6719 xlocks
.insert(&oldin
->linklock
);
6720 if (oldin
->is_dir())
6721 rdlocks
.insert(&oldin
->filelock
);
6723 if (srcdnl
->is_primary() && srci
->is_dir())
6724 // FIXME: this should happen whenever we are renamning between
6725 // realms, regardless of the file type
6726 // FIXME: If/when this changes, make sure to update the
6727 // "allowance" in handle_slave_rename_prep
6728 xlocks
.insert(&srci
->snaplock
); // FIXME: an auth bcast could be sufficient?
6730 rdlocks
.insert(&srci
->snaplock
);
6732 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: NULL
;
6733 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
,
6734 &remote_wrlocks
, auth_pin_freeze
))
6737 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6738 if (!check_access(mdr
, srcdn
->get_dir()->get_inode(), MAY_WRITE
))
6741 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
6744 if (!check_fragment_space(mdr
, destdn
->get_dir()))
6747 if (!check_access(mdr
, srci
, MAY_WRITE
))
6751 // with read lock, really verify oldin is empty
6754 _dir_is_nonempty(mdr
, oldin
)) {
6755 respond_to_request(mdr
, -ENOTEMPTY
);
6759 /* project_past_snaprealm_parent() will do this job
6761 // moving between snaprealms?
6762 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6763 SnapRealm *srcrealm = srci->find_snaprealm();
6764 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6765 if (srcrealm != destrealm &&
6766 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6767 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6768 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6769 mdcache->snaprealm_create(mdr, srci);
6775 assert(g_conf
->mds_kill_rename_at
!= 1);
6777 // -- open all srcdn inode frags, if any --
6778 // we need these open so that auth can properly delegate from inode to dirfrags
6779 // after the inode is _ours_.
6780 if (srcdnl
->is_primary() &&
6781 !srcdn
->is_auth() &&
6783 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
6784 mdr
->set_stickydirs(srci
);
6787 srci
->dirfragtree
.get_leaves(frags
);
6788 for (list
<frag_t
>::iterator p
= frags
.begin();
6791 CDir
*dir
= srci
->get_dirfrag(*p
);
6793 dout(10) << " opening " << *p
<< " under " << *srci
<< dendl
;
6794 mdcache
->open_remote_dirfrag(srci
, *p
, new C_MDS_RetryRequest(mdcache
, mdr
));
6800 // -- prepare witnesses --
6802 // do srcdn auth last
6803 mds_rank_t last
= MDS_RANK_NONE
;
6804 if (!srcdn
->is_auth()) {
6805 last
= srcdn
->authority().first
;
6806 mdr
->more()->srcdn_auth_mds
= last
;
6807 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6808 // are involved in the rename operation.
6809 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
6810 dout(10) << " preparing ambiguous auth for srci" << dendl
;
6811 assert(mdr
->more()->is_remote_frozen_authpin
);
6812 assert(mdr
->more()->rename_inode
== srci
);
6813 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6818 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6819 p
!= witnesses
.end();
6821 if (*p
== last
) continue; // do it last!
6822 if (mdr
->more()->witnessed
.count(*p
)) {
6823 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6824 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6825 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6827 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
6831 if (!mdr
->more()->waiting_on_slave
.empty())
6832 return; // we're waiting for a witness.
6834 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
6835 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
6836 assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
6837 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6841 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6842 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
6843 assert(g_conf
->mds_kill_rename_at
!= 3);
6844 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
6845 assert(g_conf
->mds_kill_rename_at
!= 4);
6847 // -- declare now --
6848 mdr
->set_mds_stamp(ceph_clock_now());
6850 // -- prepare journal entry --
6851 mdr
->ls
= mdlog
->get_current_segment();
6852 EUpdate
*le
= new EUpdate(mdlog
, "rename");
6853 mdlog
->start_entry(le
);
6854 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6855 if (!mdr
->more()->witnessed
.empty()) {
6856 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6858 le
->reqid
= mdr
->reqid
;
6859 le
->had_slaves
= true;
6861 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6862 // no need to send frozen auth pin to recovring auth MDS of srci
6863 mdr
->more()->is_remote_frozen_authpin
= false;
6866 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
6867 if (le
->client_map
.length())
6868 le
->cmapv
= mds
->sessionmap
.get_projected();
6870 // -- commit locally --
6871 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
6873 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
6877 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
6879 dout(10) << "_rename_finish " << *mdr
<< dendl
;
6881 if (!mdr
->more()->witnessed
.empty())
6882 mdcache
->logged_master_update(mdr
->reqid
);
6885 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
6887 mdcache
->send_dentry_link(destdn
, mdr
);
6889 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
6890 CInode
*in
= destdnl
->get_inode();
6891 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
6893 // test hack: test slave commit
6894 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
6895 assert(g_conf
->mds_kill_rename_at
!= 5);
6896 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
6897 assert(g_conf
->mds_kill_rename_at
!= 6);
6900 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), srcdn
->get_dir(), META_POP_IWR
);
6901 if (destdnl
->is_remote() && in
->is_auth())
6902 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), in
, META_POP_IWR
);
6904 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6906 assert(g_conf
->mds_kill_rename_at
!= 7);
6909 respond_to_request(mdr
, 0);
6912 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
6915 // respond_to_request() drops locks. So stray reintegration can race with us.
6916 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6917 mdcache
->notify_stray(straydn
);
6925 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
6926 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
6928 if (mds
->is_cluster_degraded() &&
6929 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
6930 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
6931 if (mdr
->more()->waiting_on_slave
.empty())
6932 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
6936 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
6937 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6938 MMDSSlaveRequest::OP_RENAMEPREP
);
6940 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
6941 for (auto dn
: srctrace
)
6942 req
->srcdnpath
.push_dentry(dn
->name
);
6943 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
6944 for (auto dn
: dsttrace
)
6945 req
->destdnpath
.push_dentry(dn
->name
);
6947 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
6949 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
6951 // srcdn auth will verify our current witness list is sufficient
6952 req
->witnesses
= witnesse
;
6954 req
->op_stamp
= mdr
->get_op_stamp();
6955 mds
->send_message_mds(req
, who
);
6957 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
6958 mdr
->more()->waiting_on_slave
.insert(who
);
6962 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
6964 version_t oldpv
= mdr
->more()->inode_import_v
;
6966 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
6969 bufferlist::iterator blp
= mdr
->more()->inode_import
.begin();
6972 ::decode(mdr
->more()->imported_client_map
, blp
);
6973 ::encode(mdr
->more()->imported_client_map
, *client_map_bl
,
6974 mds
->mdsmap
->get_up_features());
6975 prepare_force_open_sessions(mdr
->more()->imported_client_map
, mdr
->more()->sseq_map
);
6977 list
<ScatterLock
*> updated_scatterlocks
;
6978 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
6979 mdr
->more()->cap_imports
, updated_scatterlocks
);
6981 // hack: force back to !auth and clean, temporarily
6982 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
6983 srcdnl
->get_inode()->mark_clean();
6988 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
6991 diri
->get_dirfrags(ls
);
6993 bool force_journal
= false;
6995 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6996 if ((*p
)->is_subtree_root() && (*p
)->get_dir_auth().first
== mds
->get_nodeid()) {
6997 dout(10) << " frag " << (*p
)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
6998 force_journal
= true;
7001 dout(20) << " frag " << (*p
)->get_frag() << " is not auth subtree dirfrag" << dendl
;
7004 // see if any children of our frags are auth subtrees.
7005 list
<CDir
*> subtrees
;
7006 mdcache
->list_subtrees(subtrees
);
7007 dout(10) << " subtrees " << subtrees
<< " frags " << ls
<< dendl
;
7008 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7010 for (list
<CDir
*>::iterator q
= subtrees
.begin(); q
!= subtrees
.end(); ++q
) {
7011 if (dir
->contains(*q
)) {
7012 if ((*q
)->get_dir_auth().first
== mds
->get_nodeid()) {
7013 dout(10) << " frag " << (*p
)->get_frag() << " contains (maybe) auth subtree, will force journal "
7015 force_journal
= true;
7018 dout(20) << " frag " << (*p
)->get_frag() << " contains but isn't auth for " << **q
<< dendl
;
7020 dout(20) << " frag " << (*p
)->get_frag() << " does not contain " << **q
<< dendl
;
7026 return force_journal
;
7029 void Server::_rename_prepare(MDRequestRef
& mdr
,
7030 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
7031 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7033 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7035 dout(10) << " straydn " << *straydn
<< dendl
;
7037 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7038 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7039 CInode
*srci
= srcdnl
->get_inode();
7040 CInode
*oldin
= destdnl
->get_inode();
7042 // primary+remote link merge?
7043 bool linkmerge
= (srci
== destdnl
->get_inode() &&
7044 (srcdnl
->is_primary() || destdnl
->is_primary()));
7045 bool silent
= srcdn
->get_dir()->inode
->is_stray();
7047 bool force_journal_dest
= false;
7048 if (srci
->is_dir() && !destdn
->is_auth()) {
7049 if (srci
->is_auth()) {
7050 // if we are auth for srci and exporting it, force journal because journal replay needs
7051 // the source inode to create auth subtrees.
7052 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
7053 force_journal_dest
= true;
7055 force_journal_dest
= _need_force_journal(srci
, false);
7058 bool force_journal_stray
= false;
7059 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
7060 force_journal_stray
= _need_force_journal(oldin
, true);
7063 dout(10) << " merging remote and primary links to the same inode" << dendl
;
7065 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
7066 if (force_journal_dest
)
7067 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
7068 if (force_journal_stray
)
7069 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
7071 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
7072 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
7073 metablob
->renamed_dirino
= srci
->ino();
7074 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
7075 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
7076 metablob
->renamed_dirino
= oldin
->ino();
7080 inode_t
*pi
= 0; // renamed inode
7081 inode_t
*tpi
= 0; // target/overwritten inode
7085 if (destdnl
->is_primary()) {
7086 assert(straydn
); // moving to straydn.
7087 // link--, and move.
7088 if (destdn
->is_auth()) {
7089 tpi
= oldin
->project_inode(); //project_snaprealm
7090 tpi
->version
= straydn
->pre_dirty(tpi
->version
);
7091 tpi
->update_backtrace();
7093 straydn
->push_projected_linkage(oldin
);
7094 } else if (destdnl
->is_remote()) {
7096 if (oldin
->is_auth()) {
7097 tpi
= oldin
->project_inode();
7098 tpi
->version
= oldin
->pre_dirty();
7104 if (srcdnl
->is_remote()) {
7107 if (destdn
->is_auth())
7108 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
7109 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7111 if (srci
->is_auth()) {
7112 pi
= srci
->project_inode();
7113 pi
->version
= srci
->pre_dirty();
7116 dout(10) << " will merge remote onto primary link" << dendl
;
7117 if (destdn
->is_auth()) {
7118 pi
= oldin
->project_inode();
7119 pi
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
7123 if (destdn
->is_auth()) {
7125 if (srcdn
->is_auth())
7126 oldpv
= srci
->get_projected_version();
7128 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
7130 // note which dirfrags have child subtrees in the journal
7131 // event, so that we can open those (as bounds) during replay.
7132 if (srci
->is_dir()) {
7134 srci
->get_dirfrags(ls
);
7135 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7137 if (!dir
->is_auth())
7138 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
7140 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
7143 pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
7144 // & srcdnl->snaprealm
7145 pi
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
7146 pi
->update_backtrace();
7148 destdn
->push_projected_linkage(srci
);
7152 if (srcdn
->is_auth())
7153 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
7154 srcdn
->push_projected_linkage(); // push null linkage
7158 pi
->ctime
= mdr
->get_op_stamp();
7164 tpi
->ctime
= mdr
->get_op_stamp();
7166 destdn
->make_path_string(tpi
->stray_prior_path
, true);
7168 if (tpi
->nlink
== 0)
7169 oldin
->state_set(CInode::STATE_ORPHAN
);
7173 // prepare nesting, mtime updates
7174 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
7176 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7177 // then link the source inode to destdn
7178 if (destdnl
->is_primary()) {
7180 if (straydn
->is_auth()) {
7181 metablob
->add_dir_context(straydn
->get_dir());
7182 metablob
->add_dir(straydn
->get_dir(), true);
7187 if (destdn
->is_auth() && !destdnl
->is_null()) {
7188 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
7189 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
7190 if (destdnl
->is_primary()) {
7192 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
7193 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7198 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
7199 int flags
= predirty_dir
| predirty_primary
;
7200 if (srcdn
->is_auth())
7201 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
7202 if (destdn
->is_auth())
7203 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
7205 SnapRealm
*src_realm
= srci
->find_snaprealm();
7206 SnapRealm
*dest_realm
= destdn
->get_dir()->inode
->find_snaprealm();
7207 snapid_t next_dest_snap
= dest_realm
->get_newest_seq() + 1;
7209 // add it all to the metablob
7212 if (destdnl
->is_primary()) {
7214 if (destdn
->is_auth()) {
7215 // project snaprealm, too
7216 if (oldin
->snaprealm
|| dest_realm
->get_newest_seq() + 1 > oldin
->get_oldest_snap())
7217 oldin
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
7218 straydn
->first
= MAX(oldin
->first
, next_dest_snap
);
7219 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
7220 } else if (force_journal_stray
) {
7221 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
7222 metablob
->add_dir_context(straydn
->get_dir());
7223 metablob
->add_primary_dentry(straydn
, oldin
, true);
7225 } else if (destdnl
->is_remote()) {
7226 if (oldin
->is_auth()) {
7228 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
7229 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
7230 CEPH_NOSNAP
, 0, destdnl
);
7231 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
7237 if (srcdnl
->is_remote()) {
7239 if (destdn
->is_auth() && !destdnl
->is_null())
7240 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7242 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7244 if (destdn
->is_auth())
7245 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7246 if (srci
->get_projected_parent_dn()->is_auth()) { // it's remote
7247 metablob
->add_dir_context(srci
->get_projected_parent_dir());
7248 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci
->get_projected_parent_dn(), CEPH_NOSNAP
, 0, srcdnl
);
7249 metablob
->add_primary_dentry(srci
->get_projected_parent_dn(), srci
, true);
7252 if (destdn
->is_auth() && !destdnl
->is_null())
7253 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7255 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7257 if (destdn
->is_auth())
7258 metablob
->add_primary_dentry(destdn
, destdnl
->get_inode(), true, true);
7260 } else if (srcdnl
->is_primary()) {
7261 // project snap parent update?
7262 if (destdn
->is_auth() && src_realm
!= dest_realm
&&
7263 (srci
->snaprealm
|| src_realm
->get_newest_seq() + 1 > srci
->get_oldest_snap()))
7264 srci
->project_past_snaprealm_parent(dest_realm
);
7266 if (destdn
->is_auth() && !destdnl
->is_null())
7267 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7269 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7271 if (destdn
->is_auth())
7272 metablob
->add_primary_dentry(destdn
, srci
, true, true);
7273 else if (force_journal_dest
) {
7274 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
7275 metablob
->add_dir_context(destdn
->get_dir());
7276 metablob
->add_primary_dentry(destdn
, srci
, true);
7277 if (srcdn
->is_auth() && srci
->is_dir()) {
7278 // journal new subtrees root dirfrags
7280 srci
->get_dirfrags(ls
);
7281 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7284 metablob
->add_dir(dir
, true);
7291 if (srcdn
->is_auth()) {
7292 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
7293 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
7294 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7295 // both primary and NULL dentries. Because during journal replay, null dentry is
7296 // processed after primary dentry.
7297 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
7298 metablob
->add_primary_dentry(srcdn
, srci
, true);
7299 metablob
->add_null_dentry(srcdn
, true);
7301 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
7303 // make renamed inode first track the dn
7304 if (srcdnl
->is_primary() && destdn
->is_auth())
7305 srci
->first
= destdn
->first
;
7307 if (oldin
&& oldin
->is_dir()) {
7309 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
7312 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
7317 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7319 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7320 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
7322 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7323 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7325 CInode
*oldin
= destdnl
->get_inode();
7327 // primary+remote link merge?
7328 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7329 (srcdnl
->is_primary() || destdnl
->is_primary()));
7333 if (destdnl
->is_primary()) {
7335 dout(10) << "straydn is " << *straydn
<< dendl
;
7336 destdn
->get_dir()->unlink_inode(destdn
, false);
7338 straydn
->pop_projected_linkage();
7339 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7340 assert(!straydn
->is_projected()); // no other projected
7342 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
7345 if (destdn
->is_auth()) {
7346 bool hadrealm
= (oldin
->snaprealm
? true : false);
7347 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7348 if (oldin
->snaprealm
&& !hadrealm
)
7349 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
);
7351 // FIXME this snaprealm is not filled out correctly
7352 //oldin->open_snaprealm(); might be sufficient..
7354 } else if (destdnl
->is_remote()) {
7355 destdn
->get_dir()->unlink_inode(destdn
, false);
7356 if (oldin
->is_auth())
7357 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7361 // unlink src before we relink it at dest
7362 CInode
*in
= srcdnl
->get_inode();
7365 bool srcdn_was_remote
= srcdnl
->is_remote();
7366 srcdn
->get_dir()->unlink_inode(srcdn
);
7369 if (srcdn_was_remote
) {
7372 destdnl
= destdn
->pop_projected_linkage();
7373 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7374 assert(!destdn
->is_projected()); // no other projected
7376 destdn
->link_remote(destdnl
, in
);
7377 if (destdn
->is_auth())
7378 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
7381 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7383 dout(10) << "merging remote onto primary link" << dendl
;
7384 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7388 dout(10) << "merging primary onto remote link" << dendl
;
7389 destdn
->get_dir()->unlink_inode(destdn
, false);
7391 destdnl
= destdn
->pop_projected_linkage();
7392 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7393 assert(!destdn
->is_projected()); // no other projected
7395 // srcdn inode import?
7396 if (!srcdn
->is_auth() && destdn
->is_auth()) {
7397 assert(mdr
->more()->inode_import
.length() > 0);
7399 map
<client_t
,Capability::Import
> imported_caps
;
7401 // finish cap imports
7402 finish_force_open_sessions(mdr
->more()->imported_client_map
, mdr
->more()->sseq_map
);
7403 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
7404 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
7405 mdr
->more()->srcdn_auth_mds
, true,
7406 mdr
->more()->cap_imports
[destdnl
->get_inode()],
7410 mdr
->more()->inode_import
.clear();
7411 ::encode(imported_caps
, mdr
->more()->inode_import
);
7413 /* hack: add an auth pin for each xlock we hold. These were
7414 * remote xlocks previously but now they're local and
7415 * we're going to try and unpin when we xlock_finish. */
7416 for (set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7417 i
!= mdr
->xlocks
.end();
7419 if ((*i
)->get_parent() == destdnl
->get_inode() &&
7420 !(*i
)->is_locallock())
7421 mds
->locker
->xlock_import(*i
);
7423 // hack: fix auth bit
7424 in
->state_set(CInode::STATE_AUTH
);
7426 mdr
->clear_ambiguous_auth();
7429 if (destdn
->is_auth()) {
7430 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7433 // FIXME: fix up snaprealm!
7438 if (srcdn
->is_auth())
7439 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
7440 srcdn
->pop_projected_linkage();
7441 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7442 assert(!srcdn
->is_projected()); // no other projected
7444 // apply remaining projected inodes (nested)
7447 // update subtree map?
7448 if (destdnl
->is_primary() && in
->is_dir())
7449 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
7451 if (straydn
&& oldin
->is_dir())
7452 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
7454 // removing a new dn?
7455 if (srcdn
->is_auth())
7456 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
7464 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
7465 CDentry
*srcdn
, *destdn
, *straydn
;
7467 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7468 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7469 void finish(int r
) override
{
7470 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7474 class C_MDS_SlaveRenameCommit
: public ServerContext
{
7476 CDentry
*srcdn
, *destdn
, *straydn
;
7478 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7479 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7480 void finish(int r
) override
{
7481 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
7485 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
7488 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
7489 ServerContext(s
), mdr(r
) {}
7490 void finish(int r
) override
{
7491 server
->_slave_rename_sessions_flushed(mdr
);
7495 /* This function DOES put the mdr->slave_request before returning*/
7496 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
7498 dout(10) << "handle_slave_rename_prep " << *mdr
7499 << " " << mdr
->slave_request
->srcdnpath
7500 << " to " << mdr
->slave_request
->destdnpath
7503 if (mdr
->slave_request
->is_interrupted()) {
7504 dout(10) << " slave request interrupted, sending noop reply" << dendl
;
7505 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7506 reply
->mark_interrupted();
7507 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7508 mdr
->slave_request
->put();
7509 mdr
->slave_request
= 0;
7514 filepath
destpath(mdr
->slave_request
->destdnpath
);
7515 dout(10) << " dest " << destpath
<< dendl
;
7516 vector
<CDentry
*> trace
;
7517 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, destpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVERXLOCK
);
7520 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7524 assert(r
== 0); // we shouldn't get an error here!
7526 CDentry
*destdn
= trace
[trace
.size()-1];
7527 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7528 dout(10) << " destdn " << *destdn
<< dendl
;
7532 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
7533 dout(10) << " src " << srcpath
<< dendl
;
7534 CInode
*srci
= nullptr;
7535 r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &srci
, MDS_TRAVERSE_DISCOVERXLOCK
);
7539 // srcpath must not point to a null dentry
7540 assert(srci
!= nullptr);
7542 CDentry
*srcdn
= trace
[trace
.size()-1];
7543 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7544 dout(10) << " srcdn " << *srcdn
<< dendl
;
7549 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7550 (srcdnl
->is_primary() || destdnl
->is_primary()));
7551 CDentry
*straydn
= mdr
->straydn
;
7552 if (destdnl
->is_primary() && !linkmerge
)
7555 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7556 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
7558 // set up commit waiter (early, to clean up any freezing etc we do)
7559 if (!mdr
->more()->slave_commit
)
7560 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
7563 if (srcdn
->is_auth()) {
7564 set
<mds_rank_t
> srcdnrep
;
7565 srcdn
->list_replicas(srcdnrep
);
7567 bool reply_witness
= false;
7568 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
7571 // - avoid conflicting lock state changes
7572 // - avoid concurrent updates to the inode
7573 // (this could also be accomplished with the versionlock)
7574 int allowance
= 2; // 1 for the mdr auth_pin, 1 for the link lock
7575 allowance
+= srcdnl
->get_inode()->is_dir(); // for the snap lock
7576 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
7577 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
7579 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7580 if (srcdnl
->get_inode()->is_frozen_auth_pin())
7581 mdr
->unfreeze_auth_pin();
7583 if (!frozen_inode
) {
7584 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
7589 * set ambiguous auth for srci
7590 * NOTE: we don't worry about ambiguous cache expire as we do
7591 * with subtree migrations because all slaves will pin
7592 * srcdn->get_inode() for duration of this rename.
7594 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7596 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7597 // the master will send another OP_RENAMEPREP slave request later.
7598 if (mdr
->slave_request
->witnesses
.size() > 1) {
7599 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
7600 reply_witness
= true;
7603 // make sure bystanders have received all lock related messages
7604 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7605 if (*p
== mdr
->slave_to_mds
||
7606 (mds
->is_cluster_degraded() &&
7607 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
7609 MMDSSlaveRequest
*notify
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7610 MMDSSlaveRequest::OP_RENAMENOTIFY
);
7611 mds
->send_message_mds(notify
, *p
);
7612 mdr
->more()->waiting_on_slave
.insert(*p
);
7615 // make sure clients have received all cap related messages
7616 set
<client_t
> export_client_set
;
7617 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
7619 MDSGatherBuilder
gather(g_ceph_context
);
7620 flush_client_sessions(export_client_set
, gather
);
7621 if (gather
.has_subs()) {
7622 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
7623 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
7628 // is witness list sufficient?
7629 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7630 if (*p
== mdr
->slave_to_mds
||
7631 mdr
->slave_request
->witnesses
.count(*p
)) continue;
7632 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
7633 reply_witness
= true;
7637 if (reply_witness
) {
7638 assert(!srcdnrep
.empty());
7639 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7640 MMDSSlaveRequest::OP_RENAMEPREPACK
);
7641 reply
->witnesses
.swap(srcdnrep
);
7642 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7643 mdr
->slave_request
->put();
7644 mdr
->slave_request
= 0;
7647 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
7648 if (!mdr
->more()->waiting_on_slave
.empty()) {
7649 dout(10) << " still waiting for rename notify acks from "
7650 << mdr
->more()->waiting_on_slave
<< dendl
;
7653 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
7654 // set ambiguous auth for srci on witnesses
7655 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7658 // encode everything we'd need to roll this back... basically, just the original state.
7659 rename_rollback rollback
;
7661 rollback
.reqid
= mdr
->reqid
;
7663 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
7664 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7665 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7666 rollback
.orig_src
.dname
= srcdn
->name
;
7667 if (srcdnl
->is_primary())
7668 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
7670 assert(srcdnl
->is_remote());
7671 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
7672 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
7675 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
7676 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7677 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7678 rollback
.orig_dest
.dname
= destdn
->name
;
7679 if (destdnl
->is_primary())
7680 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
7681 else if (destdnl
->is_remote()) {
7682 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
7683 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
7687 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
7688 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7689 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7690 rollback
.stray
.dname
= straydn
->name
;
7692 ::encode(rollback
, mdr
->more()->rollback_bl
);
7693 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7696 mdr
->ls
= mdlog
->get_current_segment();
7697 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
7698 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
7699 mdlog
->start_entry(le
);
7700 le
->rollback
= mdr
->more()->rollback_bl
;
7702 bufferlist blah
; // inode import data... obviously not used if we're the slave
7703 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
7705 if (le
->commit
.empty()) {
7706 dout(10) << " empty metablob, skipping journal" << dendl
;
7707 mdlog
->cancel_entry(le
);
7709 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7711 mdr
->more()->slave_update_journaled
= true;
7712 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
7718 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
7719 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7721 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
7724 MMDSSlaveRequest
*reply
= NULL
;
7725 if (!mdr
->aborted
) {
7726 reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7727 if (!mdr
->more()->slave_update_journaled
)
7728 reply
->mark_not_journaled();
7731 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7732 CDentry::linkage_t
*destdnl
= NULL
;
7733 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7736 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
7737 // set export bounds for CInode::encode_export()
7739 if (srcdnl
->get_inode()->is_dir()) {
7740 srcdnl
->get_inode()->get_dirfrags(bounds
);
7741 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7742 (*p
)->state_set(CDir::STATE_EXPORTBOUND
);
7745 map
<client_t
,entity_inst_t
> exported_client_map
;
7747 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
7748 exported_client_map
);
7750 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7751 (*p
)->state_clear(CDir::STATE_EXPORTBOUND
);
7754 ::encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
7755 reply
->inode_export
.claim_append(inodebl
);
7756 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
7759 // remove mdr auth pin
7760 mdr
->auth_unpin(srcdnl
->get_inode());
7761 mdr
->more()->is_inode_exporter
= true;
7763 if (srcdnl
->get_inode()->is_dirty())
7764 srcdnl
->get_inode()->mark_clean();
7766 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
7770 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
7772 destdnl
= destdn
->get_linkage();
7775 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), srcdn
->get_dir(), META_POP_IWR
);
7776 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
7777 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), destdnl
->get_inode(),
7781 mdr
->slave_request
->put();
7782 mdr
->slave_request
= 0;
7786 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7788 assert(mdr
->aborted
);
7789 dout(10) << " abort flag set, finishing" << dendl
;
7790 mdcache
->request_finish(mdr
);
7794 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
7795 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7797 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
7799 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7801 list
<MDSInternalContextBase
*> finished
;
7803 // unfreeze+singleauth inode
7804 // hmm, do i really need to delay this?
7805 if (mdr
->more()->is_inode_exporter
) {
7807 CInode
*in
= destdnl
->get_inode();
7810 // we exported, clear out any xlocks that we moved to another MDS
7811 set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7812 while (i
!= mdr
->xlocks
.end()) {
7813 SimpleLock
*lock
= *i
++;
7815 // we only care about xlocks on the exported inode
7816 if (lock
->get_parent() == in
&&
7817 !lock
->is_locallock())
7818 mds
->locker
->xlock_export(lock
, mdr
.get());
7821 map
<client_t
,Capability::Import
> peer_imported
;
7822 bufferlist::iterator bp
= mdr
->more()->inode_import
.begin();
7823 ::decode(peer_imported
, bp
);
7825 dout(10) << " finishing inode export on " << *destdnl
->get_inode() << dendl
;
7826 mdcache
->migrator
->finish_export_inode(destdnl
->get_inode(),
7827 mdr
->get_mds_stamp(),
7828 mdr
->slave_to_mds
, peer_imported
, finished
);
7829 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
7832 assert(destdnl
->get_inode()->is_frozen_inode());
7833 destdnl
->get_inode()->unfreeze_inode(finished
);
7837 if (mdr
->more()->is_ambiguous_auth
) {
7838 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
7839 mdr
->more()->is_ambiguous_auth
= false;
7842 if (straydn
&& mdr
->more()->slave_update_journaled
) {
7843 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
7844 if (strayin
&& !strayin
->snaprealm
)
7845 mdcache
->clear_dirty_bits_for_stray(strayin
);
7848 mds
->queue_waiters(finished
);
7851 if (mdr
->more()->slave_update_journaled
) {
7852 // write a commit to the journal
7853 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
7854 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
7855 ESlaveUpdate::RENAME
);
7856 mdlog
->start_entry(le
);
7857 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
7860 _committed_slave(mdr
);
7865 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7866 // witness list from the master, and they failed before we tried prep again.
7867 if (mdr
->more()->rollback_bl
.length()) {
7868 if (mdr
->more()->is_inode_exporter
) {
7869 dout(10) << " reversing inode export of " << *destdnl
->get_inode() << dendl
;
7870 destdnl
->get_inode()->abort_export();
7872 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
7873 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
7874 // rollback but preserve the slave request
7875 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
7876 mdr
->more()->rollback_bl
.clear();
7878 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
7880 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
7882 if (mdr
->more()->is_ambiguous_auth
) {
7883 if (srcdn
->is_auth())
7884 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
7886 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
7887 mdr
->more()->is_ambiguous_auth
= false;
7889 mds
->queue_waiters(finished
);
7890 mdcache
->request_finish(mdr
);
7895 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
7896 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
7899 pf
= dir
->project_fnode();
7900 mut
->add_projected_fnode(dir
);
7901 pf
->version
= dir
->pre_dirty();
7904 pf
->fragstat
.nsubdirs
+= linkunlink
;
7906 pf
->fragstat
.nfiles
+= linkunlink
;
7909 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
7910 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
7911 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
7912 pf
->rstat
.rsnaprealms
+= linkunlink
* rstat
.rsnaprealms
;
7914 if (pf
->fragstat
.mtime
== ctime
) {
7915 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
7916 if (pf
->rstat
.rctime
== ctime
)
7917 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
7919 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
7920 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
7923 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
7930 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7931 CDentry
*sd
, version_t pv
, CDentry
*dd
,
7932 CDentry
*st
, bool f
) :
7933 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
7934 straydn(st
), finish_mdr(f
) {}
7935 void finish(int r
) override
{
7936 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
7937 destdn
, straydn
, finish_mdr
);
7941 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
7944 rename_rollback rollback
;
7945 bufferlist::iterator p
= rbl
.begin();
7946 ::decode(rollback
, p
);
7948 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
7949 // need to finish this update before sending resolve to claim the subtree
7950 mdcache
->add_rollback(rollback
.reqid
, master
);
7952 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7953 mut
->ls
= mds
->mdlog
->get_current_segment();
7955 CDentry
*srcdn
= NULL
;
7956 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
7958 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
7960 dout(10) << " srcdir " << *srcdir
<< dendl
;
7961 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
7963 dout(10) << " srcdn " << *srcdn
<< dendl
;
7964 assert(srcdn
->get_linkage()->is_null());
7966 dout(10) << " srcdn not found" << dendl
;
7968 dout(10) << " srcdir not found" << dendl
;
7970 CDentry
*destdn
= NULL
;
7971 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
7973 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
7975 dout(10) << " destdir " << *destdir
<< dendl
;
7976 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
7978 dout(10) << " destdn " << *destdn
<< dendl
;
7980 dout(10) << " destdn not found" << dendl
;
7982 dout(10) << " destdir not found" << dendl
;
7985 if (rollback
.orig_src
.ino
) {
7986 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
7987 if (in
&& in
->is_dir())
7988 assert(srcdn
&& destdn
);
7990 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
7992 CDir
*straydir
= NULL
;
7993 CDentry
*straydn
= NULL
;
7994 if (rollback
.stray
.dirfrag
.ino
) {
7995 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
7997 dout(10) << "straydir " << *straydir
<< dendl
;
7998 straydn
= straydir
->lookup(rollback
.stray
.dname
);
8000 dout(10) << " straydn " << *straydn
<< dendl
;
8001 assert(straydn
->get_linkage()->is_primary());
8003 dout(10) << " straydn not found" << dendl
;
8005 dout(10) << "straydir not found" << dendl
;
8008 CInode
*target
= NULL
;
8009 if (rollback
.orig_dest
.ino
) {
8010 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
8012 assert(destdn
&& straydn
);
8013 } else if (rollback
.orig_dest
.remote_ino
)
8014 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
8016 // can't use is_auth() in the resolve stage
8017 mds_rank_t whoami
= mds
->get_nodeid();
8019 assert(!destdn
|| destdn
->authority().first
!= whoami
);
8020 assert(!straydn
|| straydn
->authority().first
!= whoami
);
8022 bool force_journal_src
= false;
8023 bool force_journal_dest
= false;
8024 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
8025 force_journal_src
= _need_force_journal(in
, false);
8026 if (in
&& target
&& target
->is_dir())
8027 force_journal_dest
= _need_force_journal(in
, true);
8029 version_t srcdnpv
= 0;
8032 if (srcdn
->authority().first
== whoami
)
8033 srcdnpv
= srcdn
->pre_dirty();
8034 if (rollback
.orig_src
.ino
) {
8036 srcdn
->push_projected_linkage(in
);
8038 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
8039 rollback
.orig_src
.remote_d_type
);
8044 if (in
->authority().first
== whoami
) {
8045 pi
= in
->project_inode();
8046 mut
->add_projected_inode(in
);
8047 pi
->version
= in
->pre_dirty();
8049 pi
= in
->get_projected_inode();
8050 if (pi
->ctime
== rollback
.ctime
)
8051 pi
->ctime
= rollback
.orig_src
.old_ctime
;
8054 if (srcdn
&& srcdn
->authority().first
== whoami
) {
8056 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
8057 in
? in
->is_dir() : false, 1, pi
? pi
->accounted_rstat
: blah
);
8062 if (rollback
.orig_dest
.ino
&& target
) {
8063 destdn
->push_projected_linkage(target
);
8064 } else if (rollback
.orig_dest
.remote_ino
) {
8065 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
8066 rollback
.orig_dest
.remote_d_type
);
8068 // the dentry will be trimmed soon, it's ok to have wrong linkage
8069 if (rollback
.orig_dest
.ino
)
8070 assert(mds
->is_resolve());
8071 destdn
->push_projected_linkage();
8076 straydn
->push_projected_linkage();
8080 if (target
->authority().first
== whoami
) {
8081 ti
= target
->project_inode();
8082 mut
->add_projected_inode(target
);
8083 ti
->version
= target
->pre_dirty();
8085 ti
= target
->get_projected_inode();
8086 if (ti
->ctime
== rollback
.ctime
)
8087 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
8088 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
8089 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
8090 assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
8092 assert(rollback
.orig_dest
.remote_ino
&&
8093 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
8099 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
8101 dout(0) << " srci back to " << *in
<< dendl
;
8103 dout(0) << " destdn back to " << *destdn
<< dendl
;
8105 dout(0) << " desti back to " << *target
<< dendl
;
8108 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
8109 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
8110 mdlog
->start_entry(le
);
8112 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
8113 le
->commit
.add_dir_context(srcdir
);
8114 if (rollback
.orig_src
.ino
)
8115 le
->commit
.add_primary_dentry(srcdn
, 0, true);
8117 le
->commit
.add_remote_dentry(srcdn
, true);
8120 if (!rollback
.orig_src
.ino
&& // remote linkage
8121 in
&& in
->authority().first
== whoami
) {
8122 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
8123 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
8126 if (force_journal_dest
) {
8127 assert(rollback
.orig_dest
.ino
);
8128 le
->commit
.add_dir_context(destdir
);
8129 le
->commit
.add_primary_dentry(destdn
, 0, true);
8132 // slave: no need to journal straydn
8134 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
8135 assert(rollback
.orig_dest
.remote_ino
);
8136 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
8137 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
8140 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
8141 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
8142 le
->commit
.renamed_dirino
= in
->ino();
8143 if (srcdn
->authority().first
== whoami
) {
8145 in
->get_dirfrags(ls
);
8146 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
8148 if (!dir
->is_auth())
8149 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
8151 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
8153 } else if (force_journal_dest
) {
8154 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
8155 le
->commit
.renamed_dirino
= target
->ino();
8158 if (target
&& target
->is_dir()) {
8160 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
8163 if (in
&& in
->is_dir()) {
8165 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
8168 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
8169 assert(le
->commit
.empty());
8170 mdlog
->cancel_entry(le
);
8172 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, finish_mdr
);
8174 assert(!le
->commit
.empty());
8176 mdr
->more()->slave_update_journaled
= false;
8177 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
, srcdn
, srcdnpv
,
8178 destdn
, straydn
, finish_mdr
);
8179 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
8184 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
8185 version_t srcdnpv
, CDentry
*destdn
,
8186 CDentry
*straydn
, bool finish_mdr
)
8188 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
8191 straydn
->get_dir()->unlink_inode(straydn
);
8192 straydn
->pop_projected_linkage();
8195 destdn
->get_dir()->unlink_inode(destdn
);
8196 destdn
->pop_projected_linkage();
8199 srcdn
->pop_projected_linkage();
8200 if (srcdn
->authority().first
== mds
->get_nodeid())
8201 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
8206 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
8207 CInode
*in
= srcdn
->get_linkage()->get_inode();
8208 if (srcdn
->authority().first
== mds
->get_nodeid())
8209 in
->state_set(CInode::STATE_AUTH
);
8210 // update subtree map?
8211 if (in
&& in
->is_dir()) {
8213 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
8218 CInode
*oldin
= destdn
->get_linkage()->get_inode();
8219 // update subtree map?
8220 if (oldin
&& oldin
->is_dir()) {
8222 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
8226 if (mds
->is_resolve()) {
8229 root
= mdcache
->get_subtree_root(straydn
->get_dir());
8231 root
= mdcache
->get_subtree_root(destdn
->get_dir());
8233 mdcache
->try_trim_non_auth_subtree(root
);
8237 list
<MDSInternalContextBase
*> finished
;
8238 if (mdr
->more()->is_ambiguous_auth
) {
8239 if (srcdn
->is_auth())
8240 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
8242 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8243 mdr
->more()->is_ambiguous_auth
= false;
8245 mds
->queue_waiters(finished
);
8246 if (finish_mdr
|| mdr
->aborted
)
8247 mdcache
->request_finish(mdr
);
8249 mdr
->more()->slave_rolling_back
= false;
8252 mdcache
->finish_rollback(mut
->reqid
);
8257 /* This function DOES put the passed message before returning*/
8258 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8260 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8261 << " witnessed by " << ack
->get_source()
8262 << " " << *ack
<< dendl
;
8263 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8266 mdr
->more()->slaves
.insert(from
);
8267 if (mdr
->more()->srcdn_auth_mds
== from
&&
8268 mdr
->more()->is_remote_frozen_authpin
&&
8269 !mdr
->more()->is_ambiguous_auth
) {
8270 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
8273 // witnessed? or add extra witnesses?
8274 assert(mdr
->more()->witnessed
.count(from
) == 0);
8275 if (ack
->is_interrupted()) {
8276 dout(10) << " slave request interrupted, noop" << dendl
;
8277 } else if (ack
->witnesses
.empty()) {
8278 mdr
->more()->witnessed
.insert(from
);
8279 if (!ack
->is_not_journaled())
8280 mdr
->more()->has_journaled_slaves
= true;
8282 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
8283 mdr
->more()->extra_witnesses
.swap(ack
->witnesses
);
8284 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
8288 if (ack
->inode_export
.length()) {
8289 dout(10) << " got srci import" << dendl
;
8290 mdr
->more()->inode_import
.claim(ack
->inode_export
);
8291 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
8294 // remove from waiting list
8295 assert(mdr
->more()->waiting_on_slave
.count(from
));
8296 mdr
->more()->waiting_on_slave
.erase(from
);
8298 if (mdr
->more()->waiting_on_slave
.empty())
8299 dispatch_client_request(mdr
); // go again!
8301 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
8304 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8306 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
8307 << ack
->get_source() << dendl
;
8308 assert(mdr
->is_slave());
8309 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8311 if (mdr
->more()->waiting_on_slave
.count(from
)) {
8312 mdr
->more()->waiting_on_slave
.erase(from
);
8314 if (mdr
->more()->waiting_on_slave
.empty()) {
8315 if (mdr
->slave_request
)
8316 dispatch_slave_request(mdr
);
8318 dout(10) << " still waiting for rename notify acks from "
8319 << mdr
->more()->waiting_on_slave
<< dendl
;
8323 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
8325 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
8327 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
8328 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
8330 if (mdr
->more()->waiting_on_slave
.empty()) {
8331 if (mdr
->slave_request
)
8332 dispatch_slave_request(mdr
);
8334 dout(10) << " still waiting for rename notify acks from "
8335 << mdr
->more()->waiting_on_slave
<< dendl
;
8340 /* This function takes responsibility for the passed mdr*/
8341 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
8343 MClientRequest
*req
= mdr
->client_request
;
8346 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8347 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8348 respond_to_request(mdr
, -ESTALE
);
8351 if (!diri
->is_auth()) {
8352 mdcache
->request_forward(mdr
, diri
->authority().first
);
8355 if (!diri
->is_dir()) {
8356 respond_to_request(mdr
, -ENOTDIR
);
8359 dout(10) << "lssnap on " << *diri
<< dendl
;
8362 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8363 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8364 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8367 if (!check_access(mdr
, diri
, MAY_READ
))
8370 SnapRealm
*realm
= diri
->find_snaprealm();
8371 map
<snapid_t
,SnapInfo
*> infomap
;
8372 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
8374 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
8376 max_entries
= infomap
.size();
8377 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
8379 // make sure at least one item can be encoded
8380 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
8382 __u64 last_snapid
= 0;
8383 string offset_str
= req
->get_path2();
8384 if (!offset_str
.empty())
8385 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
8388 encode_empty_dirstat(dirbl
);
8390 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
8394 map
<snapid_t
,SnapInfo
*>::iterator p
= infomap
.upper_bound(last_snapid
);
8395 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
8396 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
8400 if (p
->second
->ino
== diri
->ino())
8401 snap_name
= p
->second
->name
;
8403 snap_name
= p
->second
->get_long_name();
8405 unsigned start_len
= dnbl
.length();
8406 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
8409 ::encode(snap_name
, dnbl
);
8410 encode_infinite_lease(dnbl
);
8412 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
8415 keep
.substr_of(dnbl
, 0, start_len
);
8422 ::encode(num
, dirbl
);
8424 if (p
== infomap
.end()) {
8425 flags
= CEPH_READDIR_FRAG_END
;
8426 if (last_snapid
== 0)
8427 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
8429 ::encode(flags
, dirbl
);
8430 dirbl
.claim_append(dnbl
);
8432 mdr
->reply_extra_bl
= dirbl
;
8434 respond_to_request(mdr
, 0);
8440 struct C_MDS_mksnap_finish
: public ServerLogContext
{
8443 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
8444 ServerLogContext(s
, r
), diri(di
), info(i
) {}
8445 void finish(int r
) override
{
8446 server
->_mksnap_finish(mdr
, diri
, info
);
8450 /* This function takes responsibility for the passed mdr*/
8451 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
8453 if (!mds
->mdsmap
->allows_snaps()) {
8454 // you can't make snapshots until you set an option right now
8455 respond_to_request(mdr
, -EPERM
);
8459 MClientRequest
*req
= mdr
->client_request
;
8460 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8461 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8462 respond_to_request(mdr
, -ESTALE
);
8466 if (!diri
->is_auth()) { // fw to auth?
8467 mdcache
->request_forward(mdr
, diri
->authority().first
);
8472 if (!diri
->is_dir()) {
8473 respond_to_request(mdr
, -ENOTDIR
);
8476 if (diri
->is_system() && !diri
->is_root()) {
8477 // no snaps in system dirs (root is ok)
8478 respond_to_request(mdr
, -EPERM
);
8482 const string
&snapname
= req
->get_filepath().last_dentry();
8484 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8485 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8486 respond_to_request(mdr
, -EPERM
);
8490 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
8493 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8495 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8496 rdlocks
.erase(&diri
->snaplock
);
8497 xlocks
.insert(&diri
->snaplock
);
8499 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8502 if (!check_access(mdr
, diri
, MAY_WRITE
))
8505 // make sure name is unique
8506 if (diri
->snaprealm
&&
8507 diri
->snaprealm
->exists(snapname
)) {
8508 respond_to_request(mdr
, -EEXIST
);
8511 if (snapname
.length() == 0 ||
8512 snapname
[0] == '_') {
8513 respond_to_request(mdr
, -EINVAL
);
8517 // allocate a snapid
8518 if (!mdr
->more()->stid
) {
8520 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
8521 mdr
->get_mds_stamp(),
8522 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8523 new C_MDS_RetryRequest(mdcache
, mdr
));
8527 version_t stid
= mdr
->more()->stid
;
8529 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8530 ::decode(snapid
, p
);
8531 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
8535 info
.ino
= diri
->ino();
8536 info
.snapid
= snapid
;
8537 info
.name
= snapname
;
8538 info
.stamp
= mdr
->get_op_stamp();
8540 inode_t
*pi
= diri
->project_inode();
8541 pi
->ctime
= info
.stamp
;
8542 pi
->version
= diri
->pre_dirty();
8544 // project the snaprealm
8545 sr_t
*newsnap
= diri
->project_snaprealm(snapid
);
8546 newsnap
->snaps
[snapid
] = info
;
8547 newsnap
->seq
= snapid
;
8548 newsnap
->last_created
= snapid
;
8550 // journal the inode changes
8551 mdr
->ls
= mdlog
->get_current_segment();
8552 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
8553 mdlog
->start_entry(le
);
8555 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8556 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8557 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8558 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8560 // journal the snaprealm changes
8561 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
8566 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
8568 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
8570 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
8572 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8575 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8578 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8580 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
8584 mdr
->snapid
= info
.snapid
;
8586 respond_to_request(mdr
, 0);
8592 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
8595 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8596 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8597 void finish(int r
) override
{
8598 server
->_rmsnap_finish(mdr
, diri
, snapid
);
8602 /* This function takes responsibility for the passed mdr*/
8603 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
8605 MClientRequest
*req
= mdr
->client_request
;
8607 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8608 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8609 respond_to_request(mdr
, -ESTALE
);
8612 if (!diri
->is_auth()) { // fw to auth?
8613 mdcache
->request_forward(mdr
, diri
->authority().first
);
8616 if (!diri
->is_dir()) {
8617 respond_to_request(mdr
, -ENOTDIR
);
8621 const string
&snapname
= req
->get_filepath().last_dentry();
8623 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8624 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8625 respond_to_request(mdr
, -EPERM
);
8629 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
8632 if (snapname
.length() == 0 || snapname
[0] == '_') {
8633 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
8636 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
8637 respond_to_request(mdr
, -ENOENT
);
8640 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
8641 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
8643 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8644 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8645 rdlocks
.erase(&diri
->snaplock
);
8646 xlocks
.insert(&diri
->snaplock
);
8648 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8651 if (!check_access(mdr
, diri
, MAY_WRITE
))
8655 if (!mdr
->more()->stid
) {
8656 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
8657 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8658 new C_MDS_RetryRequest(mdcache
, mdr
));
8661 version_t stid
= mdr
->more()->stid
;
8662 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8665 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8668 inode_t
*pi
= diri
->project_inode();
8669 pi
->version
= diri
->pre_dirty();
8670 pi
->ctime
= mdr
->get_op_stamp();
8672 mdr
->ls
= mdlog
->get_current_segment();
8673 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
8674 mdlog
->start_entry(le
);
8676 // project the snaprealm
8677 sr_t
*newnode
= diri
->project_snaprealm();
8678 newnode
->snaps
.erase(snapid
);
8680 newnode
->last_destroyed
= seq
;
8682 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8683 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8684 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8685 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8687 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
8692 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8694 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8695 snapid_t stid
= mdr
->more()->stid
;
8696 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8700 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8703 mds
->snapclient
->commit(stid
, mdr
->ls
);
8705 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8707 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
8711 respond_to_request(mdr
, 0);
8713 // purge snapshot data
8714 if (diri
->snaprealm
->have_past_parents_open())
8715 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
8718 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
8721 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8722 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8723 void finish(int r
) override
{
8724 server
->_renamesnap_finish(mdr
, diri
, snapid
);
8728 /* This function takes responsibility for the passed mdr*/
8729 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
8731 MClientRequest
*req
= mdr
->client_request
;
8732 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
8733 respond_to_request(mdr
, -EINVAL
);
8737 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8738 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8739 respond_to_request(mdr
, -ESTALE
);
8743 if (!diri
->is_auth()) { // fw to auth?
8744 mdcache
->request_forward(mdr
, diri
->authority().first
);
8748 if (!diri
->is_dir()) { // dir only
8749 respond_to_request(mdr
, -ENOTDIR
);
8753 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
||
8754 mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8755 respond_to_request(mdr
, -EPERM
);
8759 const string
&dstname
= req
->get_filepath().last_dentry();
8760 const string
&srcname
= req
->get_filepath2().last_dentry();
8761 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
8763 if (srcname
.length() == 0 || srcname
[0] == '_') {
8764 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
8767 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
8768 respond_to_request(mdr
, -ENOENT
);
8771 if (dstname
.length() == 0 || dstname
[0] == '_') {
8772 respond_to_request(mdr
, -EINVAL
);
8775 if (diri
->snaprealm
->exists(dstname
)) {
8776 respond_to_request(mdr
, -EEXIST
);
8780 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
8781 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
8784 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8786 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8787 rdlocks
.erase(&diri
->snaplock
);
8788 xlocks
.insert(&diri
->snaplock
);
8790 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8793 if (!check_access(mdr
, diri
, MAY_WRITE
))
8797 if (!mdr
->more()->stid
) {
8798 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
8799 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8800 new C_MDS_RetryRequest(mdcache
, mdr
));
8804 version_t stid
= mdr
->more()->stid
;
8805 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8808 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8811 inode_t
*pi
= diri
->project_inode();
8812 pi
->ctime
= mdr
->get_op_stamp();
8813 pi
->version
= diri
->pre_dirty();
8815 // project the snaprealm
8816 sr_t
*newsnap
= diri
->project_snaprealm();
8817 assert(newsnap
->snaps
.count(snapid
));
8818 newsnap
->snaps
[snapid
].name
= dstname
;
8820 // journal the inode changes
8821 mdr
->ls
= mdlog
->get_current_segment();
8822 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
8823 mdlog
->start_entry(le
);
8825 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8826 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8827 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8828 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8830 // journal the snaprealm changes
8831 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
8836 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8838 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8840 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8843 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8845 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8847 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
, true);
8852 mdr
->snapid
= snapid
;
8853 respond_to_request(mdr
, 0);
8857 * Return true if server is in state RECONNECT and this
8858 * client has not yet reconnected.
8860 bool Server::waiting_for_reconnect(client_t c
) const
8862 return client_reconnect_gather
.count(c
) > 0;
8865 void Server::dump_reconnect_status(Formatter
*f
) const
8867 f
->open_object_section("reconnect_status");
8868 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;